added youtube as music src

This commit is contained in:
Lars Noack
2022-10-20 15:13:19 +02:00
parent 851f79b424
commit 609041c50f
10 changed files with 176 additions and 68 deletions

Binary file not shown.

Binary file not shown.

View File

@@ -3,9 +3,11 @@ import os.path
import pandas as pd
from mutagen.easyid3 import EasyID3
import json
import logging
import musify
import youtube_music
"""
https://en.wikipedia.org/wiki/ID3
https://mutagen.readthedocs.io/en/latest/user/id3.html
@@ -87,15 +89,25 @@ class Download:
for idx, row in self.dataframe.iterrows():
row['artist'] = json.loads(row['artist'].replace("'", '"'))
self.download(row['path'], row['file'], row['url'])
if self.path_stuff(row['path'], row['file']):
continue
src = row['src']
if src == 'musify':
self.download_from_musify(row['path'], row['file'], row['url'])
elif src == 'youtube':
youtube_music.download(row)
self.write_metadata(row, row['file'])
def download(self, path, file, url):
if os.path.exists(file):
logging.info(f"'{file}' does already exist, thus not downloading.")
return
def path_stuff(self, path: str, file_: str):
# returns true if it shouldn't be downloaded
if os.path.exists(file_):
logging.info(f"'{file_}' does already exist, thus not downloading.")
return True
os.makedirs(path, exist_ok=True)
return False
def download_from_musify(self, path, file, url):
logging.info(f"downloading: '{url}'")
r = self.session.get(url)
if r.status_code != 200:
@@ -118,16 +130,6 @@ class Download:
row[key] = str(row[key])
audiofile[key] = row[key]
"""
audiofile["artist"] = row['artist']
audiofile["albumartist"] = row['album_artist']
audiofile["date"] = str(row['year'])
audiofile["genre"] = row['genre']
audiofile["title"] = row['title']
audiofile["album"] = row['album']
audiofile["tracknumber"] = str(row['track'])
"""
audiofile.save()

View File

@@ -1,69 +1,47 @@
import json
import os.path
import pandas as pd
import requests
import logging
import musify
import youtube_music
class Download:
def __init__(self, metadata_csv: str = ".cache1.csv", session: requests.Session = requests.Session(),
file: str = ".cache2.csv", temp: str = "temp") -> None:
self.temp = temp
self.session = session
self.session.headers = {
"Connection": "keep-alive",
"Referer": "https://musify.club/"
}
self.metadata = pd.read_csv(os.path.join(self.temp, metadata_csv), index_col=0)
self.urls = []
missing_urls, self.urls = self.check_musify()
self.dump_urls(file)
def check_musify_track(self, row):
artist = json.loads(row['artist'].replace("'", '"'))
track = row['title']
url = f"https://musify.club/search/suggestions?term={track}"
r = self.session.get(url=url)
if r.status_code == 200:
autocomplete = r.json()
for row in autocomplete:
if any(a in row['label'] for a in artist) and "/track" in row['url']:
return row
return None
def check_musify(self, urls: list = []):
missing_urls = []
def get_download_link(default_url):
# https://musify.club/track/dl/18567672/rauw-alejandro-te-felicito-feat-shakira.mp3
# /track/sundenklang-wenn-mein-herz-schreit-3883217'
file_ = default_url.split("/")[-1]
musify_id = file_.split("-")[-1]
musify_name = "-".join(file_.split("-")[:-1])
logging.info(f"https://musify.club/track/dl/{musify_id}/{musify_name}.mp3")
return f"https://musify.club/track/dl/{musify_id}/{musify_name}.mp3"
for idx, row in self.metadata.iterrows():
url = self.check_musify_track(row)
if url is None:
missing_urls.append(row['id'])
row['artist'] = json.loads(row['artist'].replace("'", '"'))
# check musify
musify_url = musify.get_musify_url(row)
if musify_url is not None:
self.add_url(musify_url, 'musify', dict(row))
continue
data = dict(row)
data['url'] = get_download_link(url['url'])
urls.append(data)
return missing_urls, urls
# check youtube
youtube_url = youtube_music.get_youtube_url(row)
if youtube_url is not None:
self.add_url(youtube_url, 'youtube', dict(row))
continue
logging.warning(f"Didn't find any sources for {row['title']}")
self.dump_urls(file)
def add_url(self, url: str, src: str, row: dict):
row['url'] = url
row['src'] = src
self.urls.append(row)
def dump_urls(self, file: str = ".cache2.csv"):
df = pd.DataFrame(self.urls)

35
src/musify.py Normal file
View File

@@ -0,0 +1,35 @@
import logging
import requests
session = requests.Session()
session.headers = {
"Connection": "keep-alive",
"Referer": "https://musify.club/"
}
def get_musify_url(row):
title = row['title']
artists = row['artist']
url = f"https://musify.club/search/suggestions?term={title}"
r = session.get(url=url)
if r.status_code == 200:
autocomplete = r.json()
for row in autocomplete:
if any(a in row['label'] for a in artists) and "/track" in row['url']:
return get_download_link(row['url'])
return None
def get_download_link(default_url):
# https://musify.club/track/dl/18567672/rauw-alejandro-te-felicito-feat-shakira.mp3
# /track/sundenklang-wenn-mein-herz-schreit-3883217'
file_ = default_url.split("/")[-1]
musify_id = file_.split("-")[-1]
musify_name = "-".join(file_.split("-")[:-1])
logging.info(f"https://musify.club/track/dl/{musify_id}/{musify_name}.mp3")
return f"https://musify.club/track/dl/{musify_id}/{musify_name}.mp3"

56
src/youtube_music.py Normal file
View File

@@ -0,0 +1,56 @@
import youtube_dl
import pandas as pd
import jellyfish
import logging
YDL_OPTIONS = {'format': 'bestaudio', 'noplaylist':'True'}
YOUTUBE_URL_KEY = 'webpage_url'
def get_youtube_from_isrc(isrc: str):
# https://stackoverflow.com/questions/63388364/searching-youtube-videos-using-youtube-dl
with youtube_dl.YoutubeDL(YDL_OPTIONS) as ydl:
video = ydl.extract_info(f"ytsearch:{isrc}", download=False)['entries'][0]
print(type(video))
if YOUTUBE_URL_KEY not in video:
return None
return {
'url': video[YOUTUBE_URL_KEY],
'title': video['title']
}
def get_youtube_url(row):
if pd.isna(row['isrc']):
return None
real_title = row['title'].lower()
result = get_youtube_from_isrc(row['isrc'])
video_title = result['title'].lower()
phonetic_distance = jellyfish.levenshtein_distance(real_title, video_title)
print(real_title, video_title, phonetic_distance)
if phonetic_distance > 1:
logging.warning(f"dont downloading {result['url']} cuz the phonetic distance ({phonetic_distance}) between {real_title} and {video_title} is to high.")
return None
return result['url']
def download(row):
url = row['url']
file_ = row['file']
options = {
'format': 'bestaudio/best',
'keepvideo': False,
'outtmpl': file_
}
with youtube_dl.YoutubeDL(options) as ydl:
ydl.download([url])
if __name__ == "__main__":
# example isrc that exists on youtube music
ISRC = "DEUM71500715"
result = get_youtube_from_isrc(ISRC)
print(result)
result = get_youtube_from_isrc("aslhfklasdhfjklasdfjkhasdjlfhlasdjfkuuiueiw")
print(result)