From 5084534cb85d87cf98174cb3c0cdc99257a0917e Mon Sep 17 00:00:00 2001 From: lars Date: Thu, 17 Nov 2022 13:23:27 +0100 Subject: [PATCH] continued --- src/music_kraken/__init__.py | 10 -- .../audio_source/sources/musify.py | 99 ++++++++++++++++++- src/music_kraken/scraping/__init__.py | 0 .../scraping/phonetic_compares.py | 22 ----- src/music_kraken/target/set_target.py | 2 + src/music_kraken/utils/database.py | 60 ++++++++++- src/music_kraken/utils/shared.py | 30 ++++-- 7 files changed, 175 insertions(+), 48 deletions(-) delete mode 100644 src/music_kraken/scraping/__init__.py delete mode 100644 src/music_kraken/scraping/phonetic_compares.py diff --git a/src/music_kraken/__init__.py b/src/music_kraken/__init__.py index 3587685..c0f0df0 100644 --- a/src/music_kraken/__init__.py +++ b/src/music_kraken/__init__.py @@ -26,16 +26,6 @@ from .lyrics import lyrics import logging import os -# configure logger default -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.FileHandler(os.path.join(temp_dir, LOG_FILE)), - logging.StreamHandler() - ] -) - def get_existing_genre(): valid_directories = [] diff --git a/src/music_kraken/audio_source/sources/musify.py b/src/music_kraken/audio_source/sources/musify.py index 5811638..fd3e986 100644 --- a/src/music_kraken/audio_source/sources/musify.py +++ b/src/music_kraken/audio_source/sources/musify.py @@ -11,6 +11,8 @@ from .source import AudioSource TRIES = 5 TIMEOUT = 10 +logger = MUSIFY_LOGGER + session = requests.Session() session.headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0", @@ -22,23 +24,111 @@ session.proxies = proxies class Musify(AudioSource): @classmethod - def fetch_source(cls, row: dict): + def fetch_source(cls, row: dict) -> str | None: super().fetch_source(row) title = row['title'] artists = row['artists'] - url = f"https://musify.club/search/suggestions?term={artists[0]} - {title}" + # trying to get a download link via the autocomplete api + for artist in artists: + url = cls.fetch_source_from_autocomplete(title=title, artist=artist) + if url is not None: + logger.info(f"found download link {url}") + return url + + # trying to get a download link via the html of the direct search page + for artist in artists: + url = cls.fetch_source_from_search(title=title, artist=artist) + if url is not None: + logger.info(f"found download link {url}") + return url + + logger.warning(f"Didn't find the audio on {cls.__name__}") + + @classmethod + def get_download_link(cls, track_url: str) -> str | None: + # https://musify.club/track/dl/18567672/rauw-alejandro-te-felicito-feat-shakira.mp3 + # /track/sundenklang-wenn-mein-herz-schreit-3883217' + + file_ = track_url.split("/")[-1] + if len(file_) == 0: + return None + musify_id = file_.split("-")[-1] + musify_name = "-".join(file_.split("-")[:-1]) + + return f"https://musify.club/track/dl/{musify_id}/{musify_name}.mp3" + + @classmethod + def fetch_source_from_autocomplete(cls, title: str, artist: str) -> str | None: + url = f"https://musify.club/search/suggestions?term={artist} - {title}" try: + logger.info(f"calling {url}") r = session.get(url=url) except requests.exceptions.ConnectionError: + logger.info("connection error occurred") return None if r.status_code == 200: autocomplete = r.json() for row in autocomplete: - if any(a in row['label'] for a in artists) and "/track" in row['url']: - return get_download_link(row['url']) + if artist in row['label'] and "/track" in row['url']: + return cls.get_download_link(row['url']) + + return None + + @classmethod + def get_soup_of_search(cls, query: str, trie=0) -> bs4.BeautifulSoup | None: + url = f"https://musify.club/search?searchText={query}" + logger.debug(f"Trying to get soup from {url}") + r = session.get(url) + if r.status_code != 200: + if r.status_code in [503] and trie < TRIES: + logging.warning(f"youtube blocked downloading. ({trie}-{TRIES})") + logging.warning(f"retrying in {TIMEOUT} seconds again") + time.sleep(TIMEOUT) + return get_soup_of_search(query, trie=trie + 1) + + logging.warning("too many tries, returning") + return None + return bs4.BeautifulSoup(r.content, features="html.parser") + + @classmethod + def fetch_source_from_search(cls, title: str, artist: str) -> str | None: + query: str = f"{artist[0]} - {title}" + search_soup = cls.get_soup_of_search(query=query) + if search_soup is None: + return None + + # get the soup of the container with all track results + tracklist_container_soup = search_soup.find_all("div", {"class": "playlist"}) + if len(tracklist_container_soup) == 0: + return None + if len(tracklist_container_soup) != 1: + logger.warning("HTML Layout of https://musify.club changed. (or bug)") + tracklist_container_soup = tracklist_container_soup[0] + + tracklist_soup = tracklist_container_soup.find_all("div", {"class": "playlist__details"}) + + def parse_track_soup(_track_soup): + anchor_soups = _track_soup.find_all("a") + artist_ = anchor_soups[0].text.strip() + track_ = anchor_soups[1].text.strip() + url_ = anchor_soups[1]['href'] + return artist_, track_, url_ + + # check each track in the container, if they match + for track_soup in tracklist_soup: + artist_option, title_option, track_url = parse_track_soup(track_soup) + + title_match, title_distance = phonetic_compares.match_titles(title, title_option) + artist_match, artist_distance = phonetic_compares.match_artists(artist, artist_option) + + logging.debug(f"{(title, title_option, title_match, title_distance)}") + logging.debug(f"{(artist, artist_option, artist_match, artist_distance)}") + + if not title_match and not artist_match: + return cls.get_download_link(track_url) return None @@ -51,7 +141,6 @@ class Musify(AudioSource): return download_from_musify(file_, url) - def get_musify_url(row): title = row['title'] artists = row['artists'] diff --git a/src/music_kraken/scraping/__init__.py b/src/music_kraken/scraping/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/music_kraken/scraping/phonetic_compares.py b/src/music_kraken/scraping/phonetic_compares.py deleted file mode 100644 index facb851..0000000 --- a/src/music_kraken/scraping/phonetic_compares.py +++ /dev/null @@ -1,22 +0,0 @@ -import jellyfish - -TITLE_THRESHOLD_LEVENSHTEIN = 2 - - -def match_titles(title_1: str, title_2: str) -> (bool, int): - distance = jellyfish.levenshtein_distance(title_1, title_2) - return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance - - -def match_artists(artist_1, artist_2: str) -> (bool, int): - if type(artist_1) == list: - distances = [] - - for artist_1_ in artist_1: - match, distance = match_titles(artist_1_, artist_2) - if not match: - return match, distance - - distances.append(distance) - return True, min(distances) - return match_titles(artist_1, artist_2) diff --git a/src/music_kraken/target/set_target.py b/src/music_kraken/target/set_target.py index 4ea77c2..deaa7de 100644 --- a/src/music_kraken/target/set_target.py +++ b/src/music_kraken/target/set_target.py @@ -22,7 +22,9 @@ class UrlPath: self.genre = genre for row in database.get_tracks_without_filepath(): + print(row) file, path = self.get_path_from_row(row) + logger.info(f"setting target to {file}") database.set_filepath(row['id'], file, path, genre) def get_path_from_row(self, row): diff --git a/src/music_kraken/utils/database.py b/src/music_kraken/utils/database.py index c5d7d6f..662edaf 100644 --- a/src/music_kraken/utils/database.py +++ b/src/music_kraken/utils/database.py @@ -132,7 +132,7 @@ class Database: "release_group.id == release_.release_group_id", "artist_track.artist_id == artist.id", "artist_track.track_id == track.id", - "source.track_id == track.id" + "(track.id == source.track_id OR track.id NOT IN (SELECT track_id FROM source))" ] where_args.extend(custom_where) @@ -274,3 +274,61 @@ if __name__ == "__main__": database = Database(os.path.join(temp_dir, "metadata.db"), os.path.join(temp_dir, "database_structure.sql"), logger, reset_anyways=True) + + """ + +SELECT DISTINCT + json_object( + 'artists', json_group_array( + ( + SELECT DISTINCT json_object( + 'id', artist.id, + 'name', artist.name + ) + ) + ), + 'source', json_group_array( + ( + SELECT json_object( + 'src_', src_.src, + 'url', src_.url, + 'valid', src_.valid + ) + ) + ), + 'id', track.id, + 'tracknumber', track.tracknumber, + 'titlesort ', track.tracknumber, + 'musicbrainz_releasetrackid', track.id, + 'musicbrainz_albumid', release_.id, + 'title', track.track, + 'isrc', track.isrc, + 'album', release_.title, + 'copyright', release_.copyright, + 'album_status', release_.album_status, + 'language', release_.language, + 'year', release_.year, + 'date', release_.date, + 'country', release_.country, + 'barcode', release_.barcode, + 'albumartist', release_group.albumartist, + 'albumsort', release_group.albumsort, + 'musicbrainz_albumtype', release_group.musicbrainz_albumtype, + 'compilation', release_group.compilation, + 'album_artist_id', release_group.album_artist_id, + 'path', track.path, + 'file', track.file, + 'genre', track.genre, + 'url', track.url, + 'src', track.src, + 'lyrics', track.lyrics + ) +FROM track, release_, release_group, artist, artist_track +LEFT JOIN release_ id ON track.release_id = release_.id +LEFT JOIN release_group id ON release_.id = release_group.id +LEFT JOIN artist_track track_id ON track.id = artist_track.track_id +LEFT JOIN artist id ON artist_track.artist_id = artist.id +LEFT JOIN source src_ ON track.id = src_.track_id +GROUP BY track.id; + + """ diff --git a/src/music_kraken/utils/shared.py b/src/music_kraken/utils/shared.py index c0469b3..31acb18 100644 --- a/src/music_kraken/utils/shared.py +++ b/src/music_kraken/utils/shared.py @@ -10,33 +10,43 @@ LOG_FILE = "download_logs.log" DATABASE_FILE = "metadata.db" DATABASE_STRUCTURE_FILE = "database_structure.sql" DATABASE_STRUCTURE_FALLBACK = "https://raw.githubusercontent.com/HeIIow2/music-downloader/master/assets/database_structure.sql" +temp_dir = os.path.join(tempfile.gettempdir(), TEMP_FOLDER) +if not os.path.exists(temp_dir): + os.mkdir(temp_dir) + +# configure logger default +logging.basicConfig( + level=logging.INFO, + format=logging.BASIC_FORMAT, + handlers=[ + logging.FileHandler(os.path.join(temp_dir, LOG_FILE)), + logging.StreamHandler() + ] +) SEARCH_LOGGER = logging.getLogger("mb-cli") DATABASE_LOGGER = logging.getLogger("database") -METADATA_DOWNLOAD_LOGGER = logging.getLogger("metadata-download") +METADATA_DOWNLOAD_LOGGER = logging.getLogger("metadata") URL_DOWNLOAD_LOGGER = logging.getLogger("AudioSource") YOUTUBE_LOGGER = logging.getLogger("Youtube") +MUSIFY_LOGGER = logging.getLogger("Musify") PATH_LOGGER = logging.getLogger("create-paths") DOWNLOAD_LOGGER = logging.getLogger("download") LYRICS_LOGGER = logging.getLogger("lyrics") GENIUS_LOGGER = logging.getLogger("genius") -NOT_A_GENRE = ".", "..", "misc_scripts", "Music", "script", ".git", ".idea" -MUSIC_DIR = os.path.expanduser('~/Music') - -temp_dir = os.path.join(tempfile.gettempdir(), TEMP_FOLDER) -if not os.path.exists(temp_dir): - os.mkdir(temp_dir) - logging.getLogger("musicbrainzngs").setLevel(logging.WARNING) musicbrainzngs.set_useragent("metadata receiver", "0.1", "https://github.com/HeIIow2/music-downloader") +NOT_A_GENRE = ".", "..", "misc_scripts", "Music", "script", ".git", ".idea" +MUSIC_DIR = os.path.expanduser('~/Music') + database = Database(os.path.join(temp_dir, DATABASE_FILE), os.path.join(temp_dir, DATABASE_STRUCTURE_FILE), DATABASE_STRUCTURE_FALLBACK, DATABASE_LOGGER, - reset_anyways=True) + reset_anyways=False) TOR = False @@ -46,4 +56,4 @@ proxies = { } if TOR else {} # only the sources here will get downloaded, in the order the list is ordered -AUDIO_SOURCES = ["Youtube", "Musify"] +AUDIO_SOURCES = ["Musify", "Youtube"]