From ecc0c72cea5671114a52a1fdb7b60ce111d82252 Mon Sep 17 00:00:00 2001 From: lars Date: Sun, 6 Nov 2022 23:01:03 +0100 Subject: [PATCH] bugfixes --- README.md | 3 ++ src/download.py | 80 +++++++++++++++-------------- src/download_links.py | 11 ++-- src/main.py | 39 ++++++++++---- src/metadata/database.py | 7 ++- src/metadata/database_structure.sql | 56 ++++++++++++++++++++ src/metadata/download.py | 59 ++++++++++++++++----- src/metadata/object_handeling.py | 2 + src/metadata/options.py | 2 - src/musify.py | 14 ++++- src/url_to_path.py | 19 ++++--- 11 files changed, 210 insertions(+), 82 deletions(-) create mode 100644 src/metadata/database_structure.sql diff --git a/README.md b/README.md index 26f48dc..d777837 100644 --- a/README.md +++ b/README.md @@ -106,3 +106,6 @@ There are two bottlenecks with this approach though: **Progress** - There is a great site whith a huge isrc database [https://isrc.soundexchange.com/](https://isrc.soundexchange.com/). + + +https://slavart.gamesdrive.net/ \ No newline at end of file diff --git a/src/download.py b/src/download.py index 126c900..b53b13b 100644 --- a/src/download.py +++ b/src/download.py @@ -5,7 +5,6 @@ from mutagen.easyid3 import EasyID3 from pydub import AudioSegment import logging -from metadata import database import musify import youtube_music @@ -15,57 +14,29 @@ https://mutagen.readthedocs.io/en/latest/user/id3.html # to get all valid keys from mutagen.easyid3 import EasyID3 +print("\n".join(EasyID3.valid_keys.keys())) print(EasyID3.valid_keys.keys()) """ -def write_metadata(row, file_path): - if not os.path.exists(file_path): - logging.warning("something went really wrong") - return False - # only convert the file to the proper format if mutagen doesn't work with it due to time - try: - audiofile = EasyID3(file_path) - except mutagen.id3.ID3NoHeaderError: - AudioSegment.from_file(file_path).export(file_path, format="mp3") - audiofile = EasyID3(file_path) - - valid_keys = list(EasyID3.valid_keys.keys()) - - for key in list(row.keys()): - if key in valid_keys and row[key] is not None: - if type(row[key]) != list: - row[key] = str(row[key]) - audiofile[key] = row[key] - else: - logging.warning(key) - - logging.info("saving") - audiofile.save(file_path, v1=2) - - -def path_stuff(path: str, file_: str): - # returns true if it shouldn't be downloaded - if os.path.exists(file_): - logging.info(f"'{file_}' does already exist, thus not downloading.") - return True - os.makedirs(path, exist_ok=True) - return False class Download: - def __init__(self, proxies: dict = None, base_path: str = ""): + def __init__(self, database, logger: logging.Logger, proxies: dict = None, base_path: str = ""): if proxies is not None: musify.set_proxy(proxies) + self.database = database + self.logger = logger + for row in database.get_tracks_to_download(): row['artist'] = [i['name'] for i in row['artists']] row['file'] = os.path.join(base_path, row['file']) row['path'] = os.path.join(base_path, row['path']) - if path_stuff(row['path'], row['file']): - write_metadata(row, row['file']) + if self.path_stuff(row['path'], row['file']): + self.write_metadata(row, row['file']) continue download_success = None @@ -76,10 +47,43 @@ class Download: download_success = youtube_music.download(row) if download_success == -1: - logging.warning(f"couldn't download {row.url} from {row.src}") + self.logger.warning(f"couldn't download {row['url']} from {row['src']}") continue - write_metadata(row, row['file']) + self.write_metadata(row, row['file']) + + def write_metadata(self, row, file_path): + if not os.path.exists(file_path): + self.logger.warning("something went really wrong") + return False + + # only convert the file to the proper format if mutagen doesn't work with it due to time + try: + audiofile = EasyID3(file_path) + except mutagen.id3.ID3NoHeaderError: + AudioSegment.from_file(file_path).export(file_path, format="mp3") + audiofile = EasyID3(file_path) + + valid_keys = list(EasyID3.valid_keys.keys()) + + for key in list(row.keys()): + if key in valid_keys and row[key] is not None: + if type(row[key]) != list: + row[key] = str(row[key]) + audiofile[key] = row[key] + else: + self.logger.warning(key) + + self.logger.info("saving") + audiofile.save(file_path, v1=2) + + def path_stuff(self, path: str, file_: str): + # returns true if it shouldn't be downloaded + if os.path.exists(file_): + self.logger.info(f"'{file_}' does already exist, thus not downloading.") + return True + os.makedirs(path, exist_ok=True) + return False if __name__ == "__main__": diff --git a/src/download_links.py b/src/download_links.py index d9fceb8..16a8c0d 100644 --- a/src/download_links.py +++ b/src/download_links.py @@ -3,17 +3,18 @@ import logging import musify import youtube_music -from metadata import database class Download: - def __init__(self, metadata_csv: str = ".cache1.csv", proxies: dict = None) -> None: + def __init__(self, database, logger: logging.Logger, proxies: dict = None) -> None: + self.database = database + self.logger = logger if proxies is not None: musify.set_proxy(proxies) self.urls = [] - for row in database.get_tracks_to_download(): + for row in self.database.get_tracks_to_download(): row['artists'] = [artist['name'] for artist in row['artists']] id_ = row['id'] @@ -36,10 +37,10 @@ class Download: self.add_url(musify_url, 'musify', id_) continue - logging.warning(f"Didn't find any sources for {row['title']}") + self.logger.warning(f"Didn't find any sources for {row['title']}") def add_url(self, url: str, src: str, id_: str): - database.set_download_data(id_, url, src) + self.database.set_download_data(id_, url, src) if __name__ == "__main__": diff --git a/src/main.py b/src/main.py index 6cbdc20..5162a32 100644 --- a/src/main.py +++ b/src/main.py @@ -1,3 +1,5 @@ +from metadata.database import Database +from metadata.download import MetadataDownloader import metadata.download import metadata.metadata import download_links @@ -6,12 +8,19 @@ import download import logging import os +import tempfile +logging.basicConfig(level=logging.INFO) -TEMP = "temp" -STEP_ONE_CACHE = ".cache1.csv" -STEP_TWO_CACHE = ".cache2.csv" -STEP_THREE_CACHE = ".cache3.csv" +TEMP_FOLDER = "music-downloader" +DATABASE_FILE = "metadata.db" +DATABASE_STRUCTURE_FILE = "database_structure.sql" + +DATABASE_LOGGER = logging.getLogger("database") +METADATA_DOWNLOAD_LOGGER = logging.getLogger("metadata-download") +URL_DOWNLOAD_LOGGER = logging.getLogger("ling-download") +PATH_LOGGER = logging.getLogger("create-paths") +DOWNLOAD_LOGGER = logging.getLogger("download") NOT_A_GENRE = ".", "..", "misc_scripts", "Music", "script", ".git", ".idea" MUSIC_DIR = os.path.expanduser('~/Music') @@ -20,6 +29,16 @@ TOR = False logger = logging.getLogger() logger.level = logging.DEBUG +temp_dir = os.path.join(tempfile.gettempdir(), TEMP_FOLDER) +if not os.path.exists(temp_dir): + os.mkdir(temp_dir) + +database = Database(os.path.join(temp_dir, DATABASE_FILE), + os.path.join(temp_dir, DATABASE_STRUCTURE_FILE), DATABASE_LOGGER, + reset_anyways=False) + + + def get_existing_genre(): valid_directories = [] @@ -31,7 +50,7 @@ def get_existing_genre(): def search_for_metadata(query: str): - search = metadata.metadata.Search(query=query, temp=TEMP) + search = metadata.metadata.Search(query=query) print(search.options) while True: @@ -86,20 +105,20 @@ def cli(start_at: int = 0): if start_at <= 0: search = search_for_metadata(query=input("initial query: ")) logging.info("Starting Downloading of metadata") - metadata.download.download(search) + metadata_downloader = MetadataDownloader(database, METADATA_DOWNLOAD_LOGGER) + metadata_downloader.download(search) if start_at <= 1: logging.info("Fetching Download Links") - download_links.Download(proxies=proxies) + download_links.Download(database, METADATA_DOWNLOAD_LOGGER, proxies=proxies) if start_at <= 2: logging.info("creating Paths") - print(genre) - url_to_path.UrlPath(genre=genre) + url_to_path.UrlPath(database, PATH_LOGGER, genre=genre) if start_at <= 3: logging.info("starting to download the mp3's") - download.Download(proxies=proxies, base_path=MUSIC_DIR) + download.Download(database, DOWNLOAD_LOGGER, proxies=proxies, base_path=MUSIC_DIR) if __name__ == "__main__": diff --git a/src/metadata/database.py b/src/metadata/database.py index 3a0ce5e..df5f7b4 100644 --- a/src/metadata/database.py +++ b/src/metadata/database.py @@ -98,6 +98,7 @@ class Database: musicbrainz_releasetrackid: str, musicbrainz_albumid: str, feature_aritsts: list, + tracknumber: str = None, track: str = None, isrc: str = None ): @@ -111,8 +112,8 @@ class Database: self.connection.commit() # add track - query = "INSERT OR REPLACE INTO track (id, release_id, track, isrc) VALUES (?, ?, ?, ?);" - values = musicbrainz_releasetrackid, musicbrainz_albumid, track, isrc + query = "INSERT OR REPLACE INTO track (id, release_id, track, isrc, tracknumber) VALUES (?, ?, ?, ?, ?);" + values = musicbrainz_releasetrackid, musicbrainz_albumid, track, isrc, tracknumber self.cursor.execute(query, values) self.connection.commit() @@ -139,6 +140,8 @@ SELECT DISTINCT ) ), 'id', track.id, + 'tracknumber', track.tracknumber, + 'titlesort ', track.tracknumber, 'musicbrainz_releasetrackid', track.id, 'musicbrainz_albumid', release_.id, 'title', track.track, diff --git a/src/metadata/database_structure.sql b/src/metadata/database_structure.sql new file mode 100644 index 0000000..d5262c7 --- /dev/null +++ b/src/metadata/database_structure.sql @@ -0,0 +1,56 @@ +DROP TABLE IF EXISTS artist; +CREATE TABLE artist ( + id TEXT PRIMARY KEY NOT NULL, + name TEXT +); + +DROP TABLE IF EXISTS artist_release_group; +CREATE TABLE artist_release_group ( + artist_id TEXT NOT NULL, + release_group_id TEXT NOT NULL +); + +DROP TABLE IF EXISTS artist_track; +CREATE TABLE artist_track ( + artist_id TEXT NOT NULL, + track_id TEXT NOT NULL +); + +DROP TABLE IF EXISTS release_group; +CREATE TABLE release_group ( + id TEXT PRIMARY KEY NOT NULL, + albumartist TEXT, + albumsort INT, + musicbrainz_albumtype TEXT, + compilation TEXT, + album_artist_id TEXT +); + +DROP TABLE IF EXISTS release_; +CREATE TABLE release_ ( + id TEXT PRIMARY KEY NOT NULL, + release_group_id TEXT NOT NULL, + title TEXT, + copyright TEXT, + album_status TEXT, + language TEXT, + year TEXT, + date TEXT, + country TEXT, + barcode TEXT +); + +DROP TABLE IF EXISTS track; +CREATE TABLE track ( + id TEXT PRIMARY KEY NOT NULL, + downloaded BOOLEAN NOT NULL DEFAULT 0, + release_id TEXT NOT NULL, + track TEXT, + tracknumber TEXT, + isrc TEXT, + genre TEXT, + path TEXT, + file TEXT, + url TEXT, + src TEXT +); diff --git a/src/metadata/download.py b/src/metadata/download.py index 330687f..9b7d888 100644 --- a/src/metadata/download.py +++ b/src/metadata/download.py @@ -2,7 +2,11 @@ from typing import List import musicbrainzngs import logging -from object_handeling import get_elem_from_obj, parse_music_brainz_date +try: + from object_handeling import get_elem_from_obj, parse_music_brainz_date + +except ModuleNotFoundError: + from metadata.object_handeling import get_elem_from_obj, parse_music_brainz_date # I don't know if it would be feesable to set up my own mb instance # https://github.com/metabrainz/musicbrainz-docker @@ -81,9 +85,10 @@ class MetadataDownloader: database, logger, musicbrainz_releasegroupid: str, - artists = [], + artists=[], albumsort: int = None, - only_download_distinct_releases: bool = True + only_download_distinct_releases: bool = True, + fetch_further: bool = True ): self.database = database self.logger = logger @@ -117,6 +122,9 @@ class MetadataDownloader: self.save() + if not fetch_further: + return + if only_download_distinct_releases: self.append_distinct_releases(release_datas) else: @@ -142,7 +150,8 @@ class MetadataDownloader: for existing_artist in self.artists: if artist_id == existing_artist.musicbrainz_artistid: return existing_artist - new_artist = Artist(artist_id, release_groups=[self], new_release_groups=False) + new_artist = MetadataDownloader.Artist(self.database, self.logger, artist_id, release_groups=[self], + new_release_groups=False) self.artists.append(new_artist) return new_artist @@ -150,7 +159,8 @@ class MetadataDownloader: musicbrainz_albumid = get_elem_from_obj(release_data, ['id']) if musicbrainz_albumid is None: return - self.releases.append(MetadataDownloader.Release(self.database, self.logger, musicbrainz_albumid, release_group=self)) + self.releases.append( + MetadataDownloader.Release(self.database, self.logger, musicbrainz_albumid, release_group=self)) def append_distinct_releases(self, release_datas: List[dict]): titles = {} @@ -174,7 +184,8 @@ class MetadataDownloader: database, logger, musicbrainz_albumid: str, - release_group = None + release_group=None, + fetch_furter: bool = True ): self.database = database self.logger = logger @@ -186,10 +197,16 @@ class MetadataDownloader: self.release_group = release_group self.tracklist = [] - result = musicbrainzngs.get_release_by_id(self.musicbrainz_albumid, includes=["recordings", "labels"]) + result = musicbrainzngs.get_release_by_id(self.musicbrainz_albumid, + includes=["recordings", "labels", "release-groups"]) release_data = get_elem_from_obj(result, ['release'], return_if_none={}) label_data = get_elem_from_obj(release_data, ['label-info-list'], return_if_none={}) recording_datas = get_elem_from_obj(release_data, ['medium-list', 0, 'track-list'], return_if_none=[]) + release_group_data = get_elem_from_obj(release_data, ['release-group'], return_if_none={}) + if self.release_group is None: + self.release_group = MetadataDownloader.ReleaseGroup(self.database, self.logger, + musicbrainz_releasegroupid=get_elem_from_obj( + release_group_data, ['id']), fetch_further=False) self.title = get_elem_from_obj(release_data, ['title']) self.copyright = get_elem_from_obj(label_data, [0, 'label', 'name']) @@ -202,7 +219,8 @@ class MetadataDownloader: self.barcode = get_elem_from_obj(release_data, ['barcode']) self.save() - self.append_recordings(recording_datas) + if fetch_furter: + self.append_recordings(recording_datas) def __str__(self): return f"{self.title} ©{self.copyright} {self.album_status}" @@ -223,12 +241,14 @@ class MetadataDownloader: ) def append_recordings(self, recording_datas: dict): - for recording_data in recording_datas: + for i, recording_data in enumerate(recording_datas): musicbrainz_releasetrackid = get_elem_from_obj(recording_data, ['recording', 'id']) if musicbrainz_releasetrackid is None: continue - self.tracklist.append(MetadataDownloader.Track(self.database, self.logger, musicbrainz_releasetrackid, self)) + self.tracklist.append( + MetadataDownloader.Track(self.database, self.logger, musicbrainz_releasetrackid, self, + track_number=str(i + 1))) class Track: def __init__( @@ -236,7 +256,8 @@ class MetadataDownloader: database, logger, musicbrainz_releasetrackid: str, - release = None + release=None, + track_number: str = None ): self.database = database self.logger = logger @@ -249,10 +270,18 @@ class MetadataDownloader: self.release = release self.artists = [] + self.track_number = track_number + result = musicbrainzngs.get_recording_by_id(self.musicbrainz_releasetrackid, includes=["artists", "releases", "recording-rels", "isrcs", "work-level-rels"]) recording_data = result['recording'] + release_data = get_elem_from_obj(recording_data, ['release-list', -1]) + if self.release is None: + self.release = MetadataDownloader.Release(self.database, self.logger, + get_elem_from_obj(release_data, ['id']), fetch_furter=False) + + for artist_data in get_elem_from_obj(recording_data, ['artist-credit'], return_if_none=[]): self.append_artist(get_elem_from_obj(artist_data, ['artist', 'id'])) @@ -271,6 +300,7 @@ class MetadataDownloader: musicbrainz_releasetrackid=self.musicbrainz_releasetrackid, musicbrainz_albumid=self.release.musicbrainz_albumid, feature_aritsts=[artist.musicbrainz_artistid for artist in self.artists], + tracknumber=self.track_number, track=self.title, isrc=self.isrc ) @@ -316,14 +346,15 @@ if __name__ == "__main__": import database database_ = database.Database(os.path.join(temp_dir, "metadata.db"), - os.path.join(temp_dir, "database_structure.sql"), db_logger, - reset_anyways=True) + os.path.join(temp_dir, "database_structure.sql"), db_logger, + reset_anyways=True) download_logger = logging.getLogger("metadata downloader") download_logger.setLevel(logging.INFO) downloader = MetadataDownloader(database_, download_logger) - downloader.download({'id': '5cfecbe4-f600-45e5-9038-ce820eedf3d1', 'type': 'artist'}) + downloader.download({'id': 'd2006339-9e98-4624-a386-d503328eb854', 'type': 'track'}) + # downloader.download({'id': 'cdd16860-35fd-46af-bd8c-5de7b15ebc31', 'type': 'release'}) # download({'id': '4b9af532-ef7e-42ab-8b26-c466327cb5e0', 'type': 'release'}) # download({'id': 'c24ed9e7-6df9-44de-8570-975f1a5a75d1', 'type': 'track'}) diff --git a/src/metadata/object_handeling.py b/src/metadata/object_handeling.py index 3830c8e..57a3b4d 100644 --- a/src/metadata/object_handeling.py +++ b/src/metadata/object_handeling.py @@ -1,5 +1,6 @@ from datetime import date + def get_elem_from_obj(current_object, keys: list, after_process=lambda x: x, return_if_none=None): current_object = current_object for key in keys: @@ -9,6 +10,7 @@ def get_elem_from_obj(current_object, keys: list, after_process=lambda x: x, ret return return_if_none return after_process(current_object) + def parse_music_brainz_date(mb_date: str) -> date: year = 1 month = 1 diff --git a/src/metadata/options.py b/src/metadata/options.py index 4a385cf..dea4882 100644 --- a/src/metadata/options.py +++ b/src/metadata/options.py @@ -37,7 +37,6 @@ def get_string_for_option(option: dict) -> str: class Options: def __init__(self, results: list): self.results = results - print(results) self.artist_count = 0 self.release_count = 0 @@ -69,7 +68,6 @@ class Options: def __str__(self) -> str: string = f"artists: {self.artist_count}; releases {self.release_count}; tracks {self.track_count}\n" for i, option in enumerate(self.result_list): - print(option) string += f"{i})\t{option['type']}:\t" + get_string_for_option(option) return string diff --git a/src/musify.py b/src/musify.py index 69d534a..7a0e23b 100644 --- a/src/musify.py +++ b/src/musify.py @@ -1,9 +1,14 @@ import logging +import time + import requests import bs4 import phonetic_compares +TRIES = 5 +TIMEOUT = 10 + session = requests.Session() session.headers = { "Connection": "keep-alive", @@ -64,11 +69,18 @@ def download(row): return download_from_musify(file_, url) -def get_soup_of_search(query: str): +def get_soup_of_search(query: str, trie=0): url = f"https://musify.club/search?searchText={query}" logging.debug(f"Trying to get soup from {url}") r = session.get(url) if r.status_code != 200: + if r.status_code in [503] and trie < TRIES: + logging.warning(f"youtube blocked downloading. ({trie}-{TRIES})") + logging.warning(f"retrying in {TIMEOUT} seconds again") + time.sleep(TIMEOUT) + return get_soup_of_search(query, trie=trie+1) + + logging.warning("too many tries, returning") raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}") return bs4.BeautifulSoup(r.content, features="html.parser") diff --git a/src/url_to_path.py b/src/url_to_path.py index 3862df5..dd4a28f 100644 --- a/src/url_to_path.py +++ b/src/url_to_path.py @@ -1,20 +1,17 @@ import os.path -import json - -from metadata import database +import logging class UrlPath: - def __init__(self, genre: str, temp: str = "temp", file: str = ".cache3.csv", step_two_file: str = ".cache2.csv"): - self.temp = temp - self.file = file + def __init__(self, database, logger: logging.Logger, genre: str): + self.database = database + self.logger = logger self.genre = genre - for row in database.get_tracks_without_filepath(): + for row in self.database.get_tracks_without_filepath(): file, path = self.get_path_from_row(row) - database.set_filepath(row['id'], file, path, genre) - + self.database.set_filepath(row['id'], file, path, genre) def get_path_from_row(self, row): """ @@ -23,7 +20,9 @@ class UrlPath: :param row: :return: path: """ - return os.path.join(self.get_genre(), self.get_artist(row), self.get_album(row), f"{self.get_song(row)}.mp3"), os.path.join(self.get_genre(), self.get_artist(row), self.get_album(row)) + return os.path.join(self.get_genre(), self.get_artist(row), self.get_album(row), + f"{self.get_song(row)}.mp3"), os.path.join(self.get_genre(), self.get_artist(row), + self.get_album(row)) def escape_part(self, part: str): return part.replace("/", " ")