Merge branch 'fix/musify_scrapes_year_as_artist' into experimental

This commit is contained in:
Hazel 2024-05-10 17:52:11 +02:00
commit 04405f88eb
17 changed files with 134 additions and 85 deletions

22
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,22 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
},
{
"name": "Python Debugger: Download script",
"type": "debugpy",
"request": "launch",
"program": "development/actual_donwload.py",
"console": "integratedTerminal"
}
]
}

View File

@ -19,6 +19,7 @@
"albumsort", "albumsort",
"APIC", "APIC",
"Bandcamp", "Bandcamp",
"bitrate",
"dotenv", "dotenv",
"encyclopaedia", "encyclopaedia",
"ENDC", "ENDC",
@ -27,6 +28,7 @@
"metallum", "metallum",
"musify", "musify",
"OKBLUE", "OKBLUE",
"OKGREEN",
"pathvalidate", "pathvalidate",
"Referer", "Referer",
"sponsorblock", "sponsorblock",

View File

@ -6,9 +6,8 @@ logging.getLogger().setLevel(logging.DEBUG)
if __name__ == "__main__": if __name__ == "__main__":
commands = [ commands = [
"s: #a Crystal F", "s: #a Psychonaut 4",
"10", "d: 0",
"2",
] ]

View File

@ -47,7 +47,7 @@ def correct_codec(target: Target, bitrate_kb: int = main_settings["bitrate"], au
# run the ffmpeg command with a progressbar # run the ffmpeg command with a progressbar
ff = FfmpegProgress(ffmpeg_command) ff = FfmpegProgress(ffmpeg_command)
with tqdm(total=100, desc=f"removing {len(interval_list)} segments") as pbar: with tqdm(total=100, desc=f"processing") as pbar:
for progress in ff.run_command_with_progress(): for progress in ff.run_command_with_progress():
pbar.update(progress-pbar.n) pbar.update(progress-pbar.n)

View File

@ -166,9 +166,9 @@ class Downloader:
self.genre = genre or get_genre() self.genre = genre or get_genre()
self.process_metadata_anyway = process_metadata_anyway self.process_metadata_anyway = process_metadata_anyway
print() output()
print(f"Downloading to: \"{self.genre}\"") output(f"Downloading to: \"{self.genre}\"", color=BColors.HEADER)
print() output()
def print_current_options(self): def print_current_options(self):
self.page_dict = dict() self.page_dict = dict()
@ -312,10 +312,8 @@ class Downloader:
def download(self, data_objects: List[DatabaseObject], **kwargs) -> bool: def download(self, data_objects: List[DatabaseObject], **kwargs) -> bool:
output() output()
if len(data_objects) == 1: if len(data_objects) > 1:
output(f"Downloading {data_objects[0].option_string}...", color=BColors.BOLD) output(f"Downloading {len(data_objects)} objects...", *("- " + o.option_string for o in data_objects), color=BColors.BOLD, sep="\n")
else:
output(f"Downloading {len(data_objects)} objects...", *("- " + o.option_string for o in data_objects), color=BColors.BOLD, sep="\n")
_result_map: Dict[DatabaseObject, DownloadResult] = dict() _result_map: Dict[DatabaseObject, DownloadResult] = dict()

View File

@ -118,7 +118,7 @@ class Pages:
audio_pages = self._audio_pages_set.intersection(_page_types) audio_pages = self._audio_pages_set.intersection(_page_types)
for download_page in audio_pages: for download_page in audio_pages:
return self._page_instances[download_page].download(music_object=music_object, genre=genre, download_all=download_all, process_metadata_anyway=process_metadata_anyway) return self._page_instances[download_page].download(music_object=music_object, genre=genre)
return DownloadResult(error_message=f"No audio source has been found for {music_object}.") return DownloadResult(error_message=f"No audio source has been found for {music_object}.")

View File

@ -153,6 +153,8 @@ class Collection(Generic[T]):
if other is None: if other is None:
return return
if not other._inner._has_data:
return
if other.id in self._indexed_from_id: if other.id in self._indexed_from_id:
return return

View File

@ -32,7 +32,7 @@ class FormattedText:
if self.is_empty and other.is_empty: if self.is_empty and other.is_empty:
return True return True
return self.doc == other.doc return self.html == other.html
@property @property
def markdown(self) -> str: def markdown(self) -> str:

View File

@ -30,6 +30,8 @@ class InnerData:
_refers_to_instances: set = None _refers_to_instances: set = None
_is_in_collection: set = None _is_in_collection: set = None
_has_data: bool = False
""" """
Attribute versions keep track, of if the attribute has been changed. Attribute versions keep track, of if the attribute has been changed.
""" """
@ -48,9 +50,19 @@ class InnerData:
for key, value in kwargs.items(): for key, value in kwargs.items():
if hasattr(value, "__is_collection__"): if hasattr(value, "__is_collection__"):
value._collection_for[self] = key value._collection_for[self] = key
self.__setattr__(key, value) self.__setattr__(key, value)
if self._has_data:
continue
def __setattr__(self, key: str, value):
if self._has_data or not hasattr(self, "_default_values"):
return super().__setattr__(key, value)
super().__setattr__("_has_data", not (key in self._default_values and self._default_values[key] == value))
return super().__setattr__(key, value)
def __hash__(self): def __hash__(self):
return self.id return self.id

View File

@ -111,7 +111,7 @@ class Song(Base):
"album_collection": Collection, "album_collection": Collection,
"feature_artist_collection": Collection, "feature_artist_collection": Collection,
"title": lambda: "", "title": lambda: None,
"unified_title": lambda: None, "unified_title": lambda: None,
"isrc": lambda: None, "isrc": lambda: None,
"genre": lambda: None, "genre": lambda: None,

View File

@ -3,8 +3,9 @@ import random
import re import re
from copy import copy from copy import copy
from pathlib import Path from pathlib import Path
from typing import Optional, Union, Type, Dict, Set, List, Tuple from typing import Optional, Union, Type, Dict, Set, List, Tuple, TypedDict
from string import Formatter from string import Formatter
from dataclasses import dataclass, field
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -28,11 +29,23 @@ from ..utils.config import main_settings
from ..utils.support_classes.query import Query from ..utils.support_classes.query import Query
from ..utils.support_classes.download_result import DownloadResult from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import fit_to_file_system from ..utils.string_processing import fit_to_file_system
from ..utils import trace from ..utils import trace, output, BColors
INDEPENDENT_DB_OBJECTS = Union[Label, Album, Artist, Song] INDEPENDENT_DB_OBJECTS = Union[Label, Album, Artist, Song]
INDEPENDENT_DB_TYPES = Union[Type[Song], Type[Album], Type[Artist], Type[Label]] INDEPENDENT_DB_TYPES = Union[Type[Song], Type[Album], Type[Artist], Type[Label]]
@dataclass
class FetchOptions:
download_all: bool = False
album_type_blacklist: Set[AlbumType] = field(default_factory=lambda: set(AlbumType(a) for a in main_settings["album_type_blacklist"]))
@dataclass
class DownloadOptions:
download_all: bool = False
album_type_blacklist: Set[AlbumType] = field(default_factory=lambda: set(AlbumType(a) for a in main_settings["album_type_blacklist"]))
process_audio_if_found: bool = False
process_metadata_if_found: bool = True
class NamingDict(dict): class NamingDict(dict):
CUSTOM_KEYS: Dict[str, str] = { CUSTOM_KEYS: Dict[str, str] = {
@ -101,6 +114,10 @@ class Page:
# set this to true, if all song details can also be fetched by fetching album details # set this to true, if all song details can also be fetched by fetching album details
NO_ADDITIONAL_DATA_FROM_SONG = False NO_ADDITIONAL_DATA_FROM_SONG = False
def __init__(self, download_options: DownloadOptions = None, fetch_options: FetchOptions = None):
self.download_options: DownloadOptions = download_options or DownloadOptions()
self.fetch_options: FetchOptions = fetch_options or FetchOptions()
def _search_regex(self, pattern, string, default=None, fatal=True, flags=0, group=None): def _search_regex(self, pattern, string, default=None, fatal=True, flags=0, group=None):
""" """
Perform a regex search on the given string, using a single or a list of Perform a regex search on the given string, using a single or a list of
@ -176,7 +193,6 @@ class Page:
self, self,
music_object: DatabaseObject, music_object: DatabaseObject,
stop_at_level: int = 1, stop_at_level: int = 1,
post_process: bool = True
) -> DatabaseObject: ) -> DatabaseObject:
""" """
when a music object with lacking data is passed in, it returns when a music object with lacking data is passed in, it returns
@ -208,7 +224,6 @@ class Page:
source=source, source=source,
enforce_type=type(music_object), enforce_type=type(music_object),
stop_at_level=stop_at_level, stop_at_level=stop_at_level,
post_process=False,
type_string=type(music_object).__name__, type_string=type(music_object).__name__,
entity_string=music_object.option_string, entity_string=music_object.option_string,
) )
@ -230,7 +245,6 @@ class Page:
source: Source, source: Source,
stop_at_level: int = 2, stop_at_level: int = 2,
enforce_type: Type[DatabaseObject] = None, enforce_type: Type[DatabaseObject] = None,
post_process: bool = True,
type_string: str = "", type_string: str = "",
entity_string: str = "", entity_string: str = "",
) -> Optional[DatabaseObject]: ) -> Optional[DatabaseObject]:
@ -268,7 +282,7 @@ class Page:
for sub_element in collection: for sub_element in collection:
sub_element.merge( sub_element.merge(
self.fetch_details(sub_element, stop_at_level=stop_at_level - 1, post_process=False)) self.fetch_details(sub_element, stop_at_level=stop_at_level - 1))
return music_object return music_object
@ -288,8 +302,6 @@ class Page:
self, self,
music_object: DatabaseObject, music_object: DatabaseObject,
genre: str, genre: str,
download_all: bool = False,
process_metadata_anyway: bool = True
) -> DownloadResult: ) -> DownloadResult:
naming_dict: NamingDict = NamingDict({"genre": genre}) naming_dict: NamingDict = NamingDict({"genre": genre})
@ -308,25 +320,22 @@ class Page:
fill_naming_objects(music_object) fill_naming_objects(music_object)
return self._download(music_object, naming_dict, download_all, process_metadata_anyway=process_metadata_anyway) return self._download(music_object, naming_dict)
def _download( def _download(
self, self,
music_object: DatabaseObject, music_object: DatabaseObject,
naming_dict: NamingDict, naming_dict: NamingDict,
download_all: bool = False, **kwargs
skip_details: bool = False,
process_metadata_anyway: bool = True
) -> DownloadResult: ) -> DownloadResult:
trace(f"downloading {type(music_object).__name__} [{music_object.option_string}]") if isinstance(music_object, Song):
skip_next_details = skip_details output(f"Downloading {music_object.option_string} to:", color=BColors.BOLD)
else:
output(f"Downloading {music_object.option_string}...", color=BColors.BOLD)
# Skips all releases, that are defined in shared.ALBUM_TYPE_BLACKLIST, if download_all is False # Skips all releases, that are defined in shared.ALBUM_TYPE_BLACKLIST, if download_all is False
if isinstance(music_object, Album): if isinstance(music_object, Album):
if self.NO_ADDITIONAL_DATA_FROM_SONG: if not self.download_options.download_all and music_object.album_type in self.download_options.album_type_blacklist:
skip_next_details = True
if not download_all and music_object.album_type.value in main_settings["album_type_blacklist"]:
return DownloadResult() return DownloadResult()
if not (isinstance(music_object, Song) and self.NO_ADDITIONAL_DATA_FROM_SONG): if not (isinstance(music_object, Song) and self.NO_ADDITIONAL_DATA_FROM_SONG):
@ -338,7 +347,7 @@ class Page:
naming_dict.add_object(music_object) naming_dict.add_object(music_object)
if isinstance(music_object, Song): if isinstance(music_object, Song):
return self._download_song(music_object, naming_dict, process_metadata_anyway=process_metadata_anyway) return self._download_song(music_object, naming_dict)
download_result: DownloadResult = DownloadResult() download_result: DownloadResult = DownloadResult()
@ -347,13 +356,11 @@ class Page:
sub_ordered_music_object: DatabaseObject sub_ordered_music_object: DatabaseObject
for sub_ordered_music_object in collection: for sub_ordered_music_object in collection:
download_result.merge(self._download(sub_ordered_music_object, naming_dict.copy(), download_all, download_result.merge(self._download(sub_ordered_music_object, naming_dict.copy()))
skip_details=skip_next_details,
process_metadata_anyway=process_metadata_anyway))
return download_result return download_result
def _download_song(self, song: Song, naming_dict: NamingDict, process_metadata_anyway: bool = True): def _download_song(self, song: Song, naming_dict: NamingDict):
if "genre" not in naming_dict and song.genre is not None: if "genre" not in naming_dict and song.genre is not None:
naming_dict["genre"] = song.genre naming_dict["genre"] = song.genre
@ -373,40 +380,33 @@ class Page:
if song.target_collection.empty: if song.target_collection.empty:
song.target_collection.append(new_target) song.target_collection.append(new_target)
r = DownloadResult(1)
temp_target: Target = Target.temp()
found_on_disc = False
target: Target
for target in song.target_collection:
current_exists = target.exists
if current_exists:
output(f'- {target.file_path} {BColors.OKGREEN.value}[already exists]', color=BColors.GREY)
target.copy_content(temp_target)
found_on_disc = True
r.found_on_disk += 1
r.add_target(target)
else:
output(f'- {target.file_path}', color=BColors.GREY)
if not song.source_collection.has_source_page(self.SOURCE_TYPE): if not song.source_collection.has_source_page(self.SOURCE_TYPE):
return DownloadResult(error_message=f"No {self.__class__.__name__} source found for {song.option_string}.") return DownloadResult(error_message=f"No {self.__class__.__name__} source found for {song.option_string}.")
sources = song.source_collection.get_sources(self.SOURCE_TYPE) sources = song.source_collection.get_sources(self.SOURCE_TYPE)
temp_target: Target = Target(
relative_to_music_dir=False,
file_path=Path(
main_settings["temp_directory"],
str(song.id)
)
)
r = DownloadResult(1)
found_on_disc = False
target: Target
for target in song.target_collection:
if target.exists:
if process_metadata_anyway:
target.copy_content(temp_target)
found_on_disc = True
r.found_on_disk += 1
r.add_target(target)
if found_on_disc and not process_metadata_anyway:
self.LOGGER.info(f"{song.option_string} already exists, thus not downloading again.")
return r
skip_intervals = [] skip_intervals = []
if not found_on_disc: if not found_on_disc:
for source in sources: for source in sources:
r = self.download_song_to_target(source=source, target=temp_target, desc=song.option_string) r = self.download_song_to_target(source=source, target=temp_target, desc="downloading")
if not r.is_fatal_error: if not r.is_fatal_error:
skip_intervals = self.get_skip_intervals(song, source) skip_intervals = self.get_skip_intervals(song, source)
@ -417,16 +417,19 @@ class Page:
song=song, song=song,
temp_target=temp_target, temp_target=temp_target,
interval_list=skip_intervals, interval_list=skip_intervals,
found_on_disc=found_on_disc,
)) ))
return r return r
def _post_process_targets(self, song: Song, temp_target: Target, interval_list: List) -> DownloadResult: def _post_process_targets(self, song: Song, temp_target: Target, interval_list: List, found_on_disc: bool) -> DownloadResult:
correct_codec(temp_target, interval_list=interval_list) if not found_on_disc or self.download_options.process_audio_if_found:
correct_codec(temp_target, interval_list=interval_list)
self.post_process_hook(song, temp_target) self.post_process_hook(song, temp_target)
write_metadata_to_target(song.metadata, temp_target, song) if not found_on_disc or self.download_options.process_metadata_if_found:
write_metadata_to_target(song.metadata, temp_target, song)
r = DownloadResult() r = DownloadResult()

View File

@ -22,6 +22,7 @@ from ..objects import (
Artwork, Artwork,
) )
from ..connection import Connection from ..connection import Connection
from ..utils import dump_to_file
from ..utils.support_classes.download_result import DownloadResult from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import clean_song_title from ..utils.string_processing import clean_song_title
from ..utils.config import main_settings, logging_settings from ..utils.config import main_settings, logging_settings

View File

@ -418,6 +418,10 @@ class Musify(Page):
href = artist_soup["href"] href = artist_soup["href"]
if href is not None: if href is not None:
href_parts = href.split("/")
if len(href_parts) <= 1 or href_parts[-2] != "artist":
return
artist_src_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) artist_src_list.append(Source(self.SOURCE_TYPE, self.HOST + href))
name_elem: BeautifulSoup = artist_soup.find("span", {"itemprop": "name"}) name_elem: BeautifulSoup = artist_soup.find("span", {"itemprop": "name"})
@ -681,17 +685,20 @@ class Musify(Page):
anchor: BeautifulSoup = artist_crumb.find("a") anchor: BeautifulSoup = artist_crumb.find("a")
if anchor is not None: if anchor is not None:
href = anchor.get("href") href = anchor.get("href")
artist_source_list: List[Source] = []
if href is not None: href_parts = href.split("/")
artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip())) if not(len(href_parts) <= 1 or href_parts[-2] != "artist"):
artist_source_list: List[Source] = []
span: BeautifulSoup = anchor.find("span") if href is not None:
if span is not None: artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip()))
artist_list.append(Artist(
name=span.get_text(strip=True), span: BeautifulSoup = anchor.find("span")
source_list=artist_source_list if span is not None:
)) artist_list.append(Artist(
name=span.get_text(strip=True),
source_list=artist_source_list
))
else: else:
self.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case") self.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case")
@ -938,10 +945,10 @@ class Musify(Page):
album_status_id = album_card.get("data-type") album_status_id = album_card.get("data-type")
if album_status_id.isdigit(): if album_status_id.isdigit():
album_status_id = int(album_status_id) album_status_id = int(album_status_id)
album_type = ALBUM_TYPE_MAP[album_status_id] album_kwargs["album_type"] = ALBUM_TYPE_MAP[album_status_id]
if album_status_id == 5: if album_status_id == 5:
album_status = AlbumStatus.BOOTLEG album_kwargs["album_status"] = AlbumStatus.BOOTLEG
def parse_release_anchor(_anchor: BeautifulSoup, text_is_name=False): def parse_release_anchor(_anchor: BeautifulSoup, text_is_name=False):
nonlocal album_kwargs nonlocal album_kwargs
@ -1037,7 +1044,7 @@ class Musify(Page):
for card_soup in soup.find_all("div", {"class": "card"}): for card_soup in soup.find_all("div", {"class": "card"}):
album = self._parse_album_card(card_soup, artist_name, **kwargs) album = self._parse_album_card(card_soup, artist_name, **kwargs)
if album.album_type in _album_type_blacklist: if not self.fetch_options.download_all and album.album_type in self.fetch_options.album_type_blacklist:
continue continue
artist.main_album_collection.append(album) artist.main_album_collection.append(album)

View File

@ -42,7 +42,7 @@ class YouTube(SuperYouTube):
SOURCE_TYPE = SourcePages.YOUTUBE SOURCE_TYPE = SourcePages.YOUTUBE
LOGGER = logging_settings["youtube_logger"] LOGGER = logging_settings["youtube_logger"]
NO_ADDITIONAL_DATA_FROM_SONG = True NO_ADDITIONAL_DATA_FROM_SONG = False
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.connection: Connection = Connection( self.connection: Connection = Connection(

View File

@ -12,7 +12,7 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
__stage__ = os.getenv("STAGE", "prod") __stage__ = os.getenv("STAGE", "prod")
DEBUG = (__stage__ == "dev") and True DEBUG = (__stage__ == "dev") and False
DEBUG_LOGGING = DEBUG and False DEBUG_LOGGING = DEBUG and False
DEBUG_TRACE = DEBUG and True DEBUG_TRACE = DEBUG and True
DEBUG_OBJECT_TRACE = DEBUG and False DEBUG_OBJECT_TRACE = DEBUG and False

View File

@ -116,10 +116,13 @@ def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) ->
# Remove artist from the start of the title # Remove artist from the start of the title
if raw_song_title.lower().startswith(artist_name.lower()): if raw_song_title.lower().startswith(artist_name.lower()):
raw_song_title = raw_song_title[len(artist_name):].strip()
if raw_song_title.startswith("-"): possible_new_name = raw_song_title[len(artist_name):].strip()
raw_song_title = raw_song_title[1:].strip()
for char in ("-", "", ":", "|"):
if possible_new_name.startswith(char):
raw_song_title = possible_new_name[1:].strip()
break
return raw_song_title.strip() return raw_song_title.strip()

View File

@ -69,7 +69,7 @@ dependencies = [
"toml~=0.10.2", "toml~=0.10.2",
"typing_extensions~=4.7.1", "typing_extensions~=4.7.1",
"python-sponsorblock~=0.0.0", "python-sponsorblock~=0.0.dev1",
"youtube_dl", "youtube_dl",
] ]
dynamic = [ dynamic = [