From cd2e7d7173c9e297b1a06d8bebe28d22e7d89290 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Thu, 23 May 2024 16:33:40 +0200 Subject: [PATCH] draft: moving page interface to downloader module --- music_kraken/cli/main_downloader.py | 103 ++----- music_kraken/download/__init__.py | 432 +++++++++++++++++++++++++++- 2 files changed, 448 insertions(+), 87 deletions(-) diff --git a/music_kraken/cli/main_downloader.py b/music_kraken/cli/main_downloader.py index c5ba8a9..2356f8e 100644 --- a/music_kraken/cli/main_downloader.py +++ b/music_kraken/cli/main_downloader.py @@ -1,89 +1,24 @@ import random -from typing import Set, Type, Dict, List -from pathlib import Path import re +from pathlib import Path +from typing import Dict, List, Set, Type -from .utils import cli_function -from .options.first_config import initial_config - -from ..utils import output, BColors -from ..utils.config import write_config, main_settings -from ..utils.shared import URL_PATTERN -from ..utils.string_processing import fit_to_file_system -from ..utils.support_classes.query import Query -from ..utils.support_classes.download_result import DownloadResult +from .. import console +from ..download import Downloader +from ..download.results import GoToResults, Option, PageResults, Results +from ..objects import Album, Artist, DatabaseObject, Song +from ..pages import Page +from ..utils import BColors, output +from ..utils.config import main_settings, write_config +from ..utils.enums.colors import BColors from ..utils.exception import MKInvalidInputException from ..utils.exception.download import UrlNotFoundException -from ..utils.enums.colors import BColors -from .. import console - -from ..download.results import Results, Option, PageResults, GoToResults -from ..download.page_attributes import Pages -from ..pages import Page -from ..objects import Song, Album, Artist, DatabaseObject - -""" -This is the implementation of the Shell - -# Behaviour - -## Searching - -```mkshell -> s: {querry or url} - -# examples -> s: https://musify.club/release/some-random-release-183028492 -> s: r: #a an Artist #r some random Release -``` - -Searches for an url, or an query - -### Query Syntax - -``` -#a {artist} #r {release} #t {track} -``` - -You can escape stuff like `#` doing this: `\#` - -## Downloading - -To download something, you either need a direct link, or you need to have already searched for options - -```mkshell -> d: {option ids or direct url} - -# examples -> d: 0, 3, 4 -> d: 1 -> d: https://musify.club/release/some-random-release-183028492 -``` - -## Misc - -### Exit - -```mkshell -> q -> quit -> exit -> abort -``` - -### Current Options - -```mkshell -> . -``` - -### Previous Options - -``` -> .. -``` - -""" +from ..utils.shared import URL_PATTERN +from ..utils.string_processing import fit_to_file_system +from ..utils.support_classes.download_result import DownloadResult +from ..utils.support_classes.query import Query +from .options.first_config import initial_config +from .utils import cli_function EXIT_COMMANDS = {"q", "quit", "exit", "abort"} ALPHABET = "abcdefghijklmnopqrstuvwxyz" @@ -143,7 +78,7 @@ def help_message(): print() -class Downloader: +class CliDownloader: def __init__( self, exclude_pages: Set[Type[Page]] = None, @@ -153,7 +88,7 @@ class Downloader: genre: str = None, process_metadata_anyway: bool = False, ) -> None: - self.pages: Pages = Pages(exclude_pages=exclude_pages, exclude_shady=exclude_shady) + self.pages: Downloader = Downloader(exclude_pages=exclude_pages, exclude_shady=exclude_shady) self.page_dict: Dict[str, Type[Page]] = dict() @@ -446,7 +381,7 @@ def download( else: print(f"{BColors.FAIL.value}Something went wrong configuring.{BColors.ENDC.value}") - shell = Downloader(genre=genre, process_metadata_anyway=process_metadata_anyway) + shell = CliDownloader(genre=genre, process_metadata_anyway=process_metadata_anyway) if command_list is not None: for command in command_list: diff --git a/music_kraken/download/__init__.py b/music_kraken/download/__init__.py index 7ca0086..e9f0fd0 100644 --- a/music_kraken/download/__init__.py +++ b/music_kraken/download/__init__.py @@ -1,8 +1,36 @@ -from dataclasses import dataclass, field -from typing import Set +from __future__ import annotations -from ..utils.config import main_settings +import logging +import random +import re +from collections import defaultdict +from copy import copy +from dataclasses import dataclass, field +from pathlib import Path +from string import Formatter +from typing import (TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Type, + TypedDict, Union) + +import requests +from bs4 import BeautifulSoup + +from ..audio import correct_codec, write_metadata_to_target +from ..connection import Connection +from ..objects import Album, Artist, Collection +from ..objects import DatabaseObject as DataObject +from ..objects import Label, Options, Song, Source, Target +from ..utils import BColors, output, trace +from ..utils.config import main_settings, youtube_settings +from ..utils.enums import ALL_SOURCE_TYPES, SourceType from ..utils.enums.album import AlbumType +from ..utils.exception import MKMissingNameException +from ..utils.exception.download import UrlNotFoundException +from ..utils.path_manager import LOCATIONS +from ..utils.shared import DEBUG_PAGES +from ..utils.string_processing import fit_to_file_system +from ..utils.support_classes.download_result import DownloadResult +from ..utils.support_classes.query import Query +from .results import SearchResults @dataclass @@ -19,3 +47,401 @@ class DownloadOptions: download_again_if_found: bool = False process_audio_if_found: bool = False process_metadata_if_found: bool = True + + +fetch_map = { + Song: "fetch_song", + Album: "fetch_album", + Artist: "fetch_artist", + Label: "fetch_label", +} + + +class Downloader: + def __init__( + self, + auto_register_pages: bool = True, + download_options: DownloadOptions = None, + fetch_options: FetchOptions = None, + **kwargs + ): + self.LOGGER = logging.getLogger("download") + + self.download_options: DownloadOptions = download_options or DownloadOptions() + self.fetch_options: FetchOptions = fetch_options or FetchOptions() + + self._registered_pages: Dict[Type[Page], Set[Page]] = defaultdict(set) + if auto_register_pages: + self.scan_for_pages(**kwargs) + + def register_page(self, page_type: Type[Page], **kwargs): + if page_type in _registered_pages: + return + + self._registered_pages[page_type].add(page_type( + download_options=self.download_options, + fetch_options=self.fetch_options, + **kwargs + )) + + def deregister_page(self, page_type: Type[Page]): + if page_type not in _registered_pages: + return + + for p in self._registered_pages[page_type]: + p.__del__() + del self._registered_pages[page_type] + + def scan_for_pages(self, **kwargs): + # assuming the wanted pages are the leaf classes of the interface + leaf_classes = [] + + class_list = [Page] + while len(class_list): + _class = class_list.pop() + class_subclasses = _class.__subclasses__() + + if len(class_subclasses) == 0: + if _class.REGISTER: + leaf_classes.append(_class) + else: + class_list.extend(class_subclasses) + + for leaf_class in leaf_classes: + self.register_page(leaf_class, **kwargs) + + def get_pages(self, *page_types: List[Type[Page]]) -> Generator[Page, None, None]: + if len(page_types) == 0: + page_types = _registered_pages.keys() + + for page_type in page_types: + yield from self._registered_pages[page_type] + + def search(self, query: Query) -> SearchResults: + result = SearchResults() + + for page in self.get_pages(): + result.add( + page=type(page), + search_result=page.search(query=query) + ) + + return result + + def fetch_details(self, data_object: DataObject, stop_at_level: int = 1, **kwargs) -> DataObject: + source: Source + for source in data_object.source_collection.get_sources(source_type_sorting={ + "only_with_page": True, + }): + new_data_object = self.fetch_from_source(source=source, stop_at_level=stop_at_level) + if new_data_object is not None: + data_object.merge(new_data_object) + + return data_object + + def fetch_from_source(self, source: Source, **kwargs) -> Optional[DataObject]: + if not source.has_page: + return None + + source_type = source.page.get_source_type(source=source) + if source_type is None: + self.LOGGER.debug(f"Could not determine source type for {source}.") + return None + + func = getattr(source.page, fetch_map[source_type]) + + # fetching the data object and marking it as fetched + data_object: DataObject = func(source=source, **kwargs) + data_object.mark_as_fetched(source.hash_url) + return data_object + + def fetch_from_url(self, url: str) -> Optional[DataObject]: + source = Source.match_url(url, ALL_SOURCE_TYPES.MANUAL) + if source is None: + return None + + return self.fetch_from_source(source=source) + + def _skip_object(self, data_object: DataObject) -> bool: + if isinstance(data_object, Album): + if not self.download_options.download_all and data_object.album_type in self.download_options.album_type_blacklist: + return True + + return False + + def download(self, data_object: DataObject, genre: str, **kwargs) -> DownloadResult: + # fetch the given object + self.fetch_details(data_object) + output(f"\nDownloading {data_object.option_string}...", color=BColors.BOLD) + + # fetching all parent objects (e.g. if you only download a song) + if not kwargs.get("fetched_upwards", False): + to_fetch: List[DataObject] = [data_object] + + while len(to_fetch) > 0: + new_to_fetch = [] + for d in to_fetch: + if self._skip_object(d): + continue + + self.fetch_details(d) + + for c in d.get_parent_collections(): + new_to_fetch.extend(c) + + to_fetch = new_to_fetch + + kwargs["fetched_upwards"] = True + + # download all children + download_result: DownloadResult = DownloadResult() + for c in data_object.get_child_collections(): + for d in c: + if self._skip_object(d): + continue + + download_result.merge(self.download(d, genre, **kwargs)) + + # actually download if the object is a song + if isinstance(data_object, Song): + """ + TODO + add the traced artist and album to the naming. + I am able to do that, because duplicate values are removed later on. + """ + + self._download_song(data_object, naming={ + "genre": [genre], + "audio_format": [main_settings["audio_format"]], + }) + + return download_result + + def _extract_fields_from_template(self, path_template: str) -> Set[str]: + return set(re.findall(r"{([^}]+)}", path_template)) + + def _parse_path_template(self, path_template: str, naming: Dict[str, List[str]]) -> str: + field_names: Set[str] = self._extract_fields_from_template(path_template) + + for field in field_names: + if len(naming[field]) == 0: + raise MKMissingNameException(f"Missing field for {field}.") + + path_template = path_template.replace(f"{{{field}}}", naming[field][0]) + + return path_template + + def _download_song(self, song: Song, naming: dict) -> DownloadOptions: + """ + TODO + Search the song in the file system. + """ + r = DownloadResult(total=1) + + # pre process the data recursively + song.compile() + + # manage the naming + naming: Dict[str, List[str]] = defaultdict(list, naming) + naming["song"].append(song.title_value) + naming["isrc"].append(song.isrc) + naming["album"].extend(a.title_value for a in song.album_collection) + naming["album_type"].extend(a.album_type.value for a in song.album_collection) + naming["artist"].extend(a.name for a in song.artist_collection) + naming["artist"].extend(a.name for a in song.feature_artist_collection) + for a in song.album_collection: + naming["label"].extend([l.title_value for l in a.label_collection]) + # removing duplicates from the naming, and process the strings + for key, value in naming.items(): + # https://stackoverflow.com/a/17016257 + naming[key] = list(dict.fromkeys(value)) + song.genre = naming["genre"][0] + + # manage the targets + tmp: Target = Target.temp(file_extension=main_settings["audio_format"]) + + song.target_collection.append(Target( + relative_to_music_dir=True, + file_path=Path( + self._parse_path_template(main_settings["download_path"], naming=naming), + self._parse_path_template(main_settings["download_file"], naming=naming), + ) + )) + for target in song.target_collection: + if target.exists: + output(f'{target.file_path} {BColors.OKGREEN.value}[already exists]', color=BColors.GREY) + r.found_on_disk += 1 + + if not self.download_options.download_again_if_found: + target.copy_content(tmp) + else: + target.create_path() + output(f'{target.file_path}', color=BColors.GREY) + + # this streams from every available source until something succeeds, setting the skip intervals to the values of the according source + used_source: Optional[Source] = None + skip_intervals: List[Tuple[float, float]] = [] + for source in song.source_collection.get_sources(source_type_sorting={ + "only_with_page": True, + "sort_key": lambda page: page.download_priority, + "reverse": True, + }): + if tmp.exists: + break + + used_source = source + streaming_results = source.page.download_song_to_target(source=source, target=tmp, desc="download") + skip_intervals = source.page.get_skip_intervals(song=song, source=source) + + # if something has been downloaded but it somehow failed, delete the file + if streaming_results.is_fatal_error and tmp.exists: + tmp.delete() + + # if everything went right, the file should exist now + if not tmp.exists: + if used_source is None: + r.error_message = f"No source found for {song.option_string}." + else: + r.error_message = f"Something went wrong downloading {song.option_string}." + return r + + # post process the audio + found_on_disk = used_source is None + if not found_on_disk or self.download_options.process_audio_if_found: + correct_codec(target=tmp, skip_intervals=skip_intervals) + r.sponsor_segments = len(skip_intervals) + + if used_source is not None: + used_source.page.post_process_hook(song=song, temp_target=tmp) + + if not found_on_disk or self.download_options.process_metadata_if_found: + write_metadata_to_target(metadata=song.metadata, target=tmp, song=song) + + # copy the tmp target to the final locations + for target in song.target_collection: + tmp.copy_content(target) + + tmp.delete() + return r + + def fetch_url(self, url: str, **kwargs) -> DataObject: + source = Source.match_url(url, ALL_SOURCE_TYPES.MANUAL) + + if source is None or source.page is None: + raise UrlNotFoundException(url=url) + + return source.page.fetch_object_from_source(source=source, **kwargs) + + +class Page: + REGISTER = True + SOURCE_TYPE: SourceType + LOGGER: logging.Logger + + def __new__(cls, *args, **kwargs): + cls.LOGGER = logging.getLogger(cls.__name__) + return super().__new__(cls) + + def __init__(self, download_options: DownloadOptions = None, fetch_options: FetchOptions = None, **kwargs): + self.SOURCE_TYPE.register_page(self) + + self.download_options: DownloadOptions = download_options or DownloadOptions() + self.fetch_options: FetchOptions = fetch_options or FetchOptions() + + def __del__(self): + self.SOURCE_TYPE.deregister_page() + + def _search_regex(self, pattern, string, default=None, fatal=True, flags=0, group=None): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + RegexNotFoundError, depending on fatal, specifying the field name. + """ + + if isinstance(pattern, str): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: + break + + if mobj: + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif isinstance(group, (list, tuple)): + return tuple(mobj.group(g) for g in group) + else: + return mobj.group(group) + + return default + + def get_source_type(self, source: Source) -> Optional[Type[DataObject]]: + return None + + def get_soup_from_response(self, r: requests.Response) -> BeautifulSoup: + return BeautifulSoup(r.content, "html.parser") + + # to search stuff + def search(self, query: Query) -> List[DataObject]: + music_object = query.music_object + + search_functions = { + Song: self.song_search, + Album: self.album_search, + Artist: self.artist_search, + Label: self.label_search + } + + if type(music_object) in search_functions: + r = search_functions[type(music_object)](music_object) + if r is not None and len(r) > 0: + return r + + r = [] + for default_query in query.default_search: + for single_option in self.general_search(default_query): + r.append(single_option) + + return r + + def general_search(self, search_query: str) -> List[DataObject]: + return [] + + def label_search(self, label: Label) -> List[Label]: + return [] + + def artist_search(self, artist: Artist) -> List[Artist]: + return [] + + def album_search(self, album: Album) -> List[Album]: + return [] + + def song_search(self, song: Song) -> List[Song]: + return [] + + # to fetch stuff + def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song: + return Song() + + def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: + return Album() + + def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: + return Artist() + + def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label: + return Label() + + # to download stuff + def get_skip_intervals(self, song: Song, source: Source) -> List[Tuple[float, float]]: + return [] + + def post_process_hook(self, song: Song, temp_target: Target, **kwargs): + pass + + def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult: + return DownloadResult() +