From e3e547c2328a4edabc1f0fea82010f7a6da2321e Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Wed, 8 May 2024 09:15:41 +0200 Subject: [PATCH] feat: improved musify --- music_kraken/cli/main_downloader.py | 4 +- music_kraken/objects/collection.py | 8 +- music_kraken/objects/song.py | 14 +- music_kraken/pages/musify.py | 261 +++++++++++++--------------- 4 files changed, 130 insertions(+), 157 deletions(-) diff --git a/music_kraken/cli/main_downloader.py b/music_kraken/cli/main_downloader.py index d66da6f..73812cd 100644 --- a/music_kraken/cli/main_downloader.py +++ b/music_kraken/cli/main_downloader.py @@ -304,10 +304,8 @@ class Downloader: def goto(self, data_object: DatabaseObject): page: Type[Page] - self.pages.fetch_details(data_object) + self.pages.fetch_details(data_object, stop_at_level=1) - print(data_object) - print(data_object.options) self.set_current_options(GoToResults(data_object.options, max_items_per_page=self.max_displayed_options)) self.print_current_options() diff --git a/music_kraken/objects/collection.py b/music_kraken/objects/collection.py index b8b2d4a..255caaa 100644 --- a/music_kraken/objects/collection.py +++ b/music_kraken/objects/collection.py @@ -164,6 +164,8 @@ class Collection(Generic[T]): r = c._find_object(other) if r is not None: output("found push to", r, other, self, color=BColors.RED, sep="\t") + if existing_object is not None: + self.remove(existing_object) return c.append(other, **kwargs) if existing_object is None: @@ -179,13 +181,11 @@ class Collection(Generic[T]): raise ValueError(f"Object {other} not found in {self}") return other - """ for collection_attribute, generator in self.extend_object_to_attribute.items(): - other.__getattribute__(collection_attribute).remove(*generator, silent=silent, **kwargs) + other.__getattribute__(collection_attribute).remove(*generator, silent=True, **kwargs) for attribute, new_object in self.append_object_to_attribute.items(): - other.__getattribute__(attribute).remove(new_object, silent=silent, **kwargs) - """ + other.__getattribute__(attribute).remove(new_object, silent=True, **kwargs) self._data.remove(existing) self._unmap_element(existing) diff --git a/music_kraken/objects/song.py b/music_kraken/objects/song.py index 9f9ba7e..d7924bd 100644 --- a/music_kraken/objects/song.py +++ b/music_kraken/objects/song.py @@ -207,7 +207,7 @@ class Song(Base): r = OPTION_FOREGROUND.value + self.title_string + BColors.ENDC.value + OPTION_BACKGROUND.value r += get_collection_string(self.album_collection, " from {}", ignore_titles={self.title}) r += get_collection_string(self.main_artist_collection, " by {}") - r += get_collection_string(self.feature_artist_collection, " feat. {}") + r += get_collection_string(self.feature_artist_collection, " feat. {}" if not self.main_artist_collection.empty or True else " by {}") return r @property @@ -413,11 +413,6 @@ class Album(Base): return self.album_type.value -""" -All objects dependent on Artist -""" - - class Artist(Base): name: str unified_name: str @@ -462,7 +457,7 @@ class Artist(Base): general_genre: str = None, unformatted_location: str = None, source_list: List[Source] = None, contact_list: List[Contact] = None, feature_song_list: List[Song] = None, main_album_list: List[Album] = None, label_list: List[Label] = None, **kwargs) -> None: - + super().__init__(name=name, unified_name=unified_name, country=country, formed_in=formed_in, notes=notes, lyrical_themes=lyrical_themes, general_genre=general_genre, unformatted_location=unformatted_location, source_list=source_list, contact_list=contact_list, @@ -593,11 +588,6 @@ class Artist(Base): return r -""" -Label -""" - - class Label(Base): COLLECTION_STRING_ATTRIBUTES = ("album_collection", "current_artist_collection") diff --git a/music_kraken/pages/musify.py b/music_kraken/pages/musify.py index 84a4453..242d0fa 100644 --- a/music_kraken/pages/musify.py +++ b/music_kraken/pages/musify.py @@ -1,7 +1,7 @@ from collections import defaultdict from dataclasses import dataclass from enum import Enum -from typing import List, Optional, Type, Union, Generator +from typing import List, Optional, Type, Union, Generator, Dict, Any from urllib.parse import urlparse import pycountry @@ -24,7 +24,7 @@ from ..objects import ( Lyrics, Artwork ) -from ..utils.config import logging_settings +from ..utils.config import logging_settings, main_settings from ..utils import string_processing, shared from ..utils.string_processing import clean_song_title from ..utils.support_classes.query import Query @@ -361,7 +361,7 @@ class Musify(Page): return Song( title=clean_song_title(song_title, artist_name=artist_list[0].name if len(artist_list) > 0 else None), - main_artist_list=artist_list, + feature_artist_list=artist_list, source_list=source_list ) @@ -510,7 +510,7 @@ class Musify(Page): title=clean_song_title(track_name, artist_name=artist_list[0].name if len(artist_list) > 0 else None), source_list=source_list, lyrics_list=lyrics_list, - main_artist_list=artist_list, + feature_artist_list=artist_list, album_list=album_list, artwork=artwork, ) @@ -652,10 +652,101 @@ class Musify(Page): return Song( title=clean_song_title(song_name, artist_name=artist_list[0].name if len(artist_list) > 0 else None), tracksort=tracksort, - main_artist_list=artist_list, + feature_artist_list=artist_list, source_list=source_list ) + + def _parse_album(self, soup: BeautifulSoup) -> Album: + name: str = None + source_list: List[Source] = [] + artist_list: List[Artist] = [] + date: ID3Timestamp = None + + """ + if breadcrumb list has 4 elements, then + the -2 is the artist link, + the -1 is the album + """ + # breadcrumb + breadcrumb_soup: BeautifulSoup = soup.find("ol", {"class", "breadcrumb"}) + breadcrumb_elements: List[BeautifulSoup] = breadcrumb_soup.find_all("li", {"class": "breadcrumb-item"}) + if len(breadcrumb_elements) == 4: + # album + album_crumb: BeautifulSoup = breadcrumb_elements[-1] + name = album_crumb.text.strip() + + # artist + artist_crumb: BeautifulSoup = breadcrumb_elements[-2] + anchor: BeautifulSoup = artist_crumb.find("a") + if anchor is not None: + href = anchor.get("href") + artist_source_list: List[Source] = [] + + if href is not None: + artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip())) + + span: BeautifulSoup = anchor.find("span") + if span is not None: + artist_list.append(Artist( + name=span.get_text(strip=True), + source_list=artist_source_list + )) + else: + self.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case") + + # meta + meta_url: BeautifulSoup = soup.find("meta", {"itemprop": "url"}) + if meta_url is not None: + url = meta_url.get("content") + if url is not None: + source_list.append(Source(self.SOURCE_TYPE, self.HOST + url)) + + meta_name: BeautifulSoup = soup.find("meta", {"itemprop": "name"}) + if meta_name is not None: + _name = meta_name.get("content") + if _name is not None: + name = _name + + # album info + album_info_ul: BeautifulSoup = soup.find("ul", {"class": "album-info"}) + if album_info_ul is not None: + artist_anchor: BeautifulSoup + for artist_anchor in album_info_ul.find_all("a", {"itemprop": "byArtist"}): + # line 98 + artist_source_list: List[Source] = [] + + artist_url_meta = artist_anchor.find("meta", {"itemprop": "url"}) + if artist_url_meta is not None: + artist_href = artist_url_meta.get("content") + if artist_href is not None: + artist_source_list.append(Source(self.SOURCE_TYPE, url=self.HOST + artist_href)) + + artist_meta_name = artist_anchor.find("meta", {"itemprop": "name"}) + if artist_meta_name is not None: + artist_name = artist_meta_name.get("content") + if artist_name is not None: + artist_list.append(Artist( + name=artist_name, + source_list=artist_source_list + )) + + time_soup: BeautifulSoup = album_info_ul.find("time", {"itemprop": "datePublished"}) + if time_soup is not None: + raw_datetime = time_soup.get("datetime") + if raw_datetime is not None: + try: + date = ID3Timestamp.strptime(raw_datetime, "%Y-%m-%d") + except ValueError: + self.LOGGER.debug(f"Raw datetime doesn't match time format %Y-%m-%d: {raw_datetime}") + + return Album( + title=name, + source_list=source_list, + artist_list=artist_list, + date=date + ) + def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: """ fetches album from source: @@ -694,19 +785,14 @@ class Musify(Page): return album - def _get_artist_attributes(self, url: MusifyUrl) -> Artist: + def _fetch_initial_artist(self, url: MusifyUrl, source: Source) -> Artist: """ - fetches the main Artist attributes from this endpoint https://musify.club/artist/ghost-bath-280348?_pjax=#bodyContent - it needs to parse html - - :param url: - :return: """ r = self.connection.get(f"https://musify.club/{url.source_type.value}/{url.name_with_id}?_pjax=#bodyContent", name="artist_attributes_" + url.name_with_id) if r is None: - return Artist() + return Artist(source_list=[source]) soup = self.get_soup_from_response(r) @@ -821,7 +907,7 @@ class Musify(Page): notes=notes ) - def _parse_album_card(self, album_card: BeautifulSoup, artist_name: str = None) -> Album: + def _parse_album_card(self, album_card: BeautifulSoup, source: Source, artist_name: str = None, **kwargs) -> Album: """
@@ -845,13 +931,17 @@ class Musify(Page):
""" - _id: Optional[str] = None + album_kwargs: Dict[str, Any] = { + "source_list": [source], + } + name: str = None source_list: List[Source] = [] timestamp: Optional[ID3Timestamp] = None album_status = None def set_name(new_name: str): + nonlocal album_kwargs nonlocal name nonlocal artist_name @@ -882,7 +972,7 @@ class Musify(Page): album_status = AlbumStatus.BOOTLEG def parse_release_anchor(_anchor: BeautifulSoup, text_is_name=False): - nonlocal _id + nonlocal album_kwargs nonlocal name nonlocal source_list @@ -892,21 +982,11 @@ class Musify(Page): href = _anchor.get("href") if href is not None: # add url to sources - source_list.append(Source( + album_kwargs["source_list"].append(Source( self.SOURCE_TYPE, self.HOST + href )) - # split id from url - split_href = href.split("-") - if len(split_href) > 1: - _id = split_href[-1] - - if not text_is_name: - return - - set_name(_anchor.text) - anchor_list = album_card.find_all("a", recursive=False) if len(anchor_list) > 0: anchor = anchor_list[0] @@ -964,104 +1044,16 @@ class Musify(Page): album_status=album_status, ) - def _parse_album(self, soup: BeautifulSoup) -> Album: - name: str = None - source_list: List[Source] = [] - artist_list: List[Artist] = [] - date: ID3Timestamp = None - - """ - if breadcrumb list has 4 elements, then - the -2 is the artist link, - the -1 is the album - """ - # breadcrumb - breadcrumb_soup: BeautifulSoup = soup.find("ol", {"class", "breadcrumb"}) - breadcrumb_elements: List[BeautifulSoup] = breadcrumb_soup.find_all("li", {"class": "breadcrumb-item"}) - if len(breadcrumb_elements) == 4: - # album - album_crumb: BeautifulSoup = breadcrumb_elements[-1] - name = album_crumb.text.strip() - - # artist - artist_crumb: BeautifulSoup = breadcrumb_elements[-2] - anchor: BeautifulSoup = artist_crumb.find("a") - if anchor is not None: - href = anchor.get("href") - artist_source_list: List[Source] = [] - - if href is not None: - artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip())) - - span: BeautifulSoup = anchor.find("span") - if span is not None: - artist_list.append(Artist( - name=span.get_text(strip=True), - source_list=artist_source_list - )) - else: - self.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case") - - # meta - meta_url: BeautifulSoup = soup.find("meta", {"itemprop": "url"}) - if meta_url is not None: - url = meta_url.get("content") - if url is not None: - source_list.append(Source(self.SOURCE_TYPE, self.HOST + url)) - - meta_name: BeautifulSoup = soup.find("meta", {"itemprop": "name"}) - if meta_name is not None: - _name = meta_name.get("content") - if _name is not None: - name = _name - - # album info - album_info_ul: BeautifulSoup = soup.find("ul", {"class": "album-info"}) - if album_info_ul is not None: - artist_anchor: BeautifulSoup - for artist_anchor in album_info_ul.find_all("a", {"itemprop": "byArtist"}): - # line 98 - artist_source_list: List[Source] = [] - - artist_url_meta = artist_anchor.find("meta", {"itemprop": "url"}) - if artist_url_meta is not None: - artist_href = artist_url_meta.get("content") - if artist_href is not None: - artist_source_list.append(Source(self.SOURCE_TYPE, url=self.HOST + artist_href)) - - artist_meta_name = artist_anchor.find("meta", {"itemprop": "name"}) - if artist_meta_name is not None: - artist_name = artist_meta_name.get("content") - if artist_name is not None: - artist_list.append(Artist( - name=artist_name, - source_list=artist_source_list - )) - - time_soup: BeautifulSoup = album_info_ul.find("time", {"itemprop": "datePublished"}) - if time_soup is not None: - raw_datetime = time_soup.get("datetime") - if raw_datetime is not None: - try: - date = ID3Timestamp.strptime(raw_datetime, "%Y-%m-%d") - except ValueError: - self.LOGGER.debug(f"Raw datetime doesn't match time format %Y-%m-%d: {raw_datetime}") - - return Album( - title=name, - source_list=source_list, - artist_list=artist_list, - date=date - ) - - def _get_discography(self, artist: Artist, url: MusifyUrl, artist_name: str = None, stop_at_level: int = 1) -> Generator[Album, None, None]: + def _fetch_artist_discography(self, artist: Artist, url: MusifyUrl, artist_name: str = None, **kwargs): """ POST https://musify.club/artist/filteralbums - ArtistID: 280348 - SortOrder.Property: dateCreated - SortOrder.IsAscending: false - X-Requested-With: XMLHttpRequest + ArtistID: 280348 + SortOrder.Property: dateCreated + SortOrder.IsAscending: false + X-Requested-With: XMLHttpRequest """ + _download_all = kwargs.get("download_all", False) + _album_type_blacklist = kwargs.get("album_type_blacklist", main_settings["album_type_blacklist"]) endpoint = self.HOST + "/" + url.source_type.value + "/filteralbums" @@ -1072,36 +1064,29 @@ class Musify(Page): "X-Requested-With": "XMLHttpRequest" }, name="discography_" + url.name_with_id) if r is None: - return [] - soup: BeautifulSoup = BeautifulSoup(r.content, features="html.parser") + return + + soup: BeautifulSoup = self.get_soup_from_response(r) for card_soup in soup.find_all("div", {"class": "card"}): - album = self._parse_album_card(card_soup, artist_name) - if album.album_type is AlbumType.COMPILATION_ALBUM or album.album_type is AlbumType.MIXTAPE: + album = self._parse_album_card(card_soup, source, artist_name, **kwargs) + if album.album_type in _album_type_blacklist: continue + artist.main_album_collection.append(album) - def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: + def fetch_artist(self, source: Source, **kwargs) -> Artist: """ - fetches artist from source - + TODO [x] discography [x] attributes [] picture gallery - - Args: - source (Source): the source to fetch - stop_at_level: int = 1: if it is false, every album from discograohy will be fetched. Defaults to False. - - Returns: - Artist: the artist fetched """ url = parse_url(source.url) - artist = self._get_artist_attributes(url) - - self._get_discography(artist, url, artist.name) + artist = self._fetch_initial_artist(url, source=source, **kwargs) + self._fetch_artist_discography(artist, url, artist.name, **kwargs) return artist