From 068c749c385b51479912bf4cb65c7b005efd5cca Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 15:27:10 +0200 Subject: [PATCH 01/13] feat: implemented artist search --- music_kraken/download/page_attributes.py | 3 +- music_kraken/pages/__init__.py | 1 + music_kraken/pages/genius.py | 312 +++++++++++++++++++++++ music_kraken/utils/shared.py | 4 +- 4 files changed, 317 insertions(+), 3 deletions(-) create mode 100644 music_kraken/pages/genius.py diff --git a/music_kraken/download/page_attributes.py b/music_kraken/download/page_attributes.py index 997960d..1db24be 100644 --- a/music_kraken/download/page_attributes.py +++ b/music_kraken/download/page_attributes.py @@ -30,11 +30,12 @@ from ..utils.exception import MKMissingNameException from ..utils.exception.download import UrlNotFoundException from ..utils.shared import DEBUG_PAGES -from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic, Bandcamp, INDEPENDENT_DB_OBJECTS +from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic, Bandcamp, Genius, INDEPENDENT_DB_OBJECTS ALL_PAGES: Set[Type[Page]] = { # EncyclopaediaMetallum, + Genius, Musify, YoutubeMusic, Bandcamp diff --git a/music_kraken/pages/__init__.py b/music_kraken/pages/__init__.py index 5757a2c..ba24501 100644 --- a/music_kraken/pages/__init__.py +++ b/music_kraken/pages/__init__.py @@ -3,5 +3,6 @@ from .musify import Musify from .youtube import YouTube from .youtube_music import YoutubeMusic from .bandcamp import Bandcamp +from .genius import Genius from .abstract import Page, INDEPENDENT_DB_OBJECTS diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py new file mode 100644 index 0000000..7a609c8 --- /dev/null +++ b/music_kraken/pages/genius.py @@ -0,0 +1,312 @@ +from typing import List, Optional, Type +from urllib.parse import urlparse, urlunparse, urlencode +import json +from enum import Enum +from bs4 import BeautifulSoup +import pycountry + +from ..objects import Source, DatabaseObject +from .abstract import Page +from ..objects import ( + Artist, + Source, + SourceType, + Song, + Album, + Label, + Target, + Contact, + ID3Timestamp, + Lyrics, + FormattedText, + Artwork, +) +from ..connection import Connection +from ..utils import dump_to_file, traverse_json_path +from ..utils.enums import SourceType, ALL_SOURCE_TYPES +from ..utils.support_classes.download_result import DownloadResult +from ..utils.string_processing import clean_song_title +from ..utils.config import main_settings, logging_settings +from ..utils.shared import DEBUG + +if DEBUG: + from ..utils import dump_to_file + + +class Genius(Page): + SOURCE_TYPE = ALL_SOURCE_TYPES.GENIUS + HOST = "genius.com" + + def __init__(self, *args, **kwargs): + self.connection: Connection = Connection( + host="https://genius.com/", + logger=self.LOGGER, + module="genius", + ) + + super().__init__(*args, **kwargs) + + def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]: + path = source.parsed_url.path.replace("/", "") + + return super().get_source_type(source) + + def general_search(self, search_query: str, **kwargs) -> List[DatabaseObject]: + results = [] + + search_params = { + "q": search_query, + } + + r = self.connection.get("https://genius.com/api/search/multi?" + urlencode(search_params), name=f"search_{search_query}") + if r is None: + return results + + dump_to_file("search_genius.json", r.text, is_json=True, exit_after_dump=False) + data = r.json() + + for elements in traverse_json_path(data, "response.sections", default=[]): + hits = elements.get("hits", []) + for hit in hits: + result = hit.get("result", {}) + hit_type = hit.get("type", result.get("_type")) + + name = result.get("name") + if name is None: + continue + source = Source(self.SOURCE_TYPE, result.get("url"), additional_data={ + "id": result.get("id"), + }) + if source.url is None: + continue + image_url = result.get("header_image_url") + + if hit_type == "artist": + results.append(Artist( + name=name, + source_list=[source] + )) + continue + + return results + + def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label: + return Label() + + def _parse_artist_details(self, soup: BeautifulSoup) -> Artist: + name: str = None + source_list: List[Source] = [] + contact_list: List[Contact] = [] + + band_name_location: BeautifulSoup = soup.find("p", {"id": "band-name-location"}) + if band_name_location is not None: + title_span = band_name_location.find("span", {"class": "title"}) + if title_span is not None: + name = title_span.text.strip() + + link_container: BeautifulSoup = soup.find("ol", {"id": "band-links"}) + if link_container is not None: + li: BeautifulSoup + for li in link_container.find_all("a"): + if li is None and li['href'] is not None: + continue + + source_list.append(Source.match_url(_parse_artist_url(li['href']), referrer_page=self.SOURCE_TYPE)) + + return Artist( + name=name, + source_list=source_list + ) + + def _parse_album(self, soup: BeautifulSoup, initial_source: Source) -> List[Album]: + title = None + source_list: List[Source] = [] + + a = soup.find("a") + if a is not None and a["href"] is not None: + source_list.append(Source(self.SOURCE_TYPE, _get_host(initial_source) + a["href"])) + + title_p = soup.find("p", {"class": "title"}) + if title_p is not None: + title = title_p.text.strip() + + return Album(title=title, source_list=source_list) + + def _parse_artist_data_blob(self, data_blob: dict, artist_url: str): + parsed_artist_url = urlparse(artist_url) + album_list: List[Album] = [] + + for album_json in data_blob.get("buyfulldisco", {}).get("tralbums", []): + album_list.append(Album( + title=album_json["title"].strip(), + source_list=[Source( + self.SOURCE_TYPE, + urlunparse((parsed_artist_url.scheme, parsed_artist_url.netloc, album_json["page_url"], "", "", "")) + )] + )) + + return album_list + + def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: + artist = Artist() + + r = self.connection.get(_parse_artist_url(source.url), name=f"artist_{urlparse(source.url).scheme}_{urlparse(source.url).netloc}") + if r is None: + return artist + + soup = self.get_soup_from_response(r) + + if DEBUG: + dump_to_file("artist_page.html", r.text, exit_after_dump=False) + + artist = self._parse_artist_details(soup=soup.find("div", {"id": "bio-container"})) + + html_music_grid = soup.find("ol", {"id": "music-grid"}) + if html_music_grid is not None: + for subsoup in html_music_grid.find_all("li"): + artist.album_collection.append(self._parse_album(soup=subsoup, initial_source=source)) + + for i, data_blob_soup in enumerate(soup.find_all("div", {"id": ["pagedata", "collectors-data"]})): + data_blob = data_blob_soup["data-blob"] + + if DEBUG: + dump_to_file(f"bandcamp_artist_data_blob_{i}.json", data_blob, is_json=True, exit_after_dump=False) + + if data_blob is not None: + artist.album_collection.extend( + self._parse_artist_data_blob(json.loads(data_blob), source.url) + ) + + artist.source_collection.append(source) + return artist + + def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]: + lyrics_list: List[Lyrics] = [] + + _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text") + if _lyrics is not None: + lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics))) + + return Song( + title=clean_song_title(track["item"]["name"]), + source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])], + tracksort=int(track["position"]), + artwork=artwork, + ) + + def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: + album = Album() + + r = self.connection.get(source.url, name=f"album_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('album', '')}") + if r is None: + return album + + soup = self.get_soup_from_response(r) + + data_container = soup.find("script", {"type": "application/ld+json"}) + + if DEBUG: + dump_to_file("album_data.json", data_container.text, is_json=True, exit_after_dump=False) + + data = json.loads(data_container.text) + artist_data = data["byArtist"] + + artist_source_list = [] + if "@id" in artist_data: + artist_source_list = [Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))] + album = Album( + title=data["name"].strip(), + source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]))], + date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"), + artist_list=[Artist( + name=artist_data["name"].strip(), + source_list=artist_source_list + )] + ) + + artwork: Artwork = Artwork() + + def _get_artwork_url(_data: dict) -> Optional[str]: + if "image" in _data: + return _data["image"] + for _property in _data.get("additionalProperty", []): + if _property.get("name") == "art_id": + return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg" + + _artwork_url = _get_artwork_url(data) + if _artwork_url is not None: + artwork.append(url=_artwork_url, width=350, height=350) + else: + for album_release in data.get("albumRelease", []): + _artwork_url = _get_artwork_url(album_release) + if _artwork_url is not None: + artwork.append(url=_artwork_url, width=350, height=350) + break + + + for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])): + if DEBUG: + dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False) + + try: + album.song_collection.append(self._parse_track_element(track_json, artwork=artwork)) + except KeyError: + continue + + album.source_collection.append(source) + return album + + def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]: + track_lyrics = soup.find("div", {"class": "lyricsText"}) + if track_lyrics: + return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))] + + return [] + + def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song: + r = self.connection.get(source.url, name=f"song_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('track', '')}") + if r is None: + return Song() + + soup = self.get_soup_from_response(r) + + data_container = soup.find("script", {"type": "application/ld+json"}) + other_data = {} + + other_data_list = soup.select("script[data-tralbum]") + if len(other_data_list) > 0: + other_data = json.loads(other_data_list[0]["data-tralbum"]) + + dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False) + dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False) + dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False) + + data = json.loads(data_container.text) + album_data = data["inAlbum"] + artist_data = data["byArtist"] + + mp3_url = None + for key, value in other_data.get("trackinfo", [{}])[0].get("file", {"": None}).items(): + mp3_url = value + + song = Song( + title=clean_song_title(data["name"], artist_name=artist_data["name"]), + source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)], + album_list=[Album( + title=album_data["name"].strip(), + date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"), + source_list=[Source(self.SOURCE_TYPE, album_data["@id"])] + )], + artist_list=[Artist( + name=artist_data["name"].strip(), + source_list=[Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))] + )], + lyrics_list=self._fetch_lyrics(soup=soup) + ) + + return song + + def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult: + if source.audio_url is None: + return DownloadResult(error_message="Couldn't find download link.") + return self.connection.stream_into(url=source.audio_url, target=target, description=desc) diff --git a/music_kraken/utils/shared.py b/music_kraken/utils/shared.py index 2a5d4a4..b75cf7f 100644 --- a/music_kraken/utils/shared.py +++ b/music_kraken/utils/shared.py @@ -15,11 +15,11 @@ __stage__ = os.getenv("STAGE", "prod") DEBUG = (__stage__ == "dev") and True DEBUG_LOGGING = DEBUG and False DEBUG_TRACE = DEBUG and True -DEBUG_OBJECT_TRACE = DEBUG and True +DEBUG_OBJECT_TRACE = DEBUG and False DEBUG_OBJECT_TRACE_CALLSTACK = DEBUG_OBJECT_TRACE and False DEBUG_YOUTUBE_INITIALIZING = DEBUG and False DEBUG_PAGES = DEBUG and False -DEBUG_DUMP = DEBUG and False +DEBUG_DUMP = DEBUG and True DEBUG_PRINT_ID = DEBUG and True if DEBUG: From f6caee41a87b7977809f9ea1e1e086477e82974d Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 15:52:41 +0200 Subject: [PATCH 02/13] feat: finished searching genious --- music_kraken/pages/genius.py | 106 ++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 7a609c8..1ff2eda 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -51,6 +51,90 @@ class Genius(Page): return super().get_source_type(source) + def add_to_artwork(self, artwork: Artwork, url: str): + if url is None: + return + + url_frags = url.split(".") + if len(url_frags) < 2: + artwork.append(url=url) + return + + dimensions = url_frags[-2].split("x") + if len(dimensions) < 2: + artwork.append(url=url) + return + + if len(dimensions) == 3: + dimensions = dimensions[:-1] + + try: + artwork.append(url=url, width=int(dimensions[0]), height=int(dimensions[1])) + except ValueError: + artwork.append(url=url) + + def parse_api_object(self, data: dict) -> Optional[DatabaseObject]: + object_type = data.get("_type") + + artwork = Artwork() + self.add_to_artwork(artwork, data.get("header_image_url")) + self.add_to_artwork(artwork, data.get("image_url")) + + source: Source = Source(self.SOURCE_TYPE, data.get("url"), additional_data={ + "id": data.get("id"), + "slug": data.get("slug"), + "api_path": data.get("api_path"), + }) + + if source.url is None: + return None + + if object_type == "artist": + return Artist( + name=data.get("name"), + source_list=[source], + artwork=artwork, + ) + + if object_type == "album": + self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url")) + self.add_to_artwork(artwork, data.get("cover_art_url")) + + return Album( + title=data.get("name"), + source_list=[source], + artist_list=[self.parse_api_object(data.get("artist"))], + artwork=artwork, + date=ID3Timestamp(**data.get("release_date_components", {})), + ) + + if object_type == "song": + self.add_to_artwork(artwork, data.get("song_art_image_thumbnail_url")) + self.add_to_artwork(artwork, data.get("song_art_image_url")) + + main_artist_list = [] + featured_artist_list = [] + + _artist_name = None + primary_artist = self.parse_api_object(data.get("primary_artist")) + if primary_artist is not None: + _artist_name = primary_artist.name + main_artist_list.append(primary_artist) + for feature_artist in data.get("featured_artists", []): + artist = self.parse_api_object(feature_artist) + if artist is not None: + featured_artist_list.append(artist) + + return Song( + title=clean_song_title(data.get("title"), artist_name=_artist_name), + source_list=[source], + artwork=artwork, + feature_artist_list=featured_artist_list, + artist_list=main_artist_list, + ) + + return None + def general_search(self, search_query: str, **kwargs) -> List[DatabaseObject]: results = [] @@ -68,25 +152,9 @@ class Genius(Page): for elements in traverse_json_path(data, "response.sections", default=[]): hits = elements.get("hits", []) for hit in hits: - result = hit.get("result", {}) - hit_type = hit.get("type", result.get("_type")) - - name = result.get("name") - if name is None: - continue - source = Source(self.SOURCE_TYPE, result.get("url"), additional_data={ - "id": result.get("id"), - }) - if source.url is None: - continue - image_url = result.get("header_image_url") - - if hit_type == "artist": - results.append(Artist( - name=name, - source_list=[source] - )) - continue + parsed = self.parse_api_object(hit.get("result")) + if parsed is not None: + results.append(parsed) return results From e4fd9faf12a0bc3a9506106c3505a59bab57292d Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 15:57:09 +0200 Subject: [PATCH 03/13] feat: detecting url type --- development/actual_donwload.py | 5 ++- music_kraken/pages/genius.py | 64 ++++------------------------------ 2 files changed, 8 insertions(+), 61 deletions(-) diff --git a/development/actual_donwload.py b/development/actual_donwload.py index d91876e..ad4e316 100644 --- a/development/actual_donwload.py +++ b/development/actual_donwload.py @@ -6,9 +6,8 @@ logging.getLogger().setLevel(logging.DEBUG) if __name__ == "__main__": commands = [ - "s: #a I'm in a coffin", - "0", - "d: 0", + "s: #a Crystal F", + "10", ] diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 1ff2eda..e6fa86f 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -48,8 +48,13 @@ class Genius(Page): def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]: path = source.parsed_url.path.replace("/", "") + + if path.startswith("artists"): + return Artist + if path.startswith("albums"): + return Album - return super().get_source_type(source) + return Song def add_to_artwork(self, artwork: Artwork, url: str): if url is None: @@ -158,63 +163,6 @@ class Genius(Page): return results - def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label: - return Label() - - def _parse_artist_details(self, soup: BeautifulSoup) -> Artist: - name: str = None - source_list: List[Source] = [] - contact_list: List[Contact] = [] - - band_name_location: BeautifulSoup = soup.find("p", {"id": "band-name-location"}) - if band_name_location is not None: - title_span = band_name_location.find("span", {"class": "title"}) - if title_span is not None: - name = title_span.text.strip() - - link_container: BeautifulSoup = soup.find("ol", {"id": "band-links"}) - if link_container is not None: - li: BeautifulSoup - for li in link_container.find_all("a"): - if li is None and li['href'] is not None: - continue - - source_list.append(Source.match_url(_parse_artist_url(li['href']), referrer_page=self.SOURCE_TYPE)) - - return Artist( - name=name, - source_list=source_list - ) - - def _parse_album(self, soup: BeautifulSoup, initial_source: Source) -> List[Album]: - title = None - source_list: List[Source] = [] - - a = soup.find("a") - if a is not None and a["href"] is not None: - source_list.append(Source(self.SOURCE_TYPE, _get_host(initial_source) + a["href"])) - - title_p = soup.find("p", {"class": "title"}) - if title_p is not None: - title = title_p.text.strip() - - return Album(title=title, source_list=source_list) - - def _parse_artist_data_blob(self, data_blob: dict, artist_url: str): - parsed_artist_url = urlparse(artist_url) - album_list: List[Album] = [] - - for album_json in data_blob.get("buyfulldisco", {}).get("tralbums", []): - album_list.append(Album( - title=album_json["title"].strip(), - source_list=[Source( - self.SOURCE_TYPE, - urlunparse((parsed_artist_url.scheme, parsed_artist_url.netloc, album_json["page_url"], "", "", "")) - )] - )) - - return album_list - def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: artist = Artist() From adfce16d2a2654a665a5198c3355e6ae4c714ea3 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 16:21:58 +0200 Subject: [PATCH 04/13] feat: fetched the flat artist details --- .vscode/settings.json | 1 + music_kraken/connection/cache.py | 6 ++-- music_kraken/objects/artwork.py | 2 ++ music_kraken/pages/genius.py | 49 ++++++++++++++++++-------------- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index bea0c42..fbc21fa 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -27,6 +27,7 @@ "Gitea", "iframe", "isrc", + "itemprop", "levenshtein", "metallum", "MUSICBRAINZ", diff --git a/music_kraken/connection/cache.py b/music_kraken/connection/cache.py index 004a6ba..4c40fe9 100644 --- a/music_kraken/connection/cache.py +++ b/music_kraken/connection/cache.py @@ -136,13 +136,13 @@ class Cache: ) self._write_attribute(cache_attribute) - cache_path = fit_to_file_system(Path(module_path, name), hidden_ok=True) + cache_path = fit_to_file_system(Path(module_path, name.replace("/", "_")), hidden_ok=True) with cache_path.open("wb") as content_file: self.logger.debug(f"writing cache to {cache_path}") content_file.write(content) def get(self, name: str) -> Optional[CacheResult]: - path = fit_to_file_system(Path(self._dir, self.module, name), hidden_ok=True) + path = fit_to_file_system(Path(self._dir, self.module, name.replace("/", "_")), hidden_ok=True) if not path.is_file(): return None @@ -165,7 +165,7 @@ class Cache: if ca.name == "": continue - file = fit_to_file_system(Path(self._dir, ca.module, ca.name), hidden_ok=True) + file = fit_to_file_system(Path(self._dir, ca.module, ca.name.replace("/", "_")), hidden_ok=True) if not ca.is_valid: self.logger.debug(f"deleting cache {ca.id}") diff --git a/music_kraken/objects/artwork.py b/music_kraken/objects/artwork.py index d5ba54b..178edf6 100644 --- a/music_kraken/objects/artwork.py +++ b/music_kraken/objects/artwork.py @@ -59,4 +59,6 @@ class Artwork: self._variant_mapping[key] = value def __eq__(self, other: Artwork) -> bool: + if not isinstance(other, Artwork): + return False return any(a == b for a, b in zip(self._variant_mapping.keys(), other._variant_mapping.keys())) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index e6fa86f..7c3af73 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -85,20 +85,38 @@ class Genius(Page): self.add_to_artwork(artwork, data.get("header_image_url")) self.add_to_artwork(artwork, data.get("image_url")) + additional_sources: List[Source] = [] source: Source = Source(self.SOURCE_TYPE, data.get("url"), additional_data={ "id": data.get("id"), "slug": data.get("slug"), "api_path": data.get("api_path"), }) + notes = FormattedText() + description = data.get("description", {}) + if "html" in description: + notes.html = description["html"] + elif "markdown" in description: + notes.markdown = description["markdown"] + elif "description_preview" in data: + notes.plain = data["description_preview"] + if source.url is None: return None if object_type == "artist": + if data.get("instagram_name") is not None: + additional_sources.append(Source(ALL_SOURCE_TYPES.INSTAGRAM, f"https://www.instagram.com/{data['instagram_name']}/")) + if data.get("facebook_name") is not None: + additional_sources.append(Source(ALL_SOURCE_TYPES.FACEBOOK, f"https://www.facebook.com/{data['facebook_name']}/")) + if data.get("twitter_name") is not None: + additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/")) + return Artist( name=data.get("name"), source_list=[source], artwork=artwork, + notes=notes, ) if object_type == "album": @@ -165,35 +183,22 @@ class Genius(Page): def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: artist = Artist() + # https://genius.com/api/artists/24527/albums?page=1 - r = self.connection.get(_parse_artist_url(source.url), name=f"artist_{urlparse(source.url).scheme}_{urlparse(source.url).netloc}") + r = self.connection.get(source.url, name=source.url) if r is None: return artist - soup = self.get_soup_from_response(r) - if DEBUG: - dump_to_file("artist_page.html", r.text, exit_after_dump=False) + # find the content attribute in the meta tag which is contained in the head + data_container = soup.find("meta", {"itemprop": "page_data"}) + if data_container is not None: + content = data_container["content"] + dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False) + data = json.loads(content) - artist = self._parse_artist_details(soup=soup.find("div", {"id": "bio-container"})) + artist = self.parse_api_object(data.get("artist", {})) - html_music_grid = soup.find("ol", {"id": "music-grid"}) - if html_music_grid is not None: - for subsoup in html_music_grid.find_all("li"): - artist.album_collection.append(self._parse_album(soup=subsoup, initial_source=source)) - - for i, data_blob_soup in enumerate(soup.find_all("div", {"id": ["pagedata", "collectors-data"]})): - data_blob = data_blob_soup["data-blob"] - - if DEBUG: - dump_to_file(f"bandcamp_artist_data_blob_{i}.json", data_blob, is_json=True, exit_after_dump=False) - - if data_blob is not None: - artist.album_collection.extend( - self._parse_artist_data_blob(json.loads(data_blob), source.url) - ) - - artist.source_collection.append(source) return artist def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]: From 46b64b8f8dfa2436c9ed103e19c243fef264747b Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 16:23:05 +0200 Subject: [PATCH 05/13] feat: fetched the flat artist details --- music_kraken/pages/genius.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 7c3af73..f37715b 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -198,6 +198,9 @@ class Genius(Page): data = json.loads(content) artist = self.parse_api_object(data.get("artist", {})) + + for e in data.get("artist_albums", []): + artist.album_collection.append(self.parse_api_object(e)) return artist From f5d953d9ce6f844d7107e1b946bf2ef0558aca32 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 16:34:04 +0200 Subject: [PATCH 06/13] feat: theoretically fetching feature songs --- music_kraken/pages/genius.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index f37715b..15f1ac2 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -182,7 +182,7 @@ class Genius(Page): return results def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: - artist = Artist() + artist: Artist = Artist() # https://genius.com/api/artists/24527/albums?page=1 r = self.connection.get(source.url, name=source.url) @@ -200,7 +200,26 @@ class Genius(Page): artist = self.parse_api_object(data.get("artist", {})) for e in data.get("artist_albums", []): - artist.album_collection.append(self.parse_api_object(e)) + r = self.parse_api_object(e) + if not isinstance(r, Album): + continue + + artist.album_collection.append(r) + + for e in data.get("artist_songs", []): + r = self.parse_api_object(e) + if not isinstance(r, Song): + continue + + """ + TODO + fetch the album for these songs, because the api doesn't + return them + """ + + artist.album_collection.extend(r.album_collection) + + artist.source_collection.append(source) return artist From 769d27dc5c6174c1c9a6d676cb4a56977fe5b9bf Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 16:43:52 +0200 Subject: [PATCH 07/13] feat: album details --- development/actual_donwload.py | 1 + music_kraken/pages/genius.py | 134 +++++---------------------------- 2 files changed, 20 insertions(+), 115 deletions(-) diff --git a/development/actual_donwload.py b/development/actual_donwload.py index ad4e316..1ee7563 100644 --- a/development/actual_donwload.py +++ b/development/actual_donwload.py @@ -8,6 +8,7 @@ if __name__ == "__main__": commands = [ "s: #a Crystal F", "10", + "1", ] diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 15f1ac2..e17aea9 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -113,7 +113,7 @@ class Genius(Page): additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/")) return Artist( - name=data.get("name"), + name=data.get("name").strip(), source_list=[source], artwork=artwork, notes=notes, @@ -123,8 +123,12 @@ class Genius(Page): self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url")) self.add_to_artwork(artwork, data.get("cover_art_url")) + for cover_art in data.get("cover_arts", []): + self.add_to_artwork(artwork, cover_art.get("image_url")) + self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url")) + return Album( - title=data.get("name"), + title=data.get("name").strip(), source_list=[source], artist_list=[self.parse_api_object(data.get("artist"))], artwork=artwork, @@ -223,130 +227,30 @@ class Genius(Page): return artist - def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]: - lyrics_list: List[Lyrics] = [] - - _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text") - if _lyrics is not None: - lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics))) - - return Song( - title=clean_song_title(track["item"]["name"]), - source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])], - tracksort=int(track["position"]), - artwork=artwork, - ) - def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: - album = Album() + album: Album = Album() + # https://genius.com/api/artists/24527/albums?page=1 - r = self.connection.get(source.url, name=f"album_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('album', '')}") + r = self.connection.get(source.url, name=source.url) if r is None: return album - soup = self.get_soup_from_response(r) - data_container = soup.find("script", {"type": "application/ld+json"}) - - if DEBUG: - dump_to_file("album_data.json", data_container.text, is_json=True, exit_after_dump=False) - - data = json.loads(data_container.text) - artist_data = data["byArtist"] - - artist_source_list = [] - if "@id" in artist_data: - artist_source_list = [Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))] - album = Album( - title=data["name"].strip(), - source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]))], - date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"), - artist_list=[Artist( - name=artist_data["name"].strip(), - source_list=artist_source_list - )] - ) - - artwork: Artwork = Artwork() - - def _get_artwork_url(_data: dict) -> Optional[str]: - if "image" in _data: - return _data["image"] - for _property in _data.get("additionalProperty", []): - if _property.get("name") == "art_id": - return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg" - - _artwork_url = _get_artwork_url(data) - if _artwork_url is not None: - artwork.append(url=_artwork_url, width=350, height=350) - else: - for album_release in data.get("albumRelease", []): - _artwork_url = _get_artwork_url(album_release) - if _artwork_url is not None: - artwork.append(url=_artwork_url, width=350, height=350) - break - - - for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])): - if DEBUG: - dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False) - - try: - album.song_collection.append(self._parse_track_element(track_json, artwork=artwork)) - except KeyError: - continue + # find the content attribute in the meta tag which is contained in the head + data_container = soup.find("meta", {"itemprop": "page_data"}) + if data_container is not None: + content = data_container["content"] + dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False) + data = json.loads(content) + album = self.parse_api_object(data.get("album", {})) + album.source_collection.append(source) + return album - def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]: - track_lyrics = soup.find("div", {"class": "lyricsText"}) - if track_lyrics: - return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))] - - return [] - def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song: - r = self.connection.get(source.url, name=f"song_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('track', '')}") - if r is None: - return Song() - - soup = self.get_soup_from_response(r) - - data_container = soup.find("script", {"type": "application/ld+json"}) - other_data = {} - - other_data_list = soup.select("script[data-tralbum]") - if len(other_data_list) > 0: - other_data = json.loads(other_data_list[0]["data-tralbum"]) - - dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False) - dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False) - dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False) - - data = json.loads(data_container.text) - album_data = data["inAlbum"] - artist_data = data["byArtist"] - - mp3_url = None - for key, value in other_data.get("trackinfo", [{}])[0].get("file", {"": None}).items(): - mp3_url = value - - song = Song( - title=clean_song_title(data["name"], artist_name=artist_data["name"]), - source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)], - album_list=[Album( - title=album_data["name"].strip(), - date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"), - source_list=[Source(self.SOURCE_TYPE, album_data["@id"])] - )], - artist_list=[Artist( - name=artist_data["name"].strip(), - source_list=[Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))] - )], - lyrics_list=self._fetch_lyrics(soup=soup) - ) - + song = Song() return song def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult: From 688b4fd357b2d927af78e2b5731b270977a15038 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 16:47:38 +0200 Subject: [PATCH 08/13] feat: getting the album tracklist --- music_kraken/objects/formatted_text.py | 8 ++++++++ music_kraken/pages/genius.py | 13 ++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/music_kraken/objects/formatted_text.py b/music_kraken/objects/formatted_text.py index 99e9ae2..6acb86e 100644 --- a/music_kraken/objects/formatted_text.py +++ b/music_kraken/objects/formatted_text.py @@ -37,11 +37,19 @@ class FormattedText: @property def markdown(self) -> str: return md(self.html).strip() + + @markdown.setter + def markdown(self, value: str) -> None: + self.html = mistune.markdown(value) @property def plain(self) -> str: md = self.markdown return md.replace("\n\n", "\n") + + @plain.setter + def plain(self, value: str) -> None: + self.html = mistune.markdown(plain_to_markdown(value)) def __str__(self) -> str: return self.markdown diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index e17aea9..8c8a15f 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -99,7 +99,7 @@ class Genius(Page): elif "markdown" in description: notes.markdown = description["markdown"] elif "description_preview" in data: - notes.plain = data["description_preview"] + notes.plaintext = data["description_preview"] if source.url is None: return None @@ -126,7 +126,7 @@ class Genius(Page): for cover_art in data.get("cover_arts", []): self.add_to_artwork(artwork, cover_art.get("image_url")) self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url")) - + return Album( title=data.get("name").strip(), source_list=[source], @@ -244,7 +244,14 @@ class Genius(Page): data = json.loads(content) album = self.parse_api_object(data.get("album", {})) - + + for e in data.get("album_appearances", []): + r = self.parse_api_object(e.get("song")) + if not isinstance(r, Song): + continue + + album.song_collection.append(r) + album.source_collection.append(source) return album From f61b34dd40ec0727e00f75fd0f306e31651b2b00 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 16:52:01 +0200 Subject: [PATCH 09/13] feat: improved feature artists by also adding writer and producer to it --- music_kraken/pages/genius.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 8c8a15f..afead40 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -147,7 +147,7 @@ class Genius(Page): if primary_artist is not None: _artist_name = primary_artist.name main_artist_list.append(primary_artist) - for feature_artist in data.get("featured_artists", []): + for feature_artist in (*data.get("featured_artists", []), *data.get("producer_artists", []), *data.get("writer_artists", [])): artist = self.parse_api_object(feature_artist) if artist is not None: featured_artist_list.append(artist) From 7b4eee858ab264e264f697bebebf68e295f28a7c Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 17:14:58 +0200 Subject: [PATCH 10/13] feat: parsed script json --- development/actual_donwload.py | 1 + music_kraken/pages/genius.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/development/actual_donwload.py b/development/actual_donwload.py index 1ee7563..ad8f1d0 100644 --- a/development/actual_donwload.py +++ b/development/actual_donwload.py @@ -9,6 +9,7 @@ if __name__ == "__main__": "s: #a Crystal F", "10", "1", + "3", ] diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index afead40..885aed8 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -256,8 +256,38 @@ class Genius(Page): return album + def get_json_content_from_response(self, response, start: str, end: str) -> Optional[str]: + content = response.text + start_index = content.find(start) + if start_index < 0: + return None + start_index += len(start) + end_index = content.find(end, start_index) + if end_index < 0: + return None + return content[start_index:end_index] + def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song: - song = Song() + song: Song = Song() + + r = self.connection.get(source.url, name=source.url) + if r is None: + return song + + # get the contents that are between `JSON.parse('` and `');` + content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n window.__APP_CONFIG__ = ") + if content is not None: + content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'") + data = json.loads(content) + + dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False) + + soup = self.get_soup_from_response(r) + for lyrics in soup.find_all("div", {"data-lyrics-container": "true"}): + lyrics_object = Lyrics(FormattedText(html=lyrics.prettify())) + song.lyrics_collection.append(lyrics_object) + + song.source_collection.append(source) return song def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult: From 90f70638b4b288421be9d646ddc38dd1a9d7a979 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Tue, 21 May 2024 17:55:08 +0200 Subject: [PATCH 11/13] feat: better lyrics support --- music_kraken/pages/genius.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 885aed8..0664ac4 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -280,6 +280,10 @@ class Genius(Page): content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'") data = json.loads(content) + lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None) + if lyrics_html is not None: + song.lyrics_collection.append(Lyrics(FormattedText(html=lyrics_html))) + dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False) soup = self.get_soup_from_response(r) @@ -289,8 +293,3 @@ class Genius(Page): song.source_collection.append(source) return song - - def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult: - if source.audio_url is None: - return DownloadResult(error_message="Couldn't find download link.") - return self.connection.stream_into(url=source.audio_url, target=target, description=desc) From 49dc7093c893749544246ae76d05406b64e3f991 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Wed, 22 May 2024 15:18:43 +0200 Subject: [PATCH 12/13] fix: genius fallback --- music_kraken/pages/genius.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/music_kraken/pages/genius.py b/music_kraken/pages/genius.py index 0664ac4..1719025 100644 --- a/music_kraken/pages/genius.py +++ b/music_kraken/pages/genius.py @@ -79,6 +79,8 @@ class Genius(Page): artwork.append(url=url) def parse_api_object(self, data: dict) -> Optional[DatabaseObject]: + if data is None: + return None object_type = data.get("_type") artwork = Artwork() @@ -93,7 +95,7 @@ class Genius(Page): }) notes = FormattedText() - description = data.get("description", {}) + description = data.get("description") or {} if "html" in description: notes.html = description["html"] elif "markdown" in description: @@ -113,7 +115,7 @@ class Genius(Page): additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/")) return Artist( - name=data.get("name").strip(), + name=data["name"].strip() if data.get("name") is not None else None, source_list=[source], artwork=artwork, notes=notes, @@ -147,7 +149,7 @@ class Genius(Page): if primary_artist is not None: _artist_name = primary_artist.name main_artist_list.append(primary_artist) - for feature_artist in (*data.get("featured_artists", []), *data.get("producer_artists", []), *data.get("writer_artists", [])): + for feature_artist in (*(data.get("featured_artists") or []), *(data.get("producer_artists") or []), *(data.get("writer_artists") or [])): artist = self.parse_api_object(feature_artist) if artist is not None: featured_artist_list.append(artist) @@ -201,16 +203,16 @@ class Genius(Page): dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False) data = json.loads(content) - artist = self.parse_api_object(data.get("artist", {})) + artist = self.parse_api_object(data.get("artist")) - for e in data.get("artist_albums", []): + for e in (data.get("artist_albums") or []): r = self.parse_api_object(e) if not isinstance(r, Album): continue artist.album_collection.append(r) - for e in data.get("artist_songs", []): + for e in (data.get("artist_songs") or []): r = self.parse_api_object(e) if not isinstance(r, Song): continue @@ -243,7 +245,7 @@ class Genius(Page): dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False) data = json.loads(content) - album = self.parse_api_object(data.get("album", {})) + album = self.parse_api_object(data.get("album")) for e in data.get("album_appearances", []): r = self.parse_api_object(e.get("song")) From 344da0a0bf8cdfaea474d8999a7cbfddee09a350 Mon Sep 17 00:00:00 2001 From: Lars Noack Date: Wed, 22 May 2024 15:20:26 +0200 Subject: [PATCH 13/13] fix: converting pictures to rgb before saving --- music_kraken/audio/metadata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/music_kraken/audio/metadata.py b/music_kraken/audio/metadata.py index d3f450e..bceb775 100644 --- a/music_kraken/audio/metadata.py +++ b/music_kraken/audio/metadata.py @@ -93,6 +93,10 @@ def write_metadata_to_target(metadata: Metadata, target: Target, song: Song): # resize the image to the preferred resolution img.thumbnail((main_settings["preferred_artwork_resolution"], main_settings["preferred_artwork_resolution"])) + # https://stackoverflow.com/a/59476938/16804841 + if img.mode != 'RGB': + img = img.convert('RGB') + img.save(converted_target.file_path, "JPEG") # https://stackoverflow.com/questions/70228440/mutagen-how-can-i-correctly-embed-album-art-into-mp3-file-so-that-i-can-see-t