feat: musicbrainz overall search

feat: Musicbrainz oriantation and class creation
hotfix
2024-09-16 13:12:50 +02:00 · 2024-07-03 10:48:44 +02:00 · 2024-06-03 10:19:32 +02:00 · 2024-05-23 13:36:10 +02:00 · 2024-05-23 13:24:25 +02:00 · 2024-05-23 13:20:34 +02:00
10 changed files with 240 additions and 123 deletions
@@ -8,6 +8,8 @@ if __name__ == "__main__":
    commands = [
        "s: #a Crystal F",
        "10",
        "1",
        "3",
    ]
@@ -93,6 +93,10 @@ def write_metadata_to_target(metadata: Metadata, target: Target, song: Song):
            # resize the image to the preferred resolution
            img.thumbnail((main_settings["preferred_artwork_resolution"], main_settings["preferred_artwork_resolution"]))
            # https://stackoverflow.com/a/59476938/16804841
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img.save(converted_target.file_path, "JPEG")
        # https://stackoverflow.com/questions/70228440/mutagen-how-can-i-correctly-embed-album-art-into-mp3-file-so-that-i-can-see-t
@@ -6,6 +6,7 @@ from typing import List, Optional
 from functools import lru_cache
 import logging
 from ..utils import output, BColors
 from ..utils.config import main_settings
 from ..utils.string_processing import fit_to_file_system
@@ -204,9 +205,12 @@ class Cache:
        for path in self._dir.iterdir():
            if path.is_dir():
                for file in path.iterdir():
                    output(f"Deleting file {file}", color=BColors.GREY)
                    file.unlink()
                output(f"Deleting folder {path}", color=BColors.HEADER)
                path.rmdir()
            else:
                output(f"Deleting folder {path}", color=BColors.HEADER)
                path.unlink()
        self.cached_attributes.clear()
@@ -30,7 +30,7 @@ from ..utils.exception import MKMissingNameException
 from ..utils.exception.download import UrlNotFoundException
 from ..utils.shared import DEBUG_PAGES
-from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic, Bandcamp, Genius, INDEPENDENT_DB_OBJECTS
+from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic, Bandcamp, Musicbrainz, Genius, INDEPENDENT_DB_OBJECTS
 ALL_PAGES: Set[Type[Page]] = {
@@ -38,7 +38,8 @@ ALL_PAGES: Set[Type[Page]] = {
    Genius,
    Musify,
    YoutubeMusic,
-    Bandcamp
+    Bandcamp,
    Musicbrainz
 }
 if youtube_settings["use_youtube_alongside_youtube_music"]:
@@ -38,11 +38,19 @@ class FormattedText:
    def markdown(self) -> str:
        return md(self.html).strip()
    @markdown.setter
    def markdown(self, value: str) -> None:
        self.html = mistune.markdown(value)
    @property
    def plain(self) -> str:
        md = self.markdown
        return md.replace("\n\n", "\n")
    @plain.setter
    def plain(self, value: str) -> None:
        self.html = mistune.markdown(plain_to_markdown(value))
    def __str__(self) -> str:
        return self.markdown
@@ -1,5 +1,6 @@
 from .encyclopaedia_metallum import EncyclopaediaMetallum
 from .musify import Musify
 from .musicbrainz import Musicbrainz
 from .youtube import YouTube
 from .youtube_music import YoutubeMusic
 from .bandcamp import Bandcamp
@@ -79,6 +79,8 @@ class Genius(Page):
            artwork.append(url=url)
    def parse_api_object(self, data: dict) -> Optional[DatabaseObject]:
        if data is None:
            return None
        object_type = data.get("_type")
        artwork = Artwork()
@@ -93,13 +95,13 @@ class Genius(Page):
        })
        notes = FormattedText()
-        description = data.get("description", {})
+        description = data.get("description") or {}
        if "html" in description:
            notes.html = description["html"]
        elif "markdown" in description:
            notes.markdown = description["markdown"]
        elif "description_preview" in data:
-            notes.plain = data["description_preview"]
+            notes.plaintext = data["description_preview"]
        if source.url is None:
            return None
@@ -113,7 +115,7 @@ class Genius(Page):
                additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/"))
            return Artist(
-                name=data.get("name"),
+                name=data["name"].strip() if data.get("name") is not None else None,
                source_list=[source],
                artwork=artwork,
                notes=notes,
@@ -123,12 +125,16 @@ class Genius(Page):
            self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url"))
            self.add_to_artwork(artwork, data.get("cover_art_url"))
            for cover_art in data.get("cover_arts", []):
                self.add_to_artwork(artwork, cover_art.get("image_url"))
                self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url"))
            return Album(
-                title=data.get("name"),
+                title=data.get("name").strip(),
                source_list=[source],
                artist_list=[self.parse_api_object(data.get("artist"))],
                artwork=artwork,
-                date=ID3Timestamp(**data.get("release_date_components", {})),
+                date=ID3Timestamp(**(data.get("release_date_components") or {})),
            )
        if object_type == "song":
@@ -143,7 +149,7 @@ class Genius(Page):
            if primary_artist is not None:
                _artist_name = primary_artist.name
                main_artist_list.append(primary_artist)
-            for feature_artist in data.get("featured_artists", []):
+            for feature_artist in (*(data.get("featured_artists") or []), *(data.get("producer_artists") or []), *(data.get("writer_artists") or [])):
                artist = self.parse_api_object(feature_artist)
                if artist is not None:
                    featured_artist_list.append(artist)
@@ -197,16 +203,16 @@ class Genius(Page):
            dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False)
            data = json.loads(content)
-            artist = self.parse_api_object(data.get("artist", {}))
+            artist = self.parse_api_object(data.get("artist"))
-            for e in data.get("artist_albums", []):
+            for e in (data.get("artist_albums") or []):
                r = self.parse_api_object(e)
                if not isinstance(r, Album):
                    continue
                artist.album_collection.append(r)
-            for e in data.get("artist_songs", []):
+            for e in (data.get("artist_songs") or []):
                r = self.parse_api_object(e)
                if not isinstance(r, Song):
                    continue
@@ -223,133 +229,69 @@ class Genius(Page):
        return artist
    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
        lyrics_list: List[Lyrics] = []
        _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
        if _lyrics is not None:
            lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
        return Song(
            title=clean_song_title(track["item"]["name"]),
            source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
            tracksort=int(track["position"]),
            artwork=artwork,
        )
    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
-        album = Album()
+        album: Album = Album()
        # https://genius.com/api/artists/24527/albums?page=1
-        r = self.connection.get(source.url, name=f"album_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('album', '')}")
+        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return album
        soup = self.get_soup_from_response(r)
-        data_container = soup.find("script", {"type": "application/ld+json"})
+        # find the content attribute in the meta tag which is contained in the head
        data_container = soup.find("meta", {"itemprop": "page_data"})
        if data_container is not None:
            content = data_container["content"]
            dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False)
            data = json.loads(content)
-        if DEBUG:
+            album = self.parse_api_object(data.get("album"))
            dump_to_file("album_data.json", data_container.text, is_json=True, exit_after_dump=False)
-        data = json.loads(data_container.text)
+            for e in data.get("album_appearances", []):
-        artist_data = data["byArtist"]
+                r = self.parse_api_object(e.get("song"))
-
+                if not isinstance(r, Song):
        artist_source_list = []
        if "@id" in artist_data:
            artist_source_list = [Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))]
        album = Album(
            title=data["name"].strip(),
            source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]))],
            date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
            artist_list=[Artist(
                name=artist_data["name"].strip(),
                source_list=artist_source_list
            )]
        )
        artwork: Artwork = Artwork()
        def _get_artwork_url(_data: dict) -> Optional[str]:
            if "image" in _data:
                return _data["image"]
            for _property in _data.get("additionalProperty", []):
                if _property.get("name") == "art_id":
                    return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
        _artwork_url = _get_artwork_url(data)
        if _artwork_url is not None:
            artwork.append(url=_artwork_url, width=350, height=350)
        else:
            for album_release in data.get("albumRelease", []):
                _artwork_url = _get_artwork_url(album_release)
                if _artwork_url is not None:
                    artwork.append(url=_artwork_url, width=350, height=350)
                    break
        for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
            if DEBUG:
                dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
            try:
                album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
            except KeyError:
                    continue
                album.song_collection.append(r)
        album.source_collection.append(source)
        return album
-    def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
+    def get_json_content_from_response(self, response, start: str, end: str) -> Optional[str]:
-        track_lyrics = soup.find("div", {"class": "lyricsText"})
+        content = response.text
-        if track_lyrics:
+        start_index = content.find(start)
-            return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
+        if start_index < 0:
-
+            return None
-        return []
+        start_index += len(start)
        end_index = content.find(end, start_index)
        if end_index < 0:
            return None
        return content[start_index:end_index]
    def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
-        r = self.connection.get(source.url, name=f"song_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('track', '')}")
+        song: Song = Song()
        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return Song()
        soup = self.get_soup_from_response(r)
        data_container = soup.find("script", {"type": "application/ld+json"})
        other_data = {}
        other_data_list = soup.select("script[data-tralbum]")
        if len(other_data_list) > 0:
            other_data = json.loads(other_data_list[0]["data-tralbum"])
        dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
        dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
        dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
        data = json.loads(data_container.text)
        album_data = data["inAlbum"]
        artist_data = data["byArtist"]
        mp3_url = None
        for key, value in other_data.get("trackinfo", [{}])[0].get("file", {"": None}).items():
            mp3_url = value
        song = Song(
            title=clean_song_title(data["name"], artist_name=artist_data["name"]),
            source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
            album_list=[Album(
                title=album_data["name"].strip(),
                date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
                source_list=[Source(self.SOURCE_TYPE, album_data["@id"])]
            )],
            artist_list=[Artist(
                name=artist_data["name"].strip(),
                source_list=[Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))]
            )],
            lyrics_list=self._fetch_lyrics(soup=soup)
        )
            return song
-    def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:
+        # get the contents that are between `JSON.parse('` and `');`
-        if source.audio_url is None:
+        content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n      window.__APP_CONFIG__ = ")
-            return DownloadResult(error_message="Couldn't find download link.")
+        if content is not None:
-        return self.connection.stream_into(url=source.audio_url, target=target, description=desc)
+            content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")
            data = json.loads(content)
            lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None)
            if lyrics_html is not None:
                song.lyrics_collection.append(Lyrics(FormattedText(html=lyrics_html)))
            dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False)
        soup = self.get_soup_from_response(r)
        for lyrics in soup.find_all("div", {"data-lyrics-container": "true"}):
            lyrics_object = Lyrics(FormattedText(html=lyrics.prettify()))
            song.lyrics_collection.append(lyrics_object)
        song.source_collection.append(source)
        return song
@@ -0,0 +1,145 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional, Type, Union, Generator, Dict, Any
 from urllib.parse import urlparse
 import pycountry
 import musicbrainzngs
 from bs4 import BeautifulSoup
 from ..connection import Connection
 from .abstract import Page
 from ..utils.enums import SourceType, ALL_SOURCE_TYPES
 from ..utils.enums.album import AlbumType, AlbumStatus
 from ..objects import (
    Artist,
    Source,
    Song,
    Album,
    ID3Timestamp,
    FormattedText,
    Label,
    Target,
    DatabaseObject,
    Lyrics,
    Artwork
 )
 from ..utils.config import logging_settings, main_settings
 from ..utils import string_processing, shared
 from ..utils.string_processing import clean_song_title
 from ..utils.support_classes.query import Query
 from ..utils.support_classes.download_result import DownloadResult
 class Musicbrainz(Page):
    SOURCE_TYPE = ALL_SOURCE_TYPES.MUSICBRAINZ
    HOST = "https://musicbrainz.org"
    def __init__(self, *args, **kwargs):
        musicbrainzngs.set_useragent("mk", "1")
        super().__init__(*args, **kwargs)
    def general_search(self, search_query: str) -> List[DatabaseObject]:
        search_results = []
        #Artist
        search_results += self.artist_search(search_query).copy()
        #Album
        search_results += self.album_search(search_query).copy()
        #Song
        search_results += self.song_search(search_query).copy()
        return search_results
    def artist_search(self, search_query: str) -> List[Artist]:
        artist_list = []
        #Artist
        artist_dict_list: list = musicbrainzngs.search_artists(search_query)['artist-list']
        artist_source_list: List[Source] = []
        for artist_dict in artist_dict_list:
            artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + "/artist/" + artist_dict['id']))
            artist_list.append(Artist(
                name=artist_dict['name'],
                source_list=artist_source_list
            ))
        return artist_list
    def song_search(self, search_query: str) -> List[Song]:
        song_list = []
        #Song
        song_dict_list: list = musicbrainzngs.search_recordings(search_query)['recording-list']
        song_source_list: List[Source] = [] 
        for song_dict in song_dict_list:
            song_source_list.append(Source(self.SOURCE_TYPE, self.HOST + "/recording/" + song_dict['id'])) 
            song_list.append(Song(
                title=song_dict['title'],
                source_list=song_source_list
            )) 
        return song_list
    def album_search(self, search_query: str) -> List[Album]:
        album_list = []
        #Album
        album_dict_list: list = musicbrainzngs.search_release_groups(search_query)['release-group-list']
        album_source_list: List[Source] = []
        for album_dict in album_dict_list:
            album_source_list.append(Source(self.SOURCE_TYPE, self.HOST + "/release-group/" + album_dict['id']))
            album_list.append(Album(
                title=album_dict['title'],
                source_list=album_source_list
            ))
        return album_list
    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
        album_list = []
        #Album
        album_dict_list: list = musicbrainzngs.search_release_groups(search_query)['release-group-list']
        album_source_list: List[Source] = []
        for album_dict in album_dict_list:
            album_source_list.append(Source(self.SOURCE_TYPE, self.HOST + "/release-group/" + album_dict['id']))
            album_list.append(Album(
                title=album_dict['title'],
                source_list=album_source_list
            ))
    def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
        artist_list = []
        #Artist
        artist_dict_list: list = musicbrainzngs.search_artists(search_query)['artist-list']
        artist_source_list: List[Source] = []
        for artist_dict in artist_dict_list:
            artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + "/artist/" + artist_dict['id']))
            artist_list.append(Artist(
                name=artist_dict['name'],
                source_list=artist_source_list,
            ))
    def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
        song_list = []
        #Song
        song_dict_list: list = musicbrainzngs.search_recordings(search_query)['recording-list']
        song_source_list: List[Source] = [] 
        for song_dict in song_dict_list:
            song_source_list.append(Source(self.SOURCE_TYPE, self.HOST + "/recording/" + song_dict['id'])) 
            song_list.append(Song(
                title=song_dict['title'],
                source_list=song_source_list
            )) 
@@ -549,6 +549,11 @@ class YoutubeMusic(SuperYouTube):
        return album
    def fetch_lyrics(self, video_id: str, playlist_id: str = None) -> str:
        """
        1. fetches the tabs of a song, to get the browse id
        2. finds the browse id of the lyrics
        3. fetches the lyrics with the browse id
        """
        request_data = {
            "context": {**self.credentials.context, "adSignalsInfo": {"params": []}},
            "videoId": video_id,
@@ -575,6 +580,7 @@ class YoutubeMusic(SuperYouTube):
            pageType = traverse_json_path(tab, "tabRenderer.endpoint.browseEndpoint.browseEndpointContextSupportedConfigs.browseEndpointContextMusicConfig.pageType", default="")
            if pageType in ("MUSIC_TAB_TYPE_LYRICS", "MUSIC_PAGE_TYPE_TRACK_LYRICS") or "lyrics" in pageType.lower():
                browse_id = traverse_json_path(tab, "tabRenderer.endpoint.browseEndpoint.browseId", default=None)
                if browse_id is not None:
                    break
        if browse_id is None:
@@ -721,7 +727,6 @@ class YoutubeMusic(SuperYouTube):
        self.download_values_by_url[source.url] = {
            "url": _best_format.get("url"),
            "chunk_size": _best_format.get("downloader_options", {}).get("http_chunk_size", main_settings["chunk_size"]),
            "headers": _best_format.get("http_headers", {}),
        }
@@ -59,6 +59,11 @@ Reference for the logging formats: https://docs.python.org/3/library/logging.htm
        description="The logger for the musify scraper.",
        default_value="musify"
    ),
    LoggerAttribute(
        name="musicbrainz_logger",
        description="The logger for the musicbrainz scraper.",
        default_value="musicbrainz"
    ),
    LoggerAttribute(
        name="youtube_logger",
        description="The logger for the youtube scraper.",
Author	SHA1	Message	Date
Luna	265c9f462f	feat: musicbrainz overall search ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-09-16 13:12:50 +02:00
Luna	780daac0ef	feat: Musicbrainz oriantation and class creation ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-07-03 10:48:44 +02:00
Luna	465af49057	hotfix ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-06-03 10:19:32 +02:00
Hazel	2aa0f02fa5	Merge branch 'adding_genius' into experimental ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-23 13:36:10 +02:00
Hazel	7b0b830d64	feat: removed legacy key ci/woodpecker/push/woodpecker Pipeline failed Details	2024-05-23 13:24:25 +02:00
Hazel	1ba6c97f5a	feat: more extensive browse id	2024-05-23 13:20:34 +02:00
Hazel	c8cbfc7cb9	feat: improved output of clearing the cache	2024-05-23 13:17:14 +02:00
Hazel	344da0a0bf	fix: converting pictures to rgb before saving ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-22 15:20:26 +02:00
Hazel	49dc7093c8	fix: genius fallback ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-22 15:18:43 +02:00
Hazel	90f70638b4	feat: better lyrics support ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 17:55:08 +02:00
Hazel	7b4eee858a	feat: parsed script json ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 17:14:58 +02:00
Hazel	f61b34dd40	feat: improved feature artists by also adding writer and producer to it ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:52:01 +02:00
Hazel	688b4fd357	feat: getting the album tracklist ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:47:38 +02:00
Hazel	769d27dc5c	feat: album details ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:43:52 +02:00