fix: converting pictures to rgb before saving

fix: genius fallback
feat: better lyrics support
2024-05-22 15:20:26 +02:00 · 2024-05-22 15:18:43 +02:00 · 2024-05-21 17:55:08 +02:00 · 2024-05-21 17:14:58 +02:00 · 2024-05-21 16:52:01 +02:00 · 2024-05-21 16:47:38 +02:00
7 changed files with 129 additions and 196 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -27,6 +27,7 @@
        "Gitea",
        "iframe",
        "isrc",
+        "itemprop",
        "levenshtein",
        "metallum",
        "MUSICBRAINZ",
--- a/development/actual_donwload.py
+++ b/development/actual_donwload.py
@@ -6,9 +6,10 @@ logging.getLogger().setLevel(logging.DEBUG)

 if __name__ == "__main__":
    commands = [
-        "s: #a I'm in a coffin",
-        "0",
-        "d: 0",
+        "s: #a Crystal F",
+        "10",
+        "1",
+        "3",
    ]

    
--- a/music_kraken/audio/metadata.py
+++ b/music_kraken/audio/metadata.py
@@ -93,6 +93,10 @@ def write_metadata_to_target(metadata: Metadata, target: Target, song: Song):
            # resize the image to the preferred resolution
            img.thumbnail((main_settings["preferred_artwork_resolution"], main_settings["preferred_artwork_resolution"]))

+            # https://stackoverflow.com/a/59476938/16804841
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+
            img.save(converted_target.file_path, "JPEG")

        # https://stackoverflow.com/questions/70228440/mutagen-how-can-i-correctly-embed-album-art-into-mp3-file-so-that-i-can-see-t
--- a/music_kraken/connection/cache.py
+++ b/music_kraken/connection/cache.py
@@ -136,13 +136,13 @@ class Cache:
        )
        self._write_attribute(cache_attribute)

-        cache_path = fit_to_file_system(Path(module_path, name), hidden_ok=True)
+        cache_path = fit_to_file_system(Path(module_path, name.replace("/", "_")), hidden_ok=True)
        with cache_path.open("wb") as content_file:
            self.logger.debug(f"writing cache to {cache_path}")
            content_file.write(content)

    def get(self, name: str) -> Optional[CacheResult]:
-        path = fit_to_file_system(Path(self._dir, self.module, name), hidden_ok=True)
+        path = fit_to_file_system(Path(self._dir, self.module, name.replace("/", "_")), hidden_ok=True)

        if not path.is_file():
            return None
@@ -165,7 +165,7 @@ class Cache:
            if ca.name == "":
                continue

-            file = fit_to_file_system(Path(self._dir, ca.module, ca.name), hidden_ok=True)
+            file = fit_to_file_system(Path(self._dir, ca.module, ca.name.replace("/", "_")), hidden_ok=True)

            if not ca.is_valid:
                self.logger.debug(f"deleting cache {ca.id}")
--- a/music_kraken/objects/artwork.py
+++ b/music_kraken/objects/artwork.py
@@ -59,4 +59,6 @@ class Artwork:
                self._variant_mapping[key] = value

    def __eq__(self, other: Artwork) -> bool:
+        if not isinstance(other, Artwork):
+            return False
        return any(a == b for a, b in zip(self._variant_mapping.keys(), other._variant_mapping.keys()))
--- a/music_kraken/objects/formatted_text.py
+++ b/music_kraken/objects/formatted_text.py
@@ -38,11 +38,19 @@ class FormattedText:
    def markdown(self) -> str:
        return md(self.html).strip()
    
+    @markdown.setter
+    def markdown(self, value: str) -> None:
+        self.html = mistune.markdown(value)
+
    @property
    def plain(self) -> str:
        md = self.markdown
        return md.replace("\n\n", "\n")
    
+    @plain.setter
+    def plain(self, value: str) -> None:
+        self.html = mistune.markdown(plain_to_markdown(value))
+
    def __str__(self) -> str:
        return self.markdown

--- a/music_kraken/pages/genius.py
+++ b/music_kraken/pages/genius.py
@@ -49,7 +49,12 @@ class Genius(Page):
    def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]:
        path = source.parsed_url.path.replace("/", "")
        
-        return super().get_source_type(source)
+        if path.startswith("artists"):
+            return Artist
+        if path.startswith("albums"):
+            return Album
+
+        return Song

    def add_to_artwork(self, artwork: Artwork, url: str):
        if url is None:
@@ -74,34 +79,58 @@ class Genius(Page):
            artwork.append(url=url)

    def parse_api_object(self, data: dict) -> Optional[DatabaseObject]:
+        if data is None:
+            return None
        object_type = data.get("_type")

        artwork = Artwork()
        self.add_to_artwork(artwork, data.get("header_image_url"))
        self.add_to_artwork(artwork, data.get("image_url"))
        
+        additional_sources: List[Source] = []
        source: Source = Source(self.SOURCE_TYPE, data.get("url"), additional_data={
            "id": data.get("id"),
            "slug": data.get("slug"),
            "api_path": data.get("api_path"),
        })

+        notes = FormattedText()
+        description = data.get("description") or {}
+        if "html" in description:
+            notes.html = description["html"]
+        elif "markdown" in description:
+            notes.markdown = description["markdown"]
+        elif "description_preview" in data:
+            notes.plaintext = data["description_preview"]
+
        if source.url is None:
            return None

        if object_type == "artist":
+            if data.get("instagram_name") is not None:
+                additional_sources.append(Source(ALL_SOURCE_TYPES.INSTAGRAM, f"https://www.instagram.com/{data['instagram_name']}/"))
+            if data.get("facebook_name") is not None:
+                additional_sources.append(Source(ALL_SOURCE_TYPES.FACEBOOK, f"https://www.facebook.com/{data['facebook_name']}/"))
+            if data.get("twitter_name") is not None:
+                additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/"))
+
            return Artist(
-                name=data.get("name"),
+                name=data["name"].strip() if data.get("name") is not None else None,
                source_list=[source],
                artwork=artwork,
+                notes=notes,
            )

        if object_type == "album":
            self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url"))
            self.add_to_artwork(artwork, data.get("cover_art_url"))

+            for cover_art in data.get("cover_arts", []):
+                self.add_to_artwork(artwork, cover_art.get("image_url"))
+                self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url"))
+
            return Album(
-                title=data.get("name"),
+                title=data.get("name").strip(),
                source_list=[source],
                artist_list=[self.parse_api_object(data.get("artist"))],
                artwork=artwork,
@@ -120,7 +149,7 @@ class Genius(Page):
            if primary_artist is not None:
                _artist_name = primary_artist.name
                main_artist_list.append(primary_artist)
-            for feature_artist in data.get("featured_artists", []):
+            for feature_artist in (*(data.get("featured_artists") or []), *(data.get("producer_artists") or []), *(data.get("writer_artists") or [])):
                artist = self.parse_api_object(feature_artist)
                if artist is not None:
                    featured_artist_list.append(artist)
@@ -158,223 +187,111 @@ class Genius(Page):

        return results

-    def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label:
-        return Label()
-
-    def _parse_artist_details(self, soup: BeautifulSoup) -> Artist:
-        name: str = None
-        source_list: List[Source] = []
-        contact_list: List[Contact] = []
-
-        band_name_location: BeautifulSoup = soup.find("p", {"id": "band-name-location"})
-        if band_name_location is not None:
-            title_span = band_name_location.find("span", {"class": "title"})
-            if title_span is not None:
-                name = title_span.text.strip()
-
-        link_container: BeautifulSoup = soup.find("ol", {"id": "band-links"})
-        if link_container is not None:
-            li: BeautifulSoup
-            for li in link_container.find_all("a"):
-                if li is None and li['href'] is not None:
-                    continue
-
-                source_list.append(Source.match_url(_parse_artist_url(li['href']), referrer_page=self.SOURCE_TYPE))
-
-        return Artist(
-            name=name,
-            source_list=source_list
-        )
-
-    def _parse_album(self, soup: BeautifulSoup, initial_source: Source) -> List[Album]:
-        title = None
-        source_list: List[Source] = []
-
-        a = soup.find("a")
-        if a is not None and a["href"] is not None:
-            source_list.append(Source(self.SOURCE_TYPE, _get_host(initial_source) + a["href"]))
-
-        title_p = soup.find("p", {"class": "title"})
-        if title_p is not None:
-            title = title_p.text.strip()
-
-        return Album(title=title, source_list=source_list)
-
-    def _parse_artist_data_blob(self, data_blob: dict, artist_url: str):
-        parsed_artist_url = urlparse(artist_url)
-        album_list: List[Album] = []
-
-        for album_json in data_blob.get("buyfulldisco", {}).get("tralbums", []):
-            album_list.append(Album(
-                title=album_json["title"].strip(),
-                source_list=[Source(
-                    self.SOURCE_TYPE,
-                    urlunparse((parsed_artist_url.scheme, parsed_artist_url.netloc, album_json["page_url"], "", "", ""))
-                )]
-            ))
-
-        return album_list
-
    def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
-        artist = Artist()
+        artist: Artist = Artist()
+        # https://genius.com/api/artists/24527/albums?page=1

-        r = self.connection.get(_parse_artist_url(source.url), name=f"artist_{urlparse(source.url).scheme}_{urlparse(source.url).netloc}")
+        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return artist
-
        soup = self.get_soup_from_response(r)

-        if DEBUG:
-            dump_to_file("artist_page.html", r.text, exit_after_dump=False)
+        # find the content attribute in the meta tag which is contained in the head
+        data_container = soup.find("meta", {"itemprop": "page_data"})
+        if data_container is not None:
+            content = data_container["content"]
+            dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False)
+            data = json.loads(content)

-        artist = self._parse_artist_details(soup=soup.find("div", {"id": "bio-container"}))
+            artist = self.parse_api_object(data.get("artist"))
            
-        html_music_grid = soup.find("ol", {"id": "music-grid"})
-        if html_music_grid is not None:
-            for subsoup in html_music_grid.find_all("li"):
-                artist.album_collection.append(self._parse_album(soup=subsoup, initial_source=source))
+            for e in (data.get("artist_albums") or []):
+                r = self.parse_api_object(e)
+                if not isinstance(r, Album):
+                    continue

-        for i, data_blob_soup in enumerate(soup.find_all("div", {"id": ["pagedata", "collectors-data"]})):
-            data_blob = data_blob_soup["data-blob"]
+                artist.album_collection.append(r)
            
-            if DEBUG:
-                dump_to_file(f"bandcamp_artist_data_blob_{i}.json", data_blob, is_json=True, exit_after_dump=False)
+            for e in (data.get("artist_songs") or []):
+                r = self.parse_api_object(e)
+                if not isinstance(r, Song):
+                    continue

-            if data_blob is not None:
-                artist.album_collection.extend(
-                    self._parse_artist_data_blob(json.loads(data_blob), source.url)
-                )
+                """
+                TODO
+                fetch the album for these songs, because the api doesn't 
+                return them
+                """
+
+                artist.album_collection.extend(r.album_collection)

        artist.source_collection.append(source)
+
        return artist

-    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
-        lyrics_list: List[Lyrics] = []
-
-        _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
-        if _lyrics is not None:
-            lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
-
-        return Song(
-            title=clean_song_title(track["item"]["name"]),
-            source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
-            tracksort=int(track["position"]),
-            artwork=artwork,
-        )
-
    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
-        album = Album()
+        album: Album = Album()
+        # https://genius.com/api/artists/24527/albums?page=1

-        r = self.connection.get(source.url, name=f"album_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('album', '')}")
+        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return album
-
        soup = self.get_soup_from_response(r)

-        data_container = soup.find("script", {"type": "application/ld+json"})
+        # find the content attribute in the meta tag which is contained in the head
+        data_container = soup.find("meta", {"itemprop": "page_data"})
+        if data_container is not None:
+            content = data_container["content"]
+            dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False)
+            data = json.loads(content)

-        if DEBUG:
-            dump_to_file("album_data.json", data_container.text, is_json=True, exit_after_dump=False)
+            album = self.parse_api_object(data.get("album"))

-        data = json.loads(data_container.text)
-        artist_data = data["byArtist"]
+            for e in data.get("album_appearances", []):
+                r = self.parse_api_object(e.get("song"))
+                if not isinstance(r, Song):
+                    continue

-        artist_source_list = []
-        if "@id" in artist_data:
-            artist_source_list = [Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))]
-        album = Album(
-            title=data["name"].strip(),
-            source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]))],
-            date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
-            artist_list=[Artist(
-                name=artist_data["name"].strip(),
-                source_list=artist_source_list
-            )]
-        )
-
-        artwork: Artwork = Artwork()
-
-        def _get_artwork_url(_data: dict) -> Optional[str]:
-            if "image" in _data:
-                return _data["image"]
-            for _property in _data.get("additionalProperty", []):
-                if _property.get("name") == "art_id":
-                    return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
-
-        _artwork_url = _get_artwork_url(data)
-        if _artwork_url is not None:
-            artwork.append(url=_artwork_url, width=350, height=350)
-        else:
-            for album_release in data.get("albumRelease", []):
-                _artwork_url = _get_artwork_url(album_release)
-                if _artwork_url is not None:
-                    artwork.append(url=_artwork_url, width=350, height=350)
-                    break
-
-
-        for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
-            if DEBUG:
-                dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
-
-            try:
-                album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
-            except KeyError:
-                continue
+                album.song_collection.append(r)

        album.source_collection.append(source)
+        
        return album

-    def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
-        track_lyrics = soup.find("div", {"class": "lyricsText"})
-        if track_lyrics:
-            return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
-
-        return []
+    def get_json_content_from_response(self, response, start: str, end: str) -> Optional[str]:
+        content = response.text
+        start_index = content.find(start)
+        if start_index < 0:
+            return None
+        start_index += len(start)
+        end_index = content.find(end, start_index)
+        if end_index < 0:
+            return None
+        return content[start_index:end_index]

    def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
-        r = self.connection.get(source.url, name=f"song_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('track', '')}")
+        song: Song = Song()
+
+        r = self.connection.get(source.url, name=source.url)
        if r is None:
-            return Song()
+            return song
+
+        # get the contents that are between `JSON.parse('` and `');`
+        content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n      window.__APP_CONFIG__ = ")
+        if content is not None:
+            content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")
+            data = json.loads(content)
+
+            lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None)
+            if lyrics_html is not None:
+                song.lyrics_collection.append(Lyrics(FormattedText(html=lyrics_html)))
+
+            dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False)

        soup = self.get_soup_from_response(r)
+        for lyrics in soup.find_all("div", {"data-lyrics-container": "true"}):
+            lyrics_object = Lyrics(FormattedText(html=lyrics.prettify()))
+            song.lyrics_collection.append(lyrics_object)

-        data_container = soup.find("script", {"type": "application/ld+json"})
-        other_data = {}
-
-        other_data_list = soup.select("script[data-tralbum]")
-        if len(other_data_list) > 0:
-            other_data = json.loads(other_data_list[0]["data-tralbum"])
-
-        dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
-        dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
-        dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
-
-        data = json.loads(data_container.text)
-        album_data = data["inAlbum"]
-        artist_data = data["byArtist"]
-
-        mp3_url = None
-        for key, value in other_data.get("trackinfo", [{}])[0].get("file", {"": None}).items():
-            mp3_url = value
-
-        song = Song(
-            title=clean_song_title(data["name"], artist_name=artist_data["name"]),
-            source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
-            album_list=[Album(
-                title=album_data["name"].strip(),
-                date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
-                source_list=[Source(self.SOURCE_TYPE, album_data["@id"])]
-            )],
-            artist_list=[Artist(
-                name=artist_data["name"].strip(),
-                source_list=[Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))]
-            )],
-            lyrics_list=self._fetch_lyrics(soup=soup)
-        )
-
+        song.source_collection.append(source)
        return song
-
-    def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:
-        if source.audio_url is None:
-            return DownloadResult(error_message="Couldn't find download link.")
-        return self.connection.stream_into(url=source.audio_url, target=target, description=desc)
Author	SHA1	Message	Date
Lars Noack	344da0a0bf	fix: converting pictures to rgb before saving All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-22 15:20:26 +02:00
Lars Noack	49dc7093c8	fix: genius fallback All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-22 15:18:43 +02:00
Lars Noack	90f70638b4	feat: better lyrics support All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 17:55:08 +02:00
Lars Noack	7b4eee858a	feat: parsed script json All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 17:14:58 +02:00
Lars Noack	f61b34dd40	feat: improved feature artists by also adding writer and producer to it All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:52:01 +02:00
Lars Noack	688b4fd357	feat: getting the album tracklist All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:47:38 +02:00
Lars Noack	769d27dc5c	feat: album details All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:43:52 +02:00
Lars Noack	f5d953d9ce	feat: theoretically fetching feature songs All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:34:04 +02:00
Lars Noack	46b64b8f8d	feat: fetched the flat artist details	2024-05-21 16:23:05 +02:00
Lars Noack	adfce16d2a	feat: fetched the flat artist details	2024-05-21 16:21:58 +02:00
Lars Noack	e4fd9faf12	feat: detecting url type	2024-05-21 15:57:09 +02:00