fix: converting pictures to rgb before saving

fix: genius fallback
feat: better lyrics support
2024-05-22 15:20:26 +02:00 · 2024-05-22 15:18:43 +02:00 · 2024-05-21 17:55:08 +02:00 · 2024-05-21 17:14:58 +02:00 · 2024-05-21 16:52:01 +02:00 · 2024-05-21 16:47:38 +02:00
4 changed files with 74 additions and 118 deletions
@@ -8,6 +8,8 @@ if __name__ == "__main__":
    commands = [
        "s: #a Crystal F",
        "10",
        "1",
        "3",
    ]
@@ -93,6 +93,10 @@ def write_metadata_to_target(metadata: Metadata, target: Target, song: Song):
            # resize the image to the preferred resolution
            img.thumbnail((main_settings["preferred_artwork_resolution"], main_settings["preferred_artwork_resolution"]))
            # https://stackoverflow.com/a/59476938/16804841
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img.save(converted_target.file_path, "JPEG")
        # https://stackoverflow.com/questions/70228440/mutagen-how-can-i-correctly-embed-album-art-into-mp3-file-so-that-i-can-see-t
@@ -37,11 +37,19 @@ class FormattedText:
    @property
    def markdown(self) -> str:
        return md(self.html).strip()
    @markdown.setter
    def markdown(self, value: str) -> None:
        self.html = mistune.markdown(value)
    @property
    def plain(self) -> str:
        md = self.markdown
        return md.replace("\n\n", "\n")
    @plain.setter
    def plain(self, value: str) -> None:
        self.html = mistune.markdown(plain_to_markdown(value))
    def __str__(self) -> str:
        return self.markdown
@@ -79,6 +79,8 @@ class Genius(Page):
            artwork.append(url=url)
    def parse_api_object(self, data: dict) -> Optional[DatabaseObject]:
        if data is None:
            return None
        object_type = data.get("_type")
        artwork = Artwork()
@@ -93,13 +95,13 @@ class Genius(Page):
        })
        notes = FormattedText()
-        description = data.get("description", {})
+        description = data.get("description") or {}
        if "html" in description:
            notes.html = description["html"]
        elif "markdown" in description:
            notes.markdown = description["markdown"]
        elif "description_preview" in data:
-            notes.plain = data["description_preview"]
+            notes.plaintext = data["description_preview"]
        if source.url is None:
            return None
@@ -113,7 +115,7 @@ class Genius(Page):
                additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/"))
            return Artist(
-                name=data.get("name"),
+                name=data["name"].strip() if data.get("name") is not None else None,
                source_list=[source],
                artwork=artwork,
                notes=notes,
@@ -123,8 +125,12 @@ class Genius(Page):
            self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url"))
            self.add_to_artwork(artwork, data.get("cover_art_url"))
            for cover_art in data.get("cover_arts", []):
                self.add_to_artwork(artwork, cover_art.get("image_url"))
                self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url"))
            return Album(
-                title=data.get("name"),
+                title=data.get("name").strip(),
                source_list=[source],
                artist_list=[self.parse_api_object(data.get("artist"))],
                artwork=artwork,
@@ -143,7 +149,7 @@ class Genius(Page):
            if primary_artist is not None:
                _artist_name = primary_artist.name
                main_artist_list.append(primary_artist)
-            for feature_artist in data.get("featured_artists", []):
+            for feature_artist in (*(data.get("featured_artists") or []), *(data.get("producer_artists") or []), *(data.get("writer_artists") or [])):
                artist = self.parse_api_object(feature_artist)
                if artist is not None:
                    featured_artist_list.append(artist)
@@ -197,16 +203,16 @@ class Genius(Page):
            dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False)
            data = json.loads(content)
-            artist = self.parse_api_object(data.get("artist", {}))
+            artist = self.parse_api_object(data.get("artist"))
-            for e in data.get("artist_albums", []):
+            for e in (data.get("artist_albums") or []):
                r = self.parse_api_object(e)
                if not isinstance(r, Album):
                    continue
                artist.album_collection.append(r)
-            for e in data.get("artist_songs", []):
+            for e in (data.get("artist_songs") or []):
                r = self.parse_api_object(e)
                if not isinstance(r, Song):
                    continue
@@ -223,133 +229,69 @@ class Genius(Page):
        return artist
    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
        lyrics_list: List[Lyrics] = []
        _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
        if _lyrics is not None:
            lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
        return Song(
            title=clean_song_title(track["item"]["name"]),
            source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
            tracksort=int(track["position"]),
            artwork=artwork,
        )
    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
-        album = Album()
+        album: Album = Album()
        # https://genius.com/api/artists/24527/albums?page=1
-        r = self.connection.get(source.url, name=f"album_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('album', '')}")
+        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return album
        soup = self.get_soup_from_response(r)
-        data_container = soup.find("script", {"type": "application/ld+json"})
+        # find the content attribute in the meta tag which is contained in the head
        data_container = soup.find("meta", {"itemprop": "page_data"})
        if data_container is not None:
            content = data_container["content"]
            dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False)
            data = json.loads(content)
-        if DEBUG:
+            album = self.parse_api_object(data.get("album"))
            dump_to_file("album_data.json", data_container.text, is_json=True, exit_after_dump=False)
-        data = json.loads(data_container.text)
+            for e in data.get("album_appearances", []):
-        artist_data = data["byArtist"]
+                r = self.parse_api_object(e.get("song"))
                if not isinstance(r, Song):
                    continue
-        artist_source_list = []
+                album.song_collection.append(r)
        if "@id" in artist_data:
            artist_source_list = [Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))]
        album = Album(
            title=data["name"].strip(),
            source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]))],
            date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
            artist_list=[Artist(
                name=artist_data["name"].strip(),
                source_list=artist_source_list
            )]
        )
        artwork: Artwork = Artwork()
        def _get_artwork_url(_data: dict) -> Optional[str]:
            if "image" in _data:
                return _data["image"]
            for _property in _data.get("additionalProperty", []):
                if _property.get("name") == "art_id":
                    return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
        _artwork_url = _get_artwork_url(data)
        if _artwork_url is not None:
            artwork.append(url=_artwork_url, width=350, height=350)
        else:
            for album_release in data.get("albumRelease", []):
                _artwork_url = _get_artwork_url(album_release)
                if _artwork_url is not None:
                    artwork.append(url=_artwork_url, width=350, height=350)
                    break
        for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
            if DEBUG:
                dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
            try:
                album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
            except KeyError:
                continue
        album.source_collection.append(source)
        return album
-    def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
+    def get_json_content_from_response(self, response, start: str, end: str) -> Optional[str]:
-        track_lyrics = soup.find("div", {"class": "lyricsText"})
+        content = response.text
-        if track_lyrics:
+        start_index = content.find(start)
-            return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
+        if start_index < 0:
-
+            return None
-        return []
+        start_index += len(start)
        end_index = content.find(end, start_index)
        if end_index < 0:
            return None
        return content[start_index:end_index]
    def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
-        r = self.connection.get(source.url, name=f"song_{urlparse(source.url).netloc.split('.')[0]}_{urlparse(source.url).path.replace('/', '').replace('track', '')}")
+        song: Song = Song()
        r = self.connection.get(source.url, name=source.url)
        if r is None:
-            return Song()
+            return song
        # get the contents that are between `JSON.parse('` and `');`
        content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n      window.__APP_CONFIG__ = ")
        if content is not None:
            content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")
            data = json.loads(content)
            lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None)
            if lyrics_html is not None:
                song.lyrics_collection.append(Lyrics(FormattedText(html=lyrics_html)))
            dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False)
        soup = self.get_soup_from_response(r)
        for lyrics in soup.find_all("div", {"data-lyrics-container": "true"}):
            lyrics_object = Lyrics(FormattedText(html=lyrics.prettify()))
            song.lyrics_collection.append(lyrics_object)
-        data_container = soup.find("script", {"type": "application/ld+json"})
+        song.source_collection.append(source)
        other_data = {}
        other_data_list = soup.select("script[data-tralbum]")
        if len(other_data_list) > 0:
            other_data = json.loads(other_data_list[0]["data-tralbum"])
        dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
        dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
        dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
        data = json.loads(data_container.text)
        album_data = data["inAlbum"]
        artist_data = data["byArtist"]
        mp3_url = None
        for key, value in other_data.get("trackinfo", [{}])[0].get("file", {"": None}).items():
            mp3_url = value
        song = Song(
            title=clean_song_title(data["name"], artist_name=artist_data["name"]),
            source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
            album_list=[Album(
                title=album_data["name"].strip(),
                date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
                source_list=[Source(self.SOURCE_TYPE, album_data["@id"])]
            )],
            artist_list=[Artist(
                name=artist_data["name"].strip(),
                source_list=[Source(self.SOURCE_TYPE, _parse_artist_url(artist_data["@id"]))]
            )],
            lyrics_list=self._fetch_lyrics(soup=soup)
        )
        return song
    def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:
        if source.audio_url is None:
            return DownloadResult(error_message="Couldn't find download link.")
        return self.connection.stream_into(url=source.audio_url, target=target, description=desc)
Author	SHA1	Message	Date
Hazel	344da0a0bf	fix: converting pictures to rgb before saving ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-22 15:20:26 +02:00
Hazel	49dc7093c8	fix: genius fallback ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-22 15:18:43 +02:00
Hazel	90f70638b4	feat: better lyrics support ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 17:55:08 +02:00
Hazel	7b4eee858a	feat: parsed script json ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 17:14:58 +02:00
Hazel	f61b34dd40	feat: improved feature artists by also adding writer and producer to it ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:52:01 +02:00
Hazel	688b4fd357	feat: getting the album tracklist ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:47:38 +02:00
Hazel	769d27dc5c	feat: album details ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:43:52 +02:00