music-kraken-core/music_kraken/pages/genius.py

from typing import List, Optional, Type
from urllib.parse import urlparse, urlunparse, urlencode
import json
from enum import Enum
from bs4 import BeautifulSoup
import pycountry

from ..objects import Source, DatabaseObject
from .abstract import Page
from ..objects import (
    Artist,
    Source,
    SourceType,
    Song,
    Album,
    Label,
    Target,
    Contact,
    ID3Timestamp,
    Lyrics,
    FormattedText,
    Artwork,
)
from ..connection import Connection
from ..utils import dump_to_file, traverse_json_path
from ..utils.enums import SourceType, ALL_SOURCE_TYPES
from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import clean_song_title
from ..utils.config import main_settings, logging_settings
from ..utils.shared import DEBUG

if DEBUG:
    from ..utils import dump_to_file


class Genius(Page):
    SOURCE_TYPE = ALL_SOURCE_TYPES.GENIUS
    HOST = "genius.com"

    def __init__(self, *args, **kwargs):
        self.connection: Connection = Connection(
            host="https://genius.com/",
            logger=self.LOGGER,
            module="genius",
        )

        super().__init__(*args, **kwargs)

    def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]:
        path = source.parsed_url.path.replace("/", "")
        
        if path.startswith("artists"):
            return Artist
        if path.startswith("albums"):
            return Album

        return Song

    def add_to_artwork(self, artwork: Artwork, url: str):
        if url is None:
            return
        
        url_frags = url.split(".")
        if len(url_frags) < 2:
            artwork.append(url=url)
            return

        dimensions = url_frags[-2].split("x")
        if len(dimensions) < 2:
            artwork.append(url=url)
            return

        if len(dimensions) == 3:
            dimensions = dimensions[:-1]
        
        try:
            artwork.append(url=url, width=int(dimensions[0]), height=int(dimensions[1]))
        except ValueError:
            artwork.append(url=url)

    def parse_api_object(self, data: dict) -> Optional[DatabaseObject]:
        if data is None:
            return None
        object_type = data.get("_type")

        artwork = Artwork()
        self.add_to_artwork(artwork, data.get("header_image_url"))
        self.add_to_artwork(artwork, data.get("image_url"))
        
        additional_sources: List[Source] = []
        source: Source = Source(self.SOURCE_TYPE, data.get("url"), additional_data={
            "id": data.get("id"),
            "slug": data.get("slug"),
            "api_path": data.get("api_path"),
        })

        notes = FormattedText()
        description = data.get("description") or {}
        if "html" in description:
            notes.html = description["html"]
        elif "markdown" in description:
            notes.markdown = description["markdown"]
        elif "description_preview" in data:
            notes.plaintext = data["description_preview"]

        if source.url is None:
            return None

        if object_type == "artist":
            if data.get("instagram_name") is not None:
                additional_sources.append(Source(ALL_SOURCE_TYPES.INSTAGRAM, f"https://www.instagram.com/{data['instagram_name']}/"))
            if data.get("facebook_name") is not None:
                additional_sources.append(Source(ALL_SOURCE_TYPES.FACEBOOK, f"https://www.facebook.com/{data['facebook_name']}/"))
            if data.get("twitter_name") is not None:
                additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/"))

            return Artist(
                name=data["name"].strip() if data.get("name") is not None else None,
                source_list=[source],
                artwork=artwork,
                notes=notes,
            )

        if object_type == "album":
            self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url"))
            self.add_to_artwork(artwork, data.get("cover_art_url"))

            for cover_art in data.get("cover_arts", []):
                self.add_to_artwork(artwork, cover_art.get("image_url"))
                self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url"))

            return Album(
                title=data.get("name").strip(),
                source_list=[source],
                artist_list=[self.parse_api_object(data.get("artist"))],
                artwork=artwork,
                date=ID3Timestamp(**(data.get("release_date_components") or {})),
            )

        if object_type == "song":
            self.add_to_artwork(artwork, data.get("song_art_image_thumbnail_url"))
            self.add_to_artwork(artwork, data.get("song_art_image_url"))

            main_artist_list = []
            featured_artist_list = []

            _artist_name = None
            primary_artist = self.parse_api_object(data.get("primary_artist"))
            if primary_artist is not None:
                _artist_name = primary_artist.name
                main_artist_list.append(primary_artist)
            for feature_artist in (*(data.get("featured_artists") or []), *(data.get("producer_artists") or []), *(data.get("writer_artists") or [])):
                artist = self.parse_api_object(feature_artist)
                if artist is not None:
                    featured_artist_list.append(artist)

            return Song(
                title=clean_song_title(data.get("title"), artist_name=_artist_name),
                source_list=[source],
                artwork=artwork,
                feature_artist_list=featured_artist_list,
                artist_list=main_artist_list,
            )

        return None

    def general_search(self, search_query: str, **kwargs) -> List[DatabaseObject]:
        results = []

        search_params = {
            "q": search_query,
        }

        r = self.connection.get("https://genius.com/api/search/multi?" + urlencode(search_params), name=f"search_{search_query}")
        if r is None:
            return results

        dump_to_file("search_genius.json", r.text, is_json=True, exit_after_dump=False)
        data = r.json()

        for elements in traverse_json_path(data, "response.sections", default=[]):
            hits = elements.get("hits", [])
            for hit in hits:
                parsed = self.parse_api_object(hit.get("result"))
                if parsed is not None:
                    results.append(parsed)

        return results

    def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
        artist: Artist = Artist()
        # https://genius.com/api/artists/24527/albums?page=1

        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return artist
        soup = self.get_soup_from_response(r)

        # find the content attribute in the meta tag which is contained in the head
        data_container = soup.find("meta", {"itemprop": "page_data"})
        if data_container is not None:
            content = data_container["content"]
            dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False)
            data = json.loads(content)

            artist = self.parse_api_object(data.get("artist"))
            
            for e in (data.get("artist_albums") or []):
                r = self.parse_api_object(e)
                if not isinstance(r, Album):
                    continue

                artist.album_collection.append(r)
            
            for e in (data.get("artist_songs") or []):
                r = self.parse_api_object(e)
                if not isinstance(r, Song):
                    continue

                """
                TODO
                fetch the album for these songs, because the api doesn't 
                return them
                """

                artist.album_collection.extend(r.album_collection)

        artist.source_collection.append(source)

        return artist

    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
        album: Album = Album()
        # https://genius.com/api/artists/24527/albums?page=1

        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return album
        soup = self.get_soup_from_response(r)

        # find the content attribute in the meta tag which is contained in the head
        data_container = soup.find("meta", {"itemprop": "page_data"})
        if data_container is not None:
            content = data_container["content"]
            dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False)
            data = json.loads(content)

            album = self.parse_api_object(data.get("album"))

            for e in data.get("album_appearances", []):
                r = self.parse_api_object(e.get("song"))
                if not isinstance(r, Song):
                    continue

                album.song_collection.append(r)

        album.source_collection.append(source)
        
        return album

    def get_json_content_from_response(self, response, start: str, end: str) -> Optional[str]:
        content = response.text
        start_index = content.find(start)
        if start_index < 0:
            return None
        start_index += len(start)
        end_index = content.find(end, start_index)
        if end_index < 0:
            return None
        return content[start_index:end_index]

    def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
        song: Song = Song()

        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return song

        # get the contents that are between `JSON.parse('` and `');`
        content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n      window.__APP_CONFIG__ = ")
        if content is not None:
            content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")
            data = json.loads(content)

            lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None)
            if lyrics_html is not None:
                song.lyrics_collection.append(Lyrics(FormattedText(html=lyrics_html)))

            dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False)

        soup = self.get_soup_from_response(r)
        for lyrics in soup.find_all("div", {"data-lyrics-container": "true"}):
            lyrics_object = Lyrics(FormattedText(html=lyrics.prettify()))
            song.lyrics_collection.append(lyrics_object)

        song.source_collection.append(source)
        return song
feat: implemented artist search 2024-05-21 13:27:10 +00:00			`from typing import List, Optional, Type`
			`from urllib.parse import urlparse, urlunparse, urlencode`
			`import json`
			`from enum import Enum`
			`from bs4 import BeautifulSoup`
			`import pycountry`

			`from ..objects import Source, DatabaseObject`
			`from .abstract import Page`
			`from ..objects import (`
			`Artist,`
			`Source,`
			`SourceType,`
			`Song,`
			`Album,`
			`Label,`
			`Target,`
			`Contact,`
			`ID3Timestamp,`
			`Lyrics,`
			`FormattedText,`
			`Artwork,`
			`)`
			`from ..connection import Connection`
			`from ..utils import dump_to_file, traverse_json_path`
			`from ..utils.enums import SourceType, ALL_SOURCE_TYPES`
			`from ..utils.support_classes.download_result import DownloadResult`
			`from ..utils.string_processing import clean_song_title`
			`from ..utils.config import main_settings, logging_settings`
			`from ..utils.shared import DEBUG`

			`if DEBUG:`
			`from ..utils import dump_to_file`


			`class Genius(Page):`
			`SOURCE_TYPE = ALL_SOURCE_TYPES.GENIUS`
			`HOST = "genius.com"`

			`def __init__(self, args, *kwargs):`
			`self.connection: Connection = Connection(`
			`host="https://genius.com/",`
			`logger=self.LOGGER,`
			`module="genius",`
			`)`

			`super().__init__(args, *kwargs)`

			`def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]:`
			`path = source.parsed_url.path.replace("/", "")`
feat: detecting url type 2024-05-21 13:57:09 +00:00
			`if path.startswith("artists"):`
			`return Artist`
			`if path.startswith("albums"):`
			`return Album`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
feat: detecting url type 2024-05-21 13:57:09 +00:00			`return Song`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`def add_to_artwork(self, artwork: Artwork, url: str):`
			`if url is None:`
			`return`

			`url_frags = url.split(".")`
			`if len(url_frags) < 2:`
			`artwork.append(url=url)`
			`return`

			`dimensions = url_frags[-2].split("x")`
			`if len(dimensions) < 2:`
			`artwork.append(url=url)`
			`return`

			`if len(dimensions) == 3:`
			`dimensions = dimensions[:-1]`

			`try:`
			`artwork.append(url=url, width=int(dimensions[0]), height=int(dimensions[1]))`
			`except ValueError:`
			`artwork.append(url=url)`

			`def parse_api_object(self, data: dict) -> Optional[DatabaseObject]:`
fix: genius fallback 2024-05-22 13:18:43 +00:00			`if data is None:`
			`return None`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`object_type = data.get("_type")`

			`artwork = Artwork()`
			`self.add_to_artwork(artwork, data.get("header_image_url"))`
			`self.add_to_artwork(artwork, data.get("image_url"))`

feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`additional_sources: List[Source] = []`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`source: Source = Source(self.SOURCE_TYPE, data.get("url"), additional_data={`
			`"id": data.get("id"),`
			`"slug": data.get("slug"),`
			`"api_path": data.get("api_path"),`
			`})`

feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`notes = FormattedText()`
fix: genius fallback 2024-05-22 13:18:43 +00:00			`description = data.get("description") or {}`
feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`if "html" in description:`
			`notes.html = description["html"]`
			`elif "markdown" in description:`
			`notes.markdown = description["markdown"]`
			`elif "description_preview" in data:`
feat: getting the album tracklist 2024-05-21 14:47:38 +00:00			`notes.plaintext = data["description_preview"]`
feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`if source.url is None:`
			`return None`

			`if object_type == "artist":`
feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`if data.get("instagram_name") is not None:`
			`additional_sources.append(Source(ALL_SOURCE_TYPES.INSTAGRAM, f"https://www.instagram.com/{data['instagram_name']}/"))`
			`if data.get("facebook_name") is not None:`
			`additional_sources.append(Source(ALL_SOURCE_TYPES.FACEBOOK, f"https://www.facebook.com/{data['facebook_name']}/"))`
			`if data.get("twitter_name") is not None:`
			`additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/"))`

feat: finished searching genious 2024-05-21 13:52:41 +00:00			`return Artist(`
fix: genius fallback 2024-05-22 13:18:43 +00:00			`name=data["name"].strip() if data.get("name") is not None else None,`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`source_list=[source],`
			`artwork=artwork,`
feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`notes=notes,`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`)`

			`if object_type == "album":`
			`self.add_to_artwork(artwork, data.get("cover_art_thumbnail_url"))`
			`self.add_to_artwork(artwork, data.get("cover_art_url"))`

feat: album details 2024-05-21 14:43:52 +00:00			`for cover_art in data.get("cover_arts", []):`
			`self.add_to_artwork(artwork, cover_art.get("image_url"))`
			`self.add_to_artwork(artwork, cover_art.get("thumbnail_image_url"))`
feat: getting the album tracklist 2024-05-21 14:47:38 +00:00
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`return Album(`
feat: album details 2024-05-21 14:43:52 +00:00			`title=data.get("name").strip(),`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`source_list=[source],`
			`artist_list=[self.parse_api_object(data.get("artist"))],`
			`artwork=artwork,`
hotfix 2024-06-03 08:19:32 +00:00			`date=ID3Timestamp(**(data.get("release_date_components") or {})),`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`)`

			`if object_type == "song":`
			`self.add_to_artwork(artwork, data.get("song_art_image_thumbnail_url"))`
			`self.add_to_artwork(artwork, data.get("song_art_image_url"))`

			`main_artist_list = []`
			`featured_artist_list = []`

			`_artist_name = None`
			`primary_artist = self.parse_api_object(data.get("primary_artist"))`
			`if primary_artist is not None:`
			`_artist_name = primary_artist.name`
			`main_artist_list.append(primary_artist)`
fix: genius fallback 2024-05-22 13:18:43 +00:00			`for feature_artist in ((data.get("featured_artists") or []), (data.get("producer_artists") or []), *(data.get("writer_artists") or [])):`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`artist = self.parse_api_object(feature_artist)`
			`if artist is not None:`
			`featured_artist_list.append(artist)`

			`return Song(`
			`title=clean_song_title(data.get("title"), artist_name=_artist_name),`
			`source_list=[source],`
			`artwork=artwork,`
			`feature_artist_list=featured_artist_list,`
			`artist_list=main_artist_list,`
			`)`

			`return None`

feat: implemented artist search 2024-05-21 13:27:10 +00:00			`def general_search(self, search_query: str, **kwargs) -> List[DatabaseObject]:`
			`results = []`

			`search_params = {`
			`"q": search_query,`
			`}`

			`r = self.connection.get("https://genius.com/api/search/multi?" + urlencode(search_params), name=f"search_{search_query}")`
			`if r is None:`
			`return results`

			`dump_to_file("search_genius.json", r.text, is_json=True, exit_after_dump=False)`
			`data = r.json()`

			`for elements in traverse_json_path(data, "response.sections", default=[]):`
			`hits = elements.get("hits", [])`
			`for hit in hits:`
feat: finished searching genious 2024-05-21 13:52:41 +00:00			`parsed = self.parse_api_object(hit.get("result"))`
			`if parsed is not None:`
			`results.append(parsed)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
			`return results`

			`def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:`
feat: theoretically fetching feature songs 2024-05-21 14:34:04 +00:00			`artist: Artist = Artist()`
feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`# https://genius.com/api/artists/24527/albums?page=1`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`r = self.connection.get(source.url, name=source.url)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00			`if r is None:`
			`return artist`
			`soup = self.get_soup_from_response(r)`

feat: fetched the flat artist details 2024-05-21 14:21:58 +00:00			`# find the content attribute in the meta tag which is contained in the head`
			`data_container = soup.find("meta", {"itemprop": "page_data"})`
			`if data_container is not None:`
			`content = data_container["content"]`
			`dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False)`
			`data = json.loads(content)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
fix: genius fallback 2024-05-22 13:18:43 +00:00			`artist = self.parse_api_object(data.get("artist"))`
feat: fetched the flat artist details 2024-05-21 14:23:05 +00:00
fix: genius fallback 2024-05-22 13:18:43 +00:00			`for e in (data.get("artist_albums") or []):`
feat: theoretically fetching feature songs 2024-05-21 14:34:04 +00:00			`r = self.parse_api_object(e)`
			`if not isinstance(r, Album):`
			`continue`

			`artist.album_collection.append(r)`

fix: genius fallback 2024-05-22 13:18:43 +00:00			`for e in (data.get("artist_songs") or []):`
feat: theoretically fetching feature songs 2024-05-21 14:34:04 +00:00			`r = self.parse_api_object(e)`
			`if not isinstance(r, Song):`
			`continue`

			`"""`
			`TODO`
			`fetch the album for these songs, because the api doesn't`
			`return them`
			`"""`

			`artist.album_collection.extend(r.album_collection)`

			`artist.source_collection.append(source)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
			`return artist`

			`def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:`
feat: album details 2024-05-21 14:43:52 +00:00			`album: Album = Album()`
			`# https://genius.com/api/artists/24527/albums?page=1`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
feat: album details 2024-05-21 14:43:52 +00:00			`r = self.connection.get(source.url, name=source.url)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00			`if r is None:`
			`return album`
			`soup = self.get_soup_from_response(r)`

feat: album details 2024-05-21 14:43:52 +00:00			`# find the content attribute in the meta tag which is contained in the head`
			`data_container = soup.find("meta", {"itemprop": "page_data"})`
			`if data_container is not None:`
			`content = data_container["content"]`
			`dump_to_file("genius_itemprop_album.json", content, is_json=True, exit_after_dump=False)`
			`data = json.loads(content)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00
fix: genius fallback 2024-05-22 13:18:43 +00:00			`album = self.parse_api_object(data.get("album"))`
feat: getting the album tracklist 2024-05-21 14:47:38 +00:00
			`for e in data.get("album_appearances", []):`
			`r = self.parse_api_object(e.get("song"))`
			`if not isinstance(r, Song):`
			`continue`

			`album.song_collection.append(r)`

feat: implemented artist search 2024-05-21 13:27:10 +00:00			`album.source_collection.append(source)`
feat: album details 2024-05-21 14:43:52 +00:00
feat: implemented artist search 2024-05-21 13:27:10 +00:00			`return album`

feat: parsed script json 2024-05-21 15:14:58 +00:00			`def get_json_content_from_response(self, response, start: str, end: str) -> Optional[str]:`
			`content = response.text`
			`start_index = content.find(start)`
			`if start_index < 0:`
			`return None`
			`start_index += len(start)`
			`end_index = content.find(end, start_index)`
			`if end_index < 0:`
			`return None`
			`return content[start_index:end_index]`

feat: implemented artist search 2024-05-21 13:27:10 +00:00			`def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:`
feat: parsed script json 2024-05-21 15:14:58 +00:00			`song: Song = Song()`

			`r = self.connection.get(source.url, name=source.url)`
			`if r is None:`
			`return song`

			# get the contents that are between `JSON.parse('` and `');`
			`content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n window.__APP_CONFIG__ = ")`
			`if content is not None:`
			`content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")`
			`data = json.loads(content)`

feat: better lyrics support 2024-05-21 15:55:08 +00:00			`lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None)`
			`if lyrics_html is not None:`
			`song.lyrics_collection.append(Lyrics(FormattedText(html=lyrics_html)))`

feat: parsed script json 2024-05-21 15:14:58 +00:00			`dump_to_file("genius_song_script_json.json", content, is_json=True, exit_after_dump=False)`

			`soup = self.get_soup_from_response(r)`
			`for lyrics in soup.find_all("div", {"data-lyrics-container": "true"}):`
			`lyrics_object = Lyrics(FormattedText(html=lyrics.prettify()))`
			`song.lyrics_collection.append(lyrics_object)`

			`song.source_collection.append(source)`
feat: implemented artist search 2024-05-21 13:27:10 +00:00			`return song`