feat: theoretically fetching feature songs

feat: fetched the flat artist details
2024-05-21 16:34:04 +02:00 · 2024-05-21 16:23:05 +02:00 · 2024-05-21 16:21:58 +02:00 · 2024-05-21 15:57:09 +02:00
5 changed files with 60 additions and 83 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -27,6 +27,7 @@
        "Gitea",
        "iframe",
        "isrc",
+        "itemprop",
        "levenshtein",
        "metallum",
        "MUSICBRAINZ",
--- a/development/actual_donwload.py
+++ b/development/actual_donwload.py
@ -6,9 +6,8 @@ logging.getLogger().setLevel(logging.DEBUG)

 if __name__ == "__main__":
    commands = [
-        "s: #a I'm in a coffin",
-        "0",
-        "d: 0",
+        "s: #a Crystal F",
+        "10",
    ]

    
--- a/music_kraken/connection/cache.py
+++ b/music_kraken/connection/cache.py
@ -136,13 +136,13 @@ class Cache:
        )
        self._write_attribute(cache_attribute)

-        cache_path = fit_to_file_system(Path(module_path, name), hidden_ok=True)
+        cache_path = fit_to_file_system(Path(module_path, name.replace("/", "_")), hidden_ok=True)
        with cache_path.open("wb") as content_file:
            self.logger.debug(f"writing cache to {cache_path}")
            content_file.write(content)

    def get(self, name: str) -> Optional[CacheResult]:
-        path = fit_to_file_system(Path(self._dir, self.module, name), hidden_ok=True)
+        path = fit_to_file_system(Path(self._dir, self.module, name.replace("/", "_")), hidden_ok=True)

        if not path.is_file():
            return None
@ -165,7 +165,7 @@ class Cache:
            if ca.name == "":
                continue

-            file = fit_to_file_system(Path(self._dir, ca.module, ca.name), hidden_ok=True)
+            file = fit_to_file_system(Path(self._dir, ca.module, ca.name.replace("/", "_")), hidden_ok=True)

            if not ca.is_valid:
                self.logger.debug(f"deleting cache {ca.id}")
--- a/music_kraken/objects/artwork.py
+++ b/music_kraken/objects/artwork.py
@ -59,4 +59,6 @@ class Artwork:
                self._variant_mapping[key] = value

    def __eq__(self, other: Artwork) -> bool:
+        if not isinstance(other, Artwork):
+            return False
        return any(a == b for a, b in zip(self._variant_mapping.keys(), other._variant_mapping.keys()))
--- a/music_kraken/pages/genius.py
+++ b/music_kraken/pages/genius.py
@ -48,8 +48,13 @@ class Genius(Page):

    def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]:
        path = source.parsed_url.path.replace("/", "")
+        
+        if path.startswith("artists"):
+            return Artist
+        if path.startswith("albums"):
+            return Album

-        return super().get_source_type(source)
+        return Song

    def add_to_artwork(self, artwork: Artwork, url: str):
        if url is None:
@ -80,20 +85,38 @@ class Genius(Page):
        self.add_to_artwork(artwork, data.get("header_image_url"))
        self.add_to_artwork(artwork, data.get("image_url"))
        
+        additional_sources: List[Source] = []
        source: Source = Source(self.SOURCE_TYPE, data.get("url"), additional_data={
            "id": data.get("id"),
            "slug": data.get("slug"),
            "api_path": data.get("api_path"),
        })

+        notes = FormattedText()
+        description = data.get("description", {})
+        if "html" in description:
+            notes.html = description["html"]
+        elif "markdown" in description:
+            notes.markdown = description["markdown"]
+        elif "description_preview" in data:
+            notes.plain = data["description_preview"]
+
        if source.url is None:
            return None

        if object_type == "artist":
+            if data.get("instagram_name") is not None:
+                additional_sources.append(Source(ALL_SOURCE_TYPES.INSTAGRAM, f"https://www.instagram.com/{data['instagram_name']}/"))
+            if data.get("facebook_name") is not None:
+                additional_sources.append(Source(ALL_SOURCE_TYPES.FACEBOOK, f"https://www.facebook.com/{data['facebook_name']}/"))
+            if data.get("twitter_name") is not None:
+                additional_sources.append(Source(ALL_SOURCE_TYPES.TWITTER, f"https://x.com/{data['twitter_name']}/"))
+
            return Artist(
                name=data.get("name"),
                source_list=[source],
                artwork=artwork,
+                notes=notes,
            )

        if object_type == "album":
@ -158,94 +181,46 @@ class Genius(Page):

        return results

-    def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label:
-        return Label()
-
-    def _parse_artist_details(self, soup: BeautifulSoup) -> Artist:
-        name: str = None
-        source_list: List[Source] = []
-        contact_list: List[Contact] = []
-
-        band_name_location: BeautifulSoup = soup.find("p", {"id": "band-name-location"})
-        if band_name_location is not None:
-            title_span = band_name_location.find("span", {"class": "title"})
-            if title_span is not None:
-                name = title_span.text.strip()
-
-        link_container: BeautifulSoup = soup.find("ol", {"id": "band-links"})
-        if link_container is not None:
-            li: BeautifulSoup
-            for li in link_container.find_all("a"):
-                if li is None and li['href'] is not None:
-                    continue
-
-                source_list.append(Source.match_url(_parse_artist_url(li['href']), referrer_page=self.SOURCE_TYPE))
-
-        return Artist(
-            name=name,
-            source_list=source_list
-        )
-
-    def _parse_album(self, soup: BeautifulSoup, initial_source: Source) -> List[Album]:
-        title = None
-        source_list: List[Source] = []
-
-        a = soup.find("a")
-        if a is not None and a["href"] is not None:
-            source_list.append(Source(self.SOURCE_TYPE, _get_host(initial_source) + a["href"]))
-
-        title_p = soup.find("p", {"class": "title"})
-        if title_p is not None:
-            title = title_p.text.strip()
-
-        return Album(title=title, source_list=source_list)
-
-    def _parse_artist_data_blob(self, data_blob: dict, artist_url: str):
-        parsed_artist_url = urlparse(artist_url)
-        album_list: List[Album] = []
-
-        for album_json in data_blob.get("buyfulldisco", {}).get("tralbums", []):
-            album_list.append(Album(
-                title=album_json["title"].strip(),
-                source_list=[Source(
-                    self.SOURCE_TYPE,
-                    urlunparse((parsed_artist_url.scheme, parsed_artist_url.netloc, album_json["page_url"], "", "", ""))
-                )]
-            ))
-
-        return album_list
-
    def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
-        artist = Artist()
+        artist: Artist = Artist()
+        # https://genius.com/api/artists/24527/albums?page=1

-        r = self.connection.get(_parse_artist_url(source.url), name=f"artist_{urlparse(source.url).scheme}_{urlparse(source.url).netloc}")
+        r = self.connection.get(source.url, name=source.url)
        if r is None:
            return artist
-
        soup = self.get_soup_from_response(r)

-        if DEBUG:
-            dump_to_file("artist_page.html", r.text, exit_after_dump=False)
+        # find the content attribute in the meta tag which is contained in the head
+        data_container = soup.find("meta", {"itemprop": "page_data"})
+        if data_container is not None:
+            content = data_container["content"]
+            dump_to_file("genius_itemprop_artist.json", content, is_json=True, exit_after_dump=False)
+            data = json.loads(content)

-        artist = self._parse_artist_details(soup=soup.find("div", {"id": "bio-container"}))
+            artist = self.parse_api_object(data.get("artist", {}))
+            
+            for e in data.get("artist_albums", []):
+                r = self.parse_api_object(e)
+                if not isinstance(r, Album):
+                    continue

-        html_music_grid = soup.find("ol", {"id": "music-grid"})
-        if html_music_grid is not None:
-            for subsoup in html_music_grid.find_all("li"):
-                artist.album_collection.append(self._parse_album(soup=subsoup, initial_source=source))
+                artist.album_collection.append(r)
+            
+            for e in data.get("artist_songs", []):
+                r = self.parse_api_object(e)
+                if not isinstance(r, Song):
+                    continue

-        for i, data_blob_soup in enumerate(soup.find_all("div", {"id": ["pagedata", "collectors-data"]})):
-            data_blob = data_blob_soup["data-blob"]
+                """
+                TODO
+                fetch the album for these songs, because the api doesn't 
+                return them
+                """

-            if DEBUG:
-                dump_to_file(f"bandcamp_artist_data_blob_{i}.json", data_blob, is_json=True, exit_after_dump=False)
-
-            if data_blob is not None:
-                artist.album_collection.extend(
-                    self._parse_artist_data_blob(json.loads(data_blob), source.url)
-                )
+                artist.album_collection.extend(r.album_collection)

        artist.source_collection.append(source)
+
        return artist

    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
Author	SHA1	Message	Date
Lars Noack	f5d953d9ce	feat: theoretically fetching feature songs All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-05-21 16:34:04 +02:00
Lars Noack	46b64b8f8d	feat: fetched the flat artist details	2024-05-21 16:23:05 +02:00
Lars Noack	adfce16d2a	feat: fetched the flat artist details	2024-05-21 16:21:58 +02:00
Lars Noack	e4fd9faf12	feat: detecting url type	2024-05-21 15:57:09 +02:00