pretty much finished web scraping on metal archives besides lyrics

2023-03-28 08:27:21 +02:00 · 2023-03-28 08:27:21 +02:00 · 34baf75d2e
commit 34baf75d2e
parent 4340ca3b32
3 changed files with 117 additions and 46 deletions
--- a/documentation/html/metal-archives/album.html
+++ b/documentation/html/metal-archives/album.html
@ -123,9 +123,10 @@
            </div>

            <div id="album_info">
+                <h1 class="album_name">
+                    <a href="https://www.metal-archives.com/albums/Ghost_Bath/Ghost_Bath/392863">Ghost Bath</a>
+                </h1>

-                <h1 class="album_name"><a href="https://www.metal-archives.com/albums/Ghost_Bath/Ghost_Bath/392863">Ghost
-                    Bath</a></h1>
                <h2 class="band_name">
                    <a href="https://www.metal-archives.com/bands/Ghost_Bath/3540372489">Ghost Bath</a>
                </h2>
--- a/src/metal_archives.py
+++ b/src/metal_archives.py
@ -16,9 +16,12 @@ def fetch_artist():
        ]
    )

-    artist = EncyclopaediaMetallum.fetch_details(artist)
+    artist: objects.Artist = EncyclopaediaMetallum.fetch_details(artist, stop_at_level=2)
    print(artist.options)
    
+    for album in artist.main_album_collection:
+        print(album.options)
+

 def fetch_album():
    album = objects.Album(
--- a/src/music_kraken/pages/encyclopaedia_metallum.py
+++ b/src/music_kraken/pages/encyclopaedia_metallum.py
@ -461,30 +461,8 @@ class EncyclopaediaMetallum(Page):
        return artist

    @classmethod
-    def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
+    def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song:
        """
-        I am preeeety sure I can get way more data than... nothing from there
-
-        :param source:
-        :param stop_at_level:
-        :return:
-        """
-
-        # <table class="display table_lyrics
-
-        album = Album()
-
-        r = cls.get_request(source.url)
-        if r is None:
-            return album
-
-        soup = cls.get_soup_from_response(r)
-
-        tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
-        for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
-            """
-            example of row:
-
        <tr class="even">
            <td width="20"><a class="anchor" name="5948442"> </a>1.</td>        # id and tracksort
            <td class="wrapWords">Convince Me to Bleed</td>                     # name
@ -494,7 +472,8 @@ class EncyclopaediaMetallum(Page):
            </td>
        </tr>
        """
-            row_list = row.find_all(recursive=False)
+        
+        row_list = track_row.find_all(recursive=False)

        track_sort_soup = row_list[0]
        track_sort = int(track_sort_soup.text[:-1])
@ -509,14 +488,102 @@ class EncyclopaediaMetallum(Page):
            minutes, seconds = duration_stamp.split(":")
            length = (int(minutes) * 60 + int(seconds)) * 1000  # in milliseconds

-            album.song_collection.append(
-                Song(
-                    id_=track_id,
+        return Song(
            title=title,
            length=length,
            tracksort=track_sort,
            source_list=[Source(cls.SOURCE_TYPE, track_id)]
        )
+        
+
+    @classmethod
+    def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
+        tracklist: List[Song] = []
+        artist_list = []
+        album_name: str = None
+        source_list: List[Source] = []
+        
+        def _parse_album_info(album_info_soup: BeautifulSoup):
+            nonlocal artist_list
+            nonlocal album_name
+            nonlocal source_list
+            
+            if album_info_soup is None:
+                return
+            
+            album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"})
+            if len(album_soup_list) == 1:
+                anchor: BeautifulSoup = album_soup_list[0].find("a")
+                
+                href = anchor.get("href")
+                if href is not None:
+                    source_list.append(Source(cls.SOURCE_TYPE, href.strip()))
+                    
+                album_name = anchor.get_text(strip=True)
+                
+            elif len(album_soup_list) > 1:
+                LOGGER.debug("there are more than 1 album soups")
+                
+            
+            artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"})
+            if len(artist_soup_list) == 1:
+                for anchor in artist_soup_list[0].find_all("a"):
+                    artist_sources: List[Source] = []
+                    
+                    href = anchor.get("href")
+                    if href is not None:
+                        artist_sources.append(Source(cls.SOURCE_TYPE, href.strip()))
+                        
+                    artist_name = anchor.get_text(strip=True)
+                    
+                    artist_list.append(Artist(
+                        name=artist_name,
+                        source_list=artist_sources
+                    ))
+                
+            elif len(artist_soup_list) > 1:
+                LOGGER.debug("there are more than 1 artist soups")
+        
+        _parse_album_info(album_info_soup=album_soup.find(id="album_info"))
+        
+        tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody")
+        for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
+            tracklist.append(cls._parse_album_track_row(track_row=track_soup))
+
+        return Album(
+            title=album_name,
+            source_list=source_list,
+            artist_list=artist_list,
+            song_list=tracklist
        )

+    @classmethod
+    def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
+        """
+        I am preeeety sure I can get way more data than... nothing from there
+
+        :param source:
+        :param stop_at_level:
+        :return:
+        """
+
+        # <table class="display table_lyrics
+
+        r = cls.get_request(source.url)
+        if r is None:
+            return Album()
+
+        soup = cls.get_soup_from_response(r)
+        
+        album = cls._parse_album_attributes(soup, stop_at_level=stop_at_level)
+        
+        if stop_at_level > 1:
+            for song in album.song_collection:
+                for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
+                    song.merge(cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1))
+                    
        return album
+
+    @classmethod
+    def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
+        return Song()