From 34baf75d2e74caf610c0980d24021b6dafa3a439 Mon Sep 17 00:00:00 2001 From: Hellow2 Date: Tue, 28 Mar 2023 08:27:21 +0200 Subject: [PATCH] pretty much finished web scraping on metal archives besides lyrics --- documentation/html/metal-archives/album.html | 5 +- src/metal_archives.py | 5 +- .../pages/encyclopaedia_metallum.py | 153 +++++++++++++----- 3 files changed, 117 insertions(+), 46 deletions(-) diff --git a/documentation/html/metal-archives/album.html b/documentation/html/metal-archives/album.html index f236fe3..0b80eab 100644 --- a/documentation/html/metal-archives/album.html +++ b/documentation/html/metal-archives/album.html @@ -123,9 +123,10 @@
+

+ Ghost Bath +

-

Ghost - Bath

Ghost Bath

diff --git a/src/metal_archives.py b/src/metal_archives.py index b3d5ceb..4694120 100644 --- a/src/metal_archives.py +++ b/src/metal_archives.py @@ -16,8 +16,11 @@ def fetch_artist(): ] ) - artist = EncyclopaediaMetallum.fetch_details(artist) + artist: objects.Artist = EncyclopaediaMetallum.fetch_details(artist, stop_at_level=2) print(artist.options) + + for album in artist.main_album_collection: + print(album.options) def fetch_album(): diff --git a/src/music_kraken/pages/encyclopaedia_metallum.py b/src/music_kraken/pages/encyclopaedia_metallum.py index d5dbe0c..07cedec 100644 --- a/src/music_kraken/pages/encyclopaedia_metallum.py +++ b/src/music_kraken/pages/encyclopaedia_metallum.py @@ -460,6 +460,103 @@ class EncyclopaediaMetallum(Page): return artist + @classmethod + def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song: + """ + + 1. # id and tracksort + Convince Me to Bleed # name + 03:40 # length +   + Show lyrics + + + """ + + row_list = track_row.find_all(recursive=False) + + track_sort_soup = row_list[0] + track_sort = int(track_sort_soup.text[:-1]) + track_id = track_sort_soup.find("a").get("name") + + title = row_list[1].text.strip() + + length = None + + duration_stamp = row_list[2].text + if ":" in duration_stamp: + minutes, seconds = duration_stamp.split(":") + length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds + + return Song( + title=title, + length=length, + tracksort=track_sort, + source_list=[Source(cls.SOURCE_TYPE, track_id)] + ) + + + @classmethod + def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album: + tracklist: List[Song] = [] + artist_list = [] + album_name: str = None + source_list: List[Source] = [] + + def _parse_album_info(album_info_soup: BeautifulSoup): + nonlocal artist_list + nonlocal album_name + nonlocal source_list + + if album_info_soup is None: + return + + album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"}) + if len(album_soup_list) == 1: + anchor: BeautifulSoup = album_soup_list[0].find("a") + + href = anchor.get("href") + if href is not None: + source_list.append(Source(cls.SOURCE_TYPE, href.strip())) + + album_name = anchor.get_text(strip=True) + + elif len(album_soup_list) > 1: + LOGGER.debug("there are more than 1 album soups") + + + artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"}) + if len(artist_soup_list) == 1: + for anchor in artist_soup_list[0].find_all("a"): + artist_sources: List[Source] = [] + + href = anchor.get("href") + if href is not None: + artist_sources.append(Source(cls.SOURCE_TYPE, href.strip())) + + artist_name = anchor.get_text(strip=True) + + artist_list.append(Artist( + name=artist_name, + source_list=artist_sources + )) + + elif len(artist_soup_list) > 1: + LOGGER.debug("there are more than 1 artist soups") + + _parse_album_info(album_info_soup=album_soup.find(id="album_info")) + + tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody") + for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}): + tracklist.append(cls._parse_album_track_row(track_row=track_soup)) + + return Album( + title=album_name, + source_list=source_list, + artist_list=artist_list, + song_list=tracklist + ) + @classmethod def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album: """ @@ -472,51 +569,21 @@ class EncyclopaediaMetallum(Page): # - # id and tracksort - # name - # length - - - """ - row_list = row.find_all(recursive=False) - - track_sort_soup = row_list[0] - track_sort = int(track_sort_soup.text[:-1]) - track_id = track_sort_soup.find("a").get("name") - - title = row_list[1].text.strip() - - length = None - - duration_stamp = row_list[2].text - if ":" in duration_stamp: - minutes, seconds = duration_stamp.split(":") - length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds - - album.song_collection.append( - Song( - id_=track_id, - title=title, - length=length, - tracksort=track_sort, - source_list=[Source(cls.SOURCE_TYPE, track_id)] - ) - ) - + + album = cls._parse_album_attributes(soup, stop_at_level=stop_at_level) + + if stop_at_level > 1: + for song in album.song_collection: + for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE): + song.merge(cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1)) + return album + + @classmethod + def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song: + return Song()
1.Convince Me to Bleed03:40  - Show lyrics -