diff --git a/src/music_kraken/pages/encyclopaedia_metallum.py b/src/music_kraken/pages/encyclopaedia_metallum.py index 9210deb..d5dbe0c 100644 --- a/src/music_kraken/pages/encyclopaedia_metallum.py +++ b/src/music_kraken/pages/encyclopaedia_metallum.py @@ -1,4 +1,5 @@ -from typing import List +from collections import defaultdict +from typing import List, Optional, Dict import requests from bs4 import BeautifulSoup import pycountry @@ -18,7 +19,8 @@ from ..objects import ( ID3Timestamp, FormattedText, Label, - Options + Options, + AlbumType ) from ..utils import ( string_processing @@ -34,6 +36,12 @@ class EncyclopaediaMetallum(Page): SOURCE_TYPE = SourcePages.ENCYCLOPAEDIA_METALLUM + ALBUM_TYPE_MAP: Dict[str, AlbumType] = defaultdict(lambda: AlbumType.OTHER, { + "EP": AlbumType.EP, + "Full-length": AlbumType.STUDIO_ALBUM, + "Single": AlbumType.SINGLE + }) + @classmethod def search_by_query(cls, query: str) -> Options: query_obj = cls.Query(query) @@ -211,22 +219,16 @@ class EncyclopaediaMetallum(Page): ) @classmethod - def fetch_artist_discography(cls, artist: Artist, ma_artist_id: str, flat: bool = False) -> Artist: - """ - TODO - I'd guess this funktion has quite some possibility for optimizations - in form of performance and clean code - """ + def _fetch_artist_discography(cls, ma_artist_id: str) -> List[Album]: discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all" # make the request - r = cls.API_SESSION.get(discography_url.format(ma_artist_id)) - if r.status_code != 200: - LOGGER.warning(f"code {r.status_code} at {discography_url.format(ma_artist_id)}") - return artist + r = cls.get_request(discography_url.format(ma_artist_id)) + if r is None: + return [] + soup = cls.get_soup_from_response(r) - # parse the html - soup = BeautifulSoup(r.text, 'html.parser') + discography = [] tbody_soup = soup.find('tbody') for tr_soup in tbody_soup.find_all('tr'): @@ -236,7 +238,7 @@ class EncyclopaediaMetallum(Page): album_name = album_soup.text album_url = album_soup.find('a').get('href') album_id = album_url.split('/')[-1] - album_type = td_list[1].text + raw_album_type = td_list[1].text album_year = td_list[2].text date_obj = None try: @@ -244,36 +246,29 @@ class EncyclopaediaMetallum(Page): except ValueError(): pass - artist.main_album_collection.append( + discography.append( Album( id_=album_id, title=album_name, - album_type=album_type, date=date_obj, + album_type=cls.ALBUM_TYPE_MAP[raw_album_type], source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)] ) ) - if not flat: - for album in artist.main_album_collection: - cls.fetch_album_details(album, flat=flat) - - return artist + return discography @classmethod - def fetch_artist_sources(cls, artist: Artist, ma_artist_id: str) -> Artist: + def _fetch_artist_sources(cls, ma_artist_id: str) -> List[Source]: sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}" + r = cls.get_request(sources_url.format(ma_artist_id)) + if r is None: + return [] - # make the request - r = cls.API_SESSION.get(sources_url.format(ma_artist_id)) - if r.status_code != 200: - LOGGER.warning(f"code {r.status_code} at {sources_url.format(ma_artist_id)}") - return artist + soup = cls.get_soup_from_response(r) - soup = BeautifulSoup(r.text, 'html.parser') - - if soup.find("span",{"id": "noLinks"}) is not None: - return artist + if soup.find("span", {"id": "noLinks"}) is not None: + return [] artist_source = soup.find("div", {"id": "band_links_Official"}) """ @@ -285,18 +280,18 @@ class EncyclopaediaMetallum(Page): merchandice_source = soup.find("div", {"id": "band_links_Official_merchandise"}) label_source = soup.find("div", {"id": "band_links_Labels"}) + source_list = [] + if artist_source is not None: for tr in artist_source.find_all("td"): a = tr.find("a") url = a.get("href") - - source = Source.match_url(url) - if source is None: + if url is None: continue - artist.add_source(source) + source_list.append(Source.match_url(url)) - return artist + return source_list @classmethod def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist: @@ -350,7 +345,6 @@ class EncyclopaediaMetallum(Page): name = anchor.get_text(strip=True) - band_stat_soup = artist_soup.find("div", {"id": "band_stats"}) for dl_soup in band_stat_soup.find_all("dl"): for title, data in zip(dl_soup.find_all("dt"), dl_soup.find_all("dd")): @@ -423,17 +417,15 @@ class EncyclopaediaMetallum(Page): return cls._parse_artist_attributes(artist_soup=soup) @classmethod - def fetch_band_notes(cls, artist: Artist, ma_artist_id: str) -> Artist: + def _fetch_band_notes(cls, ma_artist_id: str) -> Optional[FormattedText]: endpoint = "https://www.metal-archives.com/band/read-more/id/{}" # make the request - r = cls.API_SESSION.get(endpoint.format(ma_artist_id)) - if r.status_code != 200: - LOGGER.warning(f"code {r.status_code} at {endpoint.format(ma_artist_id)}") - return artist + r = cls.get_request(endpoint.format(ma_artist_id)) + if r is None: + return FormattedText() - artist.notes.html = r.text - return artist + return FormattedText(html=r.text) @classmethod def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist: @@ -450,66 +442,49 @@ class EncyclopaediaMetallum(Page): artist = cls._fetch_artist_attributes(source.url) - return artist - - @classmethod - def fetch_artist_details(cls, artist: Artist, flat: bool = False) -> Artist: - source_list = artist.source_collection.get_sources_from_page(cls.SOURCE_TYPE) - if len(source_list) == 0: - return artist - - # taking the fist source, cuz I only need one and multiple sources don't make that much sense - source = source_list[0] artist_id = source.url.split("/")[-1] - """ - TODO - [x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489 - [x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all - [] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133 - [] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489 - [x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489 - [x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489 - """ + artist_sources = cls._fetch_artist_sources(artist_id) + artist.source_collection.extend(artist_sources) - # SIMPLE METADATA - artist = cls._fetch_artist_attributes(artist, source.url) + band_notes = cls._fetch_band_notes(artist_id) + if band_notes is not None: + artist.notes = band_notes - # DISCOGRAPHY - artist = cls.fetch_artist_discography(artist, artist_id, flat=flat) - - # EXTERNAL SOURCES - artist = cls.fetch_artist_sources(artist, artist_id) - - # ARTIST NOTES - artist = cls.fetch_band_notes(artist, artist_id) + discography: List[Album] = cls._fetch_artist_discography(artist_id) + if stop_at_level > 1: + for album in discography: + for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE): + album.merge(cls._fetch_album_from_source(source, stop_at_level=stop_at_level-1)) + artist.main_album_collection.extend(discography) return artist @classmethod - def fetch_album_details(cls, album: Album, flat: bool = False) -> Album: - source_list = album.source_collection.get_sources_from_page(cls.SOURCE_TYPE) - if len(source_list) == 0: - return album - - source = source_list[0] - album_id = source.url.split("/")[-1] + def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album: + """ + I am preeeety sure I can get way more data than... nothing from there + + :param source: + :param stop_at_level: + :return: + """ #