from collections import defaultdict from typing import List, Optional, Dict, Type, Union import requests from bs4 import BeautifulSoup import pycountry from urllib.parse import urlparse from ..connection import Connection from ..utils.shared import ENCYCLOPAEDIA_METALLUM_LOGGER, proxies from ..utils import string_processing from .abstract import Page from ..utils.enums.source import SourcePages from ..utils.enums.album import AlbumType from ..objects import ( Lyrics, Artist, Source, Song, Album, ID3Timestamp, FormattedText, Label, Options, ) class EncyclopaediaMetallum(Page): CONNECTION: Connection = Connection( host="https://www.metal-archives.com/", logger=ENCYCLOPAEDIA_METALLUM_LOGGER ) SOURCE_TYPE = SourcePages.ENCYCLOPAEDIA_METALLUM ALBUM_TYPE_MAP: Dict[str, AlbumType] = defaultdict(lambda: AlbumType.OTHER, { "Full-length": AlbumType.STUDIO_ALBUM, "Single": AlbumType.SINGLE, "EP": AlbumType.EP, "Demo": AlbumType.DEMO, "Video": AlbumType.OTHER, "Live album": AlbumType.LIVE_ALBUM, "Compilation": AlbumType.COMPILATION_ALBUM }) LOGGER = ENCYCLOPAEDIA_METALLUM_LOGGER @classmethod def search_by_query(cls, query: str) -> Options: query_obj = cls.Query(query) if query_obj.is_raw: return cls.simple_search(query_obj) return cls.advanced_search(query_obj) @classmethod def advanced_search(cls, query: Page.Query) -> Options: if query.song is not None: return Options(cls.search_for_song(query=query)) if query.album is not None: return Options(cls.search_for_album(query=query)) if query.artist is not None: return Options(cls.search_for_artist(query=query)) return Options @classmethod def search_for_song(cls, query: Page.Query) -> List[Song]: endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/songs/?songTitle={song}&bandName={" \ "artist}&releaseTitle={album}&lyrics=&genre=&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0" \ "&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&_" \ "=1674550595663" r = cls.CONNECTION.get( endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str) ) if r is None: return [] return [cls.get_song_from_json( artist_html=raw_song[0], album_html=raw_song[1], release_type=raw_song[2], title=raw_song[3], lyrics_html=raw_song[4] ) for raw_song in r.json()['aaData']] @classmethod def search_for_album(cls, query: Page.Query) -> List[Album]: endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/albums/?bandName={" \ "artist}&releaseTitle={album}&releaseYearFrom=&releaseMonthFrom=&releaseYearTo=&releaseMonthTo" \ "=&country=&location=&releaseLabelName=&releaseCatalogNumber=&releaseIdentifiers" \ "=&releaseRecordingInfo=&releaseDescription=&releaseNotes=&genre=&sEcho=1&iColumns=3&sColumns" \ "=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&_=1674563943747" r = cls.CONNECTION.get(endpoint.format(artist=query.artist_str, album=query.album_str)) if r is None: return [] return [cls.get_album_from_json( artist_html=raw_album[0], album_html=raw_album[1], release_type=raw_album[2] ) for raw_album in r.json()['aaData']] @classmethod def search_for_artist(cls, query: Page.Query) -> List[Artist]: endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/bands/?bandName={" \ "artist}&genre=&country=&yearCreationFrom=&yearCreationTo=&bandNotes=&status=&themes=&location" \ "=&bandLabelName=&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0" \ "&mDataProp_1=1&mDataProp_2=2&_=1674565459976" r = cls.CONNECTION.get(endpoint.format(artist=query.artist)) if r is None: return [] data_key = 'aaData' parsed_data = r.json() if data_key not in parsed_data: return [] return [ cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2]) for raw_artist in r.json()['aaData'] ] @classmethod def simple_search(cls, query: Page.Query) -> List[Artist]: """ Searches the default endpoint from metal archives, which intern searches only for bands, but it is the default, thus I am rolling with it """ endpoint = "https://www.metal-archives.com/search/ajax-band-search/?field=name&query={query}&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2" r = cls.CONNECTION.get(endpoint.format(query=query)) if r is None: return [] return [ cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2]) for raw_artist in r.json()['aaData'] ] @classmethod def get_artist_from_json(cls, artist_html=None, genre=None, country=None) -> Artist: """ TODO parse the country to a standart """ # parse the html # parse the html for the band name and link on metal-archives soup = BeautifulSoup(artist_html, 'html.parser') anchor = soup.find('a') artist_name = anchor.text artist_url = anchor.get('href') artist_id = artist_url.split("/")[-1] anchor.decompose() strong = soup.find('strong') if strong is not None: strong.decompose() akronyms_ = soup.text[2:-2].split(', ') return Artist( name=artist_name, source_list=[ Source(SourcePages.ENCYCLOPAEDIA_METALLUM, artist_url) ] ) @classmethod def get_album_from_json(cls, album_html=None, release_type=None, artist_html=None) -> Album: # parse the html # Self Loather' soup = BeautifulSoup(album_html, 'html.parser') anchor = soup.find('a') album_name = anchor.text album_url = anchor.get('href') album_id = album_url.split("/")[-1] album_type = cls.ALBUM_TYPE_MAP[release_type.strip()] return Album( title=album_name, album_type=album_type, source_list=[ Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url) ], artist_list=[ cls.get_artist_from_json(artist_html=artist_html) ] ) @classmethod def get_song_from_json(cls, artist_html=None, album_html=None, release_type=None, title=None, lyrics_html=None) -> Song: song_id = None if lyrics_html is not None: soup = BeautifulSoup(lyrics_html, 'html.parser') anchor = soup.find('a') raw_song_id = anchor.get('id') song_id = raw_song_id.replace("lyricsLink_", "") return Song( title=title, main_artist_list=[ cls.get_artist_from_json(artist_html=artist_html) ], album_list=[ cls.get_album_from_json(album_html=album_html, release_type=release_type, artist_html=artist_html) ], source_list=[ Source(SourcePages.ENCYCLOPAEDIA_METALLUM, song_id) ] ) @classmethod def _fetch_artist_discography(cls, ma_artist_id: str) -> List[Album]: discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all" # make the request r = cls.CONNECTION.get(discography_url.format(ma_artist_id)) if r is None: return [] soup = cls.get_soup_from_response(r) discography = [] tbody_soup = soup.find('tbody') for tr_soup in tbody_soup.find_all('tr'): td_list = tr_soup.findChildren(recursive=False) album_soup = td_list[0] album_name = album_soup.text album_url = album_soup.find('a').get('href') album_id = album_url.split('/')[-1] raw_album_type = td_list[1].text album_year = td_list[2].text date_obj = None try: date_obj = ID3Timestamp(year=int(album_year)) except ValueError(): pass discography.append( Album( title=album_name, date=date_obj, album_type=cls.ALBUM_TYPE_MAP[raw_album_type], source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)] ) ) return discography @classmethod def _fetch_artist_sources(cls, ma_artist_id: str) -> List[Source]: sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}" r = cls.CONNECTION.get(sources_url.format(ma_artist_id)) if r is None: return [] soup = cls.get_soup_from_response(r) if soup.find("span", {"id": "noLinks"}) is not None: return [] artist_source = soup.find("div", {"id": "band_links_Official"}) """ TODO add a Label object to add the label sources from TODO maybe do merchandice stuff """ merchandice_source = soup.find("div", {"id": "band_links_Official_merchandise"}) label_source = soup.find("div", {"id": "band_links_Labels"}) source_list = [] if artist_source is not None: for tr in artist_source.find_all("td"): a = tr.find("a") url = a.get("href") if url is None: continue source_list.append(Source.match_url(url, referer_page=cls.SOURCE_TYPE)) return source_list @classmethod def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist: name: str = None country: pycountry.Countrie = None formed_in_year: int = None genre: str = None lyrical_themes: List[str] = [] label_name: str = None label_url: str = None source_list: List[Source] = [] title_soup: BeautifulSoup = artist_soup.find("title") if title_soup is not None: bad_name_substring = " - Encyclopaedia Metallum: The Metal Archives" title_text = title_soup.get_text() if title_text.count(bad_name_substring) == 1: name = title_text.replace(bad_name_substring, "") else: cls.LOGGER.debug(f"the title of the page is \"{title_text}\"") """ TODO Implement the bandpictures and logos that can be gotten with the elements Artist: r = cls.CONNECTION.get(url) if r is None: return Artist() soup: BeautifulSoup = cls.get_soup_from_response(r) return cls._parse_artist_attributes(artist_soup=soup) @classmethod def _fetch_band_notes(cls, ma_artist_id: str) -> Optional[FormattedText]: endpoint = "https://www.metal-archives.com/band/read-more/id/{}" # make the request r = cls.CONNECTION.get(endpoint.format(ma_artist_id)) if r is None: return FormattedText() return FormattedText(html=r.text) @classmethod def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist: """ What it could fetch, and what is implemented: [x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489 [x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all [] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133 [] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489 [x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489 [x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489 """ artist = cls._fetch_artist_attributes(source.url) artist_id = source.url.split("/")[-1] artist_sources = cls._fetch_artist_sources(artist_id) artist.source_collection.extend(artist_sources) band_notes = cls._fetch_band_notes(artist_id) if band_notes is not None: artist.notes = band_notes discography: List[Album] = cls._fetch_artist_discography(artist_id) if stop_at_level > 1: for album in discography: for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE): album.merge(cls._fetch_album_from_source(source, stop_at_level=stop_at_level-1)) artist.main_album_collection.extend(discography) return artist @classmethod def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song: """ 1. # id and tracksort Convince Me to Bleed # name 03:40 # length   Show lyrics """ row_list = track_row.find_all(recursive=False) source_list: List[Source] = [] track_sort_soup = row_list[0] track_sort = int(track_sort_soup.text[:-1]) track_id = track_sort_soup.find("a").get("name").strip() if track_row.find("a", {"href": f"#{track_id}"}) is not None: source_list.append(Source(cls.SOURCE_TYPE, track_id)) title = row_list[1].text.strip() length = None duration_stamp = row_list[2].text if ":" in duration_stamp: minutes, seconds = duration_stamp.split(":") length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds return Song( title=title, length=length, tracksort=track_sort, source_list=source_list ) @classmethod def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album: tracklist: List[Song] = [] artist_list = [] album_name: str = None source_list: List[Source] = [] def _parse_album_info(album_info_soup: BeautifulSoup): nonlocal artist_list nonlocal album_name nonlocal source_list if album_info_soup is None: return album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"}) if len(album_soup_list) == 1: anchor: BeautifulSoup = album_soup_list[0].find("a") href = anchor.get("href") if href is not None: source_list.append(Source(cls.SOURCE_TYPE, href.strip())) album_name = anchor.get_text(strip=True) elif len(album_soup_list) > 1: cls.LOGGER.debug("there are more than 1 album soups") artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"}) if len(artist_soup_list) == 1: for anchor in artist_soup_list[0].find_all("a"): artist_sources: List[Source] = [] href = anchor.get("href") if href is not None: artist_sources.append(Source(cls.SOURCE_TYPE, href.strip())) artist_name = anchor.get_text(strip=True) artist_list.append(Artist( name=artist_name, source_list=artist_sources )) elif len(artist_soup_list) > 1: cls.LOGGER.debug("there are more than 1 artist soups") _parse_album_info(album_info_soup=album_soup.find(id="album_info")) tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody") for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}): tracklist.append(cls._parse_album_track_row(track_row=track_soup)) return Album( title=album_name, source_list=source_list, artist_list=artist_list, song_list=tracklist ) @classmethod def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album: """ I am preeeety sure I can get way more data than... nothing from there :param source: :param stop_at_level: :return: """ # Song: song_id = source.url return Song( lyrics_list=[ cls._fetch_lyrics(song_id=song_id) ] ) @classmethod def _get_type_of_url(cls, url: str) -> Optional[Union[Type[Song], Type[Album], Type[Artist], Type[Label]]]: parsed_url = urlparse(url) path: List[str] = parsed_url.path.split("/") if "band" in path: return Artist if "bands" in path: return Artist if "albums" in path: return Album if "labels" in path: return Label return None