from collections import defaultdict from typing import List, Optional, Dict, Type, Union from bs4 import BeautifulSoup import pycountry from urllib.parse import urlparse from ..connection import Connection from ..utils.config import logging_settings from .abstract import Page from ..utils.enums.source import SourcePages from ..utils.enums.album import AlbumType from ..utils.support_classes import Query from ..objects import ( Lyrics, Artist, Source, Song, Album, ID3Timestamp, FormattedText, Label, Options, DatabaseObject ) from ..utils.shared import DEBUG if DEBUG: from ..utils.debug_utils import dump_to_file ALBUM_TYPE_MAP: Dict[str, AlbumType] = defaultdict(lambda: AlbumType.OTHER, { "Full-length": AlbumType.STUDIO_ALBUM, "Single": AlbumType.SINGLE, "EP": AlbumType.EP, "Demo": AlbumType.DEMO, "Video": AlbumType.OTHER, "Live album": AlbumType.LIVE_ALBUM, "Compilation": AlbumType.COMPILATION_ALBUM }) def _song_from_json(artist_html=None, album_html=None, release_type=None, title=None, lyrics_html=None) -> Song: song_id = None if lyrics_html is not None: soup = BeautifulSoup(lyrics_html, 'html.parser') anchor = soup.find('a') raw_song_id = anchor.get('id') song_id = raw_song_id.replace("lyricsLink_", "") return Song( title=title, main_artist_list=[ _artist_from_json(artist_html=artist_html) ], album_list=[ _album_from_json(album_html=album_html, release_type=release_type, artist_html=artist_html) ], source_list=[ Source(SourcePages.ENCYCLOPAEDIA_METALLUM, song_id) ] ) def _artist_from_json(artist_html=None, genre=None, country=None) -> Artist: """ TODO parse the country to a standard """ # parse the html # parse the html for the band name and link on metal-archives soup = BeautifulSoup(artist_html, 'html.parser') anchor = soup.find('a') artist_name = anchor.text artist_url = anchor.get('href') artist_id = artist_url.split("/")[-1] anchor.decompose() strong = soup.find('strong') if strong is not None: strong.decompose() akronyms_ = soup.text[2:-2].split(', ') return Artist( name=artist_name, source_list=[ Source(SourcePages.ENCYCLOPAEDIA_METALLUM, artist_url) ] ) def _album_from_json(album_html=None, release_type=None, artist_html=None) -> Album: # parse the html # Self Loather' soup = BeautifulSoup(album_html, 'html.parser') anchor = soup.find('a') album_name = anchor.text.strip() album_url = anchor.get('href') album_id = album_url.split("/")[-1] album_type = ALBUM_TYPE_MAP[release_type.strip()] return Album( title=album_name, album_type=album_type, source_list=[ Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url) ], artist_list=[ _artist_from_json(artist_html=artist_html) ] ) class EncyclopaediaMetallum(Page): SOURCE_TYPE = SourcePages.ENCYCLOPAEDIA_METALLUM LOGGER = logging_settings["metal_archives_logger"] def __init__(self, **kwargs): self.connection: Connection = Connection( host="https://www.metal-archives.com/", logger=self.LOGGER ) super().__init__(**kwargs) def song_search(self, song: Song) -> List[Song]: endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/songs/?songTitle={song}&bandName={" \ "artist}&releaseTitle={album}&lyrics=&genre=&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0" \ "&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&_" \ "=1674550595663" """ The difficult question I am facing is, that if I try every artist, with every song, with every album, I end up with a quadratic runtime complecety O(n^2), where every step means one web request. This. Is not good. """ song_title = song.title album_titles = ["*"] if song.album_collection.empty else [album.title for album in song.album_collection] artist_titles = ["*"] if song.main_artist_collection.empty else [artist.name for artist in song.main_artist_collection] search_results = [] for artist in artist_titles: for album in album_titles: r = self.connection.get( endpoint.format(song=song_title, artist=artist, album=album) ) if r is None: return [] search_results.extend(_song_from_json( artist_html=raw_song[0], album_html=raw_song[1], release_type=raw_song[2], title=raw_song[3], lyrics_html=raw_song[4] ) for raw_song in r.json()['aaData']) return search_results def album_search(self, album: Album) -> List[Album]: endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/albums/?bandName={" \ "artist}&releaseTitle={album}&releaseYearFrom=&releaseMonthFrom=&releaseYearTo=&releaseMonthTo" \ "=&country=&location=&releaseLabelName=&releaseCatalogNumber=&releaseIdentifiers" \ "=&releaseRecordingInfo=&releaseDescription=&releaseNotes=&genre=&sEcho=1&iColumns=3&sColumns" \ "=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&_=1674563943747" album_title = album.title artist_titles = ["*"] if album.artist_collection.empty else [artist.name for artist in album.artist_collection] search_results = [] for artist in artist_titles: r = self.connection.get(endpoint.format(artist=artist, album=album_title)) if r is None: return [] search_results.extend(_album_from_json( artist_html=raw_album[0], album_html=raw_album[1], release_type=raw_album[2] ) for raw_album in r.json()['aaData']) def artist_search(self, artist: Artist) -> List[Artist]: endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/bands/?bandName={" \ "artist}&genre=&country=&yearCreationFrom=&yearCreationTo=&bandNotes=&status=&themes=&location" \ "=&bandLabelName=&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0" \ "&mDataProp_1=1&mDataProp_2=2&_=1674565459976" r = self.connection.get(endpoint.format(artist=artist.name)) if r is None: return [] data_key = 'aaData' parsed_data = r.json() if data_key not in parsed_data: return [] return [ _artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2]) for raw_artist in r.json()['aaData'] ] def general_search(self, query: str) -> List[DatabaseObject]: """ Searches the default endpoint from metal archives, which intern searches only for bands, but it is the default, thus I am rolling with it """ endpoint = "https://www.metal-archives.com/search/ajax-band-search/?field=name&query={query}&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2" r = self.connection.get(endpoint.format(query=query)) if r is None: return [] return [ _artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2]) for raw_artist in r.json()['aaData'] ] def _fetch_artist_discography(self, ma_artist_id: str) -> List[Album]: discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all" # make the request r = self.connection.get(discography_url.format(ma_artist_id)) if r is None: return [] soup = self.get_soup_from_response(r) discography = [] tbody_soup = soup.find('tbody') for tr_soup in tbody_soup.find_all('tr'): td_list = tr_soup.findChildren(recursive=False) album_soup = td_list[0] album_name = album_soup.text album_url = album_soup.find('a').get('href') album_id = album_url.split('/')[-1] raw_album_type = td_list[1].text album_year = td_list[2].text date_obj = None try: date_obj = ID3Timestamp(year=int(album_year)) except ValueError(): pass discography.append( Album( title=album_name, date=date_obj, album_type=ALBUM_TYPE_MAP[raw_album_type], source_list=[Source(self.SOURCE_TYPE, album_url)] ) ) return discography def _fetch_artist_sources(self, ma_artist_id: str) -> List[Source]: sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}" r = self.connection.get(sources_url.format(ma_artist_id)) if r is None: return [] soup = self.get_soup_from_response(r) if DEBUG: dump_to_file(f"ma_artist_sources_{ma_artist_id}.html", soup.prettify(), exit_after_dump=False) if soup.find("span", {"id": "noLinks"}) is not None: return [] source_list = [] link_table: BeautifulSoup = soup.find("table", {"id": "linksTablemain"}) if link_table is not None: for tr in link_table.find_all("tr"): anchor: BeautifulSoup = tr.find("a") if anchor is None: continue href = anchor["href"] if href is not None: source_list.append(Source.match_url(href, referer_page=self.SOURCE_TYPE)) # The following code is only legacy code, which I just kep because it doesn't harm. # The way ma returns sources changed. artist_source = soup.find("div", {"id": "band_links"}) merchandice_source = soup.find("div", {"id": "band_links_Official_merchandise"}) label_source = soup.find("div", {"id": "band_links_Labels"}) if artist_source is not None: for tr in artist_source.find_all("td"): a = tr.find("a") url = a.get("href") if url is None: continue source_list.append(Source.match_url(url, referer_page=self.SOURCE_TYPE)) return source_list def _parse_artist_attributes(self, artist_soup: BeautifulSoup) -> Artist: name: str = None country: pycountry.Countrie = None formed_in_year: int = None genre: str = None lyrical_themes: List[str] = [] label_name: str = None label_url: str = None source_list: List[Source] = [] title_soup: BeautifulSoup = artist_soup.find("title") if title_soup is not None: bad_name_substring = " - Encyclopaedia Metallum: The Metal Archives" title_text = title_soup.get_text() if title_text.count(bad_name_substring) == 1: name = title_text.replace(bad_name_substring, "") else: self.LOGGER.debug(f"the title of the page is \"{title_text}\"") """ TODO Implement the bandpictures and logos that can be gotten with the elements Artist: r = self.connection.get(url) if r is None: return Artist() soup: BeautifulSoup = self.get_soup_from_response(r) return self._parse_artist_attributes(artist_soup=soup) def _fetch_band_notes(self, ma_artist_id: str) -> Optional[FormattedText]: endpoint = "https://www.metal-archives.com/band/read-more/id/{}" # make the request r = self.connection.get(endpoint.format(ma_artist_id)) if r is None: return FormattedText() return FormattedText(html=r.text) def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist: """ What it could fetch, and what is implemented: [x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489 [x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all [] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133 [] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489 [x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489 [x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489 """ artist = self._fetch_artist_attributes(source.url) artist_id = source.url.split("/")[-1] artist_sources = self._fetch_artist_sources(artist_id) artist.source_collection.extend(artist_sources) band_notes = self._fetch_band_notes(artist_id) if band_notes is not None: artist.notes = band_notes discography: List[Album] = self._fetch_artist_discography(artist_id) if stop_at_level > 1: for album in discography: for source in album.source_collection.get_sources_from_page(self.SOURCE_TYPE): album.merge(self._fetch_album_from_source(source, stop_at_level=stop_at_level-1)) artist.main_album_collection.extend(discography) return artist def _parse_album_track_row(self, track_row: BeautifulSoup) -> Song: """ 1. # id and tracksort Convince Me to Bleed # name 03:40 # length   Show lyrics """ row_list = track_row.find_all(recursive=False) source_list: List[Source] = [] track_sort_soup = row_list[0] track_sort = int(track_sort_soup.text[:-1]) track_id = track_sort_soup.find("a").get("name").strip() if track_row.find("a", {"href": f"#{track_id}"}) is not None: source_list.append(Source(self.SOURCE_TYPE, track_id)) title = row_list[1].text.strip() length = None duration_stamp = row_list[2].text if ":" in duration_stamp: minutes, seconds = duration_stamp.split(":") length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds return Song( title=title, length=length, tracksort=track_sort, source_list=source_list ) def _parse_album_attributes(self, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album: tracklist: List[Song] = [] artist_list = [] album_name: str = None source_list: List[Source] = [] def _parse_album_info(album_info_soup: BeautifulSoup): nonlocal artist_list nonlocal album_name nonlocal source_list if album_info_soup is None: return album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"}) if len(album_soup_list) == 1: anchor: BeautifulSoup = album_soup_list[0].find("a") href = anchor.get("href") if href is not None: source_list.append(Source(self.SOURCE_TYPE, href.strip())) album_name = anchor.get_text(strip=True) elif len(album_soup_list) > 1: self.LOGGER.debug("there are more than 1 album soups") artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"}) if len(artist_soup_list) == 1: for anchor in artist_soup_list[0].find_all("a"): artist_sources: List[Source] = [] href = anchor.get("href") if href is not None: artist_sources.append(Source(self.SOURCE_TYPE, href.strip())) artist_name = anchor.get_text(strip=True) artist_list.append(Artist( name=artist_name, source_list=artist_sources )) elif len(artist_soup_list) > 1: self.LOGGER.debug("there are more than 1 artist soups") _parse_album_info(album_info_soup=album_soup.find(id="album_info")) tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody") for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}): tracklist.append(self._parse_album_track_row(track_row=track_soup)) return Album( title=album_name, source_list=source_list, artist_list=artist_list, song_list=tracklist ) def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: """ I am preeeety sure I can get way more data than... nothing from there :param source: :param stop_at_level: :return: """ # Song: song_id = source.url return Song( lyrics_list=[ self._fetch_lyrics(song_id=song_id) ] ) def get_source_type(self, source: Source): if self.SOURCE_TYPE != source.page_enum: return None url = source.url if url is None: return None parsed_url = urlparse(url) path: List[str] = parsed_url.path.split("/") if "band" in path: return Artist if "bands" in path: return Artist if "albums" in path: return Album if "labels" in path: return Label return None