from collections import defaultdict
from typing import List, Optional, Dict, Type, Union
import requests
from bs4 import BeautifulSoup
import pycountry
from urllib.parse import urlparse
from ..connection import Connection
from ..utils.shared import ENCYCLOPAEDIA_METALLUM_LOGGER, proxies
from ..utils import string_processing
from .abstract import Page
from ..utils.enums.source import SourcePages
from ..utils.enums.album import AlbumType
from ..objects import (
Lyrics,
Artist,
Source,
Song,
Album,
ID3Timestamp,
FormattedText,
Label,
Options,
)
class EncyclopaediaMetallum(Page):
CONNECTION: Connection = Connection(
host="https://www.metal-archives.com/",
logger=ENCYCLOPAEDIA_METALLUM_LOGGER
)
SOURCE_TYPE = SourcePages.ENCYCLOPAEDIA_METALLUM
ALBUM_TYPE_MAP: Dict[str, AlbumType] = defaultdict(lambda: AlbumType.OTHER, {
"Full-length": AlbumType.STUDIO_ALBUM,
"Single": AlbumType.SINGLE,
"EP": AlbumType.EP,
"Demo": AlbumType.DEMO,
"Video": AlbumType.OTHER,
"Live album": AlbumType.LIVE_ALBUM,
"Compilation": AlbumType.COMPILATION_ALBUM
})
LOGGER = ENCYCLOPAEDIA_METALLUM_LOGGER
@classmethod
def search_by_query(cls, query: str) -> Options:
query_obj = cls.Query(query)
if query_obj.is_raw:
return cls.simple_search(query_obj)
return cls.advanced_search(query_obj)
@classmethod
def advanced_search(cls, query: Page.Query) -> Options:
if query.song is not None:
return Options(cls.search_for_song(query=query))
if query.album is not None:
return Options(cls.search_for_album(query=query))
if query.artist is not None:
return Options(cls.search_for_artist(query=query))
return Options
@classmethod
def search_for_song(cls, query: Page.Query) -> List[Song]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/songs/?songTitle={song}&bandName={" \
"artist}&releaseTitle={album}&lyrics=&genre=&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0" \
"&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&_" \
"=1674550595663"
r = cls.CONNECTION.get(
endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str)
)
if r is None:
return []
return [cls.get_song_from_json(
artist_html=raw_song[0],
album_html=raw_song[1],
release_type=raw_song[2],
title=raw_song[3],
lyrics_html=raw_song[4]
) for raw_song in r.json()['aaData']]
@classmethod
def search_for_album(cls, query: Page.Query) -> List[Album]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/albums/?bandName={" \
"artist}&releaseTitle={album}&releaseYearFrom=&releaseMonthFrom=&releaseYearTo=&releaseMonthTo" \
"=&country=&location=&releaseLabelName=&releaseCatalogNumber=&releaseIdentifiers" \
"=&releaseRecordingInfo=&releaseDescription=&releaseNotes=&genre=&sEcho=1&iColumns=3&sColumns" \
"=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&_=1674563943747"
r = cls.CONNECTION.get(endpoint.format(artist=query.artist_str, album=query.album_str))
if r is None:
return []
return [cls.get_album_from_json(
artist_html=raw_album[0],
album_html=raw_album[1],
release_type=raw_album[2]
) for raw_album in r.json()['aaData']]
@classmethod
def search_for_artist(cls, query: Page.Query) -> List[Artist]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/bands/?bandName={" \
"artist}&genre=&country=&yearCreationFrom=&yearCreationTo=&bandNotes=&status=&themes=&location" \
"=&bandLabelName=&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0" \
"&mDataProp_1=1&mDataProp_2=2&_=1674565459976"
r = cls.CONNECTION.get(endpoint.format(artist=query.artist))
if r is None:
return []
data_key = 'aaData'
parsed_data = r.json()
if data_key not in parsed_data:
return []
return [
cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2])
for raw_artist in r.json()['aaData']
]
@classmethod
def simple_search(cls, query: Page.Query) -> List[Artist]:
"""
Searches the default endpoint from metal archives, which intern searches only
for bands, but it is the default, thus I am rolling with it
"""
endpoint = "https://www.metal-archives.com/search/ajax-band-search/?field=name&query={query}&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2"
r = cls.CONNECTION.get(endpoint.format(query=query))
if r is None:
return []
return [
cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2])
for raw_artist in r.json()['aaData']
]
@classmethod
def get_artist_from_json(cls, artist_html=None, genre=None, country=None) -> Artist:
"""
TODO parse the country to a standart
"""
# parse the html
# parse the html for the band name and link on metal-archives
soup = BeautifulSoup(artist_html, 'html.parser')
anchor = soup.find('a')
artist_name = anchor.text
artist_url = anchor.get('href')
artist_id = artist_url.split("/")[-1]
anchor.decompose()
strong = soup.find('strong')
if strong is not None:
strong.decompose()
akronyms_ = soup.text[2:-2].split(', ')
return Artist(
name=artist_name,
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, artist_url)
]
)
@classmethod
def get_album_from_json(cls, album_html=None, release_type=None, artist_html=None) -> Album:
# parse the html
# Self Loather'
soup = BeautifulSoup(album_html, 'html.parser')
anchor = soup.find('a')
album_name = anchor.text
album_url = anchor.get('href')
album_id = album_url.split("/")[-1]
album_type = cls.ALBUM_TYPE_MAP[release_type.strip()]
return Album(
title=album_name,
album_type=album_type,
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)
],
artist_list=[
cls.get_artist_from_json(artist_html=artist_html)
]
)
@classmethod
def get_song_from_json(cls, artist_html=None, album_html=None, release_type=None, title=None,
lyrics_html=None) -> Song:
song_id = None
if lyrics_html is not None:
soup = BeautifulSoup(lyrics_html, 'html.parser')
anchor = soup.find('a')
raw_song_id = anchor.get('id')
song_id = raw_song_id.replace("lyricsLink_", "")
return Song(
title=title,
main_artist_list=[
cls.get_artist_from_json(artist_html=artist_html)
],
album_list=[
cls.get_album_from_json(album_html=album_html, release_type=release_type, artist_html=artist_html)
],
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, song_id)
]
)
@classmethod
def _fetch_artist_discography(cls, ma_artist_id: str) -> List[Album]:
discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all"
# make the request
r = cls.CONNECTION.get(discography_url.format(ma_artist_id))
if r is None:
return []
soup = cls.get_soup_from_response(r)
discography = []
tbody_soup = soup.find('tbody')
for tr_soup in tbody_soup.find_all('tr'):
td_list = tr_soup.findChildren(recursive=False)
album_soup = td_list[0]
album_name = album_soup.text
album_url = album_soup.find('a').get('href')
album_id = album_url.split('/')[-1]
raw_album_type = td_list[1].text
album_year = td_list[2].text
date_obj = None
try:
date_obj = ID3Timestamp(year=int(album_year))
except ValueError():
pass
discography.append(
Album(
title=album_name,
date=date_obj,
album_type=cls.ALBUM_TYPE_MAP[raw_album_type],
source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)]
)
)
return discography
@classmethod
def _fetch_artist_sources(cls, ma_artist_id: str) -> List[Source]:
sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}"
r = cls.CONNECTION.get(sources_url.format(ma_artist_id))
if r is None:
return []
soup = cls.get_soup_from_response(r)
if soup.find("span", {"id": "noLinks"}) is not None:
return []
artist_source = soup.find("div", {"id": "band_links_Official"})
"""
TODO
add a Label object to add the label sources from
TODO
maybe do merchandice stuff
"""
merchandice_source = soup.find("div", {"id": "band_links_Official_merchandise"})
label_source = soup.find("div", {"id": "band_links_Labels"})
source_list = []
if artist_source is not None:
for tr in artist_source.find_all("td"):
a = tr.find("a")
url = a.get("href")
if url is None:
continue
source_list.append(Source.match_url(url, referer_page=cls.SOURCE_TYPE))
return source_list
@classmethod
def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist:
name: str = None
country: pycountry.Countrie = None
formed_in_year: int = None
genre: str = None
lyrical_themes: List[str] = []
label_name: str = None
label_url: str = None
source_list: List[Source] = []
title_soup: BeautifulSoup = artist_soup.find("title")
if title_soup is not None:
bad_name_substring = " - Encyclopaedia Metallum: The Metal Archives"
title_text = title_soup.get_text()
if title_text.count(bad_name_substring) == 1:
name = title_text.replace(bad_name_substring, "")
else:
cls.LOGGER.debug(f"the title of the page is \"{title_text}\"")
"""
TODO
Implement the bandpictures and logos that can be gotten with the elements
Artist:
r = cls.CONNECTION.get(url)
if r is None:
return Artist()
soup: BeautifulSoup = cls.get_soup_from_response(r)
return cls._parse_artist_attributes(artist_soup=soup)
@classmethod
def _fetch_band_notes(cls, ma_artist_id: str) -> Optional[FormattedText]:
endpoint = "https://www.metal-archives.com/band/read-more/id/{}"
# make the request
r = cls.CONNECTION.get(endpoint.format(ma_artist_id))
if r is None:
return FormattedText()
return FormattedText(html=r.text)
@classmethod
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
"""
What it could fetch, and what is implemented:
[x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489
[x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all
[] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133
[] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489
[x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489
[x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489
"""
artist = cls._fetch_artist_attributes(source.url)
artist_id = source.url.split("/")[-1]
artist_sources = cls._fetch_artist_sources(artist_id)
artist.source_collection.extend(artist_sources)
band_notes = cls._fetch_band_notes(artist_id)
if band_notes is not None:
artist.notes = band_notes
discography: List[Album] = cls._fetch_artist_discography(artist_id)
if stop_at_level > 1:
for album in discography:
for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
album.merge(cls._fetch_album_from_source(source, stop_at_level=stop_at_level-1))
artist.main_album_collection.extend(discography)
return artist
@classmethod
def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song:
"""
"""
row_list = track_row.find_all(recursive=False)
source_list: List[Source] = []
track_sort_soup = row_list[0]
track_sort = int(track_sort_soup.text[:-1])
track_id = track_sort_soup.find("a").get("name").strip()
if track_row.find("a", {"href": f"#{track_id}"}) is not None:
source_list.append(Source(cls.SOURCE_TYPE, track_id))
title = row_list[1].text.strip()
length = None
duration_stamp = row_list[2].text
if ":" in duration_stamp:
minutes, seconds = duration_stamp.split(":")
length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds
return Song(
title=title,
length=length,
tracksort=track_sort,
source_list=source_list
)
@classmethod
def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
tracklist: List[Song] = []
artist_list = []
album_name: str = None
source_list: List[Source] = []
def _parse_album_info(album_info_soup: BeautifulSoup):
nonlocal artist_list
nonlocal album_name
nonlocal source_list
if album_info_soup is None:
return
album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"})
if len(album_soup_list) == 1:
anchor: BeautifulSoup = album_soup_list[0].find("a")
href = anchor.get("href")
if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href.strip()))
album_name = anchor.get_text(strip=True)
elif len(album_soup_list) > 1:
cls.LOGGER.debug("there are more than 1 album soups")
artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"})
if len(artist_soup_list) == 1:
for anchor in artist_soup_list[0].find_all("a"):
artist_sources: List[Source] = []
href = anchor.get("href")
if href is not None:
artist_sources.append(Source(cls.SOURCE_TYPE, href.strip()))
artist_name = anchor.get_text(strip=True)
artist_list.append(Artist(
name=artist_name,
source_list=artist_sources
))
elif len(artist_soup_list) > 1:
cls.LOGGER.debug("there are more than 1 artist soups")
_parse_album_info(album_info_soup=album_soup.find(id="album_info"))
tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody")
for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
tracklist.append(cls._parse_album_track_row(track_row=track_soup))
return Album(
title=album_name,
source_list=source_list,
artist_list=artist_list,
song_list=tracklist
)
@classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
"""
I am preeeety sure I can get way more data than... nothing from there
:param source:
:param stop_at_level:
:return:
"""
# 1. # id and tracksort
Convince Me to Bleed # name
03:40 # length
Show lyrics