music-kraken-core/src/music_kraken/pages/encyclopaedia_metallum.py
2023-03-27 20:02:17 +02:00

561 lines
21 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
import requests
from bs4 import BeautifulSoup
import pycountry
from ..utils.shared import (
ENCYCLOPAEDIA_METALLUM_LOGGER as LOGGER
)
from .abstract import Page
from ..objects import (
DatabaseObject,
Artist,
Source,
SourcePages,
Song,
Album,
ID3Timestamp,
FormattedText,
Label,
Options
)
from ..utils import (
string_processing
)
class EncyclopaediaMetallum(Page):
API_SESSION: requests.Session = requests.Session()
API_SESSION.headers = {
"Host": "www.metal-archives.com",
"Connection": "keep-alive"
}
SOURCE_TYPE = SourcePages.ENCYCLOPAEDIA_METALLUM
@classmethod
def search_by_query(cls, query: str) -> Options:
query_obj = cls.Query(query)
if query_obj.is_raw:
return cls.simple_search(query_obj)
return cls.advanced_search(query_obj)
@classmethod
def advanced_search(cls, query: Page.Query) -> Options:
if query.song is not None:
return Options(cls.search_for_song(query=query))
if query.album is not None:
return Options(cls.search_for_album(query=query))
if query.artist is not None:
return Options(cls.search_for_artist(query=query))
return Options
@classmethod
def search_for_song(cls, query: Page.Query) -> List[Song]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/songs/?songTitle={song}&bandName={" \
"artist}&releaseTitle={album}&lyrics=&genre=&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0" \
"&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&_" \
"=1674550595663"
r = cls.API_SESSION.get(endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str))
if r.status_code != 200:
LOGGER.warning(
f"code {r.status_code} at {endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str)}")
return []
return [cls.get_song_from_json(
artist_html=raw_song[0],
album_html=raw_song[1],
release_type=raw_song[2],
title=raw_song[3],
lyrics_html=raw_song[4]
) for raw_song in r.json()['aaData']]
@classmethod
def search_for_album(cls, query: Page.Query) -> List[Album]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/albums/?bandName={" \
"artist}&releaseTitle={album}&releaseYearFrom=&releaseMonthFrom=&releaseYearTo=&releaseMonthTo" \
"=&country=&location=&releaseLabelName=&releaseCatalogNumber=&releaseIdentifiers" \
"=&releaseRecordingInfo=&releaseDescription=&releaseNotes=&genre=&sEcho=1&iColumns=3&sColumns" \
"=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&_=1674563943747"
r = cls.API_SESSION.get(endpoint.format(artist=query.artist_str, album=query.album_str))
if r.status_code != 200:
LOGGER.warning(
f"code {r.status_code} at {endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str)}")
return []
return [cls.get_album_from_json(
artist_html=raw_album[0],
album_html=raw_album[1],
release_type=[2]
) for raw_album in r.json()['aaData']]
@classmethod
def search_for_artist(cls, query: Page.Query) -> List[Artist]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/bands/?bandName={" \
"artist}&genre=&country=&yearCreationFrom=&yearCreationTo=&bandNotes=&status=&themes=&location" \
"=&bandLabelName=&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0" \
"&mDataProp_1=1&mDataProp_2=2&_=1674565459976"
r = cls.API_SESSION.get(endpoint.format(artist=query.artist))
data_key = 'aaData'
parsed_data = r.json()
if data_key not in parsed_data:
return []
return [
cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2])
for raw_artist in r.json()['aaData']
]
@classmethod
def simple_search(cls, query: Page.Query) -> List[Artist]:
"""
Searches the default endpoint from metal archives, which intern searches only
for bands, but it is the default, thus I am rolling with it
"""
endpoint = "https://www.metal-archives.com/search/ajax-band-search/?field=name&query={query}&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2"
r = cls.API_SESSION.get(endpoint.format(query=query))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {endpoint.format(query=query.query)}")
return []
return [
cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2])
for raw_artist in r.json()['aaData']
]
@classmethod
def get_artist_from_json(cls, artist_html=None, genre=None, country=None) -> Artist:
"""
TODO parse the country to a standart
"""
# parse the html
# parse the html for the band name and link on metal-archives
soup = BeautifulSoup(artist_html, 'html.parser')
anchor = soup.find('a')
artist_name = anchor.text
artist_url = anchor.get('href')
artist_id = artist_url.split("/")[-1]
notes = f"{artist_name} is a {genre} band from {country}"
anchor.decompose()
strong = soup.find('strong')
if strong is not None:
strong.decompose()
akronyms_ = soup.text[2:-2].split(', ')
notes += f"aka {akronyms_}"
notes += "."
return Artist(
id_=artist_id,
name=artist_name,
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, artist_url)
],
notes=FormattedText(plaintext=notes)
)
@classmethod
def get_album_from_json(cls, album_html=None, release_type=None, artist_html=None) -> Album:
# parse the html
# <a href="https://www.metal-archives.com/albums/Ghost_Bath/Self_Loather/970834">Self Loather</a>'
soup = BeautifulSoup(album_html, 'html.parser')
anchor = soup.find('a')
album_name = anchor.text
album_url = anchor.get('href')
album_id = album_url.split("/")[-1]
"""
TODO implement release type
"""
return Album(
id_=album_id,
title=album_name,
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)
],
artist_list=[
cls.get_artist_from_json(artist_html=artist_html)
]
)
@classmethod
def get_song_from_json(cls, artist_html=None, album_html=None, release_type=None, title=None,
lyrics_html=None) -> Song:
song_id = None
if lyrics_html is not None:
soup = BeautifulSoup(lyrics_html, 'html.parser')
anchor = soup.find('a')
raw_song_id = anchor.get('id')
song_id = raw_song_id.replace("lyricsLink_", "")
return Song(
id_=song_id,
title=title,
main_artist_list=[
cls.get_artist_from_json(artist_html=artist_html)
],
album_list=[
cls.get_album_from_json(album_html=album_html, release_type=release_type, artist_html=artist_html)
],
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, song_id)
]
)
@classmethod
def fetch_artist_discography(cls, artist: Artist, ma_artist_id: str, flat: bool = False) -> Artist:
"""
TODO
I'd guess this funktion has quite some possibility for optimizations
in form of performance and clean code
"""
discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all"
# make the request
r = cls.API_SESSION.get(discography_url.format(ma_artist_id))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {discography_url.format(ma_artist_id)}")
return artist
# parse the html
soup = BeautifulSoup(r.text, 'html.parser')
tbody_soup = soup.find('tbody')
for tr_soup in tbody_soup.find_all('tr'):
td_list = tr_soup.findChildren(recursive=False)
album_soup = td_list[0]
album_name = album_soup.text
album_url = album_soup.find('a').get('href')
album_id = album_url.split('/')[-1]
album_type = td_list[1].text
album_year = td_list[2].text
date_obj = None
try:
date_obj = ID3Timestamp(year=int(album_year))
except ValueError():
pass
artist.main_album_collection.append(
Album(
id_=album_id,
title=album_name,
album_type=album_type,
date=date_obj,
source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)]
)
)
if not flat:
for album in artist.main_album_collection:
cls.fetch_album_details(album, flat=flat)
return artist
@classmethod
def fetch_artist_sources(cls, artist: Artist, ma_artist_id: str) -> Artist:
sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}"
# make the request
r = cls.API_SESSION.get(sources_url.format(ma_artist_id))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {sources_url.format(ma_artist_id)}")
return artist
soup = BeautifulSoup(r.text, 'html.parser')
if soup.find("span",{"id": "noLinks"}) is not None:
return artist
artist_source = soup.find("div", {"id": "band_links_Official"})
"""
TODO
add a Label object to add the label sources from
TODO
maybe do merchandice stuff
"""
merchandice_source = soup.find("div", {"id": "band_links_Official_merchandise"})
label_source = soup.find("div", {"id": "band_links_Labels"})
if artist_source is not None:
for tr in artist_source.find_all("td"):
a = tr.find("a")
url = a.get("href")
source = Source.match_url(url)
if source is None:
continue
artist.add_source(source)
return artist
@classmethod
def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist:
name: str = None
country: pycountry.Countrie = None
formed_in_year: int = None
genre: str = None
lyrical_themes: List[str] = []
label_name: str = None
label_url: str = None
source_list: List[Source] = []
title_soup: BeautifulSoup = artist_soup.find("title")
if title_soup is not None:
bad_name_substring = " - Encyclopaedia Metallum: The Metal Archives"
title_text = title_soup.get_text()
if title_text.count(bad_name_substring) == 1:
name = title_text.replace(bad_name_substring, "")
else:
LOGGER.debug(f"the title of the page is \"{title_text}\"")
"""
TODO
Implement the bandpictures and logos that can be gotten with the elements
<a class="image" id="photo" title="Ghost Bath"...
<a class="image" id="logo" title="Ghost Bath"...
where the titles are the band name
"""
image_container_soup: BeautifulSoup = artist_soup.find(id="band_sidebar")
if image_container_soup is not None:
logo_soup = image_container_soup.find(id="logo")
if logo_soup is not None:
logo_title = logo_soup.get("title")
if logo_title is not None:
name = logo_title.strip()
band_pictures = image_container_soup.find(id="photo")
if band_pictures is not None:
band_picture_title = logo_soup.get("title")
if band_picture_title is not None:
name = band_picture_title.strip()
for h1_band_name_soup in artist_soup.find_all("h1", {"class": "band_name"}):
anchor: BeautifulSoup = h1_band_name_soup.find("a")
if anchor is None:
continue
href = anchor.get("href")
if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href))
name = anchor.get_text(strip=True)
band_stat_soup = artist_soup.find("div", {"id": "band_stats"})
for dl_soup in band_stat_soup.find_all("dl"):
for title, data in zip(dl_soup.find_all("dt"), dl_soup.find_all("dd")):
title_text = title.text
if "Country of origin:" == title_text:
href = data.find('a').get('href')
country = pycountry.countries.get(alpha_2=href.split("/")[-1])
continue
# not needed: Location: Minot, North Dakota
"""
TODO
status: active
need to do enums for that and add it to object
"""
if "Formed in:" == title_text:
if not data.text.isnumeric():
continue
formed_in_year = int(data.text)
continue
if "Genre:" == title_text:
genre = data.text
continue
if "Lyrical themes:" == title_text:
lyrical_themes = data.text.split(", ")
continue
if "Current label:" == title_text:
label_name = data.text
label_anchor = data.find("a")
label_url = None
if label_anchor is not None:
label_url = label_anchor.get("href")
label_id = None
if type(label_url) is str and "/" in label_url:
label_id = label_url.split("/")[-1]
"""
TODO
years active: 2012-present
process this and add field to class
"""
return Artist(
name=name,
country=country,
formed_in=ID3Timestamp(year=formed_in_year),
general_genre=genre,
lyrical_themes=lyrical_themes,
label_list=[
Label(
name=label_name,
source_list=[
Source(cls.SOURCE_TYPE, label_url)
]
)
],
source_list=source_list
)
@classmethod
def _fetch_artist_attributes(cls, url: str) -> Artist:
r = cls.get_request(url)
if r is None:
return Artist()
soup: BeautifulSoup = cls.get_soup_from_response(r)
return cls._parse_artist_attributes(artist_soup=soup)
@classmethod
def fetch_band_notes(cls, artist: Artist, ma_artist_id: str) -> Artist:
endpoint = "https://www.metal-archives.com/band/read-more/id/{}"
# make the request
r = cls.API_SESSION.get(endpoint.format(ma_artist_id))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {endpoint.format(ma_artist_id)}")
return artist
artist.notes.html = r.text
return artist
@classmethod
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
"""
What it could fetch, and what is implemented:
[x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489
[x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all
[] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133
[] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489
[x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489
[x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489
"""
artist = cls._fetch_artist_attributes(source.url)
return artist
@classmethod
def fetch_artist_details(cls, artist: Artist, flat: bool = False) -> Artist:
source_list = artist.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
if len(source_list) == 0:
return artist
# taking the fist source, cuz I only need one and multiple sources don't make that much sense
source = source_list[0]
artist_id = source.url.split("/")[-1]
"""
TODO
[x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489
[x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all
[] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133
[] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489
[x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489
[x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489
"""
# SIMPLE METADATA
artist = cls._fetch_artist_attributes(artist, source.url)
# DISCOGRAPHY
artist = cls.fetch_artist_discography(artist, artist_id, flat=flat)
# EXTERNAL SOURCES
artist = cls.fetch_artist_sources(artist, artist_id)
# ARTIST NOTES
artist = cls.fetch_band_notes(artist, artist_id)
return artist
@classmethod
def fetch_album_details(cls, album: Album, flat: bool = False) -> Album:
source_list = album.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
if len(source_list) == 0:
return album
source = source_list[0]
album_id = source.url.split("/")[-1]
# <table class="display table_lyrics
r = cls.API_SESSION.get(source.url)
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {source.url}")
return album
soup = BeautifulSoup(r.text, 'html.parser')
tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
"""
example of row:
<tr class="even">
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
<td class="wrapWords">Convince Me to Bleed</td> # name
<td align="right">03:40</td> # length
<td nowrap="nowrap"> 
<a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
</td>
</tr>
"""
row_list = row.find_all(recursive=False)
track_sort_soup = row_list[0]
track_sort = int(track_sort_soup.text[:-1])
track_id = track_sort_soup.find("a").get("name")
title = row_list[1].text.strip()
length = None
duration_stamp = row_list[2].text
if ":" in duration_stamp:
minutes, seconds = duration_stamp.split(":")
length = (int(minutes) * 60 + int(seconds))*1000 # in milliseconds
album.song_collection.append(
Song(
id_=track_id,
title=title,
length=length,
tracksort=track_sort,
source_list=[Source(cls.SOURCE_TYPE, track_id)]
)
)
return album
@classmethod
def fetch_song_details(cls, song: Song, flat: bool = False) -> Song:
source_list = song.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
if len(source_list) == 0:
return song
"""
TODO
lyrics
"""
return song