music-kraken-core/src/music_kraken/pages/encyclopaedia_metallum.py

489 lines
18 KiB
Python
Raw Normal View History

2023-01-23 23:16:10 +00:00
from typing import List
2023-01-24 08:40:01 +00:00
import requests
from bs4 import BeautifulSoup
2023-02-01 12:18:09 +00:00
import pycountry
2023-01-24 08:40:01 +00:00
from ..utils.shared import (
ENCYCLOPAEDIA_METALLUM_LOGGER as LOGGER
)
2023-01-23 23:16:10 +00:00
from .abstract import Page
2023-03-02 15:23:02 +00:00
from ..objects import (
2023-01-24 08:40:01 +00:00
MusicObject,
Artist,
Source,
2023-01-24 09:51:41 +00:00
SourcePages,
2023-01-24 11:09:47 +00:00
Song,
Album,
2023-02-06 08:16:28 +00:00
ID3Timestamp,
2023-03-02 15:23:02 +00:00
FormattedText,
2023-03-13 13:39:46 +00:00
Label,
Options
)
from ..utils import (
string_processing
2023-01-24 08:40:01 +00:00
)
2023-01-23 23:16:10 +00:00
class EncyclopaediaMetallum(Page):
2023-01-24 08:40:01 +00:00
API_SESSION: requests.Session = requests.Session()
API_SESSION.headers = {
"Host": "www.metal-archives.com",
"Connection": "keep-alive"
}
2023-01-24 17:15:07 +00:00
SOURCE_TYPE = SourcePages.ENCYCLOPAEDIA_METALLUM
2023-01-23 23:16:10 +00:00
@classmethod
2023-03-13 13:39:46 +00:00
def search_by_query(cls, query: str) -> Options:
2023-01-23 23:16:10 +00:00
query_obj = cls.Query(query)
if query_obj.is_raw:
return cls.simple_search(query_obj)
2023-01-24 09:51:41 +00:00
return cls.advanced_search(query_obj)
2023-01-23 23:16:10 +00:00
@classmethod
2023-03-13 13:39:46 +00:00
def advanced_search(cls, query: Page.Query) -> Options:
2023-01-24 09:51:41 +00:00
if query.song is not None:
2023-03-13 13:39:46 +00:00
return Options(cls.search_for_song(query=query))
2023-01-24 13:29:23 +00:00
if query.album is not None:
2023-03-13 13:39:46 +00:00
return Options(cls.search_for_album(query=query))
2023-01-24 13:29:23 +00:00
if query.artist is not None:
2023-03-13 13:39:46 +00:00
return Options(cls.search_for_artist(query=query))
return Options
2023-01-24 09:51:41 +00:00
@classmethod
def search_for_song(cls, query: Page.Query) -> List[Song]:
2023-01-30 22:54:21 +00:00
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/songs/?songTitle={song}&bandName={" \
"artist}&releaseTitle={album}&lyrics=&genre=&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0" \
"&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&_" \
"=1674550595663"
2023-01-24 09:51:41 +00:00
r = cls.API_SESSION.get(endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str))
if r.status_code != 200:
LOGGER.warning(
f"code {r.status_code} at {endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str)}")
2023-01-24 09:51:41 +00:00
return []
2023-01-24 11:09:47 +00:00
return [cls.get_song_from_json(
artist_html=raw_song[0],
album_html=raw_song[1],
release_type=raw_song[2],
2023-01-24 17:15:07 +00:00
title=raw_song[3],
lyrics_html=raw_song[4]
2023-01-24 11:09:47 +00:00
) for raw_song in r.json()['aaData']]
2023-01-24 09:51:41 +00:00
2023-01-24 13:29:23 +00:00
@classmethod
def search_for_album(cls, query: Page.Query) -> List[Album]:
2023-01-30 22:54:21 +00:00
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/albums/?bandName={" \
"artist}&releaseTitle={album}&releaseYearFrom=&releaseMonthFrom=&releaseYearTo=&releaseMonthTo" \
"=&country=&location=&releaseLabelName=&releaseCatalogNumber=&releaseIdentifiers" \
"=&releaseRecordingInfo=&releaseDescription=&releaseNotes=&genre=&sEcho=1&iColumns=3&sColumns" \
"=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&_=1674563943747"
2023-01-24 13:29:23 +00:00
r = cls.API_SESSION.get(endpoint.format(artist=query.artist_str, album=query.album_str))
if r.status_code != 200:
LOGGER.warning(
f"code {r.status_code} at {endpoint.format(song=query.song_str, artist=query.artist_str, album=query.album_str)}")
2023-01-24 13:29:23 +00:00
return []
return [cls.get_album_from_json(
artist_html=raw_album[0],
album_html=raw_album[1],
release_type=[2]
) for raw_album in r.json()['aaData']]
@classmethod
def search_for_artist(cls, query: Page.Query) -> List[Artist]:
endpoint = "https://www.metal-archives.com/search/ajax-advanced/searching/bands/?bandName={artist}&genre=&country=&yearCreationFrom=&yearCreationTo=&bandNotes=&status=&themes=&location=&bandLabelName=&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&_=1674565459976"
2023-01-24 13:29:23 +00:00
r = cls.API_SESSION.get(endpoint.format(artist=query.artist))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {endpoint.format(artist=query.artist)}")
return []
return [
2023-01-30 22:54:21 +00:00
cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2])
2023-01-24 13:29:23 +00:00
for raw_artist in r.json()['aaData']
]
2023-01-24 09:51:41 +00:00
@classmethod
def simple_search(cls, query: Page.Query) -> List[Artist]:
2023-01-24 08:40:01 +00:00
"""
Searches the default endpoint from metal archives, which intern searches only
for bands, but it is the default, thus I am rolling with it
"""
endpoint = "https://www.metal-archives.com/search/ajax-band-search/?field=name&query={query}&sEcho=1&iColumns=3&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2"
r = cls.API_SESSION.get(endpoint.format(query=query))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {endpoint.format(query=query.query)}")
return []
2023-01-24 09:51:41 +00:00
return [
2023-01-30 22:54:21 +00:00
cls.get_artist_from_json(artist_html=raw_artist[0], genre=raw_artist[1], country=raw_artist[2])
2023-01-24 09:51:41 +00:00
for raw_artist in r.json()['aaData']
]
2023-01-24 08:40:01 +00:00
@classmethod
2023-01-30 22:54:21 +00:00
def get_artist_from_json(cls, artist_html=None, genre=None, country=None) -> Artist:
2023-01-24 08:40:01 +00:00
"""
TODO parse the country to a standart
"""
# parse the html
# parse the html for the band name and link on metal-archives
2023-01-30 22:54:21 +00:00
soup = BeautifulSoup(artist_html, 'html.parser')
2023-01-24 08:40:01 +00:00
anchor = soup.find('a')
artist_name = anchor.text
artist_url = anchor.get('href')
2023-01-30 22:54:21 +00:00
artist_id = artist_url.split("/")[-1]
2023-01-24 08:40:01 +00:00
notes = f"{artist_name} is a {genre} band from {country}"
anchor.decompose()
strong = soup.find('strong')
if strong is not None:
strong.decompose()
akronyms_ = soup.text[2:-2].split(', ')
notes += f"aka {akronyms_}"
notes += "."
return Artist(
id_=artist_id,
name=artist_name,
2023-01-30 22:54:21 +00:00
source_list=[
2023-01-24 08:40:01 +00:00
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, artist_url)
],
2023-02-09 08:40:57 +00:00
notes=FormattedText(plaintext=notes)
2023-01-24 08:40:01 +00:00
)
2023-01-24 11:09:47 +00:00
@classmethod
2023-01-24 13:29:23 +00:00
def get_album_from_json(cls, album_html=None, release_type=None, artist_html=None) -> Album:
2023-01-24 11:09:47 +00:00
# parse the html
# <a href="https://www.metal-archives.com/albums/Ghost_Bath/Self_Loather/970834">Self Loather</a>'
soup = BeautifulSoup(album_html, 'html.parser')
anchor = soup.find('a')
album_name = anchor.text
album_url = anchor.get('href')
2023-01-30 22:54:21 +00:00
album_id = album_url.split("/")[-1]
2023-01-24 11:09:47 +00:00
"""
TODO implement release type
"""
return Album(
id_=album_id,
title=album_name,
2023-01-30 22:54:21 +00:00
source_list=[
2023-01-24 11:09:47 +00:00
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)
2023-01-30 22:54:21 +00:00
],
artists=[
cls.get_artist_from_json(artist_html=artist_html)
2023-01-24 11:09:47 +00:00
]
)
@classmethod
def get_song_from_json(cls, artist_html=None, album_html=None, release_type=None, title=None,
lyrics_html=None) -> Song:
2023-01-24 17:15:07 +00:00
song_id = None
if lyrics_html is not None:
soup = BeautifulSoup(lyrics_html, 'html.parser')
anchor = soup.find('a')
raw_song_id = anchor.get('id')
song_id = raw_song_id.replace("lyricsLink_", "")
2023-01-24 11:09:47 +00:00
return Song(
2023-01-24 17:15:07 +00:00
id_=song_id,
2023-01-24 11:09:47 +00:00
title=title,
main_artist_list=[
2023-01-30 22:54:21 +00:00
cls.get_artist_from_json(artist_html=artist_html)
2023-01-24 11:09:47 +00:00
],
2023-01-30 22:54:21 +00:00
album=cls.get_album_from_json(album_html=album_html, release_type=release_type, artist_html=artist_html),
source_list=[
Source(SourcePages.ENCYCLOPAEDIA_METALLUM, song_id)
]
2023-01-24 11:09:47 +00:00
)
2023-01-24 17:15:07 +00:00
@classmethod
2023-02-06 14:06:38 +00:00
def fetch_artist_discography(cls, artist: Artist, ma_artist_id: str, flat: bool = False) -> Artist:
"""
TODO
I'd guess this funktion has quite some possibility for optimizations
in form of performance and clean code
"""
discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all"
# make the request
r = cls.API_SESSION.get(discography_url.format(ma_artist_id))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {discography_url.format(ma_artist_id)}")
return artist
# parse the html
soup = BeautifulSoup(r.text, 'html.parser')
tbody_soup = soup.find('tbody')
for tr_soup in tbody_soup.find_all('tr'):
td_list = tr_soup.findChildren(recursive=False)
album_soup = td_list[0]
album_name = album_soup.text
album_url = album_soup.find('a').get('href')
album_id = album_url.split('/')[-1]
album_type = td_list[1].text
album_year = td_list[2].text
2023-02-08 16:14:51 +00:00
date_obj = None
try:
2023-02-08 16:14:51 +00:00
date_obj = ID3Timestamp(year=int(album_year))
except ValueError():
pass
2023-03-10 17:38:32 +00:00
artist.main_album_collection.append(
Album(
id_=album_id,
title=album_name,
album_type=album_type,
date=date_obj,
source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)]
)
)
2023-02-06 14:06:38 +00:00
if not flat:
2023-03-02 15:23:02 +00:00
for album in artist.main_album_collection:
2023-02-06 14:06:38 +00:00
cls.fetch_album_details(album, flat=flat)
return artist
@classmethod
def fetch_artist_sources(cls, artist: Artist, ma_artist_id: str) -> Artist:
sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}"
# make the request
r = cls.API_SESSION.get(sources_url.format(ma_artist_id))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {sources_url.format(ma_artist_id)}")
return artist
soup = BeautifulSoup(r.text, 'html.parser')
2023-02-09 14:05:49 +00:00
if soup.find("span",{"id": "noLinks"}) is not None:
return artist
artist_source = soup.find("div", {"id": "band_links_Official"})
"""
TODO
add a Label object to add the label sources from
TODO
maybe do merchandice stuff
"""
merchandice_source = soup.find("div", {"id": "band_links_Official_merchandise"})
label_source = soup.find("div", {"id": "band_links_Labels"})
2023-03-02 15:23:02 +00:00
if artist_source is not None:
for tr in artist_source.find_all("td"):
a = tr.find("a")
url = a.get("href")
2023-03-02 15:23:02 +00:00
source = Source.match_url(url)
if source is None:
continue
2023-03-02 15:23:02 +00:00
artist.add_source(source)
return artist
2023-02-01 08:18:06 +00:00
@classmethod
def fetch_artist_attributes(cls, artist: Artist, url: str) -> Artist:
2023-02-01 08:41:25 +00:00
r = cls.API_SESSION.get(url)
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {url}")
return artist
2023-02-01 12:18:09 +00:00
soup = BeautifulSoup(r.text, 'html.parser')
country: pycountry.Countrie = None
formed_in_year: int = None
genre: str = None
lyrical_themes: List[str] = []
label_name: str = None
label_url: str = None
band_stat_soup = soup.find("div", {"id": "band_stats"})
for dl_soup in band_stat_soup.find_all("dl"):
for title, data in zip(dl_soup.find_all("dt"), dl_soup.find_all("dd")):
title_text = title.text
if "Country of origin:" == title_text:
href = data.find('a').get('href')
country = pycountry.countries.get(alpha_2=href.split("/")[-1])
2023-02-01 13:26:54 +00:00
artist.country = country
2023-02-01 12:18:09 +00:00
continue
2023-02-01 12:18:09 +00:00
# not needed: Location: Minot, North Dakota
"""
TODO
status: active
need to do enums for that and add it to object
"""
if "Formed in:" == title_text:
2023-02-09 14:05:49 +00:00
if not data.text.isnumeric():
continue
2023-02-01 12:18:09 +00:00
formed_in_year = int(data.text)
artist.formed_in = ID3Timestamp(year=formed_in_year)
2023-02-01 12:18:09 +00:00
continue
if "Genre:" == title_text:
genre = data.text
2023-02-01 13:26:54 +00:00
artist.general_genre = genre
2023-02-01 12:18:09 +00:00
continue
if "Lyrical themes:" == title_text:
lyrical_themes = data.text.split(", ")
2023-02-01 13:26:54 +00:00
artist.lyrical_themes = lyrical_themes
2023-02-01 12:18:09 +00:00
continue
if "Current label:" == title_text:
label_name = data.text
2023-02-09 14:05:49 +00:00
label_anchor = data.find("a")
label_url = None
if label_anchor is not None:
label_url = label_anchor.get("href")
2023-03-10 17:38:32 +00:00
label_id = None
if type(label_url) is str and "/" in label_url:
label_id = label_url.split("/")[-1]
2023-03-02 15:23:02 +00:00
2023-03-10 17:38:32 +00:00
artist.label_collection.append(
Label(
_id=label_id,
2023-03-02 15:23:02 +00:00
name=label_name,
source_list=[
Source(cls.SOURCE_TYPE, label_url)
]
))
2023-02-01 13:26:54 +00:00
2023-02-01 12:18:09 +00:00
"""
years active: 2012-present
process this and add field to class
"""
2023-02-01 08:18:06 +00:00
return artist
2023-02-06 08:16:28 +00:00
@classmethod
def fetch_band_notes(cls, artist: Artist, ma_artist_id: str) -> Artist:
endpoint = "https://www.metal-archives.com/band/read-more/id/{}"
# make the request
r = cls.API_SESSION.get(endpoint.format(ma_artist_id))
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {endpoint.format(ma_artist_id)}")
return artist
artist.notes.html = r.text
return artist
2023-01-24 17:15:07 +00:00
@classmethod
2023-02-06 14:06:38 +00:00
def fetch_artist_details(cls, artist: Artist, flat: bool = False) -> Artist:
2023-03-10 17:38:32 +00:00
source_list = artist.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
2023-01-30 22:54:21 +00:00
if len(source_list) == 0:
2023-01-24 17:15:07 +00:00
return artist
2023-01-30 22:54:21 +00:00
# taking the fist source, cuz I only need one and multiple sources don't make that much sense
source = source_list[0]
artist_id = source.url.split("/")[-1]
"""
2023-02-01 12:18:09 +00:00
TODO
2023-02-06 08:16:28 +00:00
[x] https://www.metal-archives.com/bands/Ghost_Bath/3540372489
2023-02-01 08:18:06 +00:00
[x] https://www.metal-archives.com/band/discography/id/3540372489/tab/all
2023-02-01 12:18:09 +00:00
[] reviews: https://www.metal-archives.com/review/ajax-list-band/id/3540372489/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=200&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=3&sSortDir_0=desc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&_=1675155257133
[] simmilar: https://www.metal-archives.com/band/ajax-recommendations/id/3540372489
[x] sources: https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489
2023-02-06 08:16:28 +00:00
[x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489
"""
# SIMPLE METADATA
2023-02-01 08:18:06 +00:00
artist = cls.fetch_artist_attributes(artist, source.url)
# DISCOGRAPHY
2023-02-06 14:06:38 +00:00
artist = cls.fetch_artist_discography(artist, artist_id, flat=flat)
2023-02-01 08:41:25 +00:00
# EXTERNAL SOURCES
artist = cls.fetch_artist_sources(artist, artist_id)
2023-01-30 22:54:21 +00:00
2023-02-06 08:16:28 +00:00
# ARTIST NOTES
artist = cls.fetch_band_notes(artist, artist_id)
return artist
2023-02-06 14:06:38 +00:00
@classmethod
def fetch_album_details(cls, album: Album, flat: bool = False) -> Album:
2023-03-10 17:38:32 +00:00
source_list = album.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
2023-02-06 14:06:38 +00:00
if len(source_list) == 0:
return album
source = source_list[0]
album_id = source.url.split("/")[-1]
# <table class="display table_lyrics
r = cls.API_SESSION.get(source.url)
if r.status_code != 200:
LOGGER.warning(f"code {r.status_code} at {source.url}")
return album
soup = BeautifulSoup(r.text, 'html.parser')
tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
"""
example of row:
<tr class="even">
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
<td class="wrapWords">Convince Me to Bleed</td> # name
<td align="right">03:40</td> # length
<td nowrap="nowrap"> 
<a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
</td>
</tr>
"""
row_list = row.find_all(recursive=False)
track_sort_soup = row_list[0]
track_sort = int(track_sort_soup.text[:-1])
track_id = track_sort_soup.find("a").get("name")
title = row_list[1].text.strip()
2023-02-09 14:05:49 +00:00
length = None
duration_stamp = row_list[2].text
2023-02-09 14:05:49 +00:00
if ":" in duration_stamp:
minutes, seconds = duration_stamp.split(":")
length = (int(minutes) * 60 + int(seconds))*1000 # in milliseconds
2023-02-07 12:32:49 +00:00
2023-03-10 17:38:32 +00:00
album.song_collection.append(
Song(
id_=track_id,
title=title,
length=length,
tracksort=track_sort,
source_list=[Source(cls.SOURCE_TYPE, track_id)]
)
2023-02-08 12:16:48 +00:00
)
2023-02-07 12:32:49 +00:00
2023-02-06 14:06:38 +00:00
return album
@classmethod
def fetch_song_details(cls, song: Song, flat: bool = False) -> Song:
2023-03-10 17:38:32 +00:00
source_list = song.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
2023-02-06 14:06:38 +00:00
if len(source_list) == 0:
return song
2023-03-02 15:23:02 +00:00
"""
TODO
lyrics
"""
2023-02-06 14:06:38 +00:00
return song