pretty much finished web scraping on metal archives besides lyrics

This commit is contained in:
Hellow2 2023-03-28 08:27:21 +02:00
parent 4340ca3b32
commit 34baf75d2e
3 changed files with 117 additions and 46 deletions

View File

@ -123,9 +123,10 @@
</div> </div>
<div id="album_info"> <div id="album_info">
<h1 class="album_name">
<a href="https://www.metal-archives.com/albums/Ghost_Bath/Ghost_Bath/392863">Ghost Bath</a>
</h1>
<h1 class="album_name"><a href="https://www.metal-archives.com/albums/Ghost_Bath/Ghost_Bath/392863">Ghost
Bath</a></h1>
<h2 class="band_name"> <h2 class="band_name">
<a href="https://www.metal-archives.com/bands/Ghost_Bath/3540372489">Ghost Bath</a> <a href="https://www.metal-archives.com/bands/Ghost_Bath/3540372489">Ghost Bath</a>
</h2> </h2>

View File

@ -16,9 +16,12 @@ def fetch_artist():
] ]
) )
artist = EncyclopaediaMetallum.fetch_details(artist) artist: objects.Artist = EncyclopaediaMetallum.fetch_details(artist, stop_at_level=2)
print(artist.options) print(artist.options)
for album in artist.main_album_collection:
print(album.options)
def fetch_album(): def fetch_album():
album = objects.Album( album = objects.Album(

View File

@ -461,30 +461,8 @@ class EncyclopaediaMetallum(Page):
return artist return artist
@classmethod @classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album: def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song:
""" """
I am preeeety sure I can get way more data than... nothing from there
:param source:
:param stop_at_level:
:return:
"""
# <table class="display table_lyrics
album = Album()
r = cls.get_request(source.url)
if r is None:
return album
soup = cls.get_soup_from_response(r)
tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
"""
example of row:
<tr class="even"> <tr class="even">
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort <td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
<td class="wrapWords">Convince Me to Bleed</td> # name <td class="wrapWords">Convince Me to Bleed</td> # name
@ -494,7 +472,8 @@ class EncyclopaediaMetallum(Page):
</td> </td>
</tr> </tr>
""" """
row_list = row.find_all(recursive=False)
row_list = track_row.find_all(recursive=False)
track_sort_soup = row_list[0] track_sort_soup = row_list[0]
track_sort = int(track_sort_soup.text[:-1]) track_sort = int(track_sort_soup.text[:-1])
@ -509,14 +488,102 @@ class EncyclopaediaMetallum(Page):
minutes, seconds = duration_stamp.split(":") minutes, seconds = duration_stamp.split(":")
length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds
album.song_collection.append( return Song(
Song(
id_=track_id,
title=title, title=title,
length=length, length=length,
tracksort=track_sort, tracksort=track_sort,
source_list=[Source(cls.SOURCE_TYPE, track_id)] source_list=[Source(cls.SOURCE_TYPE, track_id)]
) )
@classmethod
def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
tracklist: List[Song] = []
artist_list = []
album_name: str = None
source_list: List[Source] = []
def _parse_album_info(album_info_soup: BeautifulSoup):
nonlocal artist_list
nonlocal album_name
nonlocal source_list
if album_info_soup is None:
return
album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"})
if len(album_soup_list) == 1:
anchor: BeautifulSoup = album_soup_list[0].find("a")
href = anchor.get("href")
if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href.strip()))
album_name = anchor.get_text(strip=True)
elif len(album_soup_list) > 1:
LOGGER.debug("there are more than 1 album soups")
artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"})
if len(artist_soup_list) == 1:
for anchor in artist_soup_list[0].find_all("a"):
artist_sources: List[Source] = []
href = anchor.get("href")
if href is not None:
artist_sources.append(Source(cls.SOURCE_TYPE, href.strip()))
artist_name = anchor.get_text(strip=True)
artist_list.append(Artist(
name=artist_name,
source_list=artist_sources
))
elif len(artist_soup_list) > 1:
LOGGER.debug("there are more than 1 artist soups")
_parse_album_info(album_info_soup=album_soup.find(id="album_info"))
tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody")
for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
tracklist.append(cls._parse_album_track_row(track_row=track_soup))
return Album(
title=album_name,
source_list=source_list,
artist_list=artist_list,
song_list=tracklist
) )
@classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
"""
I am preeeety sure I can get way more data than... nothing from there
:param source:
:param stop_at_level:
:return:
"""
# <table class="display table_lyrics
r = cls.get_request(source.url)
if r is None:
return Album()
soup = cls.get_soup_from_response(r)
album = cls._parse_album_attributes(soup, stop_at_level=stop_at_level)
if stop_at_level > 1:
for song in album.song_collection:
for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
song.merge(cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1))
return album return album
@classmethod
def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
return Song()