pretty much finished web scraping on metal archives besides lyrics
This commit is contained in:
parent
4340ca3b32
commit
34baf75d2e
@ -123,9 +123,10 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="album_info">
|
<div id="album_info">
|
||||||
|
<h1 class="album_name">
|
||||||
|
<a href="https://www.metal-archives.com/albums/Ghost_Bath/Ghost_Bath/392863">Ghost Bath</a>
|
||||||
|
</h1>
|
||||||
|
|
||||||
<h1 class="album_name"><a href="https://www.metal-archives.com/albums/Ghost_Bath/Ghost_Bath/392863">Ghost
|
|
||||||
Bath</a></h1>
|
|
||||||
<h2 class="band_name">
|
<h2 class="band_name">
|
||||||
<a href="https://www.metal-archives.com/bands/Ghost_Bath/3540372489">Ghost Bath</a>
|
<a href="https://www.metal-archives.com/bands/Ghost_Bath/3540372489">Ghost Bath</a>
|
||||||
</h2>
|
</h2>
|
||||||
|
@ -16,9 +16,12 @@ def fetch_artist():
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
artist = EncyclopaediaMetallum.fetch_details(artist)
|
artist: objects.Artist = EncyclopaediaMetallum.fetch_details(artist, stop_at_level=2)
|
||||||
print(artist.options)
|
print(artist.options)
|
||||||
|
|
||||||
|
for album in artist.main_album_collection:
|
||||||
|
print(album.options)
|
||||||
|
|
||||||
|
|
||||||
def fetch_album():
|
def fetch_album():
|
||||||
album = objects.Album(
|
album = objects.Album(
|
||||||
|
@ -460,6 +460,103 @@ class EncyclopaediaMetallum(Page):
|
|||||||
|
|
||||||
return artist
|
return artist
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song:
|
||||||
|
"""
|
||||||
|
<tr class="even">
|
||||||
|
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
|
||||||
|
<td class="wrapWords">Convince Me to Bleed</td> # name
|
||||||
|
<td align="right">03:40</td> # length
|
||||||
|
<td nowrap="nowrap">
|
||||||
|
<a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
"""
|
||||||
|
|
||||||
|
row_list = track_row.find_all(recursive=False)
|
||||||
|
|
||||||
|
track_sort_soup = row_list[0]
|
||||||
|
track_sort = int(track_sort_soup.text[:-1])
|
||||||
|
track_id = track_sort_soup.find("a").get("name")
|
||||||
|
|
||||||
|
title = row_list[1].text.strip()
|
||||||
|
|
||||||
|
length = None
|
||||||
|
|
||||||
|
duration_stamp = row_list[2].text
|
||||||
|
if ":" in duration_stamp:
|
||||||
|
minutes, seconds = duration_stamp.split(":")
|
||||||
|
length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds
|
||||||
|
|
||||||
|
return Song(
|
||||||
|
title=title,
|
||||||
|
length=length,
|
||||||
|
tracksort=track_sort,
|
||||||
|
source_list=[Source(cls.SOURCE_TYPE, track_id)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
|
||||||
|
tracklist: List[Song] = []
|
||||||
|
artist_list = []
|
||||||
|
album_name: str = None
|
||||||
|
source_list: List[Source] = []
|
||||||
|
|
||||||
|
def _parse_album_info(album_info_soup: BeautifulSoup):
|
||||||
|
nonlocal artist_list
|
||||||
|
nonlocal album_name
|
||||||
|
nonlocal source_list
|
||||||
|
|
||||||
|
if album_info_soup is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
album_soup_list = album_info_soup.find_all("h1", {"class": "album_name"})
|
||||||
|
if len(album_soup_list) == 1:
|
||||||
|
anchor: BeautifulSoup = album_soup_list[0].find("a")
|
||||||
|
|
||||||
|
href = anchor.get("href")
|
||||||
|
if href is not None:
|
||||||
|
source_list.append(Source(cls.SOURCE_TYPE, href.strip()))
|
||||||
|
|
||||||
|
album_name = anchor.get_text(strip=True)
|
||||||
|
|
||||||
|
elif len(album_soup_list) > 1:
|
||||||
|
LOGGER.debug("there are more than 1 album soups")
|
||||||
|
|
||||||
|
|
||||||
|
artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"})
|
||||||
|
if len(artist_soup_list) == 1:
|
||||||
|
for anchor in artist_soup_list[0].find_all("a"):
|
||||||
|
artist_sources: List[Source] = []
|
||||||
|
|
||||||
|
href = anchor.get("href")
|
||||||
|
if href is not None:
|
||||||
|
artist_sources.append(Source(cls.SOURCE_TYPE, href.strip()))
|
||||||
|
|
||||||
|
artist_name = anchor.get_text(strip=True)
|
||||||
|
|
||||||
|
artist_list.append(Artist(
|
||||||
|
name=artist_name,
|
||||||
|
source_list=artist_sources
|
||||||
|
))
|
||||||
|
|
||||||
|
elif len(artist_soup_list) > 1:
|
||||||
|
LOGGER.debug("there are more than 1 artist soups")
|
||||||
|
|
||||||
|
_parse_album_info(album_info_soup=album_soup.find(id="album_info"))
|
||||||
|
|
||||||
|
tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody")
|
||||||
|
for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
|
||||||
|
tracklist.append(cls._parse_album_track_row(track_row=track_soup))
|
||||||
|
|
||||||
|
return Album(
|
||||||
|
title=album_name,
|
||||||
|
source_list=source_list,
|
||||||
|
artist_list=artist_list,
|
||||||
|
song_list=tracklist
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
|
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
|
||||||
"""
|
"""
|
||||||
@ -472,51 +569,21 @@ class EncyclopaediaMetallum(Page):
|
|||||||
|
|
||||||
# <table class="display table_lyrics
|
# <table class="display table_lyrics
|
||||||
|
|
||||||
album = Album()
|
|
||||||
|
|
||||||
r = cls.get_request(source.url)
|
r = cls.get_request(source.url)
|
||||||
if r is None:
|
if r is None:
|
||||||
return album
|
return Album()
|
||||||
|
|
||||||
soup = cls.get_soup_from_response(r)
|
soup = cls.get_soup_from_response(r)
|
||||||
|
|
||||||
tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
|
album = cls._parse_album_attributes(soup, stop_at_level=stop_at_level)
|
||||||
for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
|
|
||||||
"""
|
|
||||||
example of row:
|
|
||||||
|
|
||||||
<tr class="even">
|
if stop_at_level > 1:
|
||||||
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
|
for song in album.song_collection:
|
||||||
<td class="wrapWords">Convince Me to Bleed</td> # name
|
for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
|
||||||
<td align="right">03:40</td> # length
|
song.merge(cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1))
|
||||||
<td nowrap="nowrap">
|
|
||||||
<a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
"""
|
|
||||||
row_list = row.find_all(recursive=False)
|
|
||||||
|
|
||||||
track_sort_soup = row_list[0]
|
|
||||||
track_sort = int(track_sort_soup.text[:-1])
|
|
||||||
track_id = track_sort_soup.find("a").get("name")
|
|
||||||
|
|
||||||
title = row_list[1].text.strip()
|
|
||||||
|
|
||||||
length = None
|
|
||||||
|
|
||||||
duration_stamp = row_list[2].text
|
|
||||||
if ":" in duration_stamp:
|
|
||||||
minutes, seconds = duration_stamp.split(":")
|
|
||||||
length = (int(minutes) * 60 + int(seconds)) * 1000 # in milliseconds
|
|
||||||
|
|
||||||
album.song_collection.append(
|
|
||||||
Song(
|
|
||||||
id_=track_id,
|
|
||||||
title=title,
|
|
||||||
length=length,
|
|
||||||
tracksort=track_sort,
|
|
||||||
source_list=[Source(cls.SOURCE_TYPE, track_id)]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return album
|
return album
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
|
||||||
|
return Song()
|
||||||
|
Loading…
Reference in New Issue
Block a user