fixed a bit of the scraping

This commit is contained in:
Hellow 2023-03-27 20:02:17 +02:00
parent 2bae6e1cbd
commit 7ae7aa87fd
3 changed files with 588 additions and 2 deletions

View File

@ -0,0 +1,537 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<!--While I could use the title... it wouldn't be that bad actually-->
<title>Ghost Bath - Encyclopaedia Metallum: The Metal Archives</title>
</head>
<body>
<div id="wrapper">
<div id="header">
<!--
contains:
- Logo
- Search Box
- Top Menu box (help/rules/...)
-->
<a href="https://www.metal-archives.com/" id="MA_logo">Metal Archives</a>
<div id="search_box"></div>
<div id="top_menu_box"></div>
</div>
<div id="left_col">
<!--Some stuff which is relevant for the global site, not the artist-->
</div>
<div id="content_wrapper">
<script type="8655e3a8e5105845a7a3628f-text/javascript" src="https://www.metal-archives.com/js/jquery/jquery.form.js"></script>
<script type="8655e3a8e5105845a7a3628f-text/javascript">
var bandId = 3540372489;
var bandName = "Ghost Bath";
var minRecScore = 3;
// continues with some unnecessary stuff and indentation is originally better
</script>
<div id="band_sidebar">
<div class="band_name_img">
<a class="image" id="logo" title="Ghost Bath" href="https://www.metal-archives.com/images/3/5/4/0/3540372489_logo.jpg?1647">
<img src="https://www.metal-archives.com/images/3/5/4/0/3540372489_logo.jpg?1647" title="Click to zoom" alt="Ghost Bath - Logo" border="0"/>
</a>
</div>
<div class="band_img">
<a class="image" id="photo" title="Ghost Bath"
href="https://www.metal-archives.com/images/3/5/4/0/3540372489_photo.jpg?1647"><img
src="https://www.metal-archives.com/images/3/5/4/0/3540372489_photo.jpg?1647"
title="Click to zoom" alt="Ghost Bath - Photo" border="0"/></a>
</div>
<div id="affiliation-links">
<!--Links to an ebay search of the band-->
</div>
</div>
<div id="band_content">
<div id="band_info">
<h1 class="band_name">
<a href="https://www.metal-archives.com/bands/Ghost_Bath/3540372489">Ghost Bath</a>
</h1>
<div class="clear block_spacer_5"></div>
<div id="band_stats">
<dl class="float_left">
<dt>Country of origin:</dt>
<dd><a href="https://www.metal-archives.com/lists/US">United States</a></dd>
<dt>Location:</dt>
<dd>Minot, North Dakota</dd>
<dt>Status:</dt>
<dd class="active">Active</dd>
<dt>Formed in:</dt>
<dd>2012</dd>
</dl>
<dl class="float_right">
<dt>Genre:</dt>
<dd>Depressive/Post-Black Metal</dd>
<dt>Themes:</dt>
<dd>Depression, Loneliness, Death</dd>
<dt>Current label:</dt>
<dd>
<a href="https://www.metal-archives.com/labels/Nuclear_Blast_Entertainment/43073">
Nuclear Blast Entertainment
</a>
</dd>
</dl>
<dl style="width: 100%;" class="clear">
<dt>Years active:</dt>
<dd>
2012-present
</dd>
</dl>
</div>
<div class="band_comment clear">
<!--This doesn't get the whole notes, I need to use another request for this, so we just ignore that-->
</div>
</div>
<div id="message"></div>
<div class="clear block_spacer_5"></div>
<div id="band_tabs" class="tabs no-js">
<ul>
<li><a href="#band_tab_discography">Discography</a></li>
<li><a href="#band_tab_members">Members</a></li>
<li><a href="https://www.metal-archives.com/band/ajax-reviews/id/3540372489" title="Reviews"><span>Reviews</span></a>
</li>
<li><a href="https://www.metal-archives.com/band/ajax-recommendations/id/3540372489"
title="Similar artists"><span>Similar Artists</span></a></li>
<li><a href="https://www.metal-archives.com/link/ajax-list/type/band/id/3540372489"
title="Related links"><span>Related Links</span></a></li>
</ul>
<div id="band_tab_discography">
<div id="band_disco" class="tabs2lvl">
<ul>
<li><a href="https://www.metal-archives.com/band/discography/id/3540372489/tab/all"><span>Complete discography</span></a>
</li>
<li><a href="https://www.metal-archives.com/band/discography/id/3540372489/tab/main"><span>Main</span></a>
</li>
<li><a href="https://www.metal-archives.com/band/discography/id/3540372489/tab/lives"><span>Lives</span></a>
</li>
<li><a href="https://www.metal-archives.com/band/discography/id/3540372489/tab/demos"><span>Demos</span></a>
</li>
<li><a href="https://www.metal-archives.com/band/discography/id/3540372489/tab/misc"><span>Misc.</span></a>
</li>
</ul>
</div>
</div>
<div id="band_tab_members">
<div id="band_members" class="tabs2lvl">
<ul>
<li><a href="#band_tab_members_all">Complete lineup</a></li>
<li><a href="#band_tab_members_current">Current lineup</a></li>
<li><a href="#band_tab_members_past">Past members</a></li>
<li><a href="#band_tab_members_live">Live musicians</a></li>
</ul>
<div id="band_tab_members_all">
<div class="ui-tabs-panel-content">
<table class="display lineupTable" cellpadding="0" cellspacing="0">
<tr class="lineupHeaders">
<td colspan="2" align="right">
Current
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Dennis_Mikula/532536"
class="bold">Nameless</a>
</td>
<td>
Vocals (lead), Guitars, Piano&nbsp;(2012-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/If_I_Could_Kill_Myself/3540422793">If
I Could Kill Myself</a>, Electric Specter 電妖怪, ex-I, Apparatus
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Josh_Jaye/825936"
class="bold">Josh Jaye</a>
</td>
<td>
Bass&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Stone_Marrow/3540460997">Stone
Marrow</a>, Alistair Hennessey, Death House
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Jason_Hirt/808786"
class="bold">Jason Hirt</a>
</td>
<td>
Drums&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Nothingness/3540457359">Nothingness</a>,
Conduit, ex-StillBreather
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Tim_Church/676576"
class="bold">Tim Church</a>
</td>
<td>
Guitars&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Stone_Marrow/3540460997">Stone
Marrow</a>, Death House, ex-Alistair Hennessey
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/John_Olivier/544828"
class="bold">John Olivier</a>
</td>
<td>
Guitars&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Stone_Marrow/3540460997">Stone
Marrow</a>, Death House, ex-<a
href="https://www.metal-archives.com/bands/Lungs/3540375799">Lungs</a>,
ex-Alistair Hennessey
</td>
</tr>
<tr class="lineupHeaders">
<td colspan="2" align="right">
Past
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/%E6%9D%B0%E7%B1%B3/532538"
class="bold">杰米</a>
</td>
<td>
Bass
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/%E6%B3%B0%E5%8B%92/532539"
class="bold">泰勒</a>
</td>
<td>
Drums
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/%E5%A4%9A%E8%AF%BA%E4%B8%87/532537"
class="bold">多诺万</a>
</td>
<td>
Guitars, Vocals (backing)
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/D.J._Gunnarson/554293"
class="bold">D.J. Gunnarson</a>
</td>
<td>
Vocals
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
ex-<a href="https://www.metal-archives.com/bands/Thee_Massacre/3540381468">Thee
Massacre</a>, Buried Above Ground, ex-<a
href="https://www.metal-archives.com/bands/White_Empress/3540376401">White
Empress</a></td>
</tr>
<tr class="lineupHeaders">
<td colspan="2" align="right">
Current (Live)
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Chris_Piette/566172"
class="bold">Chris Piette</a>
</td>
<td>
Drums&nbsp;(2022-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Amiensus/3540347443">Amiensus</a>,
<a href="https://www.metal-archives.com/bands/Choke/3540512518">Choke</a>,
ex-<a href="https://www.metal-archives.com/bands/Thee_Massacre/3540381468">Thee
Massacre</a>, ex-McNally Smith Extreme Metal Ensemble (live)
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Caleb_Cheslock/775455"
class="bold">Caleb Cheslock</a>
</td>
<td>
Guitars&nbsp;(2022-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Cavernlight/3540428434">Cavernlight</a>,
<a href="https://www.metal-archives.com/bands/Choke/3540512518">Choke</a>,
Christopher Gold and The New Old Things, Servare
</td>
</tr>
</table>
</div>
</div>
<div id="band_tab_members_current">
<div class="ui-tabs-panel-content">
<table class="display lineupTable" cellpadding="0" cellspacing="0">
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Dennis_Mikula/532536"
class="bold">Nameless</a>
</td>
<td>
Vocals (lead), Guitars, Piano&nbsp;(2012-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/If_I_Could_Kill_Myself/3540422793">If
I Could Kill Myself</a>, Electric Specter 電妖怪, ex-I, Apparatus
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Josh_Jaye/825936"
class="bold">Josh Jaye</a>
</td>
<td>
Bass&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Stone_Marrow/3540460997">Stone
Marrow</a>, Alistair Hennessey, Death House
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Jason_Hirt/808786"
class="bold">Jason Hirt</a>
</td>
<td>
Drums&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Nothingness/3540457359">Nothingness</a>,
Conduit, ex-StillBreather
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Tim_Church/676576"
class="bold">Tim Church</a>
</td>
<td>
Guitars&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Stone_Marrow/3540460997">Stone
Marrow</a>, Death House, ex-Alistair Hennessey
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/John_Olivier/544828"
class="bold">John Olivier</a>
</td>
<td>
Guitars&nbsp;(2016-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Stone_Marrow/3540460997">Stone
Marrow</a>, Death House, ex-<a
href="https://www.metal-archives.com/bands/Lungs/3540375799">Lungs</a>,
ex-Alistair Hennessey
</td>
</tr>
</table>
</div>
</div>
<div id="band_tab_members_past">
<div class="ui-tabs-panel-content">
<table class="display lineupTable" cellpadding="0" cellspacing="0">
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/%E6%9D%B0%E7%B1%B3/532538"
class="bold">杰米</a>
</td>
<td>
Bass
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/%E6%B3%B0%E5%8B%92/532539"
class="bold">泰勒</a>
</td>
<td>
Drums
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/%E5%A4%9A%E8%AF%BA%E4%B8%87/532537"
class="bold">多诺万</a>
</td>
<td>
Guitars, Vocals (backing)
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/D.J._Gunnarson/554293"
class="bold">D.J. Gunnarson</a>
</td>
<td>
Vocals
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
ex-<a href="https://www.metal-archives.com/bands/Thee_Massacre/3540381468">Thee
Massacre</a>, Buried Above Ground, ex-<a
href="https://www.metal-archives.com/bands/White_Empress/3540376401">White
Empress</a></td>
</tr>
</table>
</div>
</div>
<div id="band_tab_members_live">
<div class="ui-tabs-panel-content">
<table class="display lineupTable" cellpadding="0" cellspacing="0">
<tr class="lineupHeaders">
<td colspan="2" align="right">
Current
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Chris_Piette/566172"
class="bold">Chris Piette</a>
</td>
<td>
Drums&nbsp;(2022-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Amiensus/3540347443">Amiensus</a>,
<a href="https://www.metal-archives.com/bands/Choke/3540512518">Choke</a>,
ex-<a href="https://www.metal-archives.com/bands/Thee_Massacre/3540381468">Thee
Massacre</a>, ex-McNally Smith Extreme Metal Ensemble (live)
</td>
</tr>
<tr class="lineupRow">
<td width="200" valign="top">
<a href="https://www.metal-archives.com/artists/Caleb_Cheslock/775455"
class="bold">Caleb Cheslock</a>
</td>
<td>
Guitars&nbsp;(2022-present)
</td>
</tr>
<tr class="lineupBandsRow">
<td colspan="2">
See also:
<a href="https://www.metal-archives.com/bands/Cavernlight/3540428434">Cavernlight</a>,
<a href="https://www.metal-archives.com/bands/Choke/3540512518">Choke</a>,
Christopher Gold and The New Old Things, Servare
</td>
</tr>
</table>
</div>
</div>
</div>
</div>
</div>
<div id="auditTrail">
<table>
<tr>
<td>Added by: <a href="https://www.metal-archives.com/users/Teglement" class="profileMenu">Teglement</a>
</td>
<td align="right">Modified by: <a href="https://www.metal-archives.com/users/Mole666"
class="profileMenu">Mole666</a></td>
</tr>
<tr>
<td>Added on: 2013-11-06 23:08:16</td>
<td align="right">Last modified on: 2023-02-11 04:24:36</td>
</tr>
<tr>
<td valign="top">
&nbsp;
</td>
<td align="right" valign="top">
</td>
</tr>
</table>
</div>
</div>
<div id="readMoreDialog" title="Additional notes" class="displayNone"></div>
</div>
</div>
<script data-cfasync="false" src="/cdn-cgi/scripts/5c5dd728/cloudflare-static/email-decode.min.js"></script>
<script src="/cdn-cgi/scripts/7d0fa10a/cloudflare-static/rocket-loader.min.js"
data-cf-settings="8655e3a8e5105845a7a3628f-|49" defer=""></script>
</body>
</html>

View File

@ -0,0 +1,4 @@
# Metal Archives
https://www.metal-archives.com/
- [Artist page (https://www.metal-archives.com/)](artist.html)

View File

@ -300,12 +300,56 @@ class EncyclopaediaMetallum(Page):
@classmethod @classmethod
def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist: def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist:
name: str = None
country: pycountry.Countrie = None country: pycountry.Countrie = None
formed_in_year: int = None formed_in_year: int = None
genre: str = None genre: str = None
lyrical_themes: List[str] = [] lyrical_themes: List[str] = []
label_name: str = None label_name: str = None
label_url: str = None label_url: str = None
source_list: List[Source] = []
title_soup: BeautifulSoup = artist_soup.find("title")
if title_soup is not None:
bad_name_substring = " - Encyclopaedia Metallum: The Metal Archives"
title_text = title_soup.get_text()
if title_text.count(bad_name_substring) == 1:
name = title_text.replace(bad_name_substring, "")
else:
LOGGER.debug(f"the title of the page is \"{title_text}\"")
"""
TODO
Implement the bandpictures and logos that can be gotten with the elements
<a class="image" id="photo" title="Ghost Bath"...
<a class="image" id="logo" title="Ghost Bath"...
where the titles are the band name
"""
image_container_soup: BeautifulSoup = artist_soup.find(id="band_sidebar")
if image_container_soup is not None:
logo_soup = image_container_soup.find(id="logo")
if logo_soup is not None:
logo_title = logo_soup.get("title")
if logo_title is not None:
name = logo_title.strip()
band_pictures = image_container_soup.find(id="photo")
if band_pictures is not None:
band_picture_title = logo_soup.get("title")
if band_picture_title is not None:
name = band_picture_title.strip()
for h1_band_name_soup in artist_soup.find_all("h1", {"class": "band_name"}):
anchor: BeautifulSoup = h1_band_name_soup.find("a")
if anchor is None:
continue
href = anchor.get("href")
if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href))
name = anchor.get_text(strip=True)
band_stat_soup = artist_soup.find("div", {"id": "band_stats"}) band_stat_soup = artist_soup.find("div", {"id": "band_stats"})
for dl_soup in band_stat_soup.find_all("dl"): for dl_soup in band_stat_soup.find_all("dl"):
@ -353,6 +397,7 @@ class EncyclopaediaMetallum(Page):
""" """
return Artist( return Artist(
name=name,
country=country, country=country,
formed_in=ID3Timestamp(year=formed_in_year), formed_in=ID3Timestamp(year=formed_in_year),
general_genre=genre, general_genre=genre,
@ -364,12 +409,12 @@ class EncyclopaediaMetallum(Page):
Source(cls.SOURCE_TYPE, label_url) Source(cls.SOURCE_TYPE, label_url)
] ]
) )
] ],
source_list=source_list
) )
@classmethod @classmethod
def _fetch_artist_attributes(cls, url: str) -> Artist: def _fetch_artist_attributes(cls, url: str) -> Artist:
print(url)
r = cls.get_request(url) r = cls.get_request(url)
if r is None: if r is None:
return Artist() return Artist()