a bit of refactoring and implemented some more scraping

This commit is contained in:
Hellow 2023-02-06 23:47:13 +01:00
parent 8362e2d5eb
commit ec9bb69069
2 changed files with 40 additions and 9 deletions

View File

@ -8,15 +8,16 @@ https://pandoc.org/installing.html
!!!!!!!!!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!!!!!
""" """
class FormattedText: class FormattedText:
doc = None doc = None
def __init__( def __init__(
self, self,
plaintext: str = None, plaintext: str = None,
markdown: str = None, markdown: str = None,
html: str = None html: str = None
) -> None: ) -> None:
self.set_plaintext(plaintext) self.set_plaintext(plaintext)
self.set_markdown(markdown) self.set_markdown(markdown)
self.set_html(html) self.set_html(html)
@ -46,7 +47,7 @@ class FormattedText:
return None return None
return pandoc.write(self.doc, format="html").strip() return pandoc.write(self.doc, format="html").strip()
def get_plaintext(self)-> str: def get_plaintext(self) -> str:
if self.doc is None: if self.doc is None:
return None return None
return pandoc.write(self.doc, format="plain").strip() return pandoc.write(self.doc, format="plain").strip()
@ -60,6 +61,7 @@ class NotesAttributes:
def __init__(self) -> None: def __init__(self) -> None:
pass pass
if __name__ == "__main__": if __name__ == "__main__":
_plaintext = """ _plaintext = """
World of Work World of Work
@ -118,4 +120,4 @@ According to the band, "Ghost Bath" refers to "the act of committing suicide by
# print("-"*30) # print("-"*30)
# print(notes.get_markdown()) # print(notes.get_markdown())
print(notes.get_markdown()) print(notes.get_markdown())

View File

@ -448,8 +448,37 @@ class EncyclopaediaMetallum(Page):
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
tracklist_soup = soup.find("table", {"class": "table_lyrics"}) tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
print(tracklist_soup.prettify)
for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
"""
example of row:
<tr class="even">
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
<td class="wrapWords">Convince Me to Bleed</td> # name
<td align="right">03:40</td> # length
<td nowrap="nowrap"> 
<a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
</td>
</tr>
"""
row_list = row.find_all(recursive=False)
track_sort_soup = row_list[0]
track_sort = int(track_sort_soup.text[:-1])
track_id = track_sort_soup.find("a").get("name")
title = row_list[1].text.strip()
duration_stamp = row_list[2].text
minutes, seconds = duration_stamp.split(":")
duration_in_seconds = int(minutes) * 60 + int(seconds)
print(track_sort, track_id)
print(title)
print(duration_in_seconds)
print("-"*20)
# print(row)
return album return album