a bit of refactoring and implemented some more scraping
This commit is contained in:
parent
8362e2d5eb
commit
ec9bb69069
@ -8,15 +8,16 @@ https://pandoc.org/installing.html
|
|||||||
!!!!!!!!!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!!!!!
|
!!!!!!!!!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!!!!!
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class FormattedText:
|
class FormattedText:
|
||||||
doc = None
|
doc = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
plaintext: str = None,
|
plaintext: str = None,
|
||||||
markdown: str = None,
|
markdown: str = None,
|
||||||
html: str = None
|
html: str = None
|
||||||
) -> None:
|
) -> None:
|
||||||
self.set_plaintext(plaintext)
|
self.set_plaintext(plaintext)
|
||||||
self.set_markdown(markdown)
|
self.set_markdown(markdown)
|
||||||
self.set_html(html)
|
self.set_html(html)
|
||||||
@ -46,7 +47,7 @@ class FormattedText:
|
|||||||
return None
|
return None
|
||||||
return pandoc.write(self.doc, format="html").strip()
|
return pandoc.write(self.doc, format="html").strip()
|
||||||
|
|
||||||
def get_plaintext(self)-> str:
|
def get_plaintext(self) -> str:
|
||||||
if self.doc is None:
|
if self.doc is None:
|
||||||
return None
|
return None
|
||||||
return pandoc.write(self.doc, format="plain").strip()
|
return pandoc.write(self.doc, format="plain").strip()
|
||||||
@ -60,6 +61,7 @@ class NotesAttributes:
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
_plaintext = """
|
_plaintext = """
|
||||||
World of Work
|
World of Work
|
||||||
@ -118,4 +120,4 @@ According to the band, "Ghost Bath" refers to "the act of committing suicide by
|
|||||||
# print("-"*30)
|
# print("-"*30)
|
||||||
# print(notes.get_markdown())
|
# print(notes.get_markdown())
|
||||||
|
|
||||||
print(notes.get_markdown())
|
print(notes.get_markdown())
|
||||||
|
@ -448,8 +448,37 @@ class EncyclopaediaMetallum(Page):
|
|||||||
|
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
|
|
||||||
tracklist_soup = soup.find("table", {"class": "table_lyrics"})
|
tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
|
||||||
print(tracklist_soup.prettify)
|
|
||||||
|
for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
|
||||||
|
"""
|
||||||
|
example of row:
|
||||||
|
|
||||||
|
<tr class="even">
|
||||||
|
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
|
||||||
|
<td class="wrapWords">Convince Me to Bleed</td> # name
|
||||||
|
<td align="right">03:40</td> # length
|
||||||
|
<td nowrap="nowrap">
|
||||||
|
<a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
"""
|
||||||
|
row_list = row.find_all(recursive=False)
|
||||||
|
|
||||||
|
track_sort_soup = row_list[0]
|
||||||
|
track_sort = int(track_sort_soup.text[:-1])
|
||||||
|
track_id = track_sort_soup.find("a").get("name")
|
||||||
|
|
||||||
|
title = row_list[1].text.strip()
|
||||||
|
|
||||||
|
duration_stamp = row_list[2].text
|
||||||
|
minutes, seconds = duration_stamp.split(":")
|
||||||
|
duration_in_seconds = int(minutes) * 60 + int(seconds)
|
||||||
|
print(track_sort, track_id)
|
||||||
|
print(title)
|
||||||
|
print(duration_in_seconds)
|
||||||
|
print("-"*20)
|
||||||
|
# print(row)
|
||||||
|
|
||||||
return album
|
return album
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user