a bit of refactoring and implemented some more scraping
This commit is contained in:
		@@ -8,6 +8,7 @@ https://pandoc.org/installing.html
 | 
				
			|||||||
!!!!!!!!!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!!!!!
 | 
					!!!!!!!!!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!!!!!
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FormattedText:
 | 
					class FormattedText:
 | 
				
			||||||
    doc = None
 | 
					    doc = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -60,6 +61,7 @@ class NotesAttributes:
 | 
				
			|||||||
    def __init__(self) -> None:
 | 
					    def __init__(self) -> None:
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    _plaintext = """
 | 
					    _plaintext = """
 | 
				
			||||||
World of Work
 | 
					World of Work
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -448,8 +448,37 @@ class EncyclopaediaMetallum(Page):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        soup = BeautifulSoup(r.text, 'html.parser')
 | 
					        soup = BeautifulSoup(r.text, 'html.parser')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        tracklist_soup = soup.find("table", {"class": "table_lyrics"})
 | 
					        tracklist_soup = soup.find("table", {"class": "table_lyrics"}).find("tbody")
 | 
				
			||||||
        print(tracklist_soup.prettify)
 | 
					
 | 
				
			||||||
 | 
					        for row in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
 | 
				
			||||||
 | 
					            """
 | 
				
			||||||
 | 
					            example of row:
 | 
				
			||||||
 | 
					                        
 | 
				
			||||||
 | 
					            <tr class="even">
 | 
				
			||||||
 | 
					                <td width="20"><a class="anchor" name="5948442"> </a>1.</td>        # id and tracksort
 | 
				
			||||||
 | 
					                <td class="wrapWords">Convince Me to Bleed</td>                     # name
 | 
				
			||||||
 | 
					                <td align="right">03:40</td>                                        # length
 | 
				
			||||||
 | 
					                <td nowrap="nowrap"> 
 | 
				
			||||||
 | 
					                <a href="#5948442" id="lyricsButton5948442" onclick="toggleLyrics('5948442'); return false;">Show lyrics</a>
 | 
				
			||||||
 | 
					                </td>
 | 
				
			||||||
 | 
					            </tr>
 | 
				
			||||||
 | 
					            """
 | 
				
			||||||
 | 
					            row_list = row.find_all(recursive=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            track_sort_soup = row_list[0]
 | 
				
			||||||
 | 
					            track_sort = int(track_sort_soup.text[:-1])
 | 
				
			||||||
 | 
					            track_id = track_sort_soup.find("a").get("name")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            title = row_list[1].text.strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            duration_stamp = row_list[2].text
 | 
				
			||||||
 | 
					            minutes, seconds = duration_stamp.split(":")
 | 
				
			||||||
 | 
					            duration_in_seconds = int(minutes) * 60 + int(seconds)
 | 
				
			||||||
 | 
					            print(track_sort, track_id)
 | 
				
			||||||
 | 
					            print(title)
 | 
				
			||||||
 | 
					            print(duration_in_seconds)
 | 
				
			||||||
 | 
					            print("-"*20)
 | 
				
			||||||
 | 
					            # print(row)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return album
 | 
					        return album
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user