much progress

This commit is contained in:
Hellow2 2023-05-24 08:50:56 +02:00
parent 51770ec0a9
commit 0fd743e81f
2 changed files with 87 additions and 102 deletions

View File

@ -29,6 +29,7 @@ from ..utils.support_classes import Query, DownloadResult, DefaultTarget
INDEPENDENT_DB_OBJECTS = Union[Label, Album, Artist, Song] INDEPENDENT_DB_OBJECTS = Union[Label, Album, Artist, Song]
INDEPENDENT_DB_TYPES = Union[Type[Song], Type[Album], Type[Artist], Type[Label]] INDEPENDENT_DB_TYPES = Union[Type[Song], Type[Album], Type[Artist], Type[Label]]
def _clean_music_object(music_object: INDEPENDENT_DB_OBJECTS, collections: Dict[INDEPENDENT_DB_TYPES, Collection]): def _clean_music_object(music_object: INDEPENDENT_DB_OBJECTS, collections: Dict[INDEPENDENT_DB_TYPES, Collection]):
if type(music_object) == Label: if type(music_object) == Label:
return _clean_label(label=music_object, collections=collections) return _clean_label(label=music_object, collections=collections)
@ -105,8 +106,10 @@ class Page(threading.Thread):
def run(self) -> None: def run(self) -> None:
pass pass
@classmethod def get_source_type(self, source: Source) -> Optional[INDEPENDENT_DB_TYPES]:
def get_soup_from_response(cls, r: requests.Response) -> BeautifulSoup: return None
def get_soup_from_response(self, r: requests.Response) -> BeautifulSoup:
return BeautifulSoup(r.content, "html.parser") return BeautifulSoup(r.content, "html.parser")
# to search stuff # to search stuff
@ -168,7 +171,7 @@ class Page(threading.Thread):
new_music_object: DatabaseObject = type(music_object)() new_music_object: DatabaseObject = type(music_object)()
# only certain database objects, have a source list # only certain database objects, have a source list
if isinstance(music_object, INDEPENDENT_DB_TYPES): if isinstance(music_object, INDEPENDENT_DB_OBJECTS):
source: Source source: Source
for source in music_object.source_collection.get_sources_from_page(self.SOURCE_TYPE): for source in music_object.source_collection.get_sources_from_page(self.SOURCE_TYPE):
new_music_object.merge( new_music_object.merge(
@ -181,34 +184,43 @@ class Page(threading.Thread):
return music_object return music_object
def fetch_object_from_source(self, source: Source, stop_at_level: int = 2, enforce_type: Type[DatabaseObject] = None, post_process: bool = True) -> DatabaseObject: def fetch_object_from_source(self, source: Source, stop_at_level: int = 2, enforce_type: Type[DatabaseObject] = None, post_process: bool = True) -> Optional[DatabaseObject]:
obj_type = self._get_type_of_url(source.url) obj_type = self.get_source_type(source)
if obj_type is None: if obj_type is None:
return None return None
if enforce_type != obj_type and enforce_type is not None:
self.LOGGER.warning(f"Object type isn't type to enforce: {enforce_type}, {obj_type}")
return None
music_object = cls._fetch_object_from_source(source=source, obj_type=obj_type, stop_at_level=stop_at_level) music_object: DatabaseObject = None
if post_process: fetch_map = {
Song: self.fetch_song,
Album: self.fetch_album,
Artist: self.fetch_artist,
Label: self.fetch_label
}
if obj_type in fetch_map:
music_object = fetch_map[obj_type](source, stop_at_level)
if post_process and music_object is not None:
return post_process_object(music_object) return post_process_object(music_object)
return music_object return music_object
@classmethod def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
def _fetch_object_from_source(cls, source: Source, return Song()
obj_type: Union[Type[Song], Type[Album], Type[Artist], Type[Label]],
stop_at_level: int = 1) -> Union[Song, Album, Artist, Label]:
if obj_type == Artist:
return cls._fetch_artist_from_source(source=source, stop_at_level=stop_at_level)
if obj_type == Song: def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
return cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level) return Album()
if obj_type == Album: def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
return cls._fetch_album_from_source(source=source, stop_at_level=stop_at_level) return Artist()
if obj_type == Label:
return cls._fetch_label_from_source(source=source, stop_at_level=stop_at_level)
def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label:
return Label()
@classmethod @classmethod
def download( def download(
@ -461,26 +473,6 @@ class Page(threading.Thread):
return r return r
@classmethod
def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
return Song()
@classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
return Album()
@classmethod
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
return Artist()
@classmethod
def _fetch_label_from_source(cls, source: Source, stop_at_level: int = 1) -> Label:
return Label()
@classmethod
def _get_type_of_url(cls, url: str) -> Optional[Union[Type[Song], Type[Album], Type[Artist], Type[Label]]]:
return None
@classmethod @classmethod
def _download_song_to_targets(cls, source: Source, target: Target, desc: str = None) -> DownloadResult: def _download_song_to_targets(cls, source: Source, target: Target, desc: str = None) -> DownloadResult:
return DownloadResult() return DownloadResult()

View File

@ -218,16 +218,14 @@ class EncyclopaediaMetallum(Page):
for raw_artist in r.json()['aaData'] for raw_artist in r.json()['aaData']
] ]
def _fetch_artist_discography(self, ma_artist_id: str) -> List[Album]:
@classmethod
def _fetch_artist_discography(cls, ma_artist_id: str) -> List[Album]:
discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all" discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all"
# make the request # make the request
r = cls.CONNECTION.get(discography_url.format(ma_artist_id)) r = self.connection.get(discography_url.format(ma_artist_id))
if r is None: if r is None:
return [] return []
soup = cls.get_soup_from_response(r) soup = self.get_soup_from_response(r)
discography = [] discography = []
@ -251,21 +249,20 @@ class EncyclopaediaMetallum(Page):
Album( Album(
title=album_name, title=album_name,
date=date_obj, date=date_obj,
album_type=cls.ALBUM_TYPE_MAP[raw_album_type], album_type=ALBUM_TYPE_MAP[raw_album_type],
source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)] source_list=[Source(self.SOURCE_TYPE, album_url)]
) )
) )
return discography return discography
@classmethod def _fetch_artist_sources(self, ma_artist_id: str) -> List[Source]:
def _fetch_artist_sources(cls, ma_artist_id: str) -> List[Source]:
sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}" sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}"
r = cls.CONNECTION.get(sources_url.format(ma_artist_id)) r = self.connection.get(sources_url.format(ma_artist_id))
if r is None: if r is None:
return [] return []
soup = cls.get_soup_from_response(r) soup = self.get_soup_from_response(r)
if soup.find("span", {"id": "noLinks"}) is not None: if soup.find("span", {"id": "noLinks"}) is not None:
return [] return []
@ -289,12 +286,11 @@ class EncyclopaediaMetallum(Page):
if url is None: if url is None:
continue continue
source_list.append(Source.match_url(url, referer_page=cls.SOURCE_TYPE)) source_list.append(Source.match_url(url, referer_page=self.SOURCE_TYPE))
return source_list return source_list
@classmethod def _parse_artist_attributes(self, artist_soup: BeautifulSoup) -> Artist:
def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist:
name: str = None name: str = None
country: pycountry.Countrie = None country: pycountry.Countrie = None
formed_in_year: int = None formed_in_year: int = None
@ -311,7 +307,7 @@ class EncyclopaediaMetallum(Page):
if title_text.count(bad_name_substring) == 1: if title_text.count(bad_name_substring) == 1:
name = title_text.replace(bad_name_substring, "") name = title_text.replace(bad_name_substring, "")
else: else:
cls.LOGGER.debug(f"the title of the page is \"{title_text}\"") self.LOGGER.debug(f"the title of the page is \"{title_text}\"")
""" """
TODO TODO
@ -341,7 +337,7 @@ class EncyclopaediaMetallum(Page):
href = anchor.get("href") href = anchor.get("href")
if href is not None: if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href)) source_list.append(Source(self.SOURCE_TYPE, href))
name = anchor.get_text(strip=True) name = anchor.get_text(strip=True)
@ -400,35 +396,32 @@ class EncyclopaediaMetallum(Page):
Label( Label(
name=label_name, name=label_name,
source_list=[ source_list=[
Source(cls.SOURCE_TYPE, label_url) Source(self.SOURCE_TYPE, label_url)
] ]
) )
], ],
source_list=source_list source_list=source_list
) )
@classmethod def _fetch_artist_attributes(self, url: str) -> Artist:
def _fetch_artist_attributes(cls, url: str) -> Artist: r = self.connection.get(url)
r = cls.CONNECTION.get(url)
if r is None: if r is None:
return Artist() return Artist()
soup: BeautifulSoup = cls.get_soup_from_response(r) soup: BeautifulSoup = self.get_soup_from_response(r)
return cls._parse_artist_attributes(artist_soup=soup) return self._parse_artist_attributes(artist_soup=soup)
@classmethod def _fetch_band_notes(self, ma_artist_id: str) -> Optional[FormattedText]:
def _fetch_band_notes(cls, ma_artist_id: str) -> Optional[FormattedText]:
endpoint = "https://www.metal-archives.com/band/read-more/id/{}" endpoint = "https://www.metal-archives.com/band/read-more/id/{}"
# make the request # make the request
r = cls.CONNECTION.get(endpoint.format(ma_artist_id)) r = self.connection.get(endpoint.format(ma_artist_id))
if r is None: if r is None:
return FormattedText() return FormattedText()
return FormattedText(html=r.text) return FormattedText(html=r.text)
@classmethod def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
""" """
What it could fetch, and what is implemented: What it could fetch, and what is implemented:
@ -440,28 +433,27 @@ class EncyclopaediaMetallum(Page):
[x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489 [x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489
""" """
artist = cls._fetch_artist_attributes(source.url) artist = self._fetch_artist_attributes(source.url)
artist_id = source.url.split("/")[-1] artist_id = source.url.split("/")[-1]
artist_sources = cls._fetch_artist_sources(artist_id) artist_sources = self._fetch_artist_sources(artist_id)
artist.source_collection.extend(artist_sources) artist.source_collection.extend(artist_sources)
band_notes = cls._fetch_band_notes(artist_id) band_notes = self._fetch_band_notes(artist_id)
if band_notes is not None: if band_notes is not None:
artist.notes = band_notes artist.notes = band_notes
discography: List[Album] = cls._fetch_artist_discography(artist_id) discography: List[Album] = self._fetch_artist_discography(artist_id)
if stop_at_level > 1: if stop_at_level > 1:
for album in discography: for album in discography:
for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE): for source in album.source_collection.get_sources_from_page(self.SOURCE_TYPE):
album.merge(cls._fetch_album_from_source(source, stop_at_level=stop_at_level-1)) album.merge(self._fetch_album_from_source(source, stop_at_level=stop_at_level-1))
artist.main_album_collection.extend(discography) artist.main_album_collection.extend(discography)
return artist return artist
@classmethod def _parse_album_track_row(self, track_row: BeautifulSoup) -> Song:
def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song:
""" """
<tr class="even"> <tr class="even">
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort <td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
@ -482,7 +474,7 @@ class EncyclopaediaMetallum(Page):
track_id = track_sort_soup.find("a").get("name").strip() track_id = track_sort_soup.find("a").get("name").strip()
if track_row.find("a", {"href": f"#{track_id}"}) is not None: if track_row.find("a", {"href": f"#{track_id}"}) is not None:
source_list.append(Source(cls.SOURCE_TYPE, track_id)) source_list.append(Source(self.SOURCE_TYPE, track_id))
title = row_list[1].text.strip() title = row_list[1].text.strip()
@ -500,9 +492,7 @@ class EncyclopaediaMetallum(Page):
source_list=source_list source_list=source_list
) )
def _parse_album_attributes(self, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
@classmethod
def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
tracklist: List[Song] = [] tracklist: List[Song] = []
artist_list = [] artist_list = []
album_name: str = None album_name: str = None
@ -522,12 +512,12 @@ class EncyclopaediaMetallum(Page):
href = anchor.get("href") href = anchor.get("href")
if href is not None: if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href.strip())) source_list.append(Source(self.SOURCE_TYPE, href.strip()))
album_name = anchor.get_text(strip=True) album_name = anchor.get_text(strip=True)
elif len(album_soup_list) > 1: elif len(album_soup_list) > 1:
cls.LOGGER.debug("there are more than 1 album soups") self.LOGGER.debug("there are more than 1 album soups")
artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"}) artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"})
@ -537,7 +527,7 @@ class EncyclopaediaMetallum(Page):
href = anchor.get("href") href = anchor.get("href")
if href is not None: if href is not None:
artist_sources.append(Source(cls.SOURCE_TYPE, href.strip())) artist_sources.append(Source(self.SOURCE_TYPE, href.strip()))
artist_name = anchor.get_text(strip=True) artist_name = anchor.get_text(strip=True)
@ -547,13 +537,13 @@ class EncyclopaediaMetallum(Page):
)) ))
elif len(artist_soup_list) > 1: elif len(artist_soup_list) > 1:
cls.LOGGER.debug("there are more than 1 artist soups") self.LOGGER.debug("there are more than 1 artist soups")
_parse_album_info(album_info_soup=album_soup.find(id="album_info")) _parse_album_info(album_info_soup=album_soup.find(id="album_info"))
tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody") tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody")
for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}): for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
tracklist.append(cls._parse_album_track_row(track_row=track_soup)) tracklist.append(self._parse_album_track_row(track_row=track_soup))
return Album( return Album(
title=album_name, title=album_name,
@ -562,8 +552,7 @@ class EncyclopaediaMetallum(Page):
song_list=tracklist song_list=tracklist
) )
@classmethod def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
""" """
I am preeeety sure I can get way more data than... nothing from there I am preeeety sure I can get way more data than... nothing from there
@ -574,23 +563,22 @@ class EncyclopaediaMetallum(Page):
# <table class="display table_lyrics # <table class="display table_lyrics
r = cls.CONNECTION.get(source.url) r = self.connection.get(source.url)
if r is None: if r is None:
return Album() return Album()
soup = cls.get_soup_from_response(r) soup = self.get_soup_from_response(r)
album = cls._parse_album_attributes(soup, stop_at_level=stop_at_level) album = self._parse_album_attributes(soup, stop_at_level=stop_at_level)
if stop_at_level > 1: if stop_at_level > 1:
for song in album.song_collection: for song in album.song_collection:
for source in song.source_collection.get_sources_from_page(cls.SOURCE_TYPE): for source in song.source_collection.get_sources_from_page(self.SOURCE_TYPE):
song.merge(cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1)) song.merge(self._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1))
return album return album
@classmethod def _fetch_lyrics(self, song_id: str) -> Optional[Lyrics]:
def _fetch_lyrics(cls, song_id: str) -> Optional[Lyrics]:
""" """
function toggleLyrics(songId) { function toggleLyrics(songId) {
var lyricsRow = $('#song' + songId); var lyricsRow = $('#song' + songId);
@ -614,7 +602,7 @@ class EncyclopaediaMetallum(Page):
endpoint = "https://www.metal-archives.com/release/ajax-view-lyrics/id/{id}".format(id=song_id) endpoint = "https://www.metal-archives.com/release/ajax-view-lyrics/id/{id}".format(id=song_id)
r = cls.CONNECTION.get(endpoint) r = self.connection.get(endpoint)
if r is None: if r is None:
return None return None
@ -622,22 +610,27 @@ class EncyclopaediaMetallum(Page):
text=FormattedText(html=r.text), text=FormattedText(html=r.text),
language=pycountry.languages.get(alpha_2="en"), language=pycountry.languages.get(alpha_2="en"),
source_list=[ source_list=[
Source(cls.SOURCE_TYPE, endpoint) Source(self.SOURCE_TYPE, endpoint)
] ]
) )
@classmethod def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
song_id = source.url song_id = source.url
return Song( return Song(
lyrics_list=[ lyrics_list=[
cls._fetch_lyrics(song_id=song_id) self._fetch_lyrics(song_id=song_id)
] ]
) )
@classmethod def get_source_type(self, source: Source):
def _get_type_of_url(cls, url: str) -> Optional[Union[Type[Song], Type[Album], Type[Artist], Type[Label]]]: if self.SOURCE_TYPE != source.page_enum:
return None
url = source.url
if url is None:
return None
parsed_url = urlparse(url) parsed_url = urlparse(url)
path: List[str] = parsed_url.path.split("/") path: List[str] = parsed_url.path.split("/")