much progress

This commit is contained in:
Hellow2 2023-05-24 08:50:56 +02:00
parent 51770ec0a9
commit 0fd743e81f
2 changed files with 87 additions and 102 deletions

View File

@ -29,6 +29,7 @@ from ..utils.support_classes import Query, DownloadResult, DefaultTarget
INDEPENDENT_DB_OBJECTS = Union[Label, Album, Artist, Song]
INDEPENDENT_DB_TYPES = Union[Type[Song], Type[Album], Type[Artist], Type[Label]]
def _clean_music_object(music_object: INDEPENDENT_DB_OBJECTS, collections: Dict[INDEPENDENT_DB_TYPES, Collection]):
if type(music_object) == Label:
return _clean_label(label=music_object, collections=collections)
@ -104,9 +105,11 @@ class Page(threading.Thread):
def run(self) -> None:
pass
def get_source_type(self, source: Source) -> Optional[INDEPENDENT_DB_TYPES]:
return None
@classmethod
def get_soup_from_response(cls, r: requests.Response) -> BeautifulSoup:
def get_soup_from_response(self, r: requests.Response) -> BeautifulSoup:
return BeautifulSoup(r.content, "html.parser")
# to search stuff
@ -168,7 +171,7 @@ class Page(threading.Thread):
new_music_object: DatabaseObject = type(music_object)()
# only certain database objects, have a source list
if isinstance(music_object, INDEPENDENT_DB_TYPES):
if isinstance(music_object, INDEPENDENT_DB_OBJECTS):
source: Source
for source in music_object.source_collection.get_sources_from_page(self.SOURCE_TYPE):
new_music_object.merge(
@ -181,34 +184,43 @@ class Page(threading.Thread):
return music_object
def fetch_object_from_source(self, source: Source, stop_at_level: int = 2, enforce_type: Type[DatabaseObject] = None, post_process: bool = True) -> DatabaseObject:
obj_type = self._get_type_of_url(source.url)
def fetch_object_from_source(self, source: Source, stop_at_level: int = 2, enforce_type: Type[DatabaseObject] = None, post_process: bool = True) -> Optional[DatabaseObject]:
obj_type = self.get_source_type(source)
if obj_type is None:
return None
if enforce_type != obj_type and enforce_type is not None:
self.LOGGER.warning(f"Object type isn't type to enforce: {enforce_type}, {obj_type}")
return None
music_object: DatabaseObject = None
fetch_map = {
Song: self.fetch_song,
Album: self.fetch_album,
Artist: self.fetch_artist,
Label: self.fetch_label
}
if obj_type in fetch_map:
music_object = fetch_map[obj_type](source, stop_at_level)
music_object = cls._fetch_object_from_source(source=source, obj_type=obj_type, stop_at_level=stop_at_level)
if post_process:
if post_process and music_object is not None:
return post_process_object(music_object)
return music_object
def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
return Song()
@classmethod
def _fetch_object_from_source(cls, source: Source,
obj_type: Union[Type[Song], Type[Album], Type[Artist], Type[Label]],
stop_at_level: int = 1) -> Union[Song, Album, Artist, Label]:
if obj_type == Artist:
return cls._fetch_artist_from_source(source=source, stop_at_level=stop_at_level)
def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
return Album()
if obj_type == Song:
return cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level)
if obj_type == Album:
return cls._fetch_album_from_source(source=source, stop_at_level=stop_at_level)
if obj_type == Label:
return cls._fetch_label_from_source(source=source, stop_at_level=stop_at_level)
def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
return Artist()
def fetch_label(self, source: Source, stop_at_level: int = 1) -> Label:
return Label()
@classmethod
def download(
@ -461,26 +473,6 @@ class Page(threading.Thread):
return r
@classmethod
def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
return Song()
@classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
return Album()
@classmethod
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
return Artist()
@classmethod
def _fetch_label_from_source(cls, source: Source, stop_at_level: int = 1) -> Label:
return Label()
@classmethod
def _get_type_of_url(cls, url: str) -> Optional[Union[Type[Song], Type[Album], Type[Artist], Type[Label]]]:
return None
@classmethod
def _download_song_to_targets(cls, source: Source, target: Target, desc: str = None) -> DownloadResult:
return DownloadResult()

View File

@ -218,16 +218,14 @@ class EncyclopaediaMetallum(Page):
for raw_artist in r.json()['aaData']
]
@classmethod
def _fetch_artist_discography(cls, ma_artist_id: str) -> List[Album]:
def _fetch_artist_discography(self, ma_artist_id: str) -> List[Album]:
discography_url = "https://www.metal-archives.com/band/discography/id/{}/tab/all"
# make the request
r = cls.CONNECTION.get(discography_url.format(ma_artist_id))
r = self.connection.get(discography_url.format(ma_artist_id))
if r is None:
return []
soup = cls.get_soup_from_response(r)
soup = self.get_soup_from_response(r)
discography = []
@ -251,21 +249,20 @@ class EncyclopaediaMetallum(Page):
Album(
title=album_name,
date=date_obj,
album_type=cls.ALBUM_TYPE_MAP[raw_album_type],
source_list=[Source(SourcePages.ENCYCLOPAEDIA_METALLUM, album_url)]
album_type=ALBUM_TYPE_MAP[raw_album_type],
source_list=[Source(self.SOURCE_TYPE, album_url)]
)
)
return discography
@classmethod
def _fetch_artist_sources(cls, ma_artist_id: str) -> List[Source]:
def _fetch_artist_sources(self, ma_artist_id: str) -> List[Source]:
sources_url = "https://www.metal-archives.com/link/ajax-list/type/band/id/{}"
r = cls.CONNECTION.get(sources_url.format(ma_artist_id))
r = self.connection.get(sources_url.format(ma_artist_id))
if r is None:
return []
soup = cls.get_soup_from_response(r)
soup = self.get_soup_from_response(r)
if soup.find("span", {"id": "noLinks"}) is not None:
return []
@ -289,12 +286,11 @@ class EncyclopaediaMetallum(Page):
if url is None:
continue
source_list.append(Source.match_url(url, referer_page=cls.SOURCE_TYPE))
source_list.append(Source.match_url(url, referer_page=self.SOURCE_TYPE))
return source_list
@classmethod
def _parse_artist_attributes(cls, artist_soup: BeautifulSoup) -> Artist:
def _parse_artist_attributes(self, artist_soup: BeautifulSoup) -> Artist:
name: str = None
country: pycountry.Countrie = None
formed_in_year: int = None
@ -311,7 +307,7 @@ class EncyclopaediaMetallum(Page):
if title_text.count(bad_name_substring) == 1:
name = title_text.replace(bad_name_substring, "")
else:
cls.LOGGER.debug(f"the title of the page is \"{title_text}\"")
self.LOGGER.debug(f"the title of the page is \"{title_text}\"")
"""
TODO
@ -341,7 +337,7 @@ class EncyclopaediaMetallum(Page):
href = anchor.get("href")
if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href))
source_list.append(Source(self.SOURCE_TYPE, href))
name = anchor.get_text(strip=True)
@ -400,35 +396,32 @@ class EncyclopaediaMetallum(Page):
Label(
name=label_name,
source_list=[
Source(cls.SOURCE_TYPE, label_url)
Source(self.SOURCE_TYPE, label_url)
]
)
],
source_list=source_list
)
@classmethod
def _fetch_artist_attributes(cls, url: str) -> Artist:
r = cls.CONNECTION.get(url)
def _fetch_artist_attributes(self, url: str) -> Artist:
r = self.connection.get(url)
if r is None:
return Artist()
soup: BeautifulSoup = cls.get_soup_from_response(r)
soup: BeautifulSoup = self.get_soup_from_response(r)
return cls._parse_artist_attributes(artist_soup=soup)
return self._parse_artist_attributes(artist_soup=soup)
@classmethod
def _fetch_band_notes(cls, ma_artist_id: str) -> Optional[FormattedText]:
def _fetch_band_notes(self, ma_artist_id: str) -> Optional[FormattedText]:
endpoint = "https://www.metal-archives.com/band/read-more/id/{}"
# make the request
r = cls.CONNECTION.get(endpoint.format(ma_artist_id))
r = self.connection.get(endpoint.format(ma_artist_id))
if r is None:
return FormattedText()
return FormattedText(html=r.text)
@classmethod
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
def fetch_artist(self, source: Source, stop_at_level: int = 1) -> Artist:
"""
What it could fetch, and what is implemented:
@ -440,28 +433,27 @@ class EncyclopaediaMetallum(Page):
[x] band notes: https://www.metal-archives.com/band/read-more/id/3540372489
"""
artist = cls._fetch_artist_attributes(source.url)
artist = self._fetch_artist_attributes(source.url)
artist_id = source.url.split("/")[-1]
artist_sources = cls._fetch_artist_sources(artist_id)
artist_sources = self._fetch_artist_sources(artist_id)
artist.source_collection.extend(artist_sources)
band_notes = cls._fetch_band_notes(artist_id)
band_notes = self._fetch_band_notes(artist_id)
if band_notes is not None:
artist.notes = band_notes
discography: List[Album] = cls._fetch_artist_discography(artist_id)
discography: List[Album] = self._fetch_artist_discography(artist_id)
if stop_at_level > 1:
for album in discography:
for source in album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
album.merge(cls._fetch_album_from_source(source, stop_at_level=stop_at_level-1))
for source in album.source_collection.get_sources_from_page(self.SOURCE_TYPE):
album.merge(self._fetch_album_from_source(source, stop_at_level=stop_at_level-1))
artist.main_album_collection.extend(discography)
return artist
@classmethod
def _parse_album_track_row(cls, track_row: BeautifulSoup) -> Song:
def _parse_album_track_row(self, track_row: BeautifulSoup) -> Song:
"""
<tr class="even">
<td width="20"><a class="anchor" name="5948442"> </a>1.</td> # id and tracksort
@ -482,7 +474,7 @@ class EncyclopaediaMetallum(Page):
track_id = track_sort_soup.find("a").get("name").strip()
if track_row.find("a", {"href": f"#{track_id}"}) is not None:
source_list.append(Source(cls.SOURCE_TYPE, track_id))
source_list.append(Source(self.SOURCE_TYPE, track_id))
title = row_list[1].text.strip()
@ -500,9 +492,7 @@ class EncyclopaediaMetallum(Page):
source_list=source_list
)
@classmethod
def _parse_album_attributes(cls, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
def _parse_album_attributes(self, album_soup: BeautifulSoup, stop_at_level: int = 1) -> Album:
tracklist: List[Song] = []
artist_list = []
album_name: str = None
@ -522,12 +512,12 @@ class EncyclopaediaMetallum(Page):
href = anchor.get("href")
if href is not None:
source_list.append(Source(cls.SOURCE_TYPE, href.strip()))
source_list.append(Source(self.SOURCE_TYPE, href.strip()))
album_name = anchor.get_text(strip=True)
elif len(album_soup_list) > 1:
cls.LOGGER.debug("there are more than 1 album soups")
self.LOGGER.debug("there are more than 1 album soups")
artist_soup_list = album_info_soup.find_all("h2", {"class": "band_name"})
@ -537,7 +527,7 @@ class EncyclopaediaMetallum(Page):
href = anchor.get("href")
if href is not None:
artist_sources.append(Source(cls.SOURCE_TYPE, href.strip()))
artist_sources.append(Source(self.SOURCE_TYPE, href.strip()))
artist_name = anchor.get_text(strip=True)
@ -547,13 +537,13 @@ class EncyclopaediaMetallum(Page):
))
elif len(artist_soup_list) > 1:
cls.LOGGER.debug("there are more than 1 artist soups")
self.LOGGER.debug("there are more than 1 artist soups")
_parse_album_info(album_info_soup=album_soup.find(id="album_info"))
tracklist_soup = album_soup.find("table", {"class": "table_lyrics"}).find("tbody")
for track_soup in tracklist_soup.find_all("tr", {"class": ["even", "odd"]}):
tracklist.append(cls._parse_album_track_row(track_row=track_soup))
tracklist.append(self._parse_album_track_row(track_row=track_soup))
return Album(
title=album_name,
@ -562,8 +552,7 @@ class EncyclopaediaMetallum(Page):
song_list=tracklist
)
@classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
"""
I am preeeety sure I can get way more data than... nothing from there
@ -574,23 +563,22 @@ class EncyclopaediaMetallum(Page):
# <table class="display table_lyrics
r = cls.CONNECTION.get(source.url)
r = self.connection.get(source.url)
if r is None:
return Album()
soup = cls.get_soup_from_response(r)
soup = self.get_soup_from_response(r)
album = cls._parse_album_attributes(soup, stop_at_level=stop_at_level)
album = self._parse_album_attributes(soup, stop_at_level=stop_at_level)
if stop_at_level > 1:
for song in album.song_collection:
for source in song.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
song.merge(cls._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1))
for source in song.source_collection.get_sources_from_page(self.SOURCE_TYPE):
song.merge(self._fetch_song_from_source(source=source, stop_at_level=stop_at_level-1))
return album
@classmethod
def _fetch_lyrics(cls, song_id: str) -> Optional[Lyrics]:
def _fetch_lyrics(self, song_id: str) -> Optional[Lyrics]:
"""
function toggleLyrics(songId) {
var lyricsRow = $('#song' + songId);
@ -614,7 +602,7 @@ class EncyclopaediaMetallum(Page):
endpoint = "https://www.metal-archives.com/release/ajax-view-lyrics/id/{id}".format(id=song_id)
r = cls.CONNECTION.get(endpoint)
r = self.connection.get(endpoint)
if r is None:
return None
@ -622,22 +610,27 @@ class EncyclopaediaMetallum(Page):
text=FormattedText(html=r.text),
language=pycountry.languages.get(alpha_2="en"),
source_list=[
Source(cls.SOURCE_TYPE, endpoint)
Source(self.SOURCE_TYPE, endpoint)
]
)
@classmethod
def _fetch_song_from_source(cls, source: Source, stop_at_level: int = 1) -> Song:
def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song:
song_id = source.url
return Song(
lyrics_list=[
cls._fetch_lyrics(song_id=song_id)
self._fetch_lyrics(song_id=song_id)
]
)
@classmethod
def _get_type_of_url(cls, url: str) -> Optional[Union[Type[Song], Type[Album], Type[Artist], Type[Label]]]:
def get_source_type(self, source: Source):
if self.SOURCE_TYPE != source.page_enum:
return None
url = source.url
if url is None:
return None
parsed_url = urlparse(url)
path: List[str] = parsed_url.path.split("/")