continued musify scraper
This commit is contained in:
parent
9c438588a5
commit
c43ff6df69
@ -14,12 +14,15 @@ class DatabaseObject:
|
|||||||
SIMPLE_ATTRIBUTES: tuple = tuple()
|
SIMPLE_ATTRIBUTES: tuple = tuple()
|
||||||
|
|
||||||
def __init__(self, _id: str = None, dynamic: bool = False, **kwargs) -> None:
|
def __init__(self, _id: str = None, dynamic: bool = False, **kwargs) -> None:
|
||||||
|
self.automatic_id: bool = False
|
||||||
|
|
||||||
if _id is None and not dynamic:
|
if _id is None and not dynamic:
|
||||||
"""
|
"""
|
||||||
generates a random UUID
|
generates a random UUID
|
||||||
https://docs.python.org/3/library/uuid.html
|
https://docs.python.org/3/library/uuid.html
|
||||||
"""
|
"""
|
||||||
_id = str(uuid.uuid4())
|
_id = str(uuid.uuid4())
|
||||||
|
self.automatic_id = True
|
||||||
LOGGER.debug(f"id for {type(self).__name__} isn't set. Setting to {_id}")
|
LOGGER.debug(f"id for {type(self).__name__} isn't set. Setting to {_id}")
|
||||||
|
|
||||||
# The id can only be None, if the object is dynamic (self.dynamic = True)
|
# The id can only be None, if the object is dynamic (self.dynamic = True)
|
||||||
@ -71,6 +74,7 @@ class DatabaseObject:
|
|||||||
if override or getattr(self, simple_attribute) is None:
|
if override or getattr(self, simple_attribute) is None:
|
||||||
setattr(self, simple_attribute, getattr(other, simple_attribute))
|
setattr(self, simple_attribute, getattr(other, simple_attribute))
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def metadata(self) -> Metadata:
|
def metadata(self) -> Metadata:
|
||||||
return Metadata()
|
return Metadata()
|
||||||
|
@ -55,7 +55,7 @@ class Musify(Page):
|
|||||||
def get_plaintext_query(cls, query: Page.Query) -> str:
|
def get_plaintext_query(cls, query: Page.Query) -> str:
|
||||||
if query.album is None:
|
if query.album is None:
|
||||||
return f"{query.artist or '*'} - {query.song or '*'}"
|
return f"{query.artist or '*'} - {query.song or '*'}"
|
||||||
return f"{query.artist or '*'} - {query.album * '*'} - {query.song or '*'}"
|
return f"{query.artist or '*'} - {query.album or '*'} - {query.song or '*'}"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_soup_of_search(cls, query: str, trie=0) -> Optional[BeautifulSoup]:
|
def get_soup_of_search(cls, query: str, trie=0) -> Optional[BeautifulSoup]:
|
||||||
@ -110,30 +110,134 @@ class Musify(Page):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_album_contact(cls, contact: BeautifulSoup) -> Album:
|
def parse_album_contact(cls, contact: BeautifulSoup) -> Album:
|
||||||
print(contact)
|
"""
|
||||||
return Album(title="")
|
parsing following html:
|
||||||
|
|
||||||
|
```html
|
||||||
|
<div class="contacts__item">
|
||||||
|
<a href="/release/ghost-bath-ghost-bath-2013-602489" title="Ghost Bath - 2013">
|
||||||
|
|
||||||
|
<div class="contacts__img release">
|
||||||
|
<img alt="Ghost Bath" class="lozad" data-src="https://37s.musify.club/img/69/9060265/24178833.jpg"/>
|
||||||
|
<noscript><img alt="Ghost Bath" src="https://37s.musify.club/img/69/9060265/24178833.jpg"/></noscript>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="contacts__info">
|
||||||
|
<strong>Ghost Bath - 2013</strong>
|
||||||
|
<small>Ghost Bath</small>
|
||||||
|
<small>Треков: 4</small> <!--tracks-->
|
||||||
|
<small><i class="zmdi zmdi-star zmdi-hc-fw"></i> 9,04</small>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
source_list: List[Source] = []
|
||||||
|
title = ""
|
||||||
|
_id = None
|
||||||
|
year = None
|
||||||
|
artist_list: List[Artist] = []
|
||||||
|
|
||||||
|
def parse_title_date(title_date: Optional[str], delimiter: str = " - "):
|
||||||
|
if title_date is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
title_date = title_date.strip()
|
||||||
|
split_attr = title_date.split(delimiter)
|
||||||
|
|
||||||
|
if len(split_attr) < 2:
|
||||||
|
return
|
||||||
|
if not split_attr[-1].isdigit():
|
||||||
|
return
|
||||||
|
|
||||||
|
year = int(split_attr[-1])
|
||||||
|
title = delimiter.join(split_attr[:-1])
|
||||||
|
|
||||||
|
# source
|
||||||
|
anchor = contact.find("a")
|
||||||
|
if anchor is not None:
|
||||||
|
href = anchor.get("href")
|
||||||
|
|
||||||
|
# get the title and year
|
||||||
|
parse_title_date(anchor.get("title"))
|
||||||
|
|
||||||
|
|
||||||
|
if "-" in href:
|
||||||
|
_id = href.split("-")[-1]
|
||||||
|
|
||||||
|
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
|
||||||
|
|
||||||
|
# cover art
|
||||||
|
image_soup = contact.find("img")
|
||||||
|
if image_soup is not None:
|
||||||
|
alt = image_soup.get("alt")
|
||||||
|
if alt is not None:
|
||||||
|
title = alt
|
||||||
|
|
||||||
|
cover_art = image_soup.get("src")
|
||||||
|
|
||||||
|
contact_info_soup = contact.find("div", {"class": "contacts__info"})
|
||||||
|
if contact_info_soup is not None:
|
||||||
|
"""
|
||||||
|
<strong>Ghost Bath - 2013</strong>
|
||||||
|
<small>Ghost Bath</small>
|
||||||
|
<small>Треков: 4</small> <!--tracks-->
|
||||||
|
<small><i class="zmdi zmdi-star zmdi-hc-fw"></i> 9,04</small>
|
||||||
|
"""
|
||||||
|
|
||||||
|
title_soup = contact_info_soup.find("strong")
|
||||||
|
if title_soup is None:
|
||||||
|
parse_title_date(title_soup)
|
||||||
|
|
||||||
|
small_list = contact_info_soup.find_all("small")
|
||||||
|
if len(small_list) == 3:
|
||||||
|
# artist
|
||||||
|
artist_soup: BeautifulSoup = small_list[0]
|
||||||
|
raw_artist_str = artist_soup.text
|
||||||
|
|
||||||
|
for artist_str in raw_artist_str.split("&\r\n"):
|
||||||
|
artist_str = artist_str.rstrip("& ...\r\n")
|
||||||
|
artist_str = artist_str.strip()
|
||||||
|
|
||||||
|
artist_list.append(Artist(name=artist_str))
|
||||||
|
|
||||||
|
track_count_soup: BeautifulSoup = small_list[1]
|
||||||
|
rating_soup: BeautifulSoup = small_list[2]
|
||||||
|
else:
|
||||||
|
LOGGER.warning("got an unequal ammount than 3 small elements")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return Album(
|
||||||
|
_id=_id,
|
||||||
|
title=title,
|
||||||
|
source_list=source_list,
|
||||||
|
date=ID3Timestamp(year=year),
|
||||||
|
artist_list=artist_list
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_contact_container(cls, contact_container_soup: BeautifulSoup) -> List[Union[Artist, Album]]:
|
def parse_contact_container(cls, contact_container_soup: BeautifulSoup) -> List[Union[Artist, Album]]:
|
||||||
# print(contact_container_soup.prettify)
|
#print(contact_container_soup.prettify)
|
||||||
contacts = []
|
contacts = []
|
||||||
|
|
||||||
# print(contact_container_soup)
|
# print(contact_container_soup)
|
||||||
|
|
||||||
contact: BeautifulSoup
|
contact: BeautifulSoup
|
||||||
for contact in contact_container_soup.find_all("div", {"class": "contacts__item"}):
|
for contact in contact_container_soup.find_all("div", {"class": "contacts__item"}):
|
||||||
# print(contact)
|
|
||||||
|
|
||||||
anchor_soup = contact.find("a")
|
anchor_soup = contact.find("a")
|
||||||
|
|
||||||
if anchor_soup is not None:
|
if anchor_soup is not None:
|
||||||
url = anchor_soup.get("href")
|
url = anchor_soup.get("href")
|
||||||
|
|
||||||
if url is not None:
|
if url is not None:
|
||||||
print(url)
|
#print(url)
|
||||||
if "artist" in url:
|
if "artist" in url:
|
||||||
contacts.append(cls.parse_artist_contact(contact))
|
contacts.append(cls.parse_artist_contact(contact))
|
||||||
elif "release" in url:
|
elif "release" in url:
|
||||||
contacts.append(cls.parse_album_contact(contact))
|
contacts.append(cls.parse_album_contact(contact))
|
||||||
break
|
|
||||||
return contacts
|
return contacts
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -142,7 +246,7 @@ class Musify(Page):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def plaintext_search(cls, query: str) -> List[MusicObject]:
|
def plaintext_search(cls, query: str) -> Options:
|
||||||
search_results = []
|
search_results = []
|
||||||
|
|
||||||
search_soup = cls.get_soup_of_search(query=query)
|
search_soup = cls.get_soup_of_search(query=query)
|
||||||
@ -159,39 +263,7 @@ class Musify(Page):
|
|||||||
for playlist_soup in search_soup.find_all("div", {"class": "playlist"}):
|
for playlist_soup in search_soup.find_all("div", {"class": "playlist"}):
|
||||||
search_results.extend(cls.parse_playlist_soup(playlist_soup))
|
search_results.extend(cls.parse_playlist_soup(playlist_soup))
|
||||||
|
|
||||||
"""
|
return Options(search_results)
|
||||||
# get the soup of the container with all track results
|
|
||||||
tracklist_container_soup = search_soup.find_all("div", {"class": "playlist"})
|
|
||||||
if len(tracklist_container_soup) == 0:
|
|
||||||
return []
|
|
||||||
if len(tracklist_container_soup) != 1:
|
|
||||||
LOGGER.warning("HTML Layout of https://musify.club/ changed. (or bug)")
|
|
||||||
tracklist_container_soup = tracklist_container_soup[0]
|
|
||||||
|
|
||||||
tracklist_soup = tracklist_container_soup.find_all("div", {"class": "playlist__details"})
|
|
||||||
|
|
||||||
def parse_track_soup(_track_soup):
|
|
||||||
anchor_soups = _track_soup.find_all("a")
|
|
||||||
artist_ = anchor_soups[0].text.strip()
|
|
||||||
track_ = anchor_soups[1].text.strip()
|
|
||||||
url_ = anchor_soups[1]['href']
|
|
||||||
return artist_, track_, url_
|
|
||||||
|
|
||||||
# check each track in the container, if they match
|
|
||||||
for track_soup in tracklist_soup:
|
|
||||||
artist_option, title_option, track_url = parse_track_soup(track_soup)
|
|
||||||
|
|
||||||
title_match, title_distance = phonetic_compares.match_titles(title, title_option)
|
|
||||||
artist_match, artist_distance = phonetic_compares.match_artists(artist, artist_option)
|
|
||||||
|
|
||||||
logging.debug(f"{(title, title_option, title_match, title_distance)}")
|
|
||||||
logging.debug(f"{(artist, artist_option, artist_match, artist_distance)}")
|
|
||||||
|
|
||||||
if not title_match and not artist_match:
|
|
||||||
return cls.get_download_link(track_url)
|
|
||||||
"""
|
|
||||||
|
|
||||||
return search_results
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fetch_album_details(cls, album: Album, flat: bool = False) -> Album:
|
def fetch_album_details(cls, album: Album, flat: bool = False) -> Album:
|
||||||
|
Loading…
Reference in New Issue
Block a user