from collections import defaultdict from dataclasses import dataclass from enum import Enum from typing import Any, Dict, Generator, List, Optional, Type, Union from urllib.parse import urlparse import pycountry from bs4 import BeautifulSoup from ..connection import Connection from ..objects import (Album, Artist, ArtworkCollection, DatabaseObject, FormattedText, ID3Timestamp, Label, Lyrics, Song, Source, Target) from ..utils import shared, string_processing from ..utils.config import logging_settings, main_settings from ..utils.enums import ALL_SOURCE_TYPES, SourceType from ..utils.enums.album import AlbumStatus, AlbumType from ..utils.string_processing import clean_song_title from ..utils.support_classes.download_result import DownloadResult from ..utils.support_classes.query import Query from .abstract import Page """ https://musify.club/artist/ghost-bath-280348?_pjax=#bodyContent https://musify.club/artist/ghost-bath-280348/releases?_pjax=#bodyContent https://musify.club/artist/ghost-bath-280348/clips?_pjax=#bodyContent https://musify.club/artist/ghost-bath-280348/photos?_pjax=#bodyContent POST https://musify.club/artist/filtersongs ID: 280348 NameForUrl: ghost-bath Page: 1 IsAllowed: True SortOrder.Property: dateCreated SortOrder.IsAscending: false X-Requested-With: XMLHttpRequest POST https://musify.club/artist/filteralbums ArtistID: 280348 SortOrder.Property: dateCreated SortOrder.IsAscending: false X-Requested-With: XMLHttpRequest """ class MusifyTypes(Enum): ARTIST = "artist" RELEASE = "release" SONG = "track" @dataclass class MusifyUrl: source_type: MusifyTypes name_without_id: str name_with_id: str musify_id: str url: str ALBUM_TYPE_MAP = defaultdict(lambda: AlbumType.OTHER, { 1: AlbumType.OTHER, # literally other xD 2: AlbumType.STUDIO_ALBUM, 3: AlbumType.EP, 4: AlbumType.SINGLE, 5: AlbumType.OTHER, # BOOTLEG 6: AlbumType.LIVE_ALBUM, 7: AlbumType.COMPILATION_ALBUM, # compilation of different artists 8: AlbumType.MIXTAPE, 9: AlbumType.DEMO, 10: AlbumType.MIXTAPE, # DJ Mixes 11: AlbumType.COMPILATION_ALBUM, # compilation of only this artist 12: AlbumType.STUDIO_ALBUM, # split 13: AlbumType.COMPILATION_ALBUM, # unofficial 14: AlbumType.MIXTAPE # "Soundtracks" }) def parse_url(url: str) -> MusifyUrl: parsed = urlparse(url) path = parsed.path.split("/") split_name = path[2].split("-") url_id = split_name[-1] name_for_url = "-".join(split_name[:-1]) try: type_enum = MusifyTypes(path[1]) except ValueError as e: logging_settings["musify_logger"].warning(f"{path[1]} is not yet implemented, add it to MusifyTypes") raise e return MusifyUrl( source_type=type_enum, name_without_id=name_for_url, name_with_id=path[2], musify_id=url_id, url=url ) class Musify(Page): SOURCE_TYPE = ALL_SOURCE_TYPES.MUSIFY HOST = "https://musify.club" def __init__(self, *args, **kwargs): self.connection: Connection = Connection( host="https://musify.club/", logger=self.LOGGER, module="musify", ) self.stream_connection: Connection = Connection( host="https://musify.club/", logger=self.LOGGER, semantic_not_found=False, ) super().__init__(*args, **kwargs) def get_source_type(self, source: Source) -> Optional[Type[DatabaseObject]]: if source.url is None: return None musify_url = parse_url(source.url) # Has no labels, because afaik musify has no Labels musify_type_to_database_type = { MusifyTypes.SONG: Song, MusifyTypes.RELEASE: Album, MusifyTypes.ARTIST: Artist } return musify_type_to_database_type.get(musify_url.source_type) def _parse_artist_contact(self, contact: BeautifulSoup) -> Artist: source_list: List[Source] = [] name = None _id = None # source anchor = contact.find("a") if anchor is not None: href = anchor.get("href") name = anchor.get("title") if "-" in href: _id = href.split("-")[-1] source_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) # artist image image_soup = contact.find("img") if image_soup is not None: alt = image_soup.get("alt") if alt is not None: name = alt artist_thumbnail = image_soup.get("src") return Artist( name=name, source_list=source_list ) def _parse_album_contact(self, contact: BeautifulSoup) -> Album: """
""" source_list: List[Source] = [] title = None _id = None year = None artist_list: List[Artist] = [] def parse_title_date(title_date: Optional[str], delimiter: str = " - "): nonlocal year nonlocal title if title_date is None: return title_date = title_date.strip() split_attr = title_date.split(delimiter) if len(split_attr) < 2: return if not split_attr[-1].isdigit(): return year = int(split_attr[-1]) title = delimiter.join(split_attr[:-1]) # source anchor = contact.find("a") if anchor is not None: href = anchor.get("href") # get the title and year parse_title_date(anchor.get("title")) if "-" in href: _id = href.split("-")[-1] source_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) # cover art image_soup = contact.find("img") if image_soup is not None: alt = image_soup.get("alt") if alt is not None: title = alt cover_art = image_soup.get("src") contact_info_soup = contact.find("div", {"class": "contacts__info"}) if contact_info_soup is not None: """ Ghost Bath - 2013 Ghost Bath Треков: 4 9,04 """ title_soup = contact_info_soup.find("strong") if title_soup is None: parse_title_date(title_soup) small_list = contact_info_soup.find_all("small") if len(small_list) == 3: # artist artist_soup: BeautifulSoup = small_list[0] raw_artist_str = artist_soup.text for artist_str in raw_artist_str.split("&\r\n"): artist_str = artist_str.rstrip("& ...\r\n") artist_str = artist_str.strip() if artist_str.endswith("]") and "[" in artist_str: artist_str = artist_str.rsplit("[", maxsplit=1)[0] artist_list.append(Artist(name=artist_str)) track_count_soup: BeautifulSoup = small_list[1] rating_soup: BeautifulSoup = small_list[2] else: self.LOGGER.warning("got an unequal ammount than 3 small elements") return Album( title=title, source_list=source_list, date=ID3Timestamp(year=year), artist_list=artist_list ) def _parse_contact_container(self, contact_container_soup: BeautifulSoup) -> List[Union[Artist, Album]]: contacts = [] contact: BeautifulSoup for contact in contact_container_soup.find_all("div", {"class": "contacts__item"}): anchor_soup = contact.find("a") if anchor_soup is not None: url = anchor_soup.get("href") if url is not None: if "artist" in url: contacts.append(self._parse_artist_contact(contact)) elif "release" in url: contacts.append(self._parse_album_contact(contact)) return contacts def _parse_playlist_item(self, playlist_item_soup: BeautifulSoup) -> Song: _id = None song_title = playlist_item_soup.get("data-name") artist_list: List[Artist] = [] source_list: List[Source] = [] # details playlist_details: BeautifulSoup = playlist_item_soup.find("div", {"class", "playlist__heading"}) if playlist_details is not None: anchor_list = playlist_details.find_all("a") if len(anchor_list) >= 2: # artists artist_anchor: BeautifulSoup for artist_anchor in anchor_list[:-1]: _id = None href = artist_anchor.get("href") artist_source: Source = Source(self.SOURCE_TYPE, self.HOST + href) if "-" in href: _id = href.split("-")[-1] artist_list.append(Artist( name=artist_anchor.get_text(strip=True), source_list=[artist_source] )) # track track_soup: BeautifulSoup = anchor_list[-1] """ TODO this anchor text may have something like (feat. some artist) which is not acceptable """ href = track_soup.get("href") if href is not None: if "-" in href: raw_id: str = href.split("-")[-1] if raw_id.isdigit(): _id = raw_id source_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) else: self.LOGGER.debug("there are not enough anchors (2) for artist and track") self.LOGGER.debug(str(artist_list)) """ artist_name = playlist_item_soup.get("data-artist") if artist_name is not None: artist_list.append(Artist(name=artist_name)) """ id_attribute = playlist_item_soup.get("id") if id_attribute is not None: raw_id = id_attribute.replace("playerDiv", "") if raw_id.isdigit(): _id = raw_id return Song( title=clean_song_title(song_title, artist_name=artist_list[0].name if len(artist_list) > 0 else None), feature_artist_list=artist_list, source_list=source_list ) def _parse_playlist_soup(self, playlist_soup: BeautifulSoup) -> List[Song]: song_list = [] for playlist_item_soup in playlist_soup.find_all("div", {"class": "playlist__item"}): song_list.append(self._parse_playlist_item(playlist_item_soup)) return song_list def general_search(self, search_query: str) -> List[DatabaseObject]: search_results = [] r = self.connection.get(f"https://musify.club/search?searchText={search_query}", name="search_" + search_query) if r is None: return [] search_soup: BeautifulSoup = self.get_soup_from_response(r) # album and songs # child of div class: contacts row for contact_container_soup in search_soup.find_all("div", {"class": "contacts"}): search_results.extend(self._parse_contact_container(contact_container_soup)) # song # div class: playlist__item for playlist_soup in search_soup.find_all("div", {"class": "playlist"}): search_results.extend(self._parse_playlist_soup(playlist_soup)) return search_results def fetch_song(self, source: Source, stop_at_level: int = 1) -> Song: musify_url = parse_url(source.url) r = self.connection.get(source.url, name="track_" + musify_url.name_with_id) if r is None: return Song() soup = self.get_soup_from_response(r) track_name: str = None source_list: List[Source] = [source] lyrics_list: List[Lyrics] = [] artist_list: List[Artist] = [] album_list: List[Album] = [] def _parse_artist_anchor(artist_soup: BeautifulSoup): nonlocal artist_list if artist_soup is None: return artist_src_list = [] artist_name = None href = artist_soup["href"] if href is not None: href_parts = href.split("/") if len(href_parts) <= 1 or href_parts[-2] != "artist": return artist_src_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) name_elem: BeautifulSoup = artist_soup.find("span", {"itemprop": "name"}) if name_elem is not None: artist_name = name_elem.text.strip() artist_list.append(Artist(name=artist_name, source_list=artist_src_list)) def _parse_album_anchor(album_soup: BeautifulSoup): nonlocal album_list if album_anchor is None: return album_source_list = [] album_name = None href = album_soup["href"] if href is not None: album_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) name_elem: BeautifulSoup = album_soup.find("span", {"itemprop": "name"}) if name_elem is not None: album_name = name_elem.text.strip() album_list.append(Album(title=album_name, source_list=album_source_list)) # download url anchor: BeautifulSoup for anchor in soup.find_all("a", {"itemprop": "audio"}): href = anchor["href"] if href is not None: source.audio_url = self.HOST + href # song detail album_info: BeautifulSoup for album_info in soup.find_all("ul", {"class": "album-info"}): list_element: BeautifulSoup = album_info.find("li") if list_element is not None: artist_soup: BeautifulSoup for artist_soup in list_element.find_all("a"): artist_source_list = [] href = artist_soup["href"] if href is not None: artist_source_list = [Source(self.SOURCE_TYPE, self.HOST + href)] artist_list.append(Artist( name=artist_soup.text.strip(), source_list=artist_source_list )) # breadcrums breadcrumb_list_element_list: List[BeautifulSoup] = soup.find_all("ol", {"class": "breadcrumb"}) for breadcrumb_list_element in breadcrumb_list_element_list: list_points: List[BeautifulSoup] = breadcrumb_list_element.find_all("li", "breadcrumb-item") if len(list_points) != 5: self.LOGGER.warning(f"breadcrumbs of song doesn't have 5 items: {breadcrumb_list_element.prettify()}") break artist_anchor: BeautifulSoup = list_points[2].find("a") _parse_artist_anchor(artist_anchor) album_anchor: BeautifulSoup = list_points[3].find("a") _parse_album_anchor(album_anchor) track_name = list_points[4].text.strip() # album artwork artwork: ArtworkCollection = ArtworkCollection() album_image_element_list: List[BeautifulSoup] = soup.find_all("img", {"class": "album-img"}) for album_image_element in album_image_element_list: artwork.add_data(url=album_image_element.get("data-src", album_image_element.get("src"))) # lyrics lyrics_container: List[BeautifulSoup] = soup.find_all("div", {"id": "tabLyrics"}) for lyrics in lyrics_container: lyrics_text = lyrics.find("div", {"style": "white-space: pre-line"}).text.strip() lyrics_list.append(Lyrics(text=FormattedText(html=lyrics_text))) # youtube video video_container_list: List[BeautifulSoup] = soup.find_all("div", {"id": "tabVideo"}) for video_container in video_container_list: iframe_list: List[BeautifulSoup] = video_container.findAll("iframe") for iframe in iframe_list: """ the url could look like this https://www.youtube.com/embed/sNObCkhzOYA?si=dNVgnZMBNVlNb0P_ """ parsed_url = urlparse(iframe["src"]) path_parts = parsed_url.path.strip("/").split("/") if path_parts[0] != "embed" or len(path_parts) < 2: continue source_list.append(Source( ALL_SOURCE_TYPES.YOUTUBE, f"https://music.youtube.com/watch?v={path_parts[1]}", referrer_page=self.SOURCE_TYPE )) return Song( title=clean_song_title(track_name, artist_name=artist_list[0].name if len(artist_list) > 0 else None), source_list=source_list, lyrics_list=lyrics_list, feature_artist_list=artist_list, album_list=album_list, artwork=artwork, ) def _parse_song_card(self, song_card: BeautifulSoup) -> Song: """ """ song_name = song_card.get("data-name") artist_list: List[Artist] = [] source_list: List[Source] = [] tracksort = None current_url = None def parse_title(_title: str) -> str: return _title """ # get from parent div _artist_name = song_card.get("data-artist") if _artist_name is not None: artist_list.append(Artist(name=_artist_name)) """ # get tracksort tracksort_soup: BeautifulSoup = song_card.find("div", {"class": "playlist__position"}) if tracksort_soup is not None: raw_tracksort: str = tracksort_soup.get_text(strip=True) if raw_tracksort.isdigit(): tracksort = int(raw_tracksort) # playlist details playlist_details: BeautifulSoup = song_card.find("div", {"class": "playlist__details"}) if playlist_details is not None: """ """ # track anchor_list: List[BeautifulSoup] = playlist_details.find_all("a") if len(anchor_list) > 1: track_anchor: BeautifulSoup = anchor_list[-1] href: str = track_anchor.get("href") if href is not None: current_url = self.HOST + href source_list.append(Source(self.SOURCE_TYPE, self.HOST + href)) song_name = parse_title(track_anchor.get_text(strip=True)) # artist artist_span: BeautifulSoup for artist_span in playlist_details.find_all("span", {"itemprop": "byArtist"}): _artist_src = None _artist_name = None meta_artist_src = artist_span.find("meta", {"itemprop": "url"}) if meta_artist_src is not None: meta_artist_url = meta_artist_src.get("content") if meta_artist_url is not None: _artist_src = [Source(self.SOURCE_TYPE, self.HOST + meta_artist_url)] meta_artist_name = artist_span.find("meta", {"itemprop": "name"}) if meta_artist_name is not None: meta_artist_name_text = meta_artist_name.get("content") _artist_name = meta_artist_name_text if _artist_name is not None or _artist_src is not None: artist_list.append(Artist(name=_artist_name, source_list=_artist_src)) # playlist actions playlist_actions: BeautifulSoup = song_card.find("div", {"class": "playlist__actions"}) if playlist_actions is not None: """ """ # getting the actual download link: download_anchor = playlist_actions.find("a", {"itemprop": "audio"}) if download_anchor is not None: download_href = download_anchor.get("href") if download_href is not None and current_url is not None: source_list.append(Source( self.SOURCE_TYPE, url=current_url, audio_url=self.HOST + download_href )) return Song( title=clean_song_title(song_name, artist_name=artist_list[0].name if len(artist_list) > 0 else None), tracksort=tracksort, feature_artist_list=artist_list, source_list=source_list ) def _parse_album(self, soup: BeautifulSoup) -> Album: name: str = None source_list: List[Source] = [] artist_list: List[Artist] = [] date: ID3Timestamp = None """ if breadcrumb list has 4 elements, then the -2 is the artist link, the -1 is the album """ # breadcrumb breadcrumb_soup: BeautifulSoup = soup.find("ol", {"class", "breadcrumb"}) breadcrumb_elements: List[BeautifulSoup] = breadcrumb_soup.find_all("li", {"class": "breadcrumb-item"}) if len(breadcrumb_elements) == 4: # album album_crumb: BeautifulSoup = breadcrumb_elements[-1] name = album_crumb.text.strip() # artist artist_crumb: BeautifulSoup = breadcrumb_elements[-2] anchor: BeautifulSoup = artist_crumb.find("a") if anchor is not None: href = anchor.get("href") href_parts = href.split("/") if not(len(href_parts) <= 1 or href_parts[-2] != "artist"): artist_source_list: List[Source] = [] if href is not None: artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip())) span: BeautifulSoup = anchor.find("span") if span is not None: artist_list.append(Artist( name=span.get_text(strip=True), source_list=artist_source_list )) else: self.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case") # meta meta_url: BeautifulSoup = soup.find("meta", {"itemprop": "url"}) if meta_url is not None: url = meta_url.get("content") if url is not None: source_list.append(Source(self.SOURCE_TYPE, self.HOST + url)) meta_name: BeautifulSoup = soup.find("meta", {"itemprop": "name"}) if meta_name is not None: _name = meta_name.get("content") if _name is not None: name = _name # album info album_info_ul: BeautifulSoup = soup.find("ul", {"class": "album-info"}) if album_info_ul is not None: artist_anchor: BeautifulSoup for artist_anchor in album_info_ul.find_all("a", {"itemprop": "byArtist"}): # line 98 artist_source_list: List[Source] = [] artist_url_meta = artist_anchor.find("meta", {"itemprop": "url"}) if artist_url_meta is not None: artist_href = artist_url_meta.get("content") if artist_href is not None: artist_source_list.append(Source(self.SOURCE_TYPE, url=self.HOST + artist_href)) artist_meta_name = artist_anchor.find("meta", {"itemprop": "name"}) if artist_meta_name is not None: artist_name = artist_meta_name.get("content") if artist_name is not None: artist_list.append(Artist( name=artist_name, source_list=artist_source_list )) time_soup: BeautifulSoup = album_info_ul.find("time", {"itemprop": "datePublished"}) if time_soup is not None: raw_datetime = time_soup.get("datetime") if raw_datetime is not None: try: date = ID3Timestamp.strptime(raw_datetime, "%Y-%m-%d") except ValueError: self.LOGGER.debug(f"Raw datetime doesn't match time format %Y-%m-%d: {raw_datetime}") # album artwork album_artwork: ArtworkCollection = ArtworkCollection() album_artwork_list: List[BeautifulSoup] = soup.find_all("img", {"class":"artist-img"}) for album_artwork in album_artwork_list: album_artwork.add_data(url=album_artwork.get("data-src", album_artwork.get("src"))) return Album( title=name, source_list=source_list, artist_list=artist_list, date=date, artwork=album_artwork ) def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: """ fetches album from source: eg. 'https://musify.club/release/linkin-park-hybrid-theory-2000-188' /html/musify/album_overview.html - [x] tracklist - [x] attributes - [ ] ratings :param stop_at_level: :param source: :return: """ url = parse_url(source.url) endpoint = self.HOST + "/release/" + url.name_with_id r = self.connection.get(endpoint, name=url.name_with_id) if r is None: return Album() soup = BeautifulSoup(r.content, "html.parser") album = self._parse_album(soup) #