music-kraken-core/src/music_kraken/pages/musify.py

1049 lines
40 KiB
Python
Raw Normal View History

2023-03-17 12:58:58 +00:00
from collections import defaultdict
2023-04-04 18:58:22 +00:00
from dataclasses import dataclass
from enum import Enum
2023-03-30 10:00:39 +00:00
from typing import List, Optional, Type, Union
2023-04-04 18:58:22 +00:00
from urllib.parse import urlparse
import pycountry
2023-03-13 14:47:38 +00:00
import requests
from bs4 import BeautifulSoup
2023-04-20 20:30:45 +00:00
from ..connection import Connection
2023-03-13 14:47:38 +00:00
from .abstract import Page
2023-04-18 10:00:25 +00:00
from ..utils.enums.source import SourcePages
2023-04-18 10:14:34 +00:00
from ..utils.enums.album import AlbumType, AlbumStatus
2023-03-13 14:47:38 +00:00
from ..objects import (
Artist,
Source,
Song,
Album,
ID3Timestamp,
FormattedText,
Label,
2023-03-17 12:58:58 +00:00
Options,
2023-03-30 14:10:48 +00:00
Target
2023-03-13 14:47:38 +00:00
)
2023-04-20 20:30:45 +00:00
from ..utils.shared import MUSIFY_LOGGER
2023-04-04 18:58:22 +00:00
from ..utils import string_processing, shared
from .support_classes.download_result import DownloadResult
2023-03-13 14:47:38 +00:00
2023-03-16 21:52:47 +00:00
"""
https://musify.club/artist/ghost-bath-280348?_pjax=#bodyContent
https://musify.club/artist/ghost-bath-280348/releases?_pjax=#bodyContent
https://musify.club/artist/ghost-bath-280348/clips?_pjax=#bodyContent
https://musify.club/artist/ghost-bath-280348/photos?_pjax=#bodyContent
POST https://musify.club/artist/filtersongs
ID: 280348
NameForUrl: ghost-bath
Page: 1
IsAllowed: True
SortOrder.Property: dateCreated
SortOrder.IsAscending: false
X-Requested-With: XMLHttpRequest
POST https://musify.club/artist/filteralbums
ArtistID: 280348
SortOrder.Property: dateCreated
SortOrder.IsAscending: false
X-Requested-With: XMLHttpRequest
"""
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
class MusifyTypes(Enum):
ARTIST = "artist"
RELEASE = "release"
2023-03-20 21:27:05 +00:00
SONG = "track"
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
@dataclass
class MusifyUrl:
source_type: MusifyTypes
name_without_id: str
name_with_id: str
musify_id: str
url: str
2023-03-13 14:47:38 +00:00
2023-03-15 19:55:28 +00:00
class Musify(Page):
2023-03-13 14:47:38 +00:00
API_SESSION: requests.Session = requests.Session()
API_SESSION.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Connection": "keep-alive",
"Referer": "https://musify.club/"
}
API_SESSION.proxies = shared.proxies
2023-04-04 08:05:37 +00:00
TIMEOUT = 7
2023-04-04 10:47:34 +00:00
POST_TIMEOUT = 15
TRIES = 5
HOST = "https://musify.club"
2023-03-13 14:47:38 +00:00
2023-04-20 20:30:45 +00:00
CONNECTION = Connection(
host="https://musify.club/",
logger=MUSIFY_LOGGER
)
2023-03-13 14:47:38 +00:00
SOURCE_TYPE = SourcePages.MUSIFY
2023-04-04 18:58:22 +00:00
LOGGER = shared.MUSIFY_LOGGER
2023-03-17 12:11:18 +00:00
@classmethod
def parse_url(cls, url: str) -> MusifyUrl:
parsed = urlparse(url)
path = parsed.path.split("/")
split_name = path[2].split("-")
url_id = split_name[-1]
name_for_url = "-".join(split_name[:-1])
try:
type_enum = MusifyTypes(path[1])
except ValueError as e:
2023-04-04 18:58:22 +00:00
cls.LOGGER.warning(f"{path[1]} is not yet implemented, add it to MusifyTypes")
raise e
return MusifyUrl(
source_type=type_enum,
name_without_id=name_for_url,
name_with_id=path[2],
musify_id=url_id,
url=url
)
2023-03-13 14:47:38 +00:00
@classmethod
def search_by_query(cls, query: str) -> Options:
query_obj = cls.Query(query)
if query_obj.is_raw:
2023-03-14 13:48:03 +00:00
return cls.plaintext_search(query_obj.query)
return cls.plaintext_search(cls.get_plaintext_query(query_obj))
2023-03-13 14:47:38 +00:00
@classmethod
2023-03-14 13:48:03 +00:00
def get_plaintext_query(cls, query: Page.Query) -> str:
if query.album is None:
return f"{query.artist or '*'} - {query.song or '*'}"
2023-03-16 13:36:49 +00:00
return f"{query.artist or '*'} - {query.album or '*'} - {query.song or '*'}"
2023-03-13 14:47:38 +00:00
2023-03-14 13:58:54 +00:00
@classmethod
2023-03-15 19:55:28 +00:00
def parse_artist_contact(cls, contact: BeautifulSoup) -> Artist:
source_list: List[Source] = []
name = None
2023-03-15 19:55:28 +00:00
_id = None
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
# source
anchor = contact.find("a")
if anchor is not None:
href = anchor.get("href")
name = anchor.get("title")
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
if "-" in href:
_id = href.split("-")[-1]
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
# artist image
image_soup = contact.find("img")
if image_soup is not None:
alt = image_soup.get("alt")
if alt is not None:
name = alt
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
artist_thumbnail = image_soup.get("src")
2023-03-17 12:11:18 +00:00
2023-03-22 11:58:11 +00:00
return Artist(
2023-03-15 19:55:28 +00:00
_id=_id,
name=name,
source_list=source_list
2023-03-22 11:58:11 +00:00
)
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
@classmethod
def parse_album_contact(cls, contact: BeautifulSoup) -> Album:
2023-03-16 13:36:49 +00:00
"""
<div class="contacts__item">
<a href="/release/ghost-bath-ghost-bath-2013-602489" title="Ghost Bath - 2013">
<div class="contacts__img release">
<img alt="Ghost Bath" class="lozad" data-src="https://37s.musify.club/img/69/9060265/24178833.jpg"/>
<noscript><img alt="Ghost Bath" src="https://37s.musify.club/img/69/9060265/24178833.jpg"/></noscript>
</div>
<div class="contacts__info">
<strong>Ghost Bath - 2013</strong>
<small>Ghost Bath</small>
<small>Треков: 4</small> <!--tracks-->
<small><i class="zmdi zmdi-star zmdi-hc-fw"></i> 9,04</small>
</div>
</a>
</div>
"""
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
source_list: List[Source] = []
title = None
2023-03-16 13:36:49 +00:00
_id = None
year = None
artist_list: List[Artist] = []
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
def parse_title_date(title_date: Optional[str], delimiter: str = " - "):
nonlocal year
nonlocal title
2023-03-16 13:36:49 +00:00
if title_date is None:
return
2023-03-17 12:11:18 +00:00
title_date = title_date.strip()
2023-03-16 13:36:49 +00:00
split_attr = title_date.split(delimiter)
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
if len(split_attr) < 2:
return
if not split_attr[-1].isdigit():
return
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
year = int(split_attr[-1])
title = delimiter.join(split_attr[:-1])
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
# source
anchor = contact.find("a")
if anchor is not None:
href = anchor.get("href")
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
# get the title and year
parse_title_date(anchor.get("title"))
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
if "-" in href:
_id = href.split("-")[-1]
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
# cover art
image_soup = contact.find("img")
if image_soup is not None:
alt = image_soup.get("alt")
if alt is not None:
title = alt
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
cover_art = image_soup.get("src")
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
contact_info_soup = contact.find("div", {"class": "contacts__info"})
if contact_info_soup is not None:
"""
<strong>Ghost Bath - 2013</strong>
<small>Ghost Bath</small>
<small>Треков: 4</small> <!--tracks-->
<small><i class="zmdi zmdi-star zmdi-hc-fw"></i> 9,04</small>
"""
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
title_soup = contact_info_soup.find("strong")
if title_soup is None:
parse_title_date(title_soup)
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
small_list = contact_info_soup.find_all("small")
if len(small_list) == 3:
# artist
artist_soup: BeautifulSoup = small_list[0]
raw_artist_str = artist_soup.text
for artist_str in raw_artist_str.split("&\r\n"):
artist_str = artist_str.rstrip("& ...\r\n")
artist_str = artist_str.strip()
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
if artist_str.endswith("]") and "[" in artist_str:
artist_str = artist_str.rsplit("[", maxsplit=1)[0]
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
artist_list.append(Artist(name=artist_str))
2023-03-17 12:11:18 +00:00
2023-03-16 13:36:49 +00:00
track_count_soup: BeautifulSoup = small_list[1]
rating_soup: BeautifulSoup = small_list[2]
else:
2023-04-04 18:58:22 +00:00
cls.LOGGER.warning("got an unequal ammount than 3 small elements")
2023-03-17 12:11:18 +00:00
return Album(
2023-03-16 13:36:49 +00:00
_id=_id,
title=title,
source_list=source_list,
date=ID3Timestamp(year=year),
artist_list=artist_list
)
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
@classmethod
def parse_contact_container(cls, contact_container_soup: BeautifulSoup) -> List[Union[Artist, Album]]:
contacts = []
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
contact: BeautifulSoup
for contact in contact_container_soup.find_all("div", {"class": "contacts__item"}):
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
anchor_soup = contact.find("a")
2023-03-16 13:36:49 +00:00
2023-03-15 19:55:28 +00:00
if anchor_soup is not None:
url = anchor_soup.get("href")
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
if url is not None:
if "artist" in url:
contacts.append(cls.parse_artist_contact(contact))
elif "release" in url:
contacts.append(cls.parse_album_contact(contact))
return contacts
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
@classmethod
def parse_playlist_item(cls, playlist_item_soup: BeautifulSoup) -> Song:
_id = None
song_title = playlist_item_soup.get("data-name")
2023-03-16 15:57:43 +00:00
artist_list: List[Artist] = []
source_list: List[Source] = []
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
# details
playlist_details: BeautifulSoup = playlist_item_soup.find("div", {"class", "playlist__heading"})
if playlist_details is not None:
anchor_list = playlist_details.find_all("a")
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
if len(anchor_list) >= 2:
# artists
2023-03-17 12:11:18 +00:00
artist_anchor: BeautifulSoup
2023-03-16 15:57:43 +00:00
for artist_anchor in anchor_list[:-1]:
_id = None
href = artist_anchor.get("href")
artist_source: Source = Source(cls.SOURCE_TYPE, cls.HOST + href)
if "-" in href:
_id = href.split("-")[-1]
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
artist_list.append(Artist(
_id=_id,
name=artist_anchor.get_text(strip=True),
source_list=[artist_source]
))
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
# track
track_soup: BeautifulSoup = anchor_list[-1]
"""
TODO
this anchor text may have something like (feat. some artist)
which is not acceptable
"""
href = track_soup.get("href")
if href is not None:
if "-" in href:
raw_id: str = href.split("-")[-1]
if raw_id.isdigit():
_id = raw_id
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
else:
2023-04-04 18:58:22 +00:00
cls.LOGGER.debug("there are not enough anchors (2) for artist and track")
cls.LOGGER.debug(str(artist_list))
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
"""
artist_name = playlist_item_soup.get("data-artist")
if artist_name is not None:
artist_list.append(Artist(name=artist_name))
"""
id_attribute = playlist_item_soup.get("id")
if id_attribute is not None:
raw_id = id_attribute.replace("playerDiv", "")
if raw_id.isdigit():
_id = raw_id
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
return Song(
_id=_id,
title=song_title,
main_artist_list=artist_list,
source_list=source_list
)
2023-03-17 12:11:18 +00:00
2023-03-15 19:55:28 +00:00
@classmethod
def parse_playlist_soup(cls, playlist_soup: BeautifulSoup) -> List[Song]:
2023-03-16 15:57:43 +00:00
song_list = []
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
for playlist_item_soup in playlist_soup.find_all("div", {"class": "playlist__item"}):
song_list.append(cls.parse_playlist_item(playlist_item_soup))
2023-03-17 12:11:18 +00:00
2023-03-16 15:57:43 +00:00
return song_list
2023-03-13 14:47:38 +00:00
@classmethod
2023-03-16 13:36:49 +00:00
def plaintext_search(cls, query: str) -> Options:
2023-03-15 19:55:28 +00:00
search_results = []
2023-03-17 12:11:18 +00:00
2023-04-20 20:30:45 +00:00
r = cls.CONNECTION.get(f"https://musify.club/search?searchText={query}")
if r is None:
return Options()
search_soup: BeautifulSoup = BeautifulSoup(r.content, features="html.parser")
2023-03-17 12:11:18 +00:00
2023-03-14 13:48:03 +00:00
# album and songs
# child of div class: contacts row
2023-03-15 19:55:28 +00:00
for contact_container_soup in search_soup.find_all("div", {"class": "contacts"}):
search_results.extend(cls.parse_contact_container(contact_container_soup))
2023-03-17 12:11:18 +00:00
2023-03-14 13:48:03 +00:00
# song
# div class: playlist__item
for playlist_soup in search_soup.find_all("div", {"class": "playlist"}):
2023-03-15 19:55:28 +00:00
search_results.extend(cls.parse_playlist_soup(playlist_soup))
2023-03-14 13:48:03 +00:00
2023-03-16 13:36:49 +00:00
return Options(search_results)
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
@classmethod
2023-03-20 14:33:45 +00:00
def parse_album_card(cls, album_card: BeautifulSoup, artist_name: str = None) -> Album:
2023-03-17 11:31:56 +00:00
"""
<div class="card release-thumbnail" data-type="2">
<a href="/release/ghost-bath-self-loather-2021-1554266">
<img alt="Self Loather" class="card-img-top lozad" data-src="https://40s-a.musify.club/img/70/24826582/62624396.jpg"/>
<noscript><img alt="Self Loather" src="https://40s-a.musify.club/img/70/24826582/62624396.jpg"/></noscript>
</a>
<div class="card-body">
<h4 class="card-subtitle">
<a href="/release/ghost-bath-self-loather-2021-1554266">Self Loather</a>
</h4>
</div>
<div class="card-footer"><p class="card-text"><a href="/albums/2021">2021</a></p></div>
<div class="card-footer">
<p class="card-text genre__labels">
<a href="/genre/depressive-black-132">Depressive Black</a><a href="/genre/post-black-metal-295">Post-Black Metal</a> </p>
</div>
<div class="card-footer">
<small><i class="zmdi zmdi-calendar" title="Добавлено"></i> 13.11.2021</small>
<small><i class="zmdi zmdi-star zmdi-hc-fw" title="Рейтинг"></i> 5,88</small>
</div>
</div>
"""
2023-03-17 12:58:58 +00:00
album_type_map = defaultdict(lambda: AlbumType.OTHER, {
1: AlbumType.OTHER, # literally other xD
2: AlbumType.STUDIO_ALBUM,
3: AlbumType.EP,
4: AlbumType.SINGLE,
5: AlbumType.OTHER, # BOOTLEG
6: AlbumType.LIVE_ALBUM,
7: AlbumType.COMPILATION_ALBUM, # compilation of different artists
8: AlbumType.MIXTAPE,
9: AlbumType.DEMO,
10: AlbumType.MIXTAPE, # DJ Mixes
11: AlbumType.COMPILATION_ALBUM, # compilation of only this artist
2023-04-04 10:47:34 +00:00
12: AlbumType.STUDIO_ALBUM, # split
2023-03-17 12:58:58 +00:00
13: AlbumType.COMPILATION_ALBUM, # unofficial
14: AlbumType.MIXTAPE # "Soundtracks"
})
_id: Optional[str] = None
name: str = None
2023-03-17 11:31:56 +00:00
source_list: List[Source] = []
2023-03-17 12:58:58 +00:00
timestamp: Optional[ID3Timestamp] = None
album_status = None
2023-03-20 14:33:45 +00:00
def set_name(new_name: str):
nonlocal name
nonlocal artist_name
# example of just setting not working:
# https://musify.club/release/unjoy-eurythmie-psychonaut-4-tired-numb-still-alive-2012-324067
2023-03-20 14:33:45 +00:00
if new_name.count(" - ") != 1:
name = new_name
return
potential_artist_list, potential_name = new_name.split(" - ")
unified_artist_list = string_processing.unify(potential_artist_list)
if artist_name is not None:
if string_processing.unify(artist_name) not in unified_artist_list:
name = new_name
return
name = potential_name
return
name = new_name
2023-03-17 12:58:58 +00:00
album_status_id = album_card.get("data-type")
if album_status_id.isdigit():
album_status_id = int(album_status_id)
album_type = album_type_map[album_status_id]
if album_status_id == 5:
album_status = AlbumStatus.BOOTLEG
2023-03-17 12:11:18 +00:00
2023-03-17 12:58:58 +00:00
def parse_release_anchor(_anchor: BeautifulSoup, text_is_name=False):
nonlocal _id
nonlocal name
nonlocal source_list
if _anchor is None:
2023-03-17 11:39:19 +00:00
return
2023-03-17 12:11:18 +00:00
2023-03-17 12:58:58 +00:00
href = _anchor.get("href")
if href is not None:
# add url to sources
source_list.append(Source(
cls.SOURCE_TYPE,
cls.HOST + href
))
# split id from url
split_href = href.split("-")
if len(split_href) > 1:
_id = split_href[-1]
2023-03-17 12:11:18 +00:00
2023-03-17 11:39:19 +00:00
if not text_is_name:
return
2023-03-17 12:11:18 +00:00
2023-03-20 14:33:45 +00:00
set_name(_anchor.text)
2023-03-17 12:11:18 +00:00
2023-03-17 11:39:19 +00:00
anchor_list = album_card.find_all("a", recursive=False)
if len(anchor_list) > 0:
anchor = anchor_list[0]
parse_release_anchor(anchor)
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
thumbnail: BeautifulSoup = anchor.find("img")
if thumbnail is not None:
alt = thumbnail.get("alt")
if alt is not None:
2023-03-20 14:33:45 +00:00
set_name(alt)
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
image_url = thumbnail.get("src")
else:
2023-04-04 18:58:22 +00:00
cls.LOGGER.debug("the card has no thumbnail or url")
2023-03-17 12:11:18 +00:00
2023-03-17 11:39:19 +00:00
card_body = album_card.find("div", {"class": "card-body"})
if card_body is not None:
parse_release_anchor(card_body.find("a"), text_is_name=True)
2023-03-17 12:11:18 +00:00
2023-03-17 12:58:58 +00:00
def parse_small_date(small_soup: BeautifulSoup):
"""
<small>
<i class="zmdi zmdi-calendar" title="Добавлено"></i>
13.11.2021
</small>
"""
nonlocal timestamp
italic_tagging_soup: BeautifulSoup = small_soup.find("i")
if italic_tagging_soup is None:
return
if italic_tagging_soup.get("title") != "Добавлено":
# "Добавлено" can be translated to "Added (at)"
return
raw_time = small_soup.text.strip()
timestamp = ID3Timestamp.strptime(raw_time, "%d.%m.%Y")
# parse small date
2023-03-17 11:39:19 +00:00
card_footer_list = album_card.find_all("div", {"class": "card-footer"})
2023-03-17 12:58:58 +00:00
if len(card_footer_list) != 3:
2023-04-04 18:58:22 +00:00
cls.LOGGER.debug("there are not exactly 3 card footers in a card")
2023-03-17 12:58:58 +00:00
if len(card_footer_list) > 0:
for any_small_soup in card_footer_list[-1].find_all("small"):
parse_small_date(any_small_soup)
else:
2023-04-04 18:58:22 +00:00
cls.LOGGER.debug("there is not even 1 footer in the album card")
2023-03-17 12:11:18 +00:00
2023-03-29 10:02:07 +00:00
return Album(
2023-03-17 12:58:58 +00:00
_id=_id,
2023-03-17 11:39:19 +00:00
title=name,
2023-03-17 12:58:58 +00:00
source_list=source_list,
date=timestamp,
album_type=album_type,
album_status=album_status
2023-03-29 10:02:07 +00:00
)
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
@classmethod
2023-03-24 14:58:21 +00:00
def get_discography(cls, url: MusifyUrl, artist_name: str = None, stop_at_level: int = 1) -> List[Album]:
2023-03-17 11:31:56 +00:00
"""
POST https://musify.club/artist/filteralbums
ArtistID: 280348
SortOrder.Property: dateCreated
SortOrder.IsAscending: false
X-Requested-With: XMLHttpRequest
"""
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
endpoint = cls.HOST + "/" + url.source_type.value + "/filteralbums"
2023-03-17 12:11:18 +00:00
r = cls.post_request(url=endpoint, json={
2023-03-17 11:31:56 +00:00
"ArtistID": str(url.musify_id),
"SortOrder.Property": "dateCreated",
"SortOrder.IsAscending": False,
"X-Requested-With": "XMLHttpRequest"
})
if r is None:
return []
2023-03-17 11:31:56 +00:00
soup: BeautifulSoup = BeautifulSoup(r.content, features="html.parser")
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
discography: List[Album] = []
for card_soup in soup.find_all("div", {"class": "card"}):
2023-03-20 14:33:45 +00:00
new_album: Album = cls.parse_album_card(card_soup, artist_name)
2023-03-20 13:40:32 +00:00
album_source: Source
2023-04-03 14:23:30 +00:00
2023-03-24 14:58:21 +00:00
if stop_at_level > 1:
2023-03-20 13:40:32 +00:00
for album_source in new_album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
new_album.merge(cls._fetch_album_from_source(album_source, stop_at_level=stop_at_level-1))
2023-04-03 14:23:30 +00:00
2023-03-20 13:40:32 +00:00
discography.append(new_album)
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
return discography
2023-03-17 12:11:18 +00:00
@classmethod
def get_artist_attributes(cls, url: MusifyUrl) -> Artist:
"""
fetches the main Artist attributes from this endpoint
https://musify.club/artist/ghost-bath-280348?_pjax=#bodyContent
it needs to parse html
:param url:
:return:
"""
2023-03-17 22:27:14 +00:00
2023-04-20 20:30:45 +00:00
r = cls.CONNECTION.get(f"https://musify.club/{url.source_type.value}/{url.name_with_id}?_pjax=#bodyContent")
2023-03-17 22:27:14 +00:00
if r is None:
2023-03-20 13:40:32 +00:00
return Artist(_id=url.musify_id)
2023-03-17 22:27:14 +00:00
soup = BeautifulSoup(r.content, "html.parser")
"""
<ol class="breadcrumb" itemscope="" itemtype="http://schema.org/BreadcrumbList">
<li class="breadcrumb-item" itemprop="itemListElement" itemscope="" itemtype="http://schema.org/ListItem"><a href="/" itemprop="item"><span itemprop="name">Главная</span><meta content="1" itemprop="position"/></a></li>
<li class="breadcrumb-item" itemprop="itemListElement" itemscope="" itemtype="http://schema.org/ListItem"><a href="/artist" itemprop="item"><span itemprop="name">Исполнители</span><meta content="2" itemprop="position"/></a></li>
<li class="breadcrumb-item active">Ghost Bath</li>
</ol>
<ul class="nav nav-tabs nav-fill">
<li class="nav-item"><a class="active nav-link" href="/artist/ghost-bath-280348">песни (41)</a></li>
<li class="nav-item"><a class="nav-link" href="/artist/ghost-bath-280348/releases">альбомы (12)</a></li>
<li class="nav-item"><a class="nav-link" href="/artist/ghost-bath-280348/clips">видеоклипы (23)</a></li>
<li class="nav-item"><a class="nav-link" href="/artist/ghost-bath-280348/photos">фото (38)</a></li>
</ul>
<header class="content__title">
<h1>Ghost Bath</h1>
<div class="actions">
...
</div>
</header>
<ul class="icon-list">
<li>
<i class="zmdi zmdi-globe zmdi-hc-fw" title="Страна"></i>
<i class="flag-icon US shadow"></i>
Соединенные Штаты
</li>
</ul>
"""
name = None
2023-03-17 22:27:14 +00:00
source_list: List[Source] = []
2023-03-17 22:55:38 +00:00
country = None
2023-03-20 13:40:32 +00:00
notes: FormattedText = FormattedText()
2023-03-17 22:27:14 +00:00
breadcrumbs: BeautifulSoup = soup.find("ol", {"class": "breadcrumb"})
if breadcrumbs is not None:
2023-03-20 13:40:32 +00:00
breadcrumb_list: List[BeautifulSoup] = breadcrumbs.find_all("li", {"class": "breadcrumb-item"}, recursive=False)
2023-03-17 22:27:14 +00:00
if len(breadcrumb_list) == 3:
name = breadcrumb_list[-1].get_text(strip=True)
else:
2023-04-04 18:58:22 +00:00
cls.LOGGER.debug("breadcrumb layout on artist page changed")
2023-03-17 22:27:14 +00:00
nav_tabs: BeautifulSoup = soup.find("ul", {"class": "nav-tabs"})
if nav_tabs is not None:
list_item: BeautifulSoup
for list_item in nav_tabs.find_all("li", {"class": "nav-item"}, recursive=False):
if not list_item.get_text(strip=True).startswith("песни"):
# "песни" translates to "songs"
continue
anchor: BeautifulSoup = list_item.find("a")
if anchor is None:
continue
href = anchor.get("href")
if href is None:
continue
source_list.append(Source(
cls.SOURCE_TYPE,
cls.HOST + href
))
content_title: BeautifulSoup = soup.find("header", {"class": "content__title"})
if content_title is not None:
2023-03-20 13:40:32 +00:00
h1_name: BeautifulSoup = content_title.find("h1", recursive=False)
2023-03-17 22:27:14 +00:00
if h1_name is not None:
name = h1_name.get_text(strip=True)
# country and sources
2023-03-17 22:27:14 +00:00
icon_list: BeautifulSoup = soup.find("ul", {"class": "icon-list"})
if icon_list is not None:
country_italic: BeautifulSoup = icon_list.find("i", {"class", "flag-icon"})
if country_italic is not None:
style_classes: set = {'flag-icon', 'shadow'}
classes: set = set(country_italic.get("class"))
country_set: set = classes.difference(style_classes)
if len(country_set) != 1:
2023-04-04 18:58:22 +00:00
cls.LOGGER.debug("the country set contains multiple values")
2023-03-17 22:27:14 +00:00
if len(country_set) != 0:
"""
This is the css file, where all flags that can be used on musify
are laid out and styled.
Every flag has two upper case letters, thus I assume they follow the alpha_2
https://musify.club/content/flags.min.css
"""
country = pycountry.countries.get(alpha_2=list(country_set)[0])
# get all additional sources
additional_source: BeautifulSoup
for additional_source in icon_list.find_all("a", {"class", "link"}):
href = additional_source.get("href")
if href is None:
continue
new_src = Source.match_url(href, referer_page=cls.SOURCE_TYPE)
if new_src is None:
continue
source_list.append(new_src)
2023-03-17 22:55:38 +00:00
note_soup: BeautifulSoup = soup.find(id="text-main")
if note_soup is not None:
2023-03-20 13:40:32 +00:00
notes.html = note_soup.decode_contents()
2023-03-17 22:55:38 +00:00
2023-03-22 11:58:11 +00:00
return Artist(
2023-03-17 22:27:14 +00:00
_id=url.musify_id,
name=name,
2023-03-17 22:55:38 +00:00
country=country,
source_list=source_list,
notes=notes
2023-03-22 11:58:11 +00:00
)
2023-03-17 11:31:56 +00:00
@classmethod
def _fetch_artist_from_source(cls, source: Source, stop_at_level: int = 1) -> Artist:
2023-03-17 11:31:56 +00:00
"""
fetches artist from source
2023-03-17 12:58:58 +00:00
[x] discography
[x] attributes
2023-03-17 22:27:14 +00:00
[] picture gallery
2023-03-17 11:31:56 +00:00
Args:
source (Source): the source to fetch
2023-03-24 14:58:21 +00:00
stop_at_level: int = 1: if it is false, every album from discograohy will be fetched. Defaults to False.
2023-03-17 11:31:56 +00:00
Returns:
Artist: the artist fetched
"""
2023-03-17 12:11:18 +00:00
2023-03-17 11:31:56 +00:00
url = cls.parse_url(source.url)
artist = cls.get_artist_attributes(url)
2023-03-17 12:11:18 +00:00
2023-03-20 14:33:45 +00:00
discography: List[Album] = cls.get_discography(url, artist.name)
artist.main_album_collection.extend(discography)
2023-04-03 14:23:30 +00:00
return artist
2023-03-20 21:27:05 +00:00
@classmethod
def parse_song_card(cls, song_card: BeautifulSoup) -> Song:
"""
<div id="playerDiv3051" class="playlist__item" itemprop="track" itemscope="itemscope" itemtype="http://schema.org/MusicRecording" data-artist="Linkin Park" data-name="Papercut">
<div id="play_3051" class="playlist__control play" data-url="/track/play/3051/linkin-park-papercut.mp3" data-position="1" data-title="Linkin Park - Papercut" title="Слушать Linkin Park - Papercut">
<span class="ico-play"><i class="zmdi zmdi-play-circle-outline zmdi-hc-2-5x"></i></span>
<span class="ico-pause"><i class="zmdi zmdi-pause-circle-outline zmdi-hc-2-5x"></i></span>
</div>
<div class="playlist__position">
1
</div>
<div class="playlist__details">
<div class="playlist__heading">
<a href="/artist/linkin-park-5" rel="nofollow">Linkin Park</a> - <a class="strong" href="/track/linkin-park-papercut-3051">Papercut</a>
<span itemprop="byArtist" itemscope="itemscope" itemtype="http://schema.org/MusicGroup">
<meta content="/artist/linkin-park-5" itemprop="url" />
<meta content="Linkin Park" itemprop="name" />
</span>
</div>
</div>
<div>
<div class="track__details track__rating hidden-xs-down">
<span class="text-muted">
<i class="zmdi zmdi-star-circle zmdi-hc-1-3x" title="Рейтинг"></i>
326,3K
</span>
</div>
</div>
<div class="track__details hidden-xs-down">
<span class="text-muted">03:05</span>
<span class="text-muted">320 Кб/с</span>
</div>
<div class="track__details hidden-xs-down">
<span title='Есть видео Linkin Park - Papercut'><i class='zmdi zmdi-videocam zmdi-hc-1-3x'></i></span>
<span title='Есть текст Linkin Park - Papercut'><i class='zmdi zmdi-file-text zmdi-hc-1-3x'></i></span>
</div>
<div class="playlist__actions">
<span class="pl-btn save-to-pl" id="add_3051" title="Сохранить в плейлист"><i class="zmdi zmdi-plus zmdi-hc-1-5x"></i></span>
<a target="_blank" itemprop="audio" download="Linkin Park - Papercut.mp3" href="/track/dl/3051/linkin-park-papercut.mp3" class="no-ajaxy yaBrowser" id="dl_3051" title='Скачать Linkin Park - Papercut'>
<span><i class="zmdi zmdi-download zmdi-hc-2-5x"></i></span>
</a>
</div>
</div>
"""
song_name = song_card.get("data-name")
artist_list: List[Artist] = []
2023-03-20 22:11:55 +00:00
source_list: List[Source] = []
2023-03-20 21:27:05 +00:00
tracksort = None
current_url = None
2023-03-20 22:11:55 +00:00
def parse_title(_title: str) -> str:
return _title
"""
2023-03-20 21:27:05 +00:00
# get from parent div
_artist_name = song_card.get("data-artist")
if _artist_name is not None:
artist_list.append(Artist(name=_artist_name))
2023-03-20 22:11:55 +00:00
"""
2023-03-20 21:27:05 +00:00
# get tracksort
tracksort_soup: BeautifulSoup = song_card.find("div", {"class": "playlist__position"})
if tracksort_soup is not None:
raw_tracksort: str = tracksort_soup.get_text(strip=True)
if raw_tracksort.isdigit():
tracksort = int(raw_tracksort)
# playlist details
2023-03-20 22:11:55 +00:00
playlist_details: BeautifulSoup = song_card.find("div", {"class": "playlist__details"})
if playlist_details is not None:
"""
<div class="playlist__heading">
<a href="/artist/tamas-141317" rel="nofollow">Tamas</a> ft.<a href="/artist/zombiez-630767" rel="nofollow">Zombiez</a> - <a class="strong" href="/track/tamas-zombiez-voodoo-feat-zombiez-16185276">Voodoo (Feat. Zombiez)</a>
<span itemprop="byArtist" itemscope="itemscope" itemtype="http://schema.org/MusicGroup">
<meta content="/artist/tamas-141317" itemprop="url" />
<meta content="Tamas" itemprop="name" />
</span>
<span itemprop="byArtist" itemscope="itemscope" itemtype="http://schema.org/MusicGroup">
<meta content="/artist/zombiez-630767" itemprop="url" />
<meta content="Zombiez" itemprop="name" />
</span>
</div>
"""
# track
anchor_list: List[BeautifulSoup] = playlist_details.find_all("a")
if len(anchor_list) > 1:
track_anchor: BeautifulSoup = anchor_list[-1]
href: str = track_anchor.get("href")
if href is not None:
current_url = cls.HOST + href
2023-03-20 22:11:55 +00:00
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
song_name = parse_title(track_anchor.get_text(strip=True))
# artist
artist_span: BeautifulSoup
for artist_span in playlist_details.find_all("span", {"itemprop": "byArtist"}):
_artist_src = None
_artist_name = None
meta_artist_src = artist_span.find("meta", {"itemprop": "url"})
if meta_artist_src is not None:
meta_artist_url = meta_artist_src.get("content")
if meta_artist_url is not None:
_artist_src = [Source(cls.SOURCE_TYPE, cls.HOST + meta_artist_url)]
meta_artist_name = artist_span.find("meta", {"itemprop": "name"})
if meta_artist_name is not None:
meta_artist_name_text = meta_artist_name.get("content")
_artist_name = meta_artist_name_text
if _artist_name is not None or _artist_src is not None:
2023-03-22 11:58:11 +00:00
artist_list.append(Artist(name=_artist_name, source_list=_artist_src))
2023-03-20 21:27:05 +00:00
# playlist actions
playlist_actions: BeautifulSoup = song_card.find("div", {"class": "playlist__actions"})
if playlist_actions is not None:
"""
<div class="playlist__actions">
<span class="pl-btn save-to-pl" id="add_3051" title="Сохранить в плейлист"><i class="zmdi zmdi-plus zmdi-hc-1-5x"></i></span>
<a target="_blank" itemprop="audio" download="Linkin Park - Papercut.mp3" href="/track/dl/3051/linkin-park-papercut.mp3" class="no-ajaxy yaBrowser" id="dl_3051" title='Скачать Linkin Park - Papercut'>
<span><i class="zmdi zmdi-download zmdi-hc-2-5x"></i></span>
</a>
</div>
"""
# getting the actual download link:
download_anchor = playlist_actions.find("a", {"itemprop": "audio"})
if download_anchor is not None:
download_href = download_anchor.get("href")
if download_href is not None and current_url is not None:
source_list.append(Source(
cls.SOURCE_TYPE,
url=current_url,
adio_url=cls.HOST + download_href
))
2023-03-20 21:27:05 +00:00
return Song(
title=song_name,
tracksort=tracksort,
2023-03-31 08:46:56 +00:00
main_artist_list=artist_list,
source_list=source_list
2023-03-20 21:27:05 +00:00
)
2023-04-03 17:59:31 +00:00
@classmethod
def _parse_album(cls, soup: BeautifulSoup) -> Album:
name: str = None
source_list: List[Source] = []
artist_list: List[Artist] = []
2023-04-19 18:05:06 +00:00
date: ID3Timestamp = None
2023-04-03 17:59:31 +00:00
"""
if breadcrumb list has 4 elements, then
the -2 is the artist link,
the -1 is the album
"""
2023-04-19 17:40:00 +00:00
# breadcrumb
2023-04-03 17:59:31 +00:00
breadcrumb_soup: BeautifulSoup = soup.find("ol", {"class", "breadcrumb"})
breadcrumb_elements: List[BeautifulSoup] = breadcrumb_soup.find_all("li", {"class": "breadcrumb-item"})
if len(breadcrumb_elements) == 4:
# album
album_crumb: BeautifulSoup = breadcrumb_elements[-1]
name = album_crumb.text.strip()
# artist
artist_crumb: BeautifulSoup = breadcrumb_elements[-2]
anchor: BeautifulSoup = artist_crumb.find("a")
if anchor is not None:
href = anchor.get("href")
artist_source_list: List[Source] = []
if href is not None:
artist_source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href.strip()))
span: BeautifulSoup = anchor.find("span")
if span is not None:
artist_list.append(Artist(
name=span.get_text(strip=True),
source_list=artist_source_list
))
else:
cls.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case")
2023-04-19 17:40:00 +00:00
# meta
2023-04-03 17:59:31 +00:00
meta_url: BeautifulSoup = soup.find("meta", {"itemprop": "url"})
if meta_url is not None:
url = meta_url.get("content")
if url is not None:
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + url))
meta_name: BeautifulSoup = soup.find("meta", {"itemprop": "name"})
if meta_name is not None:
_name = meta_name.get("content")
if _name is not None:
name = _name
2023-04-19 17:40:00 +00:00
2023-04-19 18:05:06 +00:00
# album info
album_info_ul: BeautifulSoup = soup.find("ul", {"class": "album-info"})
2023-04-19 17:40:00 +00:00
if album_info_ul is not None:
artist_anchor: BeautifulSoup
for artist_anchor in album_info_ul.find_all("a", {"itemprop": "byArtist"}):
# line 98
2023-04-19 18:05:06 +00:00
artist_source_list: List[Source] = []
2023-04-19 17:40:00 +00:00
artist_url_meta = artist_anchor.find("meta", {"itemprop": "url"})
2023-04-19 18:05:06 +00:00
if artist_url_meta is not None:
artist_href = artist_url_meta.get("content")
if artist_href is not None:
artist_source_list.append(Source(cls.SOURCE_TYPE, url=cls.HOST + artist_href))
artist_meta_name = artist_anchor.find("meta", {"itemprop": "name"})
if artist_meta_name is not None:
artist_name = artist_meta_name.get("content")
if artist_name is not None:
artist_list.append(Artist(
name=artist_name,
source_list=artist_source_list
))
time_soup: BeautifulSoup = album_info_ul.find("time", {"itemprop": "datePublished"})
if time_soup is not None:
raw_datetime = time_soup.get("datetime")
if raw_datetime is not None:
2023-04-19 18:07:59 +00:00
try:
date = ID3Timestamp.strptime(raw_datetime, "%Y-%m-%d")
except ValueError:
cls.LOGGER.debug(f"Raw datetime doesn't match time format %Y-%m-%d: {raw_datetime}")
2023-04-03 17:59:31 +00:00
return Album(
title=name,
source_list=source_list,
2023-04-19 18:05:06 +00:00
artist_list=artist_list,
date=date
2023-04-03 17:59:31 +00:00
)
2023-03-17 11:31:56 +00:00
@classmethod
def _fetch_album_from_source(cls, source: Source, stop_at_level: int = 1) -> Album:
"""
fetches album from source:
eg. 'https://musify.club/release/linkin-park-hybrid-theory-2000-188'
2023-03-20 16:03:14 +00:00
/html/musify/album_overview.html
2023-04-03 08:39:16 +00:00
- [x] tracklist
2023-04-20 13:36:12 +00:00
- [x] attributes
2023-04-03 08:39:16 +00:00
- [ ] ratings
2023-03-20 16:03:14 +00:00
2023-03-24 14:58:21 +00:00
:param stop_at_level:
:param source:
:return:
2023-03-20 16:03:14 +00:00
"""
2023-03-20 21:27:05 +00:00
url = cls.parse_url(source.url)
endpoint = cls.HOST + "/release/" + url.name_with_id
2023-04-20 20:30:45 +00:00
r = cls.CONNECTION.get(endpoint)
if r is None:
2023-04-03 17:59:31 +00:00
return Album()
soup = BeautifulSoup(r.content, "html.parser")
2023-04-03 17:59:31 +00:00
album = cls._parse_album(soup)
2023-03-20 21:27:05 +00:00
# <div class="card"><div class="card-body">...</div></div>
cards_soup: BeautifulSoup = soup.find("div", {"class": "card-body"})
if cards_soup is not None:
card_soup: BeautifulSoup
for card_soup in cards_soup.find_all("div", {"class": "playlist__item"}):
2023-04-03 15:41:52 +00:00
new_song = cls.parse_song_card(card_soup)
album.song_collection.append(new_song)
2023-04-03 08:38:12 +00:00
if stop_at_level > 1:
song: Song
for song in album.song_collection:
sources = song.source_collection.get_sources_from_page(cls.SOURCE_TYPE)
for source in sources:
song.merge(cls._fetch_song_from_source(source=source))
2023-03-20 21:27:05 +00:00
album.update_tracksort()
2023-03-20 21:27:05 +00:00
return album
2023-03-30 10:00:39 +00:00
@classmethod
def _get_type_of_url(cls, url: str) -> Optional[Union[Type[Song], Type[Album], Type[Artist], Type[Label]]]:
url: MusifyUrl = cls.parse_url(url)
if url.source_type == MusifyTypes.ARTIST:
return Artist
if url.source_type == MusifyTypes.RELEASE:
return Album
if url.source_type == MusifyTypes.SONG:
return Song
return None
2023-03-30 14:10:48 +00:00
@classmethod
2023-04-04 18:58:22 +00:00
def _download_song_to_targets(cls, source: Source, target: Target, desc: str = None) -> DownloadResult:
2023-03-30 14:10:48 +00:00
"""
https://musify.club/track/im-in-a-coffin-life-never-was-waste-of-skin-16360302
2023-03-30 14:50:27 +00:00
https://musify.club/track/dl/16360302/im-in-a-coffin-life-never-was-waste-of-skin.mp3
2023-03-30 14:10:48 +00:00
"""
endpoint = source.audio_url
2023-04-03 17:59:31 +00:00
if source.audio_url is None:
url: MusifyUrl = cls.parse_url(source.url)
if url.source_type != MusifyTypes.SONG:
return DownloadResult(error_message=f"The url is not of the type Song: {source.url}")
endpoint = f"https://musify.club/track/dl/{url.musify_id}/{url.name_without_id}.mp3"
cls.LOGGER.warning(f"The source has no audio link. Falling back to {endpoint}.")
2023-04-23 10:08:39 +00:00
r = cls.CONNECTION.get(endpoint, stream=True, allow_redirects=True, headers={"Connection": "https://musify.club/"})
2023-04-04 18:58:22 +00:00
if r is None:
return DownloadResult(error_message=f"couldn't connect to {endpoint}")
if target.stream_into(r, desc=desc):
return DownloadResult(total=1)
return DownloadResult(error_message=f"Streaming to the file went wrong: {endpoint}, {str(target.file_path)}")