2023-03-17 12:58:58 +00:00
|
|
|
from collections import defaultdict
|
2023-03-15 19:55:28 +00:00
|
|
|
from typing import List, Optional, Union
|
2023-03-13 14:47:38 +00:00
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import pycountry
|
2023-03-14 13:48:03 +00:00
|
|
|
import time
|
2023-03-17 11:31:56 +00:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
from enum import Enum
|
|
|
|
from dataclasses import dataclass
|
2023-03-13 14:47:38 +00:00
|
|
|
|
|
|
|
from ..utils.shared import (
|
|
|
|
ENCYCLOPAEDIA_METALLUM_LOGGER as LOGGER
|
|
|
|
)
|
|
|
|
|
|
|
|
from .abstract import Page
|
|
|
|
from ..objects import (
|
|
|
|
MusicObject,
|
|
|
|
Artist,
|
|
|
|
Source,
|
|
|
|
SourcePages,
|
|
|
|
Song,
|
|
|
|
Album,
|
|
|
|
ID3Timestamp,
|
|
|
|
FormattedText,
|
|
|
|
Label,
|
2023-03-17 12:58:58 +00:00
|
|
|
Options,
|
|
|
|
AlbumType,
|
|
|
|
AlbumStatus
|
2023-03-13 14:47:38 +00:00
|
|
|
)
|
|
|
|
from ..utils import (
|
|
|
|
string_processing,
|
|
|
|
shared
|
|
|
|
)
|
2023-03-14 13:48:03 +00:00
|
|
|
from ..utils.shared import (
|
|
|
|
MUSIFY_LOGGER as LOGGER
|
|
|
|
)
|
2023-03-13 14:47:38 +00:00
|
|
|
|
2023-03-16 21:52:47 +00:00
|
|
|
"""
|
|
|
|
https://musify.club/artist/ghost-bath-280348?_pjax=#bodyContent
|
|
|
|
https://musify.club/artist/ghost-bath-280348/releases?_pjax=#bodyContent
|
|
|
|
https://musify.club/artist/ghost-bath-280348/clips?_pjax=#bodyContent
|
|
|
|
https://musify.club/artist/ghost-bath-280348/photos?_pjax=#bodyContent
|
|
|
|
|
|
|
|
POST https://musify.club/artist/filtersongs
|
|
|
|
ID: 280348
|
|
|
|
NameForUrl: ghost-bath
|
|
|
|
Page: 1
|
|
|
|
IsAllowed: True
|
|
|
|
SortOrder.Property: dateCreated
|
|
|
|
SortOrder.IsAscending: false
|
|
|
|
X-Requested-With: XMLHttpRequest
|
|
|
|
|
|
|
|
POST https://musify.club/artist/filteralbums
|
|
|
|
ArtistID: 280348
|
|
|
|
SortOrder.Property: dateCreated
|
|
|
|
SortOrder.IsAscending: false
|
|
|
|
X-Requested-With: XMLHttpRequest
|
|
|
|
"""
|
|
|
|
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
class MusifyTypes(Enum):
|
|
|
|
ARTIST = "artist"
|
2023-03-20 20:50:19 +00:00
|
|
|
RELEASE = "release"
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class MusifyUrl:
|
|
|
|
source_type: MusifyTypes
|
|
|
|
name_without_id: str
|
|
|
|
name_with_id: str
|
|
|
|
musify_id: str
|
|
|
|
url: str
|
|
|
|
|
2023-03-13 14:47:38 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
class Musify(Page):
|
2023-03-13 14:47:38 +00:00
|
|
|
API_SESSION: requests.Session = requests.Session()
|
|
|
|
API_SESSION.headers = {
|
|
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0",
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"Referer": "https://musify.club/"
|
|
|
|
}
|
|
|
|
API_SESSION.proxies = shared.proxies
|
2023-03-17 17:16:06 +00:00
|
|
|
TIMEOUT = 5
|
|
|
|
TRIES = 5
|
|
|
|
HOST = "https://musify.club"
|
2023-03-13 14:47:38 +00:00
|
|
|
|
|
|
|
SOURCE_TYPE = SourcePages.MUSIFY
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-20 20:50:19 +00:00
|
|
|
@classmethod
|
|
|
|
def parse_url(cls, url: str) -> MusifyUrl:
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
|
|
|
path = parsed.path.split("/")
|
|
|
|
|
|
|
|
split_name = path[2].split("-")
|
|
|
|
url_id = split_name[-1]
|
|
|
|
name_for_url = "-".join(split_name[:-1])
|
|
|
|
|
|
|
|
try:
|
|
|
|
type_enum = MusifyTypes(path[1])
|
|
|
|
except ValueError as e:
|
|
|
|
print(f"{path[1]} is not yet implemented, add it to MusifyTypes")
|
|
|
|
raise e
|
|
|
|
|
|
|
|
return MusifyUrl(
|
|
|
|
source_type=type_enum,
|
|
|
|
name_without_id=name_for_url,
|
|
|
|
name_with_id=path[2],
|
|
|
|
musify_id=url_id,
|
|
|
|
url=url
|
|
|
|
)
|
|
|
|
|
2023-03-13 14:47:38 +00:00
|
|
|
@classmethod
|
|
|
|
def search_by_query(cls, query: str) -> Options:
|
|
|
|
query_obj = cls.Query(query)
|
|
|
|
|
|
|
|
if query_obj.is_raw:
|
2023-03-14 13:48:03 +00:00
|
|
|
return cls.plaintext_search(query_obj.query)
|
|
|
|
return cls.plaintext_search(cls.get_plaintext_query(query_obj))
|
2023-03-13 14:47:38 +00:00
|
|
|
|
|
|
|
@classmethod
|
2023-03-14 13:48:03 +00:00
|
|
|
def get_plaintext_query(cls, query: Page.Query) -> str:
|
|
|
|
if query.album is None:
|
|
|
|
return f"{query.artist or '*'} - {query.song or '*'}"
|
2023-03-16 13:36:49 +00:00
|
|
|
return f"{query.artist or '*'} - {query.album or '*'} - {query.song or '*'}"
|
2023-03-13 14:47:38 +00:00
|
|
|
|
2023-03-14 13:58:54 +00:00
|
|
|
@classmethod
|
2023-03-15 19:55:28 +00:00
|
|
|
def parse_artist_contact(cls, contact: BeautifulSoup) -> Artist:
|
|
|
|
source_list: List[Source] = []
|
2023-03-18 12:01:27 +00:00
|
|
|
name = None
|
2023-03-15 19:55:28 +00:00
|
|
|
_id = None
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
# source
|
|
|
|
anchor = contact.find("a")
|
|
|
|
if anchor is not None:
|
|
|
|
href = anchor.get("href")
|
|
|
|
name = anchor.get("title")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
if "-" in href:
|
|
|
|
_id = href.split("-")[-1]
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
# artist image
|
|
|
|
image_soup = contact.find("img")
|
|
|
|
if image_soup is not None:
|
|
|
|
alt = image_soup.get("alt")
|
|
|
|
if alt is not None:
|
|
|
|
name = alt
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
artist_thumbnail = image_soup.get("src")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
return Artist(
|
|
|
|
_id=_id,
|
|
|
|
name=name,
|
|
|
|
source_list=source_list
|
|
|
|
)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
@classmethod
|
|
|
|
def parse_album_contact(cls, contact: BeautifulSoup) -> Album:
|
2023-03-16 13:36:49 +00:00
|
|
|
"""
|
|
|
|
<div class="contacts__item">
|
|
|
|
<a href="/release/ghost-bath-ghost-bath-2013-602489" title="Ghost Bath - 2013">
|
|
|
|
|
|
|
|
<div class="contacts__img release">
|
|
|
|
<img alt="Ghost Bath" class="lozad" data-src="https://37s.musify.club/img/69/9060265/24178833.jpg"/>
|
|
|
|
<noscript><img alt="Ghost Bath" src="https://37s.musify.club/img/69/9060265/24178833.jpg"/></noscript>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<div class="contacts__info">
|
|
|
|
<strong>Ghost Bath - 2013</strong>
|
|
|
|
<small>Ghost Bath</small>
|
|
|
|
<small>Треков: 4</small> <!--tracks-->
|
|
|
|
<small><i class="zmdi zmdi-star zmdi-hc-fw"></i> 9,04</small>
|
|
|
|
</div>
|
|
|
|
</a>
|
|
|
|
</div>
|
|
|
|
"""
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
source_list: List[Source] = []
|
2023-03-18 12:01:27 +00:00
|
|
|
title = None
|
2023-03-16 13:36:49 +00:00
|
|
|
_id = None
|
|
|
|
year = None
|
|
|
|
artist_list: List[Artist] = []
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
def parse_title_date(title_date: Optional[str], delimiter: str = " - "):
|
2023-03-18 12:01:27 +00:00
|
|
|
nonlocal year
|
|
|
|
nonlocal title
|
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
if title_date is None:
|
|
|
|
return
|
2023-03-17 12:11:18 +00:00
|
|
|
|
|
|
|
title_date = title_date.strip()
|
2023-03-16 13:36:49 +00:00
|
|
|
split_attr = title_date.split(delimiter)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
if len(split_attr) < 2:
|
|
|
|
return
|
|
|
|
if not split_attr[-1].isdigit():
|
|
|
|
return
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
year = int(split_attr[-1])
|
|
|
|
title = delimiter.join(split_attr[:-1])
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
# source
|
|
|
|
anchor = contact.find("a")
|
|
|
|
if anchor is not None:
|
|
|
|
href = anchor.get("href")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
# get the title and year
|
|
|
|
parse_title_date(anchor.get("title"))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
if "-" in href:
|
|
|
|
_id = href.split("-")[-1]
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
# cover art
|
|
|
|
image_soup = contact.find("img")
|
|
|
|
if image_soup is not None:
|
|
|
|
alt = image_soup.get("alt")
|
|
|
|
if alt is not None:
|
|
|
|
title = alt
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
cover_art = image_soup.get("src")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
contact_info_soup = contact.find("div", {"class": "contacts__info"})
|
|
|
|
if contact_info_soup is not None:
|
|
|
|
"""
|
|
|
|
<strong>Ghost Bath - 2013</strong>
|
|
|
|
<small>Ghost Bath</small>
|
|
|
|
<small>Треков: 4</small> <!--tracks-->
|
|
|
|
<small><i class="zmdi zmdi-star zmdi-hc-fw"></i> 9,04</small>
|
|
|
|
"""
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
title_soup = contact_info_soup.find("strong")
|
|
|
|
if title_soup is None:
|
|
|
|
parse_title_date(title_soup)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
small_list = contact_info_soup.find_all("small")
|
|
|
|
if len(small_list) == 3:
|
|
|
|
# artist
|
|
|
|
artist_soup: BeautifulSoup = small_list[0]
|
|
|
|
raw_artist_str = artist_soup.text
|
|
|
|
|
|
|
|
for artist_str in raw_artist_str.split("&\r\n"):
|
|
|
|
artist_str = artist_str.rstrip("& ...\r\n")
|
|
|
|
artist_str = artist_str.strip()
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
if artist_str.endswith("]") and "[" in artist_str:
|
|
|
|
artist_str = artist_str.rsplit("[", maxsplit=1)[0]
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
artist_list.append(Artist(name=artist_str))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
track_count_soup: BeautifulSoup = small_list[1]
|
|
|
|
rating_soup: BeautifulSoup = small_list[2]
|
|
|
|
else:
|
|
|
|
LOGGER.warning("got an unequal ammount than 3 small elements")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
return Album(
|
|
|
|
_id=_id,
|
|
|
|
title=title,
|
|
|
|
source_list=source_list,
|
|
|
|
date=ID3Timestamp(year=year),
|
|
|
|
artist_list=artist_list
|
|
|
|
)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
@classmethod
|
|
|
|
def parse_contact_container(cls, contact_container_soup: BeautifulSoup) -> List[Union[Artist, Album]]:
|
2023-03-17 12:11:18 +00:00
|
|
|
# print(contact_container_soup.prettify)
|
2023-03-15 19:55:28 +00:00
|
|
|
contacts = []
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
# print(contact_container_soup)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
contact: BeautifulSoup
|
|
|
|
for contact in contact_container_soup.find_all("div", {"class": "contacts__item"}):
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
anchor_soup = contact.find("a")
|
2023-03-16 13:36:49 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
if anchor_soup is not None:
|
|
|
|
url = anchor_soup.get("href")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
if url is not None:
|
2023-03-17 12:11:18 +00:00
|
|
|
# print(url)
|
2023-03-15 19:55:28 +00:00
|
|
|
if "artist" in url:
|
|
|
|
contacts.append(cls.parse_artist_contact(contact))
|
|
|
|
elif "release" in url:
|
|
|
|
contacts.append(cls.parse_album_contact(contact))
|
|
|
|
return contacts
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
@classmethod
|
|
|
|
def parse_playlist_item(cls, playlist_item_soup: BeautifulSoup) -> Song:
|
|
|
|
_id = None
|
2023-03-18 12:01:27 +00:00
|
|
|
song_title = playlist_item_soup.get("data-name")
|
2023-03-16 15:57:43 +00:00
|
|
|
artist_list: List[Artist] = []
|
|
|
|
source_list: List[Source] = []
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
# details
|
|
|
|
playlist_details: BeautifulSoup = playlist_item_soup.find("div", {"class", "playlist__heading"})
|
|
|
|
if playlist_details is not None:
|
|
|
|
anchor_list = playlist_details.find_all("a")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
if len(anchor_list) >= 2:
|
|
|
|
print(anchor_list)
|
|
|
|
# artists
|
2023-03-17 12:11:18 +00:00
|
|
|
artist_anchor: BeautifulSoup
|
2023-03-16 15:57:43 +00:00
|
|
|
for artist_anchor in anchor_list[:-1]:
|
|
|
|
_id = None
|
|
|
|
href = artist_anchor.get("href")
|
|
|
|
artist_source: Source = Source(cls.SOURCE_TYPE, cls.HOST + href)
|
|
|
|
if "-" in href:
|
|
|
|
_id = href.split("-")[-1]
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
artist_list.append(Artist(
|
|
|
|
_id=_id,
|
|
|
|
name=artist_anchor.get_text(strip=True),
|
|
|
|
source_list=[artist_source]
|
|
|
|
))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
# track
|
|
|
|
track_soup: BeautifulSoup = anchor_list[-1]
|
|
|
|
"""
|
|
|
|
TODO
|
|
|
|
this anchor text may have something like (feat. some artist)
|
|
|
|
which is not acceptable
|
|
|
|
"""
|
|
|
|
href = track_soup.get("href")
|
|
|
|
if href is not None:
|
|
|
|
if "-" in href:
|
|
|
|
raw_id: str = href.split("-")[-1]
|
|
|
|
if raw_id.isdigit():
|
|
|
|
_id = raw_id
|
|
|
|
source_list.append(Source(cls.SOURCE_TYPE, cls.HOST + href))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
else:
|
|
|
|
LOGGER.warning("there are not enough anchors (2) for artist and track")
|
|
|
|
LOGGER.warning(str(artist_list))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
"""
|
|
|
|
artist_name = playlist_item_soup.get("data-artist")
|
|
|
|
if artist_name is not None:
|
|
|
|
artist_list.append(Artist(name=artist_name))
|
|
|
|
"""
|
|
|
|
id_attribute = playlist_item_soup.get("id")
|
|
|
|
if id_attribute is not None:
|
|
|
|
raw_id = id_attribute.replace("playerDiv", "")
|
|
|
|
if raw_id.isdigit():
|
|
|
|
_id = raw_id
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
return Song(
|
|
|
|
_id=_id,
|
|
|
|
title=song_title,
|
|
|
|
main_artist_list=artist_list,
|
|
|
|
source_list=source_list
|
|
|
|
)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-15 19:55:28 +00:00
|
|
|
@classmethod
|
|
|
|
def parse_playlist_soup(cls, playlist_soup: BeautifulSoup) -> List[Song]:
|
2023-03-16 15:57:43 +00:00
|
|
|
song_list = []
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
for playlist_item_soup in playlist_soup.find_all("div", {"class": "playlist__item"}):
|
|
|
|
song_list.append(cls.parse_playlist_item(playlist_item_soup))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-16 15:57:43 +00:00
|
|
|
return song_list
|
2023-03-13 14:47:38 +00:00
|
|
|
|
|
|
|
@classmethod
|
2023-03-16 13:36:49 +00:00
|
|
|
def plaintext_search(cls, query: str) -> Options:
|
2023-03-15 19:55:28 +00:00
|
|
|
search_results = []
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 17:16:06 +00:00
|
|
|
r = cls.get_request(f"https://musify.club/search?searchText={query}")
|
|
|
|
if r is None:
|
|
|
|
return Options()
|
|
|
|
search_soup: BeautifulSoup = BeautifulSoup(r.content, features="html.parser")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-14 13:48:03 +00:00
|
|
|
# album and songs
|
|
|
|
# child of div class: contacts row
|
2023-03-15 19:55:28 +00:00
|
|
|
for contact_container_soup in search_soup.find_all("div", {"class": "contacts"}):
|
|
|
|
search_results.extend(cls.parse_contact_container(contact_container_soup))
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-14 13:48:03 +00:00
|
|
|
# song
|
|
|
|
# div class: playlist__item
|
|
|
|
for playlist_soup in search_soup.find_all("div", {"class": "playlist"}):
|
2023-03-15 19:55:28 +00:00
|
|
|
search_results.extend(cls.parse_playlist_soup(playlist_soup))
|
2023-03-14 13:48:03 +00:00
|
|
|
|
2023-03-16 13:36:49 +00:00
|
|
|
return Options(search_results)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
@classmethod
|
2023-03-20 14:33:45 +00:00
|
|
|
def parse_album_card(cls, album_card: BeautifulSoup, artist_name: str = None) -> Album:
|
2023-03-17 11:31:56 +00:00
|
|
|
"""
|
|
|
|
<div class="card release-thumbnail" data-type="2">
|
|
|
|
<a href="/release/ghost-bath-self-loather-2021-1554266">
|
|
|
|
<img alt="Self Loather" class="card-img-top lozad" data-src="https://40s-a.musify.club/img/70/24826582/62624396.jpg"/>
|
|
|
|
<noscript><img alt="Self Loather" src="https://40s-a.musify.club/img/70/24826582/62624396.jpg"/></noscript>
|
|
|
|
</a>
|
|
|
|
<div class="card-body">
|
|
|
|
<h4 class="card-subtitle">
|
|
|
|
<a href="/release/ghost-bath-self-loather-2021-1554266">Self Loather</a>
|
|
|
|
</h4>
|
|
|
|
</div>
|
|
|
|
<div class="card-footer"><p class="card-text"><a href="/albums/2021">2021</a></p></div>
|
|
|
|
<div class="card-footer">
|
|
|
|
<p class="card-text genre__labels">
|
|
|
|
<a href="/genre/depressive-black-132">Depressive Black</a><a href="/genre/post-black-metal-295">Post-Black Metal</a> </p>
|
|
|
|
</div>
|
|
|
|
<div class="card-footer">
|
|
|
|
<small><i class="zmdi zmdi-calendar" title="Добавлено"></i> 13.11.2021</small>
|
|
|
|
<small><i class="zmdi zmdi-star zmdi-hc-fw" title="Рейтинг"></i> 5,88</small>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
"""
|
2023-03-17 12:58:58 +00:00
|
|
|
|
|
|
|
album_type_map = defaultdict(lambda: AlbumType.OTHER, {
|
|
|
|
1: AlbumType.OTHER, # literally other xD
|
|
|
|
2: AlbumType.STUDIO_ALBUM,
|
|
|
|
3: AlbumType.EP,
|
|
|
|
4: AlbumType.SINGLE,
|
|
|
|
5: AlbumType.OTHER, # BOOTLEG
|
|
|
|
6: AlbumType.LIVE_ALBUM,
|
|
|
|
7: AlbumType.COMPILATION_ALBUM, # compilation of different artists
|
|
|
|
8: AlbumType.MIXTAPE,
|
|
|
|
9: AlbumType.DEMO,
|
|
|
|
10: AlbumType.MIXTAPE, # DJ Mixes
|
|
|
|
11: AlbumType.COMPILATION_ALBUM, # compilation of only this artist
|
|
|
|
13: AlbumType.COMPILATION_ALBUM, # unofficial
|
|
|
|
14: AlbumType.MIXTAPE # "Soundtracks"
|
|
|
|
})
|
|
|
|
|
|
|
|
_id: Optional[str] = None
|
2023-03-18 12:01:27 +00:00
|
|
|
name: str = None
|
2023-03-17 11:31:56 +00:00
|
|
|
source_list: List[Source] = []
|
2023-03-17 12:58:58 +00:00
|
|
|
timestamp: Optional[ID3Timestamp] = None
|
|
|
|
album_status = None
|
|
|
|
|
2023-03-20 14:33:45 +00:00
|
|
|
def set_name(new_name: str):
|
|
|
|
nonlocal name
|
|
|
|
nonlocal artist_name
|
|
|
|
|
|
|
|
# example of just setting not working: https://musify.club/release/unjoy-eurythmie-psychonaut-4-tired-numb-still-alive-2012-324067
|
|
|
|
if new_name.count(" - ") != 1:
|
|
|
|
name = new_name
|
|
|
|
return
|
|
|
|
|
|
|
|
potential_artist_list, potential_name = new_name.split(" - ")
|
|
|
|
unified_artist_list = string_processing.unify(potential_artist_list)
|
|
|
|
if artist_name is not None:
|
|
|
|
if string_processing.unify(artist_name) not in unified_artist_list:
|
|
|
|
name = new_name
|
|
|
|
return
|
|
|
|
|
|
|
|
name = potential_name
|
|
|
|
return
|
|
|
|
|
|
|
|
name = new_name
|
|
|
|
|
2023-03-17 12:58:58 +00:00
|
|
|
album_status_id = album_card.get("data-type")
|
|
|
|
if album_status_id.isdigit():
|
|
|
|
album_status_id = int(album_status_id)
|
|
|
|
album_type = album_type_map[album_status_id]
|
|
|
|
|
|
|
|
if album_status_id == 5:
|
|
|
|
album_status = AlbumStatus.BOOTLEG
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 12:58:58 +00:00
|
|
|
def parse_release_anchor(_anchor: BeautifulSoup, text_is_name=False):
|
|
|
|
nonlocal _id
|
|
|
|
nonlocal name
|
|
|
|
nonlocal source_list
|
|
|
|
|
|
|
|
if _anchor is None:
|
2023-03-17 11:39:19 +00:00
|
|
|
return
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 12:58:58 +00:00
|
|
|
href = _anchor.get("href")
|
|
|
|
if href is not None:
|
|
|
|
# add url to sources
|
|
|
|
source_list.append(Source(
|
|
|
|
cls.SOURCE_TYPE,
|
|
|
|
cls.HOST + href
|
|
|
|
))
|
|
|
|
|
|
|
|
# split id from url
|
|
|
|
split_href = href.split("-")
|
|
|
|
if len(split_href) > 1:
|
|
|
|
_id = split_href[-1]
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:39:19 +00:00
|
|
|
if not text_is_name:
|
|
|
|
return
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-20 14:33:45 +00:00
|
|
|
set_name(_anchor.text)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:39:19 +00:00
|
|
|
anchor_list = album_card.find_all("a", recursive=False)
|
|
|
|
if len(anchor_list) > 0:
|
|
|
|
anchor = anchor_list[0]
|
|
|
|
parse_release_anchor(anchor)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
thumbnail: BeautifulSoup = anchor.find("img")
|
|
|
|
if thumbnail is not None:
|
|
|
|
alt = thumbnail.get("alt")
|
|
|
|
if alt is not None:
|
2023-03-20 14:33:45 +00:00
|
|
|
set_name(alt)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
image_url = thumbnail.get("src")
|
|
|
|
else:
|
|
|
|
LOGGER.debug("the card has no thumbnail or url")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:39:19 +00:00
|
|
|
card_body = album_card.find("div", {"class": "card-body"})
|
|
|
|
if card_body is not None:
|
|
|
|
parse_release_anchor(card_body.find("a"), text_is_name=True)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 12:58:58 +00:00
|
|
|
def parse_small_date(small_soup: BeautifulSoup):
|
|
|
|
"""
|
|
|
|
<small>
|
|
|
|
<i class="zmdi zmdi-calendar" title="Добавлено"></i>
|
|
|
|
13.11.2021
|
|
|
|
</small>
|
|
|
|
"""
|
|
|
|
nonlocal timestamp
|
|
|
|
|
|
|
|
italic_tagging_soup: BeautifulSoup = small_soup.find("i")
|
|
|
|
if italic_tagging_soup is None:
|
|
|
|
return
|
|
|
|
if italic_tagging_soup.get("title") != "Добавлено":
|
|
|
|
# "Добавлено" can be translated to "Added (at)"
|
|
|
|
return
|
|
|
|
|
|
|
|
raw_time = small_soup.text.strip()
|
|
|
|
timestamp = ID3Timestamp.strptime(raw_time, "%d.%m.%Y")
|
|
|
|
|
|
|
|
# parse small date
|
2023-03-17 11:39:19 +00:00
|
|
|
card_footer_list = album_card.find_all("div", {"class": "card-footer"})
|
2023-03-17 12:58:58 +00:00
|
|
|
if len(card_footer_list) != 3:
|
|
|
|
LOGGER.debug("there are not exactly 3 card footers in a card")
|
|
|
|
|
|
|
|
if len(card_footer_list) > 0:
|
|
|
|
for any_small_soup in card_footer_list[-1].find_all("small"):
|
|
|
|
parse_small_date(any_small_soup)
|
|
|
|
else:
|
|
|
|
LOGGER.debug("there is not even 1 footer in the album card")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:39:19 +00:00
|
|
|
return Album(
|
2023-03-17 12:58:58 +00:00
|
|
|
_id=_id,
|
2023-03-17 11:39:19 +00:00
|
|
|
title=name,
|
2023-03-17 12:58:58 +00:00
|
|
|
source_list=source_list,
|
|
|
|
date=timestamp,
|
|
|
|
album_type=album_type,
|
|
|
|
album_status=album_status
|
2023-03-17 11:39:19 +00:00
|
|
|
)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
@classmethod
|
2023-03-20 14:33:45 +00:00
|
|
|
def get_discography(cls, url: MusifyUrl, artist_name: str = None, flat=False) -> List[Album]:
|
2023-03-17 11:31:56 +00:00
|
|
|
"""
|
|
|
|
POST https://musify.club/artist/filteralbums
|
|
|
|
ArtistID: 280348
|
|
|
|
SortOrder.Property: dateCreated
|
|
|
|
SortOrder.IsAscending: false
|
|
|
|
X-Requested-With: XMLHttpRequest
|
|
|
|
"""
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
endpoint = cls.HOST + "/" + url.source_type.value + "/filteralbums"
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 17:16:06 +00:00
|
|
|
r = cls.post_request(url=endpoint, json={
|
2023-03-17 11:31:56 +00:00
|
|
|
"ArtistID": str(url.musify_id),
|
|
|
|
"SortOrder.Property": "dateCreated",
|
|
|
|
"SortOrder.IsAscending": False,
|
|
|
|
"X-Requested-With": "XMLHttpRequest"
|
|
|
|
})
|
2023-03-17 17:16:06 +00:00
|
|
|
if r is None:
|
|
|
|
return []
|
2023-03-17 11:31:56 +00:00
|
|
|
soup: BeautifulSoup = BeautifulSoup(r.content, features="html.parser")
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
discography: List[Album] = []
|
|
|
|
for card_soup in soup.find_all("div", {"class": "card"}):
|
2023-03-20 14:33:45 +00:00
|
|
|
new_album: Album = cls.parse_album_card(card_soup, artist_name)
|
2023-03-20 13:40:32 +00:00
|
|
|
album_source: Source
|
|
|
|
if not flat:
|
|
|
|
for album_source in new_album.source_collection.get_sources_from_page(cls.SOURCE_TYPE):
|
2023-03-20 16:03:14 +00:00
|
|
|
new_album.merge(cls.fetch_album_from_source(album_source))
|
2023-03-20 13:40:32 +00:00
|
|
|
|
|
|
|
discography.append(new_album)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
return discography
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 17:16:06 +00:00
|
|
|
@classmethod
|
|
|
|
def get_artist_attributes(cls, url: MusifyUrl) -> Artist:
|
|
|
|
"""
|
|
|
|
fetches the main Artist attributes from this endpoint
|
|
|
|
https://musify.club/artist/ghost-bath-280348?_pjax=#bodyContent
|
|
|
|
it needs to parse html
|
|
|
|
|
|
|
|
:param url:
|
|
|
|
:return:
|
|
|
|
"""
|
2023-03-17 22:27:14 +00:00
|
|
|
|
|
|
|
r = cls.get_request(f"https://musify.club/{url.source_type.value}/{url.name_with_id}?_pjax=#bodyContent")
|
|
|
|
if r is None:
|
2023-03-20 13:40:32 +00:00
|
|
|
return Artist(_id=url.musify_id)
|
2023-03-17 22:27:14 +00:00
|
|
|
|
|
|
|
soup = BeautifulSoup(r.content, "html.parser")
|
|
|
|
|
|
|
|
"""
|
|
|
|
<ol class="breadcrumb" itemscope="" itemtype="http://schema.org/BreadcrumbList">
|
|
|
|
<li class="breadcrumb-item" itemprop="itemListElement" itemscope="" itemtype="http://schema.org/ListItem"><a href="/" itemprop="item"><span itemprop="name">Главная</span><meta content="1" itemprop="position"/></a></li>
|
|
|
|
<li class="breadcrumb-item" itemprop="itemListElement" itemscope="" itemtype="http://schema.org/ListItem"><a href="/artist" itemprop="item"><span itemprop="name">Исполнители</span><meta content="2" itemprop="position"/></a></li>
|
|
|
|
<li class="breadcrumb-item active">Ghost Bath</li>
|
|
|
|
</ol>
|
|
|
|
|
|
|
|
<ul class="nav nav-tabs nav-fill">
|
|
|
|
<li class="nav-item"><a class="active nav-link" href="/artist/ghost-bath-280348">песни (41)</a></li>
|
|
|
|
<li class="nav-item"><a class="nav-link" href="/artist/ghost-bath-280348/releases">альбомы (12)</a></li>
|
|
|
|
<li class="nav-item"><a class="nav-link" href="/artist/ghost-bath-280348/clips">видеоклипы (23)</a></li>
|
|
|
|
<li class="nav-item"><a class="nav-link" href="/artist/ghost-bath-280348/photos">фото (38)</a></li>
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
<header class="content__title">
|
|
|
|
<h1>Ghost Bath</h1>
|
|
|
|
<div class="actions">
|
|
|
|
...
|
|
|
|
</div>
|
|
|
|
</header>
|
|
|
|
|
|
|
|
<ul class="icon-list">
|
|
|
|
<li>
|
|
|
|
<i class="zmdi zmdi-globe zmdi-hc-fw" title="Страна"></i>
|
|
|
|
<i class="flag-icon US shadow"></i>
|
|
|
|
Соединенные Штаты
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
"""
|
2023-03-18 12:01:27 +00:00
|
|
|
name = None
|
2023-03-17 22:27:14 +00:00
|
|
|
source_list: List[Source] = []
|
2023-03-17 22:55:38 +00:00
|
|
|
country = None
|
2023-03-20 13:40:32 +00:00
|
|
|
notes: FormattedText = FormattedText()
|
2023-03-17 22:27:14 +00:00
|
|
|
|
|
|
|
breadcrumbs: BeautifulSoup = soup.find("ol", {"class": "breadcrumb"})
|
|
|
|
if breadcrumbs is not None:
|
2023-03-20 13:40:32 +00:00
|
|
|
breadcrumb_list: List[BeautifulSoup] = breadcrumbs.find_all("li", {"class": "breadcrumb-item"}, recursive=False)
|
2023-03-17 22:27:14 +00:00
|
|
|
if len(breadcrumb_list) == 3:
|
|
|
|
name = breadcrumb_list[-1].get_text(strip=True)
|
|
|
|
else:
|
|
|
|
LOGGER.debug("breadcrumb layout on artist page changed")
|
|
|
|
|
|
|
|
nav_tabs: BeautifulSoup = soup.find("ul", {"class": "nav-tabs"})
|
|
|
|
if nav_tabs is not None:
|
|
|
|
list_item: BeautifulSoup
|
|
|
|
for list_item in nav_tabs.find_all("li", {"class": "nav-item"}, recursive=False):
|
|
|
|
if not list_item.get_text(strip=True).startswith("песни"):
|
|
|
|
# "песни" translates to "songs"
|
|
|
|
continue
|
|
|
|
|
|
|
|
anchor: BeautifulSoup = list_item.find("a")
|
|
|
|
if anchor is None:
|
|
|
|
continue
|
|
|
|
href = anchor.get("href")
|
|
|
|
if href is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
source_list.append(Source(
|
|
|
|
cls.SOURCE_TYPE,
|
|
|
|
cls.HOST + href
|
|
|
|
))
|
|
|
|
|
|
|
|
content_title: BeautifulSoup = soup.find("header", {"class": "content__title"})
|
|
|
|
if content_title is not None:
|
2023-03-20 13:40:32 +00:00
|
|
|
h1_name: BeautifulSoup = content_title.find("h1", recursive=False)
|
2023-03-17 22:27:14 +00:00
|
|
|
if h1_name is not None:
|
|
|
|
name = h1_name.get_text(strip=True)
|
|
|
|
|
2023-03-18 12:01:27 +00:00
|
|
|
# country and sources
|
2023-03-17 22:27:14 +00:00
|
|
|
icon_list: BeautifulSoup = soup.find("ul", {"class": "icon-list"})
|
|
|
|
if icon_list is not None:
|
|
|
|
country_italic: BeautifulSoup = icon_list.find("i", {"class", "flag-icon"})
|
|
|
|
if country_italic is not None:
|
|
|
|
style_classes: set = {'flag-icon', 'shadow'}
|
|
|
|
classes: set = set(country_italic.get("class"))
|
|
|
|
|
|
|
|
country_set: set = classes.difference(style_classes)
|
|
|
|
if len(country_set) != 1:
|
|
|
|
LOGGER.debug("the country set contains multiple values")
|
|
|
|
if len(country_set) != 0:
|
|
|
|
"""
|
|
|
|
This is the css file, where all flags that can be used on musify
|
|
|
|
are laid out and styled.
|
|
|
|
Every flag has two upper case letters, thus I assume they follow the alpha_2
|
|
|
|
standard, though I haven't checked.
|
|
|
|
https://musify.club/content/flags.min.css
|
|
|
|
"""
|
|
|
|
|
|
|
|
country = pycountry.countries.get(alpha_2=list(country_set)[0])
|
|
|
|
|
2023-03-18 12:01:27 +00:00
|
|
|
# get all additional sources
|
|
|
|
additional_source: BeautifulSoup
|
|
|
|
for additional_source in icon_list.find_all("a", {"class", "link"}):
|
|
|
|
href = additional_source.get("href")
|
|
|
|
if href is None:
|
|
|
|
continue
|
|
|
|
new_src = Source.match_url(href)
|
|
|
|
if new_src is None:
|
|
|
|
continue
|
|
|
|
source_list.append(new_src)
|
|
|
|
|
2023-03-17 22:55:38 +00:00
|
|
|
note_soup: BeautifulSoup = soup.find(id="text-main")
|
|
|
|
if note_soup is not None:
|
2023-03-20 13:40:32 +00:00
|
|
|
notes.html = note_soup.decode_contents()
|
2023-03-17 22:55:38 +00:00
|
|
|
|
2023-03-17 17:16:06 +00:00
|
|
|
return Artist(
|
2023-03-17 22:27:14 +00:00
|
|
|
_id=url.musify_id,
|
|
|
|
name=name,
|
2023-03-17 22:55:38 +00:00
|
|
|
country=country,
|
|
|
|
source_list=source_list,
|
|
|
|
notes=notes
|
2023-03-17 17:16:06 +00:00
|
|
|
)
|
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
@classmethod
|
2023-03-20 13:40:32 +00:00
|
|
|
def fetch_artist_from_source(cls, source: Source, flat: bool = False) -> Artist:
|
2023-03-17 11:31:56 +00:00
|
|
|
"""
|
|
|
|
fetches artist from source
|
|
|
|
|
2023-03-17 12:58:58 +00:00
|
|
|
[x] discography
|
2023-03-20 20:50:19 +00:00
|
|
|
[x] attributes
|
2023-03-17 22:27:14 +00:00
|
|
|
[] picture gallery
|
2023-03-17 11:31:56 +00:00
|
|
|
|
|
|
|
Args:
|
|
|
|
source (Source): the source to fetch
|
|
|
|
flat (bool, optional): if it is false, every album from discograohy will be fetched. Defaults to False.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Artist: the artist fetched
|
|
|
|
"""
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
url = cls.parse_url(source.url)
|
2023-03-17 17:16:06 +00:00
|
|
|
|
|
|
|
artist = cls.get_artist_attributes(url)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-20 14:33:45 +00:00
|
|
|
discography: List[Album] = cls.get_discography(url, artist.name)
|
2023-03-17 17:16:06 +00:00
|
|
|
artist.main_album_collection.extend(discography)
|
2023-03-17 12:11:18 +00:00
|
|
|
|
2023-03-17 17:16:06 +00:00
|
|
|
return artist
|
2023-03-20 13:40:32 +00:00
|
|
|
|
2023-03-17 11:31:56 +00:00
|
|
|
@classmethod
|
2023-03-20 16:03:14 +00:00
|
|
|
def fetch_album_from_source(cls, source: Source, flat: bool = False) -> Album:
|
2023-03-20 20:50:19 +00:00
|
|
|
"""
|
|
|
|
fetches album from source:
|
|
|
|
eg. 'https://musify.club/release/linkin-park-hybrid-theory-2000-188'
|
2023-03-20 16:03:14 +00:00
|
|
|
|
2023-03-20 20:50:19 +00:00
|
|
|
/html/musify/album_overview.html
|
|
|
|
[] tracklist
|
|
|
|
[] attributes *(name and country... wooooow and I waste one request for this)*
|
|
|
|
[] ratings
|
2023-03-20 16:03:14 +00:00
|
|
|
|
2023-03-20 20:50:19 +00:00
|
|
|
:param source:
|
|
|
|
:param flat:
|
|
|
|
:return:
|
2023-03-20 16:03:14 +00:00
|
|
|
"""
|
2023-03-20 20:50:19 +00:00
|
|
|
url = cls.parse_url(source.url)
|
|
|
|
|
|
|
|
endpoint = cls.HOST + "/release/" + url.name_with_id
|
|
|
|
r = cls.get_request(endpoint)
|
|
|
|
if r is None:
|
|
|
|
return Album()
|
|
|
|
|
|
|
|
soup = BeautifulSoup(r.content, "html.parser")
|
|
|
|
|
|
|
|
|
2023-03-20 16:03:14 +00:00
|
|
|
return Album(title="works")
|