added fetching of linked sources to musify

This commit is contained in:
Hellow 2023-03-18 13:01:27 +01:00
parent 862c25dd35
commit 2724eb3e36
4 changed files with 35 additions and 13 deletions

View File

@ -377,7 +377,7 @@ All objects dependent on Artist
class Artist(MainObject): class Artist(MainObject):
COLLECTION_ATTRIBUTES = ("feature_song_collection", "main_album_collection", "label_collection") COLLECTION_ATTRIBUTES = ("feature_song_collection", "main_album_collection", "label_collection", "source_collection")
SIMPLE_ATTRIBUTES = { SIMPLE_ATTRIBUTES = {
"name": None, "name": None,
"unified_name": None, "unified_name": None,

View File

@ -1,6 +1,7 @@
from collections import defaultdict from collections import defaultdict
from enum import Enum from enum import Enum
from typing import List, Dict, Tuple from typing import List, Dict, Tuple, Optional
from urllib.parse import urlparse
from .metadata import Mapping, Metadata from .metadata import Mapping, Metadata
from .parents import DatabaseObject from .parents import DatabaseObject
@ -28,6 +29,7 @@ class SourcePages(Enum):
INSTAGRAM = "instagram" INSTAGRAM = "instagram"
FACEBOOK = "facebook" FACEBOOK = "facebook"
TWITTER = "twitter" # I will use nitter though lol TWITTER = "twitter" # I will use nitter though lol
MYSPACE = "myspace" # Yes somehow this ancient site is linked EVERYWHERE
@classmethod @classmethod
def get_homepage(cls, attribute) -> str: def get_homepage(cls, attribute) -> str:
@ -42,7 +44,8 @@ class SourcePages(Enum):
cls.INSTAGRAM: "https://www.instagram.com/", cls.INSTAGRAM: "https://www.instagram.com/",
cls.FACEBOOK: "https://www.facebook.com/", cls.FACEBOOK: "https://www.facebook.com/",
cls.SPOTIFY: "https://open.spotify.com/", cls.SPOTIFY: "https://open.spotify.com/",
cls.TWITTER: "https://twitter.com/" cls.TWITTER: "https://twitter.com/",
cls.MYSPACE: "https://myspace.com/"
} }
return homepage_map[attribute] return homepage_map[attribute]
@ -71,11 +74,14 @@ class Source(DatabaseObject):
self.url = url self.url = url
@classmethod @classmethod
def match_url(cls, url: str): def match_url(cls, url: str) -> Optional["Source"]:
""" """
this shouldn't be used, unlesse you are not certain what the source is for this shouldn't be used, unlesse you are not certain what the source is for
the reason is that it is more inefficient the reason is that it is more inefficient
""" """
parsed = urlparse(url)
url = parsed.geturl()
if url.startswith("https://www.youtube"): if url.startswith("https://www.youtube"):
return cls(SourcePages.YOUTUBE, url) return cls(SourcePages.YOUTUBE, url)
@ -101,6 +107,9 @@ class Source(DatabaseObject):
if url.startswith("https://twitter"): if url.startswith("https://twitter"):
return cls(SourcePages.TWITTER, url) return cls(SourcePages.TWITTER, url)
if url.startswith("https://myspace.com"):
return cls(SourcePages.MYSPACE, url)
def get_song_metadata(self) -> Metadata: def get_song_metadata(self) -> Metadata:
return Metadata({ return Metadata({
Mapping.FILE_WEBPAGE_URL: [self.url], Mapping.FILE_WEBPAGE_URL: [self.url],
@ -157,4 +166,4 @@ class SourceCollection(Collection):
getting the sources for a specific page like getting the sources for a specific page like
YouTube or musify YouTube or musify
""" """
return self._page_to_source_list[source_page] return self._page_to_source_list[source_page].copy()

View File

@ -102,7 +102,7 @@ class Musify(Page):
@classmethod @classmethod
def parse_artist_contact(cls, contact: BeautifulSoup) -> Artist: def parse_artist_contact(cls, contact: BeautifulSoup) -> Artist:
source_list: List[Source] = [] source_list: List[Source] = []
name = "" name = None
_id = None _id = None
# source # source
@ -153,12 +153,15 @@ class Musify(Page):
""" """
source_list: List[Source] = [] source_list: List[Source] = []
title = "" title = None
_id = None _id = None
year = None year = None
artist_list: List[Artist] = [] artist_list: List[Artist] = []
def parse_title_date(title_date: Optional[str], delimiter: str = " - "): def parse_title_date(title_date: Optional[str], delimiter: str = " - "):
nonlocal year
nonlocal title
if title_date is None: if title_date is None:
return return
@ -262,7 +265,7 @@ class Musify(Page):
@classmethod @classmethod
def parse_playlist_item(cls, playlist_item_soup: BeautifulSoup) -> Song: def parse_playlist_item(cls, playlist_item_soup: BeautifulSoup) -> Song:
_id = None _id = None
song_title = playlist_item_soup.get("data-name") or "" song_title = playlist_item_soup.get("data-name")
artist_list: List[Artist] = [] artist_list: List[Artist] = []
source_list: List[Source] = [] source_list: List[Source] = []
@ -415,7 +418,7 @@ class Musify(Page):
}) })
_id: Optional[str] = None _id: Optional[str] = None
name: str = "" name: str = None
source_list: List[Source] = [] source_list: List[Source] = []
timestamp: Optional[ID3Timestamp] = None timestamp: Optional[ID3Timestamp] = None
album_status = None album_status = None
@ -586,7 +589,7 @@ class Musify(Page):
</li> </li>
</ul> </ul>
""" """
name = "" name = None
source_list: List[Source] = [] source_list: List[Source] = []
country = None country = None
notes: FormattedText = None notes: FormattedText = None
@ -625,6 +628,7 @@ class Musify(Page):
if h1_name is not None: if h1_name is not None:
name = h1_name.get_text(strip=True) name = h1_name.get_text(strip=True)
# country and sources
icon_list: BeautifulSoup = soup.find("ul", {"class": "icon-list"}) icon_list: BeautifulSoup = soup.find("ul", {"class": "icon-list"})
if icon_list is not None: if icon_list is not None:
country_italic: BeautifulSoup = icon_list.find("i", {"class", "flag-icon"}) country_italic: BeautifulSoup = icon_list.find("i", {"class", "flag-icon"})
@ -646,10 +650,20 @@ class Musify(Page):
country = pycountry.countries.get(alpha_2=list(country_set)[0]) country = pycountry.countries.get(alpha_2=list(country_set)[0])
# get all additional sources
additional_source: BeautifulSoup
for additional_source in icon_list.find_all("a", {"class", "link"}):
href = additional_source.get("href")
if href is None:
continue
new_src = Source.match_url(href)
if new_src is None:
continue
source_list.append(new_src)
note_soup: BeautifulSoup = soup.find(id="text-main") note_soup: BeautifulSoup = soup.find(id="text-main")
if note_soup is not None: if note_soup is not None:
notes = FormattedText(html=note_soup.decode_contents()) notes = FormattedText(html=note_soup.decode_contents())
print(notes.plaintext)
return Artist( return Artist(
_id=url.musify_id, _id=url.musify_id,

View File

@ -9,7 +9,6 @@ def search():
def fetch_artist(): def fetch_artist():
artist = objects.Artist( artist = objects.Artist(
name="Ghost Bath",
source_list=[objects.Source(objects.SourcePages.MUSIFY, "https://musify.club/artist/psychonaut-4-83193")] source_list=[objects.Source(objects.SourcePages.MUSIFY, "https://musify.club/artist/psychonaut-4-83193")]
) )