added fetching of linked sources to musify

This commit is contained in:
Hellow 2023-03-18 13:01:27 +01:00
parent 862c25dd35
commit 2724eb3e36
4 changed files with 35 additions and 13 deletions

View File

@ -377,7 +377,7 @@ All objects dependent on Artist
class Artist(MainObject):
COLLECTION_ATTRIBUTES = ("feature_song_collection", "main_album_collection", "label_collection")
COLLECTION_ATTRIBUTES = ("feature_song_collection", "main_album_collection", "label_collection", "source_collection")
SIMPLE_ATTRIBUTES = {
"name": None,
"unified_name": None,

View File

@ -1,6 +1,7 @@
from collections import defaultdict
from enum import Enum
from typing import List, Dict, Tuple
from typing import List, Dict, Tuple, Optional
from urllib.parse import urlparse
from .metadata import Mapping, Metadata
from .parents import DatabaseObject
@ -28,6 +29,7 @@ class SourcePages(Enum):
INSTAGRAM = "instagram"
FACEBOOK = "facebook"
TWITTER = "twitter" # I will use nitter though lol
MYSPACE = "myspace" # Yes somehow this ancient site is linked EVERYWHERE
@classmethod
def get_homepage(cls, attribute) -> str:
@ -42,7 +44,8 @@ class SourcePages(Enum):
cls.INSTAGRAM: "https://www.instagram.com/",
cls.FACEBOOK: "https://www.facebook.com/",
cls.SPOTIFY: "https://open.spotify.com/",
cls.TWITTER: "https://twitter.com/"
cls.TWITTER: "https://twitter.com/",
cls.MYSPACE: "https://myspace.com/"
}
return homepage_map[attribute]
@ -71,11 +74,14 @@ class Source(DatabaseObject):
self.url = url
@classmethod
def match_url(cls, url: str):
def match_url(cls, url: str) -> Optional["Source"]:
"""
this shouldn't be used, unlesse you are not certain what the source is for
the reason is that it is more inefficient
"""
parsed = urlparse(url)
url = parsed.geturl()
if url.startswith("https://www.youtube"):
return cls(SourcePages.YOUTUBE, url)
@ -101,6 +107,9 @@ class Source(DatabaseObject):
if url.startswith("https://twitter"):
return cls(SourcePages.TWITTER, url)
if url.startswith("https://myspace.com"):
return cls(SourcePages.MYSPACE, url)
def get_song_metadata(self) -> Metadata:
return Metadata({
Mapping.FILE_WEBPAGE_URL: [self.url],
@ -157,4 +166,4 @@ class SourceCollection(Collection):
getting the sources for a specific page like
YouTube or musify
"""
return self._page_to_source_list[source_page]
return self._page_to_source_list[source_page].copy()

View File

@ -102,7 +102,7 @@ class Musify(Page):
@classmethod
def parse_artist_contact(cls, contact: BeautifulSoup) -> Artist:
source_list: List[Source] = []
name = ""
name = None
_id = None
# source
@ -153,12 +153,15 @@ class Musify(Page):
"""
source_list: List[Source] = []
title = ""
title = None
_id = None
year = None
artist_list: List[Artist] = []
def parse_title_date(title_date: Optional[str], delimiter: str = " - "):
nonlocal year
nonlocal title
if title_date is None:
return
@ -262,7 +265,7 @@ class Musify(Page):
@classmethod
def parse_playlist_item(cls, playlist_item_soup: BeautifulSoup) -> Song:
_id = None
song_title = playlist_item_soup.get("data-name") or ""
song_title = playlist_item_soup.get("data-name")
artist_list: List[Artist] = []
source_list: List[Source] = []
@ -415,7 +418,7 @@ class Musify(Page):
})
_id: Optional[str] = None
name: str = ""
name: str = None
source_list: List[Source] = []
timestamp: Optional[ID3Timestamp] = None
album_status = None
@ -586,7 +589,7 @@ class Musify(Page):
</li>
</ul>
"""
name = ""
name = None
source_list: List[Source] = []
country = None
notes: FormattedText = None
@ -625,6 +628,7 @@ class Musify(Page):
if h1_name is not None:
name = h1_name.get_text(strip=True)
# country and sources
icon_list: BeautifulSoup = soup.find("ul", {"class": "icon-list"})
if icon_list is not None:
country_italic: BeautifulSoup = icon_list.find("i", {"class", "flag-icon"})
@ -646,10 +650,20 @@ class Musify(Page):
country = pycountry.countries.get(alpha_2=list(country_set)[0])
# get all additional sources
additional_source: BeautifulSoup
for additional_source in icon_list.find_all("a", {"class", "link"}):
href = additional_source.get("href")
if href is None:
continue
new_src = Source.match_url(href)
if new_src is None:
continue
source_list.append(new_src)
note_soup: BeautifulSoup = soup.find(id="text-main")
if note_soup is not None:
notes = FormattedText(html=note_soup.decode_contents())
print(notes.plaintext)
return Artist(
_id=url.musify_id,

View File

@ -9,7 +9,6 @@ def search():
def fetch_artist():
artist = objects.Artist(
name="Ghost Bath",
source_list=[objects.Source(objects.SourcePages.MUSIFY, "https://musify.club/artist/psychonaut-4-83193")]
)