From 2724eb3e365483f9586b4ab96eeeb02c689ac545 Mon Sep 17 00:00:00 2001 From: Hellow Date: Sat, 18 Mar 2023 13:01:27 +0100 Subject: [PATCH] added fetching of linked sources to musify --- src/music_kraken/objects/song.py | 2 +- src/music_kraken/objects/source.py | 19 ++++++++++++++----- src/music_kraken/pages/musify.py | 26 ++++++++++++++++++++------ src/musify_search.py | 1 - 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/src/music_kraken/objects/song.py b/src/music_kraken/objects/song.py index 4169994..82a2cf0 100644 --- a/src/music_kraken/objects/song.py +++ b/src/music_kraken/objects/song.py @@ -377,7 +377,7 @@ All objects dependent on Artist class Artist(MainObject): - COLLECTION_ATTRIBUTES = ("feature_song_collection", "main_album_collection", "label_collection") + COLLECTION_ATTRIBUTES = ("feature_song_collection", "main_album_collection", "label_collection", "source_collection") SIMPLE_ATTRIBUTES = { "name": None, "unified_name": None, diff --git a/src/music_kraken/objects/source.py b/src/music_kraken/objects/source.py index 7213575..3214513 100644 --- a/src/music_kraken/objects/source.py +++ b/src/music_kraken/objects/source.py @@ -1,6 +1,7 @@ from collections import defaultdict from enum import Enum -from typing import List, Dict, Tuple +from typing import List, Dict, Tuple, Optional +from urllib.parse import urlparse from .metadata import Mapping, Metadata from .parents import DatabaseObject @@ -27,7 +28,8 @@ class SourcePages(Enum): # This has nothing to do with audio, but bands can be here INSTAGRAM = "instagram" FACEBOOK = "facebook" - TWITTER = "twitter" # I will use nitter though lol + TWITTER = "twitter" # I will use nitter though lol + MYSPACE = "myspace" # Yes somehow this ancient site is linked EVERYWHERE @classmethod def get_homepage(cls, attribute) -> str: @@ -42,7 +44,8 @@ class SourcePages(Enum): cls.INSTAGRAM: "https://www.instagram.com/", cls.FACEBOOK: "https://www.facebook.com/", cls.SPOTIFY: "https://open.spotify.com/", - cls.TWITTER: "https://twitter.com/" + cls.TWITTER: "https://twitter.com/", + cls.MYSPACE: "https://myspace.com/" } return homepage_map[attribute] @@ -71,11 +74,14 @@ class Source(DatabaseObject): self.url = url @classmethod - def match_url(cls, url: str): + def match_url(cls, url: str) -> Optional["Source"]: """ this shouldn't be used, unlesse you are not certain what the source is for the reason is that it is more inefficient """ + parsed = urlparse(url) + url = parsed.geturl() + if url.startswith("https://www.youtube"): return cls(SourcePages.YOUTUBE, url) @@ -101,6 +107,9 @@ class Source(DatabaseObject): if url.startswith("https://twitter"): return cls(SourcePages.TWITTER, url) + if url.startswith("https://myspace.com"): + return cls(SourcePages.MYSPACE, url) + def get_song_metadata(self) -> Metadata: return Metadata({ Mapping.FILE_WEBPAGE_URL: [self.url], @@ -157,4 +166,4 @@ class SourceCollection(Collection): getting the sources for a specific page like YouTube or musify """ - return self._page_to_source_list[source_page] + return self._page_to_source_list[source_page].copy() diff --git a/src/music_kraken/pages/musify.py b/src/music_kraken/pages/musify.py index 1dfe02f..6059495 100644 --- a/src/music_kraken/pages/musify.py +++ b/src/music_kraken/pages/musify.py @@ -102,7 +102,7 @@ class Musify(Page): @classmethod def parse_artist_contact(cls, contact: BeautifulSoup) -> Artist: source_list: List[Source] = [] - name = "" + name = None _id = None # source @@ -153,12 +153,15 @@ class Musify(Page): """ source_list: List[Source] = [] - title = "" + title = None _id = None year = None artist_list: List[Artist] = [] def parse_title_date(title_date: Optional[str], delimiter: str = " - "): + nonlocal year + nonlocal title + if title_date is None: return @@ -262,7 +265,7 @@ class Musify(Page): @classmethod def parse_playlist_item(cls, playlist_item_soup: BeautifulSoup) -> Song: _id = None - song_title = playlist_item_soup.get("data-name") or "" + song_title = playlist_item_soup.get("data-name") artist_list: List[Artist] = [] source_list: List[Source] = [] @@ -415,7 +418,7 @@ class Musify(Page): }) _id: Optional[str] = None - name: str = "" + name: str = None source_list: List[Source] = [] timestamp: Optional[ID3Timestamp] = None album_status = None @@ -586,7 +589,7 @@ class Musify(Page): """ - name = "" + name = None source_list: List[Source] = [] country = None notes: FormattedText = None @@ -625,6 +628,7 @@ class Musify(Page): if h1_name is not None: name = h1_name.get_text(strip=True) + # country and sources icon_list: BeautifulSoup = soup.find("ul", {"class": "icon-list"}) if icon_list is not None: country_italic: BeautifulSoup = icon_list.find("i", {"class", "flag-icon"}) @@ -646,10 +650,20 @@ class Musify(Page): country = pycountry.countries.get(alpha_2=list(country_set)[0]) + # get all additional sources + additional_source: BeautifulSoup + for additional_source in icon_list.find_all("a", {"class", "link"}): + href = additional_source.get("href") + if href is None: + continue + new_src = Source.match_url(href) + if new_src is None: + continue + source_list.append(new_src) + note_soup: BeautifulSoup = soup.find(id="text-main") if note_soup is not None: notes = FormattedText(html=note_soup.decode_contents()) - print(notes.plaintext) return Artist( _id=url.musify_id, diff --git a/src/musify_search.py b/src/musify_search.py index a95e172..bb80746 100644 --- a/src/musify_search.py +++ b/src/musify_search.py @@ -9,7 +9,6 @@ def search(): def fetch_artist(): artist = objects.Artist( - name="Ghost Bath", source_list=[objects.Source(objects.SourcePages.MUSIFY, "https://musify.club/artist/psychonaut-4-83193")] )