13 Commits

Author SHA1 Message Date
be09562632 feat: stripped whitespaces from lyrics
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 14:05:05 +02:00
29770825a4 fix: unified wrong attribute in song, causing many duplicates
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 13:54:08 +02:00
81708ba100 feat: switched to a more readable markdown converter
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 13:51:08 +02:00
301ff82bcf feat: implemented the merging from where it has been fetched from
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 13:37:12 +02:00
06ffae06a6 fix: lyrics should be embedded in the markdown format rather than html 2024-04-19 12:57:34 +02:00
919a99885c feat: disabled debugging pages
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:47:36 +02:00
e20b14a9df feat: added fetching artworks to bandcamp
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:37:14 +02:00
b933c6ac14 feat: improved the lyrics support for bandcamp
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:17:08 +02:00
3c5bbc19af feat: some slight performance improvements
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:04:13 +02:00
06acf22abb feat: improved the cleaning song title function to remove redundand brackets
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:02:54 +02:00
1e62d371cd feat: cleaned bandcamp songs
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 11:46:56 +02:00
24a90f1cdf feat: artist name in clean song title is optional 2024-04-19 11:43:21 +02:00
d9c711a2f8 feat: added lru cache to unify function to speed up indexing 2024-04-19 11:40:00 +02:00
10 changed files with 103 additions and 35 deletions

View File

@@ -7,7 +7,8 @@ logging.getLogger().setLevel(logging.DEBUG)
if __name__ == "__main__": if __name__ == "__main__":
commands = [ commands = [
"s: #a Ghost Bath", "s: #a Ghost Bath",
"4", "0",
"d: 1",
] ]

View File

@@ -14,7 +14,7 @@ from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic,
ALL_PAGES: Set[Type[Page]] = { ALL_PAGES: Set[Type[Page]] = {
EncyclopaediaMetallum, # EncyclopaediaMetallum,
Musify, Musify,
YoutubeMusic, YoutubeMusic,
Bandcamp Bandcamp

View File

@@ -1,5 +1,10 @@
import mistune import mistune
import html2markdown from markdownify import markdownify as md
def plain_to_markdown(plain: str) -> str:
return plain.replace("\n", " \n")
class FormattedText: class FormattedText:
html = "" html = ""
@@ -7,12 +12,15 @@ class FormattedText:
def __init__( def __init__(
self, self,
markdown: str = None, markdown: str = None,
html: str = None html: str = None,
plain: str = None,
) -> None: ) -> None:
if html is not None: if html is not None:
self.html = html self.html = html
elif markdown is not None: elif markdown is not None:
self.html = mistune.markdown(markdown) self.html = mistune.markdown(markdown)
elif plain is not None:
self.html = mistune.markdown(plain_to_markdown(plain))
@property @property
def is_empty(self) -> bool: def is_empty(self) -> bool:
@@ -28,7 +36,7 @@ class FormattedText:
@property @property
def markdown(self) -> str: def markdown(self) -> str:
return html2markdown.convert(self.html) return md(self.html).strip()
def __str__(self) -> str: def __str__(self) -> str:
return self.markdown return self.markdown

View File

@@ -34,6 +34,6 @@ class Lyrics(OuterProxy):
@property @property
def metadata(self) -> Metadata: def metadata(self) -> Metadata:
return Metadata({ return Metadata({
id3Mapping.UNSYNCED_LYRICS: [self.text.html] id3Mapping.UNSYNCED_LYRICS: [self.text.markdown]
}) })

View File

@@ -32,6 +32,7 @@ class InnerData:
def __init__(self, object_type, **kwargs): def __init__(self, object_type, **kwargs):
self._refers_to_instances = set() self._refers_to_instances = set()
self._fetched_from: dict = {}
# collection : collection that is a collection of self # collection : collection that is a collection of self
self._is_collection_child: Dict[Collection, Collection] = {} self._is_collection_child: Dict[Collection, Collection] = {}
@@ -52,6 +53,8 @@ class InnerData:
:return: :return:
""" """
self._fetched_from.update(__other._fetched_from)
for key, value in __other.__dict__.copy().items(): for key, value in __other.__dict__.copy().items():
# just set the other value if self doesn't already have it # just set the other value if self doesn't already have it
if key not in self.__dict__ or (key in self.__dict__ and self.__dict__[key] == self._default_values.get(key)): if key not in self.__dict__ or (key in self.__dict__ and self.__dict__[key] == self._default_values.get(key)):
@@ -109,7 +112,6 @@ class OuterProxy:
del kwargs[name] del kwargs[name]
self._fetched_from: dict = {}
self._inner: InnerData = InnerData(type(self), **kwargs) self._inner: InnerData = InnerData(type(self), **kwargs)
self._inner._refers_to_instances.add(self) self._inner._refers_to_instances.add(self)
@@ -220,13 +222,13 @@ class OuterProxy:
def mark_as_fetched(self, *url_hash_list: List[str]): def mark_as_fetched(self, *url_hash_list: List[str]):
for url_hash in url_hash_list: for url_hash in url_hash_list:
self._fetched_from[url_hash] = { self._inner._fetched_from[url_hash] = {
"time": get_unix_time(), "time": get_unix_time(),
"url": url_hash, "url": url_hash,
} }
def already_fetched_from(self, url_hash: str) -> bool: def already_fetched_from(self, url_hash: str) -> bool:
res = self._fetched_from.get(url_hash, None) res = self._inner._fetched_from.get(url_hash, None)
if res is None: if res is None:
return False return False

View File

@@ -126,7 +126,7 @@ class Song(Base):
def indexing_values(self) -> List[Tuple[str, object]]: def indexing_values(self) -> List[Tuple[str, object]]:
return [ return [
('id', self.id), ('id', self.id),
('title', unify(self.unified_title)), ('title', unify(self.title)),
('isrc', self.isrc), ('isrc', self.isrc),
*[('url', source.url) for source in self.source_collection] *[('url', source.url) for source in self.source_collection]
] ]

View File

@@ -18,10 +18,12 @@ from ..objects import (
Contact, Contact,
ID3Timestamp, ID3Timestamp,
Lyrics, Lyrics,
FormattedText FormattedText,
Artwork,
) )
from ..connection import Connection from ..connection import Connection
from ..utils.support_classes.download_result import DownloadResult from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import clean_song_title
from ..utils.config import main_settings, logging_settings from ..utils.config import main_settings, logging_settings
from ..utils.shared import DEBUG from ..utils.shared import DEBUG
@@ -114,7 +116,7 @@ class Bandcamp(Page):
if object_type is BandcampTypes.SONG: if object_type is BandcampTypes.SONG:
return Song( return Song(
title=name.strip(), title=clean_song_title(name, artist_name=data["band_name"]),
source_list=source_list, source_list=source_list,
main_artist_list=[ main_artist_list=[
Artist( Artist(
@@ -252,11 +254,18 @@ class Bandcamp(Page):
artist.source_collection.append(source) artist.source_collection.append(source)
return artist return artist
def _parse_track_element(self, track: dict) -> Optional[Song]: def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
lyrics_list: List[Lyrics] = []
_lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
if _lyrics is not None:
lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
return Song( return Song(
title=track["item"]["name"].strip(), title=clean_song_title(track["item"]["name"]),
source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])], source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
tracksort=int(track["position"]) tracksort=int(track["position"]),
artwork=artwork,
) )
def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album: def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
@@ -289,12 +298,32 @@ class Bandcamp(Page):
)] )]
) )
artwork: Artwork = Artwork()
def _get_artwork_url(_data: dict) -> Optional[str]:
if "image" in _data:
return _data["image"]
for _property in _data.get("additionalProperty", []):
if _property.get("name") == "art_id":
return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
_artwork_url = _get_artwork_url(data)
if _artwork_url is not None:
artwork.append(url=_artwork_url, width=350, height=350)
else:
for album_release in data.get("albumRelease", []):
_artwork_url = _get_artwork_url(album_release)
if _artwork_url is not None:
artwork.append(url=_artwork_url, width=350, height=350)
break
for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])): for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
if DEBUG: if DEBUG:
dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False) dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
try: try:
album.song_collection.append(self._parse_track_element(track_json)) album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
except KeyError: except KeyError:
continue continue
@@ -304,7 +333,6 @@ class Bandcamp(Page):
def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]: def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
track_lyrics = soup.find("div", {"class": "lyricsText"}) track_lyrics = soup.find("div", {"class": "lyricsText"})
if track_lyrics: if track_lyrics:
self.LOGGER.debug(" Lyrics retrieved..")
return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))] return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
return [] return []
@@ -323,10 +351,9 @@ class Bandcamp(Page):
if len(other_data_list) > 0: if len(other_data_list) > 0:
other_data = json.loads(other_data_list[0]["data-tralbum"]) other_data = json.loads(other_data_list[0]["data-tralbum"])
if DEBUG: dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False) dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False) dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
data = json.loads(data_container.text) data = json.loads(data_container.text)
album_data = data["inAlbum"] album_data = data["inAlbum"]
@@ -337,8 +364,8 @@ class Bandcamp(Page):
mp3_url = value mp3_url = value
song = Song( song = Song(
title=data["name"].strip(), title=clean_song_title(data["name"], artist_name=artist_data["name"]),
source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)], source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
album_list=[Album( album_list=[Album(
title=album_data["name"].strip(), title=album_data["name"].strip(),
date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"), date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
@@ -351,8 +378,6 @@ class Bandcamp(Page):
lyrics_list=self._fetch_lyrics(soup=soup) lyrics_list=self._fetch_lyrics(soup=soup)
) )
song.source_collection.append(source)
return song return song
def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult: def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:

View File

@@ -13,7 +13,7 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
__stage__ = os.getenv("STAGE", "prod") __stage__ = os.getenv("STAGE", "prod")
DEBUG = (__stage__ == "dev") and True DEBUG = (__stage__ == "dev") and True
DEBUG_LOGGING = DEBUG and True DEBUG_LOGGING = DEBUG and False
DEBUG_TRACE = DEBUG and True DEBUG_TRACE = DEBUG and True
DEBUG_OBJECT_TRACE = DEBUG and False DEBUG_OBJECT_TRACE = DEBUG and False
DEBUG_YOUTUBE_INITIALIZING = DEBUG and False DEBUG_YOUTUBE_INITIALIZING = DEBUG and False

View File

@@ -1,6 +1,7 @@
from typing import Tuple, Union from typing import Tuple, Union, Optional
from pathlib import Path from pathlib import Path
import string import string
from functools import lru_cache
from transliterate.exceptions import LanguageDetectionError from transliterate.exceptions import LanguageDetectionError
from transliterate import translit from transliterate import translit
@@ -10,8 +11,11 @@ from pathvalidate import sanitize_filename
COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = ( COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
"(official video)", "(official video)",
) )
OPEN_BRACKETS = "(["
CLOSE_BRACKETS = ")]"
DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")
@lru_cache
def unify(string: str) -> str: def unify(string: str) -> str:
""" """
returns a unified str, to make comparisons easy. returns a unified str, to make comparisons easy.
@@ -52,7 +56,8 @@ def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
return fit_string(string) return fit_string(string)
def clean_song_title(raw_song_title: str, artist_name: str) -> str: @lru_cache(maxsize=128)
def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
""" """
This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
@@ -64,19 +69,45 @@ def clean_song_title(raw_song_title: str, artist_name: str) -> str:
- `song (prod. some producer)` - `song (prod. some producer)`
""" """
raw_song_title = raw_song_title.strip() raw_song_title = raw_song_title.strip()
artist_name = artist_name.strip()
# Clean official Video appendix # Clean official Video appendix
for dirty_appendix in COMMON_TITLE_APPENDIX_LIST: for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
if raw_song_title.lower().endswith(dirty_appendix): if raw_song_title.lower().endswith(dirty_appendix):
raw_song_title = raw_song_title[:-len(dirty_appendix)].strip() raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()
# Remove artist from the start of the title # remove brackets and their content if they contain disallowed substrings
if raw_song_title.lower().startswith(artist_name.lower()): for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
raw_song_title = raw_song_title[len(artist_name):].strip() if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
continue
start = 0
if raw_song_title.startswith("-"): while True:
raw_song_title = raw_song_title[1:].strip() try:
open_bracket_index = raw_song_title.index(open_bracket, start)
except ValueError:
break
try:
close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
except ValueError:
break
substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
else:
start = close_bracket_index + 1
# everything that requires the artist name
if artist_name is not None:
artist_name = artist_name.strip()
# Remove artist from the start of the title
if raw_song_title.lower().startswith(artist_name.lower()):
raw_song_title = raw_song_title[len(artist_name):].strip()
if raw_song_title.startswith("-"):
raw_song_title = raw_song_title[1:].strip()
return raw_song_title.strip() return raw_song_title.strip()

View File

@@ -56,6 +56,7 @@ dependencies = [
"rich~=13.7.1", "rich~=13.7.1",
"mistune~=3.0.2", "mistune~=3.0.2",
"markdownify~=0.12.1",
"html2markdown~=0.1.7", "html2markdown~=0.1.7",
"jellyfish~=0.9.0", "jellyfish~=0.9.0",
"transliterate~=1.10.2", "transliterate~=1.10.2",