feat: disabled debugging pages

feat: added fetching artworks to bandcamp
feat: improved the lyrics support for bandcamp
2024-04-19 12:47:36 +02:00 · 2024-04-19 12:37:14 +02:00 · 2024-04-19 12:17:08 +02:00 · 2024-04-19 12:04:13 +02:00 · 2024-04-19 12:02:54 +02:00 · 2024-04-19 11:46:56 +02:00
5 changed files with 88 additions and 19 deletions
--- a/development/actual_donwload.py
+++ b/development/actual_donwload.py
@@ -7,7 +7,8 @@ logging.getLogger().setLevel(logging.DEBUG)
 if __name__ == "__main__":
    commands = [
        "s: #a Ghost Bath",
-        "4",
+        "0",
        "d: 1",
    ]
--- a/music_kraken/download/page_attributes.py
+++ b/music_kraken/download/page_attributes.py
@@ -14,7 +14,7 @@ from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic,
 ALL_PAGES: Set[Type[Page]] = {
-    EncyclopaediaMetallum,
+    # EncyclopaediaMetallum,
    Musify,
    YoutubeMusic,
    Bandcamp
--- a/music_kraken/objects/formatted_text.py
+++ b/music_kraken/objects/formatted_text.py
@@ -1,18 +1,26 @@
 import mistune
 import html2markdown
 def plain_to_markdown(plain: str) -> str:
    return plain.replace("\n", "  \n")
 class FormattedText:    
    html = ""
    def __init__(
            self,
            markdown: str = None,
-            html: str = None
+            html: str = None,
            plain: str = None,
    ) -> None:
        if html is not None:
            self.html = html
        elif markdown is not None:
            self.html = mistune.markdown(markdown)
        elif plain is not None:
            self.html = mistune.markdown(plain_to_markdown(plain))
    @property
    def is_empty(self) -> bool:
--- a/music_kraken/pages/bandcamp.py
+++ b/music_kraken/pages/bandcamp.py
@@ -18,10 +18,12 @@ from ..objects import (
    Contact,
    ID3Timestamp,
    Lyrics,
-    FormattedText
+    FormattedText,
    Artwork,
 )
 from ..connection import Connection
 from ..utils.support_classes.download_result import DownloadResult
 from ..utils.string_processing import clean_song_title
 from ..utils.config import main_settings, logging_settings
 from ..utils.shared import DEBUG
@@ -114,7 +116,7 @@ class Bandcamp(Page):
        if object_type is BandcampTypes.SONG:
            return Song(
-                title=name.strip(),
+                title=clean_song_title(name, artist_name=data["band_name"]),
                source_list=source_list,
                main_artist_list=[
                    Artist(
@@ -252,11 +254,18 @@ class Bandcamp(Page):
        artist.source_collection.append(source)
        return artist
-    def _parse_track_element(self, track: dict) -> Optional[Song]:
+    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
        lyrics_list: List[Lyrics] = []
        _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
        if _lyrics is not None:
            lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
        return Song(
-            title=track["item"]["name"].strip(),
+            title=clean_song_title(track["item"]["name"]),
            source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
-            tracksort=int(track["position"])
+            tracksort=int(track["position"]),
            artwork=artwork,
        )
    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
@@ -289,12 +298,32 @@ class Bandcamp(Page):
            )]
        )
        artwork: Artwork = Artwork()
        def _get_artwork_url(_data: dict) -> Optional[str]:
            if "image" in _data:
                return _data["image"]
            for _property in _data.get("additionalProperty", []):
                if _property.get("name") == "art_id":
                    return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
        _artwork_url = _get_artwork_url(data)
        if _artwork_url is not None:
            artwork.append(url=_artwork_url, width=350, height=350)
        else:
            for album_release in data.get("albumRelease", []):
                _artwork_url = _get_artwork_url(album_release)
                if _artwork_url is not None:
                    artwork.append(url=_artwork_url, width=350, height=350)
                    break
        for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
            if DEBUG:
                dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
            try:
-                album.song_collection.append(self._parse_track_element(track_json))
+                album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
            except KeyError:
                continue
@@ -337,7 +366,7 @@ class Bandcamp(Page):
            mp3_url = value
        song = Song(
-            title=data["name"].strip(),
+            title=clean_song_title(data["name"], artist_name=artist_data["name"]),
            source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
            album_list=[Album(
                title=album_data["name"].strip(),
--- a/music_kraken/utils/string_processing.py
+++ b/music_kraken/utils/string_processing.py
@@ -1,6 +1,7 @@
-from typing import Tuple, Union
+from typing import Tuple, Union, Optional
 from pathlib import Path
 import string
 from functools import lru_cache
 from transliterate.exceptions import LanguageDetectionError
 from transliterate import translit
@@ -10,8 +11,11 @@ from pathvalidate import sanitize_filename
 COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
    "(official video)",
 )
 OPEN_BRACKETS = "(["
 CLOSE_BRACKETS = ")]"
 DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")
-
+@lru_cache
 def unify(string: str) -> str:
    """
    returns a unified str, to make comparisons easy.
@@ -52,7 +56,8 @@ def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
        return fit_string(string)
-def clean_song_title(raw_song_title: str, artist_name: str) -> str:
+@lru_cache(maxsize=128)
 def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
    """
    This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
@@ -64,19 +69,45 @@ def clean_song_title(raw_song_title: str, artist_name: str) -> str:
    - `song (prod. some producer)`
    """
    raw_song_title = raw_song_title.strip()
    artist_name = artist_name.strip()
    # Clean official Video appendix
    for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
        if raw_song_title.lower().endswith(dirty_appendix):
            raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()
-    # Remove artist from the start of the title
+    # remove brackets and their content if they contain disallowed substrings
-    if raw_song_title.lower().startswith(artist_name.lower()):
+    for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
-        raw_song_title = raw_song_title[len(artist_name):].strip()
+        if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
            continue
        start = 0
-        if raw_song_title.startswith("-"):
+        while True:
-            raw_song_title = raw_song_title[1:].strip()
+            try:
                open_bracket_index = raw_song_title.index(open_bracket, start)
            except ValueError:
                break
            try:
                close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
            except ValueError:
                break
            substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
            if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
                raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
            else:
                start = close_bracket_index + 1
    # everything that requires the artist name
    if artist_name is not None:
        artist_name = artist_name.strip()
        # Remove artist from the start of the title
        if raw_song_title.lower().startswith(artist_name.lower()):
            raw_song_title = raw_song_title[len(artist_name):].strip()
            if raw_song_title.startswith("-"):
                raw_song_title = raw_song_title[1:].strip()
    return raw_song_title.strip()
Author	SHA1	Message	Date
Lars Noack	919a99885c	feat: disabled debugging pages All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:47:36 +02:00
Lars Noack	e20b14a9df	feat: added fetching artworks to bandcamp All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:37:14 +02:00
Lars Noack	b933c6ac14	feat: improved the lyrics support for bandcamp All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:17:08 +02:00
Lars Noack	3c5bbc19af	feat: some slight performance improvements All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:04:13 +02:00
Lars Noack	06acf22abb	feat: improved the cleaning song title function to remove redundand brackets All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:02:54 +02:00
Lars Noack	1e62d371cd	feat: cleaned bandcamp songs All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 11:46:56 +02:00
Lars Noack	24a90f1cdf	feat: artist name in clean song title is optional	2024-04-19 11:43:21 +02:00
Lars Noack	d9c711a2f8	feat: added lru cache to unify function to speed up indexing	2024-04-19 11:40:00 +02:00