feat: improved tracing

feat: removed redundand commands from song
feat: stripped whitespaces from lyrics
2024-04-19 14:43:09 +02:00 · 2024-04-19 14:16:31 +02:00 · 2024-04-19 14:05:05 +02:00 · 2024-04-19 13:54:08 +02:00 · 2024-04-19 13:51:08 +02:00 · 2024-04-19 13:37:12 +02:00
11 changed files with 110 additions and 46 deletions
--- a/development/actual_donwload.py
+++ b/development/actual_donwload.py
@@ -7,7 +7,8 @@ logging.getLogger().setLevel(logging.DEBUG)
 if __name__ == "__main__":
    commands = [
        "s: #a Ghost Bath",
-        "4",
+        "0",
+        "d: 1",
    ]

    
--- a/music_kraken/download/page_attributes.py
+++ b/music_kraken/download/page_attributes.py
@@ -14,7 +14,7 @@ from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic,


 ALL_PAGES: Set[Type[Page]] = {
-    EncyclopaediaMetallum,
+    # EncyclopaediaMetallum,
    Musify,
    YoutubeMusic,
    Bandcamp
--- a/music_kraken/objects/formatted_text.py
+++ b/music_kraken/objects/formatted_text.py
@@ -1,5 +1,10 @@
 import mistune
-import html2markdown
+from markdownify import markdownify as md
+
+
+def plain_to_markdown(plain: str) -> str:
+    return plain.replace("\n", "  \n")
+

 class FormattedText:    
    html = ""
@@ -7,12 +12,15 @@ class FormattedText:
    def __init__(
            self,
            markdown: str = None,
-            html: str = None
+            html: str = None,
+            plain: str = None,
    ) -> None:
        if html is not None:
            self.html = html
        elif markdown is not None:
            self.html = mistune.markdown(markdown)
+        elif plain is not None:
+            self.html = mistune.markdown(plain_to_markdown(plain))

    @property
    def is_empty(self) -> bool:
@@ -28,7 +36,7 @@ class FormattedText:

    @property
    def markdown(self) -> str:
-        return html2markdown.convert(self.html)
+        return md(self.html).strip()

    def __str__(self) -> str:
        return self.markdown
--- a/music_kraken/objects/lyrics.py
+++ b/music_kraken/objects/lyrics.py
@@ -34,6 +34,6 @@ class Lyrics(OuterProxy):
    @property
    def metadata(self) -> Metadata:
        return Metadata({
-            id3Mapping.UNSYNCED_LYRICS: [self.text.html]
+            id3Mapping.UNSYNCED_LYRICS: [self.text.markdown]
        })

--- a/music_kraken/objects/parents.py
+++ b/music_kraken/objects/parents.py
@@ -32,6 +32,7 @@ class InnerData:

    def __init__(self, object_type, **kwargs):
        self._refers_to_instances = set()
+        self._fetched_from: dict = {}

        # collection : collection that is a collection of self
        self._is_collection_child: Dict[Collection, Collection] = {}
@@ -52,6 +53,8 @@ class InnerData:
        :return:
        """

+        self._fetched_from.update(__other._fetched_from)
+
        for key, value in __other.__dict__.copy().items():
            # just set the other value if self doesn't already have it
            if key not in self.__dict__ or (key in self.__dict__ and self.__dict__[key] == self._default_values.get(key)):
@@ -109,7 +112,6 @@ class OuterProxy:

                del kwargs[name]

-        self._fetched_from: dict = {}
        self._inner: InnerData = InnerData(type(self), **kwargs)
        self._inner._refers_to_instances.add(self)

@@ -192,7 +194,7 @@ class OuterProxy:
        if len(b._inner._refers_to_instances) > len(a._inner._refers_to_instances):
            a, b = b, a

-        object_trace(f"merging {type(a).__name__} [{a.title_string} | {a.id}] with {type(b).__name__} [{b.title_string} | {b.id}] called by [{' | '.join(f'{s.function} {Path(s.filename).name}:{str(s.lineno)}' for s in inspect.stack()[1:5])}]")
+        object_trace(f"merging {type(a).__name__} [{a.title_string} | {a.id}] with {type(b).__name__} [{b.title_string} | {b.id}]")
        
        for collection, child_collection in b._inner._is_collection_child.items():
            try:
@@ -220,13 +222,13 @@ class OuterProxy:

    def mark_as_fetched(self, *url_hash_list: List[str]):
        for url_hash in url_hash_list:
-            self._fetched_from[url_hash] = {
+            self._inner._fetched_from[url_hash] = {
                "time": get_unix_time(),
                "url": url_hash,
            }

    def already_fetched_from(self, url_hash: str) -> bool:
-        res = self._fetched_from.get(url_hash, None)
+        res = self._inner._fetched_from.get(url_hash, None)

        if res is None:
            return False
--- a/music_kraken/objects/song.py
+++ b/music_kraken/objects/song.py
@@ -86,11 +86,6 @@ class Song(Base):
    TITEL = "title"

    def __init_collections__(self) -> None:
-        """
-        self.album_collection.contain_given_in_attribute = {
-            "artist_collection": self.main_artist_collection,
-        }
-        """
        self.album_collection.sync_on_append = {
            "artist_collection": self.main_artist_collection,
        }
@@ -126,7 +121,7 @@ class Song(Base):
    def indexing_values(self) -> List[Tuple[str, object]]:
        return [
            ('id', self.id),
-            ('title', unify(self.unified_title)),
+            ('title', unify(self.title)),
            ('isrc', self.isrc),
            *[('url', source.url) for source in self.source_collection]
        ]
@@ -347,7 +342,6 @@ class Album(Base):
                tracksort_map[i] = existing_list.pop(0)
                tracksort_map[i].tracksort = i

-
    def compile(self, merge_into: bool = False):
        """
        compiles the recursive structures,
--- a/music_kraken/pages/bandcamp.py
+++ b/music_kraken/pages/bandcamp.py
@@ -18,10 +18,12 @@ from ..objects import (
    Contact,
    ID3Timestamp,
    Lyrics,
-    FormattedText
+    FormattedText,
+    Artwork,
 )
 from ..connection import Connection
 from ..utils.support_classes.download_result import DownloadResult
+from ..utils.string_processing import clean_song_title
 from ..utils.config import main_settings, logging_settings
 from ..utils.shared import DEBUG

@@ -114,7 +116,7 @@ class Bandcamp(Page):

        if object_type is BandcampTypes.SONG:
            return Song(
-                title=name.strip(),
+                title=clean_song_title(name, artist_name=data["band_name"]),
                source_list=source_list,
                main_artist_list=[
                    Artist(
@@ -252,11 +254,18 @@ class Bandcamp(Page):
        artist.source_collection.append(source)
        return artist

-    def _parse_track_element(self, track: dict) -> Optional[Song]:
+    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
+        lyrics_list: List[Lyrics] = []
+
+        _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
+        if _lyrics is not None:
+            lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
+
        return Song(
-            title=track["item"]["name"].strip(),
+            title=clean_song_title(track["item"]["name"]),
            source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
-            tracksort=int(track["position"])
+            tracksort=int(track["position"]),
+            artwork=artwork,
        )

    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
@@ -289,12 +298,32 @@ class Bandcamp(Page):
            )]
        )

+        artwork: Artwork = Artwork()
+
+        def _get_artwork_url(_data: dict) -> Optional[str]:
+            if "image" in _data:
+                return _data["image"]
+            for _property in _data.get("additionalProperty", []):
+                if _property.get("name") == "art_id":
+                    return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
+
+        _artwork_url = _get_artwork_url(data)
+        if _artwork_url is not None:
+            artwork.append(url=_artwork_url, width=350, height=350)
+        else:
+            for album_release in data.get("albumRelease", []):
+                _artwork_url = _get_artwork_url(album_release)
+                if _artwork_url is not None:
+                    artwork.append(url=_artwork_url, width=350, height=350)
+                    break
+
+
        for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
            if DEBUG:
                dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)

            try:
-                album.song_collection.append(self._parse_track_element(track_json))
+                album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
            except KeyError:
                continue

@@ -304,7 +333,6 @@ class Bandcamp(Page):
    def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
        track_lyrics = soup.find("div", {"class": "lyricsText"})
        if track_lyrics:
-            self.LOGGER.debug(" Lyrics retrieved..")
            return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]

        return []
@@ -323,10 +351,9 @@ class Bandcamp(Page):
        if len(other_data_list) > 0:
            other_data = json.loads(other_data_list[0]["data-tralbum"])

-        if DEBUG:
-            dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
-            dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
-            dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
+        dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
+        dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
+        dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)

        data = json.loads(data_container.text)
        album_data = data["inAlbum"]
@@ -337,8 +364,8 @@ class Bandcamp(Page):
            mp3_url = value

        song = Song(
-            title=data["name"].strip(),
-            source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
+            title=clean_song_title(data["name"], artist_name=artist_data["name"]),
+            source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
            album_list=[Album(
                title=album_data["name"].strip(),
                date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
@@ -351,8 +378,6 @@ class Bandcamp(Page):
            lyrics_list=self._fetch_lyrics(soup=soup)
        )

-        song.source_collection.append(source)
-
        return song

    def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:
--- a/music_kraken/utils/init.py
+++ b/music_kraken/utils/init.py
@@ -3,7 +3,7 @@ from pathlib import Path
 import json
 import logging

-from .shared import DEBUG, DEBUG_LOGGING, DEBUG_DUMP, DEBUG_TRACE, DEBUG_OBJECT_TRACE
+from .shared import DEBUG, DEBUG_LOGGING, DEBUG_DUMP, DEBUG_TRACE, DEBUG_OBJECT_TRACE, DEBUG_OBJECT_TRACE_CALLSTACK
 from .config import config, read_config, write_config
 from .enums.colors import BColors
 from .path_manager import LOCATIONS
@@ -56,7 +56,8 @@ def object_trace(obj):
    if not DEBUG_OBJECT_TRACE:
        return

-    output("object: " + str(obj), BColors.GREY)
+    appendix =  f" called by [{' | '.join(f'{s.function} {Path(s.filename).name}:{str(s.lineno)}' for s in inspect.stack()[1:5])}]" if DEBUG_OBJECT_TRACE_CALLSTACK else ""
+    output("object: " + str(obj) + appendix, BColors.GREY)


 """
--- a/music_kraken/utils/shared.py
+++ b/music_kraken/utils/shared.py
@@ -13,12 +13,13 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
 __stage__ = os.getenv("STAGE", "prod")

 DEBUG = (__stage__ == "dev") and True
-DEBUG_LOGGING = DEBUG and True
+DEBUG_LOGGING = DEBUG and False
 DEBUG_TRACE = DEBUG and True
-DEBUG_OBJECT_TRACE = DEBUG and False
+DEBUG_OBJECT_TRACE = DEBUG and True
+DEBUG_OBJECT_TRACE_CALLSTACK = DEBUG and False
 DEBUG_YOUTUBE_INITIALIZING = DEBUG and False
 DEBUG_PAGES = DEBUG and False
-DEBUG_DUMP = DEBUG and True
+DEBUG_DUMP = DEBUG and False

 if DEBUG:
    print("DEBUG ACTIVE")
--- a/music_kraken/utils/string_processing.py
+++ b/music_kraken/utils/string_processing.py
@@ -1,6 +1,7 @@
-from typing import Tuple, Union
+from typing import Tuple, Union, Optional
 from pathlib import Path
 import string
+from functools import lru_cache

 from transliterate.exceptions import LanguageDetectionError
 from transliterate import translit
@@ -10,8 +11,11 @@ from pathvalidate import sanitize_filename
 COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
    "(official video)",
 )
+OPEN_BRACKETS = "(["
+CLOSE_BRACKETS = ")]"
+DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")

-
+@lru_cache
 def unify(string: str) -> str:
    """
    returns a unified str, to make comparisons easy.
@@ -52,7 +56,8 @@ def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
        return fit_string(string)


-def clean_song_title(raw_song_title: str, artist_name: str) -> str:
+@lru_cache(maxsize=128)
+def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
    """
    This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
    
@@ -64,19 +69,45 @@ def clean_song_title(raw_song_title: str, artist_name: str) -> str:
    - `song (prod. some producer)`
    """
    raw_song_title = raw_song_title.strip()
-    artist_name = artist_name.strip()

    # Clean official Video appendix
    for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
        if raw_song_title.lower().endswith(dirty_appendix):
            raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()

-    # Remove artist from the start of the title
-    if raw_song_title.lower().startswith(artist_name.lower()):
-        raw_song_title = raw_song_title[len(artist_name):].strip()
+    # remove brackets and their content if they contain disallowed substrings
+    for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
+        if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
+            continue
+        
+        start = 0

-        if raw_song_title.startswith("-"):
-            raw_song_title = raw_song_title[1:].strip()
+        while True:
+            try:
+                open_bracket_index = raw_song_title.index(open_bracket, start)
+            except ValueError:
+                break
+            try:
+                close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
+            except ValueError:
+                break
+
+            substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
+            if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
+                raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
+            else:
+                start = close_bracket_index + 1
+
+    # everything that requires the artist name
+    if artist_name is not None:
+        artist_name = artist_name.strip()
+
+        # Remove artist from the start of the title
+        if raw_song_title.lower().startswith(artist_name.lower()):
+            raw_song_title = raw_song_title[len(artist_name):].strip()
+
+            if raw_song_title.startswith("-"):
+                raw_song_title = raw_song_title[1:].strip()

    return raw_song_title.strip()

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ dependencies = [

    "rich~=13.7.1",
    "mistune~=3.0.2",
+    "markdownify~=0.12.1",
    "html2markdown~=0.1.7",
    "jellyfish~=0.9.0",
    "transliterate~=1.10.2",
Author	SHA1	Message	Date
Lars Noack	b4c73d56a7	feat: improved tracing All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 14:43:09 +02:00
Lars Noack	1735ff4e1d	feat: removed redundand commands from song	2024-04-19 14:16:31 +02:00
Lars Noack	be09562632	feat: stripped whitespaces from lyrics All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 14:05:05 +02:00
Lars Noack	29770825a4	fix: unified wrong attribute in song, causing many duplicates All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 13:54:08 +02:00
Lars Noack	81708ba100	feat: switched to a more readable markdown converter All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 13:51:08 +02:00
Lars Noack	301ff82bcf	feat: implemented the merging from where it has been fetched from All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 13:37:12 +02:00
Lars Noack	06ffae06a6	fix: lyrics should be embedded in the markdown format rather than html	2024-04-19 12:57:34 +02:00
Lars Noack	919a99885c	feat: disabled debugging pages All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:47:36 +02:00
Lars Noack	e20b14a9df	feat: added fetching artworks to bandcamp All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:37:14 +02:00
Lars Noack	b933c6ac14	feat: improved the lyrics support for bandcamp All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:17:08 +02:00
Lars Noack	3c5bbc19af	feat: some slight performance improvements All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:04:13 +02:00
Lars Noack	06acf22abb	feat: improved the cleaning song title function to remove redundand brackets All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:02:54 +02:00
Lars Noack	1e62d371cd	feat: cleaned bandcamp songs All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 11:46:56 +02:00
Lars Noack	24a90f1cdf	feat: artist name in clean song title is optional	2024-04-19 11:43:21 +02:00
Lars Noack	d9c711a2f8	feat: added lru cache to unify function to speed up indexing	2024-04-19 11:40:00 +02:00