feat: stripped whitespaces from lyrics

fix: unified wrong attribute in song, causing many duplicates
feat: switched to a more readable markdown converter
2024-04-19 14:05:05 +02:00 · 2024-04-19 13:54:08 +02:00 · 2024-04-19 13:51:08 +02:00 · 2024-04-19 13:37:12 +02:00 · 2024-04-19 12:57:34 +02:00 · 2024-04-19 12:47:36 +02:00
10 changed files with 103 additions and 35 deletions
--- a/development/actual_donwload.py
+++ b/development/actual_donwload.py
@@ -7,7 +7,8 @@ logging.getLogger().setLevel(logging.DEBUG)
 if __name__ == "__main__":
    commands = [
        "s: #a Ghost Bath",
-        "4",
+        "0",
        "d: 1",
    ]
--- a/music_kraken/download/page_attributes.py
+++ b/music_kraken/download/page_attributes.py
@@ -14,7 +14,7 @@ from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic,
 ALL_PAGES: Set[Type[Page]] = {
-    EncyclopaediaMetallum,
+    # EncyclopaediaMetallum,
    Musify,
    YoutubeMusic,
    Bandcamp
--- a/music_kraken/objects/formatted_text.py
+++ b/music_kraken/objects/formatted_text.py
@@ -1,5 +1,10 @@
 import mistune
-import html2markdown
+from markdownify import markdownify as md
 def plain_to_markdown(plain: str) -> str:
    return plain.replace("\n", "  \n")
 class FormattedText:    
    html = ""
@@ -7,12 +12,15 @@ class FormattedText:
    def __init__(
            self,
            markdown: str = None,
-            html: str = None
+            html: str = None,
            plain: str = None,
    ) -> None:
        if html is not None:
            self.html = html
        elif markdown is not None:
            self.html = mistune.markdown(markdown)
        elif plain is not None:
            self.html = mistune.markdown(plain_to_markdown(plain))
    @property
    def is_empty(self) -> bool:
@@ -28,7 +36,7 @@ class FormattedText:
    @property
    def markdown(self) -> str:
-        return html2markdown.convert(self.html)
+        return md(self.html).strip()
    def __str__(self) -> str:
        return self.markdown
--- a/music_kraken/objects/lyrics.py
+++ b/music_kraken/objects/lyrics.py
@@ -34,6 +34,6 @@ class Lyrics(OuterProxy):
    @property
    def metadata(self) -> Metadata:
        return Metadata({
-            id3Mapping.UNSYNCED_LYRICS: [self.text.html]
+            id3Mapping.UNSYNCED_LYRICS: [self.text.markdown]
        })
--- a/music_kraken/objects/parents.py
+++ b/music_kraken/objects/parents.py
@@ -32,6 +32,7 @@ class InnerData:
    def __init__(self, object_type, **kwargs):
        self._refers_to_instances = set()
        self._fetched_from: dict = {}
        # collection : collection that is a collection of self
        self._is_collection_child: Dict[Collection, Collection] = {}
@@ -52,6 +53,8 @@ class InnerData:
        :return:
        """
        self._fetched_from.update(__other._fetched_from)
        for key, value in __other.__dict__.copy().items():
            # just set the other value if self doesn't already have it
            if key not in self.__dict__ or (key in self.__dict__ and self.__dict__[key] == self._default_values.get(key)):
@@ -109,7 +112,6 @@ class OuterProxy:
                del kwargs[name]
        self._fetched_from: dict = {}
        self._inner: InnerData = InnerData(type(self), **kwargs)
        self._inner._refers_to_instances.add(self)
@@ -220,13 +222,13 @@ class OuterProxy:
    def mark_as_fetched(self, *url_hash_list: List[str]):
        for url_hash in url_hash_list:
-            self._fetched_from[url_hash] = {
+            self._inner._fetched_from[url_hash] = {
                "time": get_unix_time(),
                "url": url_hash,
            }
    def already_fetched_from(self, url_hash: str) -> bool:
-        res = self._fetched_from.get(url_hash, None)
+        res = self._inner._fetched_from.get(url_hash, None)
        if res is None:
            return False
--- a/music_kraken/objects/song.py
+++ b/music_kraken/objects/song.py
@@ -126,7 +126,7 @@ class Song(Base):
    def indexing_values(self) -> List[Tuple[str, object]]:
        return [
            ('id', self.id),
-            ('title', unify(self.unified_title)),
+            ('title', unify(self.title)),
            ('isrc', self.isrc),
            *[('url', source.url) for source in self.source_collection]
        ]
--- a/music_kraken/pages/bandcamp.py
+++ b/music_kraken/pages/bandcamp.py
@@ -18,10 +18,12 @@ from ..objects import (
    Contact,
    ID3Timestamp,
    Lyrics,
-    FormattedText
+    FormattedText,
    Artwork,
 )
 from ..connection import Connection
 from ..utils.support_classes.download_result import DownloadResult
 from ..utils.string_processing import clean_song_title
 from ..utils.config import main_settings, logging_settings
 from ..utils.shared import DEBUG
@@ -114,7 +116,7 @@ class Bandcamp(Page):
        if object_type is BandcampTypes.SONG:
            return Song(
-                title=name.strip(),
+                title=clean_song_title(name, artist_name=data["band_name"]),
                source_list=source_list,
                main_artist_list=[
                    Artist(
@@ -252,11 +254,18 @@ class Bandcamp(Page):
        artist.source_collection.append(source)
        return artist
-    def _parse_track_element(self, track: dict) -> Optional[Song]:
+    def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
        lyrics_list: List[Lyrics] = []
        _lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
        if _lyrics is not None:
            lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
        return Song(
-            title=track["item"]["name"].strip(),
+            title=clean_song_title(track["item"]["name"]),
            source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
-            tracksort=int(track["position"])
+            tracksort=int(track["position"]),
            artwork=artwork,
        )
    def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
@@ -289,12 +298,32 @@ class Bandcamp(Page):
            )]
        )
        artwork: Artwork = Artwork()
        def _get_artwork_url(_data: dict) -> Optional[str]:
            if "image" in _data:
                return _data["image"]
            for _property in _data.get("additionalProperty", []):
                if _property.get("name") == "art_id":
                    return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
        _artwork_url = _get_artwork_url(data)
        if _artwork_url is not None:
            artwork.append(url=_artwork_url, width=350, height=350)
        else:
            for album_release in data.get("albumRelease", []):
                _artwork_url = _get_artwork_url(album_release)
                if _artwork_url is not None:
                    artwork.append(url=_artwork_url, width=350, height=350)
                    break
        for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
            if DEBUG:
                dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
            try:
-                album.song_collection.append(self._parse_track_element(track_json))
+                album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
            except KeyError:
                continue
@@ -304,7 +333,6 @@ class Bandcamp(Page):
    def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
        track_lyrics = soup.find("div", {"class": "lyricsText"})
        if track_lyrics:
            self.LOGGER.debug(" Lyrics retrieved..")
            return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
        return []
@@ -323,10 +351,9 @@ class Bandcamp(Page):
        if len(other_data_list) > 0:
            other_data = json.loads(other_data_list[0]["data-tralbum"])
-        if DEBUG:
+        dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
-            dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
+        dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
-            dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
+        dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
            dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
        data = json.loads(data_container.text)
        album_data = data["inAlbum"]
@@ -337,8 +364,8 @@ class Bandcamp(Page):
            mp3_url = value
        song = Song(
-            title=data["name"].strip(),
+            title=clean_song_title(data["name"], artist_name=artist_data["name"]),
-            source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
+            source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
            album_list=[Album(
                title=album_data["name"].strip(),
                date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
@@ -351,8 +378,6 @@ class Bandcamp(Page):
            lyrics_list=self._fetch_lyrics(soup=soup)
        )
        song.source_collection.append(source)
        return song
    def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:
--- a/music_kraken/utils/shared.py
+++ b/music_kraken/utils/shared.py
@@ -13,7 +13,7 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
 __stage__ = os.getenv("STAGE", "prod")
 DEBUG = (__stage__ == "dev") and True
-DEBUG_LOGGING = DEBUG and True
+DEBUG_LOGGING = DEBUG and False
 DEBUG_TRACE = DEBUG and True
 DEBUG_OBJECT_TRACE = DEBUG and False
 DEBUG_YOUTUBE_INITIALIZING = DEBUG and False
--- a/music_kraken/utils/string_processing.py
+++ b/music_kraken/utils/string_processing.py
@@ -1,6 +1,7 @@
-from typing import Tuple, Union
+from typing import Tuple, Union, Optional
 from pathlib import Path
 import string
 from functools import lru_cache
 from transliterate.exceptions import LanguageDetectionError
 from transliterate import translit
@@ -10,8 +11,11 @@ from pathvalidate import sanitize_filename
 COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
    "(official video)",
 )
 OPEN_BRACKETS = "(["
 CLOSE_BRACKETS = ")]"
 DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")
-
+@lru_cache
 def unify(string: str) -> str:
    """
    returns a unified str, to make comparisons easy.
@@ -52,7 +56,8 @@ def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
        return fit_string(string)
-def clean_song_title(raw_song_title: str, artist_name: str) -> str:
+@lru_cache(maxsize=128)
 def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
    """
    This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
@@ -64,19 +69,45 @@ def clean_song_title(raw_song_title: str, artist_name: str) -> str:
    - `song (prod. some producer)`
    """
    raw_song_title = raw_song_title.strip()
    artist_name = artist_name.strip()
    # Clean official Video appendix
    for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
        if raw_song_title.lower().endswith(dirty_appendix):
            raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()
-    # Remove artist from the start of the title
+    # remove brackets and their content if they contain disallowed substrings
-    if raw_song_title.lower().startswith(artist_name.lower()):
+    for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
-        raw_song_title = raw_song_title[len(artist_name):].strip()
+        if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
            continue
        start = 0
-        if raw_song_title.startswith("-"):
+        while True:
-            raw_song_title = raw_song_title[1:].strip()
+            try:
                open_bracket_index = raw_song_title.index(open_bracket, start)
            except ValueError:
                break
            try:
                close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
            except ValueError:
                break
            substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
            if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
                raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
            else:
                start = close_bracket_index + 1
    # everything that requires the artist name
    if artist_name is not None:
        artist_name = artist_name.strip()
        # Remove artist from the start of the title
        if raw_song_title.lower().startswith(artist_name.lower()):
            raw_song_title = raw_song_title[len(artist_name):].strip()
            if raw_song_title.startswith("-"):
                raw_song_title = raw_song_title[1:].strip()
    return raw_song_title.strip()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ dependencies = [
    "rich~=13.7.1",
    "mistune~=3.0.2",
    "markdownify~=0.12.1",
    "html2markdown~=0.1.7",
    "jellyfish~=0.9.0",
    "transliterate~=1.10.2",
Author	SHA1	Message	Date
Lars Noack	be09562632	feat: stripped whitespaces from lyrics All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 14:05:05 +02:00
Lars Noack	29770825a4	fix: unified wrong attribute in song, causing many duplicates All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 13:54:08 +02:00
Lars Noack	81708ba100	feat: switched to a more readable markdown converter All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 13:51:08 +02:00
Lars Noack	301ff82bcf	feat: implemented the merging from where it has been fetched from All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 13:37:12 +02:00
Lars Noack	06ffae06a6	fix: lyrics should be embedded in the markdown format rather than html	2024-04-19 12:57:34 +02:00
Lars Noack	919a99885c	feat: disabled debugging pages All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:47:36 +02:00
Lars Noack	e20b14a9df	feat: added fetching artworks to bandcamp All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:37:14 +02:00
Lars Noack	b933c6ac14	feat: improved the lyrics support for bandcamp All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:17:08 +02:00
Lars Noack	3c5bbc19af	feat: some slight performance improvements All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:04:13 +02:00
Lars Noack	06acf22abb	feat: improved the cleaning song title function to remove redundand brackets All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 12:02:54 +02:00
Lars Noack	1e62d371cd	feat: cleaned bandcamp songs All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2024-04-19 11:46:56 +02:00
Lars Noack	24a90f1cdf	feat: artist name in clean song title is optional	2024-04-19 11:43:21 +02:00
Lars Noack	d9c711a2f8	feat: added lru cache to unify function to speed up indexing	2024-04-19 11:40:00 +02:00