15 Commits

Author SHA1 Message Date
b4c73d56a7 feat: improved tracing
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 14:43:09 +02:00
1735ff4e1d feat: removed redundand commands from song 2024-04-19 14:16:31 +02:00
be09562632 feat: stripped whitespaces from lyrics
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 14:05:05 +02:00
29770825a4 fix: unified wrong attribute in song, causing many duplicates
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 13:54:08 +02:00
81708ba100 feat: switched to a more readable markdown converter
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 13:51:08 +02:00
301ff82bcf feat: implemented the merging from where it has been fetched from
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 13:37:12 +02:00
06ffae06a6 fix: lyrics should be embedded in the markdown format rather than html 2024-04-19 12:57:34 +02:00
919a99885c feat: disabled debugging pages
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:47:36 +02:00
e20b14a9df feat: added fetching artworks to bandcamp
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:37:14 +02:00
b933c6ac14 feat: improved the lyrics support for bandcamp
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:17:08 +02:00
3c5bbc19af feat: some slight performance improvements
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:04:13 +02:00
06acf22abb feat: improved the cleaning song title function to remove redundand brackets
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 12:02:54 +02:00
1e62d371cd feat: cleaned bandcamp songs
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2024-04-19 11:46:56 +02:00
24a90f1cdf feat: artist name in clean song title is optional 2024-04-19 11:43:21 +02:00
d9c711a2f8 feat: added lru cache to unify function to speed up indexing 2024-04-19 11:40:00 +02:00
11 changed files with 110 additions and 46 deletions

View File

@@ -7,7 +7,8 @@ logging.getLogger().setLevel(logging.DEBUG)
if __name__ == "__main__":
commands = [
"s: #a Ghost Bath",
"4",
"0",
"d: 1",
]

View File

@@ -14,7 +14,7 @@ from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic,
ALL_PAGES: Set[Type[Page]] = {
EncyclopaediaMetallum,
# EncyclopaediaMetallum,
Musify,
YoutubeMusic,
Bandcamp

View File

@@ -1,5 +1,10 @@
import mistune
import html2markdown
from markdownify import markdownify as md
def plain_to_markdown(plain: str) -> str:
return plain.replace("\n", " \n")
class FormattedText:
html = ""
@@ -7,12 +12,15 @@ class FormattedText:
def __init__(
self,
markdown: str = None,
html: str = None
html: str = None,
plain: str = None,
) -> None:
if html is not None:
self.html = html
elif markdown is not None:
self.html = mistune.markdown(markdown)
elif plain is not None:
self.html = mistune.markdown(plain_to_markdown(plain))
@property
def is_empty(self) -> bool:
@@ -28,7 +36,7 @@ class FormattedText:
@property
def markdown(self) -> str:
return html2markdown.convert(self.html)
return md(self.html).strip()
def __str__(self) -> str:
return self.markdown

View File

@@ -34,6 +34,6 @@ class Lyrics(OuterProxy):
@property
def metadata(self) -> Metadata:
return Metadata({
id3Mapping.UNSYNCED_LYRICS: [self.text.html]
id3Mapping.UNSYNCED_LYRICS: [self.text.markdown]
})

View File

@@ -32,6 +32,7 @@ class InnerData:
def __init__(self, object_type, **kwargs):
self._refers_to_instances = set()
self._fetched_from: dict = {}
# collection : collection that is a collection of self
self._is_collection_child: Dict[Collection, Collection] = {}
@@ -52,6 +53,8 @@ class InnerData:
:return:
"""
self._fetched_from.update(__other._fetched_from)
for key, value in __other.__dict__.copy().items():
# just set the other value if self doesn't already have it
if key not in self.__dict__ or (key in self.__dict__ and self.__dict__[key] == self._default_values.get(key)):
@@ -109,7 +112,6 @@ class OuterProxy:
del kwargs[name]
self._fetched_from: dict = {}
self._inner: InnerData = InnerData(type(self), **kwargs)
self._inner._refers_to_instances.add(self)
@@ -192,7 +194,7 @@ class OuterProxy:
if len(b._inner._refers_to_instances) > len(a._inner._refers_to_instances):
a, b = b, a
object_trace(f"merging {type(a).__name__} [{a.title_string} | {a.id}] with {type(b).__name__} [{b.title_string} | {b.id}] called by [{' | '.join(f'{s.function} {Path(s.filename).name}:{str(s.lineno)}' for s in inspect.stack()[1:5])}]")
object_trace(f"merging {type(a).__name__} [{a.title_string} | {a.id}] with {type(b).__name__} [{b.title_string} | {b.id}]")
for collection, child_collection in b._inner._is_collection_child.items():
try:
@@ -220,13 +222,13 @@ class OuterProxy:
def mark_as_fetched(self, *url_hash_list: List[str]):
for url_hash in url_hash_list:
self._fetched_from[url_hash] = {
self._inner._fetched_from[url_hash] = {
"time": get_unix_time(),
"url": url_hash,
}
def already_fetched_from(self, url_hash: str) -> bool:
res = self._fetched_from.get(url_hash, None)
res = self._inner._fetched_from.get(url_hash, None)
if res is None:
return False

View File

@@ -86,11 +86,6 @@ class Song(Base):
TITEL = "title"
def __init_collections__(self) -> None:
"""
self.album_collection.contain_given_in_attribute = {
"artist_collection": self.main_artist_collection,
}
"""
self.album_collection.sync_on_append = {
"artist_collection": self.main_artist_collection,
}
@@ -126,7 +121,7 @@ class Song(Base):
def indexing_values(self) -> List[Tuple[str, object]]:
return [
('id', self.id),
('title', unify(self.unified_title)),
('title', unify(self.title)),
('isrc', self.isrc),
*[('url', source.url) for source in self.source_collection]
]
@@ -347,7 +342,6 @@ class Album(Base):
tracksort_map[i] = existing_list.pop(0)
tracksort_map[i].tracksort = i
def compile(self, merge_into: bool = False):
"""
compiles the recursive structures,

View File

@@ -18,10 +18,12 @@ from ..objects import (
Contact,
ID3Timestamp,
Lyrics,
FormattedText
FormattedText,
Artwork,
)
from ..connection import Connection
from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import clean_song_title
from ..utils.config import main_settings, logging_settings
from ..utils.shared import DEBUG
@@ -114,7 +116,7 @@ class Bandcamp(Page):
if object_type is BandcampTypes.SONG:
return Song(
title=name.strip(),
title=clean_song_title(name, artist_name=data["band_name"]),
source_list=source_list,
main_artist_list=[
Artist(
@@ -252,11 +254,18 @@ class Bandcamp(Page):
artist.source_collection.append(source)
return artist
def _parse_track_element(self, track: dict) -> Optional[Song]:
def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
lyrics_list: List[Lyrics] = []
_lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
if _lyrics is not None:
lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
return Song(
title=track["item"]["name"].strip(),
title=clean_song_title(track["item"]["name"]),
source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
tracksort=int(track["position"])
tracksort=int(track["position"]),
artwork=artwork,
)
def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
@@ -289,12 +298,32 @@ class Bandcamp(Page):
)]
)
artwork: Artwork = Artwork()
def _get_artwork_url(_data: dict) -> Optional[str]:
if "image" in _data:
return _data["image"]
for _property in _data.get("additionalProperty", []):
if _property.get("name") == "art_id":
return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
_artwork_url = _get_artwork_url(data)
if _artwork_url is not None:
artwork.append(url=_artwork_url, width=350, height=350)
else:
for album_release in data.get("albumRelease", []):
_artwork_url = _get_artwork_url(album_release)
if _artwork_url is not None:
artwork.append(url=_artwork_url, width=350, height=350)
break
for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
if DEBUG:
dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
try:
album.song_collection.append(self._parse_track_element(track_json))
album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
except KeyError:
continue
@@ -304,7 +333,6 @@ class Bandcamp(Page):
def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
track_lyrics = soup.find("div", {"class": "lyricsText"})
if track_lyrics:
self.LOGGER.debug(" Lyrics retrieved..")
return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
return []
@@ -323,10 +351,9 @@ class Bandcamp(Page):
if len(other_data_list) > 0:
other_data = json.loads(other_data_list[0]["data-tralbum"])
if DEBUG:
dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
data = json.loads(data_container.text)
album_data = data["inAlbum"]
@@ -337,8 +364,8 @@ class Bandcamp(Page):
mp3_url = value
song = Song(
title=data["name"].strip(),
source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
title=clean_song_title(data["name"], artist_name=artist_data["name"]),
source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
album_list=[Album(
title=album_data["name"].strip(),
date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
@@ -351,8 +378,6 @@ class Bandcamp(Page):
lyrics_list=self._fetch_lyrics(soup=soup)
)
song.source_collection.append(source)
return song
def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:

View File

@@ -3,7 +3,7 @@ from pathlib import Path
import json
import logging
from .shared import DEBUG, DEBUG_LOGGING, DEBUG_DUMP, DEBUG_TRACE, DEBUG_OBJECT_TRACE
from .shared import DEBUG, DEBUG_LOGGING, DEBUG_DUMP, DEBUG_TRACE, DEBUG_OBJECT_TRACE, DEBUG_OBJECT_TRACE_CALLSTACK
from .config import config, read_config, write_config
from .enums.colors import BColors
from .path_manager import LOCATIONS
@@ -56,7 +56,8 @@ def object_trace(obj):
if not DEBUG_OBJECT_TRACE:
return
output("object: " + str(obj), BColors.GREY)
appendix = f" called by [{' | '.join(f'{s.function} {Path(s.filename).name}:{str(s.lineno)}' for s in inspect.stack()[1:5])}]" if DEBUG_OBJECT_TRACE_CALLSTACK else ""
output("object: " + str(obj) + appendix, BColors.GREY)
"""

View File

@@ -13,12 +13,13 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
__stage__ = os.getenv("STAGE", "prod")
DEBUG = (__stage__ == "dev") and True
DEBUG_LOGGING = DEBUG and True
DEBUG_LOGGING = DEBUG and False
DEBUG_TRACE = DEBUG and True
DEBUG_OBJECT_TRACE = DEBUG and False
DEBUG_OBJECT_TRACE = DEBUG and True
DEBUG_OBJECT_TRACE_CALLSTACK = DEBUG and False
DEBUG_YOUTUBE_INITIALIZING = DEBUG and False
DEBUG_PAGES = DEBUG and False
DEBUG_DUMP = DEBUG and True
DEBUG_DUMP = DEBUG and False
if DEBUG:
print("DEBUG ACTIVE")

View File

@@ -1,6 +1,7 @@
from typing import Tuple, Union
from typing import Tuple, Union, Optional
from pathlib import Path
import string
from functools import lru_cache
from transliterate.exceptions import LanguageDetectionError
from transliterate import translit
@@ -10,8 +11,11 @@ from pathvalidate import sanitize_filename
COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
"(official video)",
)
OPEN_BRACKETS = "(["
CLOSE_BRACKETS = ")]"
DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")
@lru_cache
def unify(string: str) -> str:
"""
returns a unified str, to make comparisons easy.
@@ -52,7 +56,8 @@ def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
return fit_string(string)
def clean_song_title(raw_song_title: str, artist_name: str) -> str:
@lru_cache(maxsize=128)
def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
"""
This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
@@ -64,19 +69,45 @@ def clean_song_title(raw_song_title: str, artist_name: str) -> str:
- `song (prod. some producer)`
"""
raw_song_title = raw_song_title.strip()
artist_name = artist_name.strip()
# Clean official Video appendix
for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
if raw_song_title.lower().endswith(dirty_appendix):
raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()
# Remove artist from the start of the title
if raw_song_title.lower().startswith(artist_name.lower()):
raw_song_title = raw_song_title[len(artist_name):].strip()
# remove brackets and their content if they contain disallowed substrings
for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
continue
start = 0
if raw_song_title.startswith("-"):
raw_song_title = raw_song_title[1:].strip()
while True:
try:
open_bracket_index = raw_song_title.index(open_bracket, start)
except ValueError:
break
try:
close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
except ValueError:
break
substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
else:
start = close_bracket_index + 1
# everything that requires the artist name
if artist_name is not None:
artist_name = artist_name.strip()
# Remove artist from the start of the title
if raw_song_title.lower().startswith(artist_name.lower()):
raw_song_title = raw_song_title[len(artist_name):].strip()
if raw_song_title.startswith("-"):
raw_song_title = raw_song_title[1:].strip()
return raw_song_title.strip()

View File

@@ -56,6 +56,7 @@ dependencies = [
"rich~=13.7.1",
"mistune~=3.0.2",
"markdownify~=0.12.1",
"html2markdown~=0.1.7",
"jellyfish~=0.9.0",
"transliterate~=1.10.2",