Compare commits
15 Commits
1.15.1
...
b4c73d56a7
| Author | SHA1 | Date | |
|---|---|---|---|
| b4c73d56a7 | |||
| 1735ff4e1d | |||
| be09562632 | |||
| 29770825a4 | |||
| 81708ba100 | |||
| 301ff82bcf | |||
| 06ffae06a6 | |||
| 919a99885c | |||
| e20b14a9df | |||
| b933c6ac14 | |||
| 3c5bbc19af | |||
| 06acf22abb | |||
| 1e62d371cd | |||
| 24a90f1cdf | |||
| d9c711a2f8 |
@@ -7,7 +7,8 @@ logging.getLogger().setLevel(logging.DEBUG)
|
||||
if __name__ == "__main__":
|
||||
commands = [
|
||||
"s: #a Ghost Bath",
|
||||
"4",
|
||||
"0",
|
||||
"d: 1",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from ..pages import Page, EncyclopaediaMetallum, Musify, YouTube, YoutubeMusic,
|
||||
|
||||
|
||||
ALL_PAGES: Set[Type[Page]] = {
|
||||
EncyclopaediaMetallum,
|
||||
# EncyclopaediaMetallum,
|
||||
Musify,
|
||||
YoutubeMusic,
|
||||
Bandcamp
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
import mistune
|
||||
import html2markdown
|
||||
from markdownify import markdownify as md
|
||||
|
||||
|
||||
def plain_to_markdown(plain: str) -> str:
|
||||
return plain.replace("\n", " \n")
|
||||
|
||||
|
||||
class FormattedText:
|
||||
html = ""
|
||||
@@ -7,12 +12,15 @@ class FormattedText:
|
||||
def __init__(
|
||||
self,
|
||||
markdown: str = None,
|
||||
html: str = None
|
||||
html: str = None,
|
||||
plain: str = None,
|
||||
) -> None:
|
||||
if html is not None:
|
||||
self.html = html
|
||||
elif markdown is not None:
|
||||
self.html = mistune.markdown(markdown)
|
||||
elif plain is not None:
|
||||
self.html = mistune.markdown(plain_to_markdown(plain))
|
||||
|
||||
@property
|
||||
def is_empty(self) -> bool:
|
||||
@@ -28,7 +36,7 @@ class FormattedText:
|
||||
|
||||
@property
|
||||
def markdown(self) -> str:
|
||||
return html2markdown.convert(self.html)
|
||||
return md(self.html).strip()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.markdown
|
||||
|
||||
@@ -34,6 +34,6 @@ class Lyrics(OuterProxy):
|
||||
@property
|
||||
def metadata(self) -> Metadata:
|
||||
return Metadata({
|
||||
id3Mapping.UNSYNCED_LYRICS: [self.text.html]
|
||||
id3Mapping.UNSYNCED_LYRICS: [self.text.markdown]
|
||||
})
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ class InnerData:
|
||||
|
||||
def __init__(self, object_type, **kwargs):
|
||||
self._refers_to_instances = set()
|
||||
self._fetched_from: dict = {}
|
||||
|
||||
# collection : collection that is a collection of self
|
||||
self._is_collection_child: Dict[Collection, Collection] = {}
|
||||
@@ -52,6 +53,8 @@ class InnerData:
|
||||
:return:
|
||||
"""
|
||||
|
||||
self._fetched_from.update(__other._fetched_from)
|
||||
|
||||
for key, value in __other.__dict__.copy().items():
|
||||
# just set the other value if self doesn't already have it
|
||||
if key not in self.__dict__ or (key in self.__dict__ and self.__dict__[key] == self._default_values.get(key)):
|
||||
@@ -109,7 +112,6 @@ class OuterProxy:
|
||||
|
||||
del kwargs[name]
|
||||
|
||||
self._fetched_from: dict = {}
|
||||
self._inner: InnerData = InnerData(type(self), **kwargs)
|
||||
self._inner._refers_to_instances.add(self)
|
||||
|
||||
@@ -192,7 +194,7 @@ class OuterProxy:
|
||||
if len(b._inner._refers_to_instances) > len(a._inner._refers_to_instances):
|
||||
a, b = b, a
|
||||
|
||||
object_trace(f"merging {type(a).__name__} [{a.title_string} | {a.id}] with {type(b).__name__} [{b.title_string} | {b.id}] called by [{' | '.join(f'{s.function} {Path(s.filename).name}:{str(s.lineno)}' for s in inspect.stack()[1:5])}]")
|
||||
object_trace(f"merging {type(a).__name__} [{a.title_string} | {a.id}] with {type(b).__name__} [{b.title_string} | {b.id}]")
|
||||
|
||||
for collection, child_collection in b._inner._is_collection_child.items():
|
||||
try:
|
||||
@@ -220,13 +222,13 @@ class OuterProxy:
|
||||
|
||||
def mark_as_fetched(self, *url_hash_list: List[str]):
|
||||
for url_hash in url_hash_list:
|
||||
self._fetched_from[url_hash] = {
|
||||
self._inner._fetched_from[url_hash] = {
|
||||
"time": get_unix_time(),
|
||||
"url": url_hash,
|
||||
}
|
||||
|
||||
def already_fetched_from(self, url_hash: str) -> bool:
|
||||
res = self._fetched_from.get(url_hash, None)
|
||||
res = self._inner._fetched_from.get(url_hash, None)
|
||||
|
||||
if res is None:
|
||||
return False
|
||||
|
||||
@@ -86,11 +86,6 @@ class Song(Base):
|
||||
TITEL = "title"
|
||||
|
||||
def __init_collections__(self) -> None:
|
||||
"""
|
||||
self.album_collection.contain_given_in_attribute = {
|
||||
"artist_collection": self.main_artist_collection,
|
||||
}
|
||||
"""
|
||||
self.album_collection.sync_on_append = {
|
||||
"artist_collection": self.main_artist_collection,
|
||||
}
|
||||
@@ -126,7 +121,7 @@ class Song(Base):
|
||||
def indexing_values(self) -> List[Tuple[str, object]]:
|
||||
return [
|
||||
('id', self.id),
|
||||
('title', unify(self.unified_title)),
|
||||
('title', unify(self.title)),
|
||||
('isrc', self.isrc),
|
||||
*[('url', source.url) for source in self.source_collection]
|
||||
]
|
||||
@@ -347,7 +342,6 @@ class Album(Base):
|
||||
tracksort_map[i] = existing_list.pop(0)
|
||||
tracksort_map[i].tracksort = i
|
||||
|
||||
|
||||
def compile(self, merge_into: bool = False):
|
||||
"""
|
||||
compiles the recursive structures,
|
||||
|
||||
@@ -18,10 +18,12 @@ from ..objects import (
|
||||
Contact,
|
||||
ID3Timestamp,
|
||||
Lyrics,
|
||||
FormattedText
|
||||
FormattedText,
|
||||
Artwork,
|
||||
)
|
||||
from ..connection import Connection
|
||||
from ..utils.support_classes.download_result import DownloadResult
|
||||
from ..utils.string_processing import clean_song_title
|
||||
from ..utils.config import main_settings, logging_settings
|
||||
from ..utils.shared import DEBUG
|
||||
|
||||
@@ -114,7 +116,7 @@ class Bandcamp(Page):
|
||||
|
||||
if object_type is BandcampTypes.SONG:
|
||||
return Song(
|
||||
title=name.strip(),
|
||||
title=clean_song_title(name, artist_name=data["band_name"]),
|
||||
source_list=source_list,
|
||||
main_artist_list=[
|
||||
Artist(
|
||||
@@ -252,11 +254,18 @@ class Bandcamp(Page):
|
||||
artist.source_collection.append(source)
|
||||
return artist
|
||||
|
||||
def _parse_track_element(self, track: dict) -> Optional[Song]:
|
||||
def _parse_track_element(self, track: dict, artwork: Artwork) -> Optional[Song]:
|
||||
lyrics_list: List[Lyrics] = []
|
||||
|
||||
_lyrics: Optional[str] = track.get("item", {}).get("recordingOf", {}).get("lyrics", {}).get("text")
|
||||
if _lyrics is not None:
|
||||
lyrics_list.append(Lyrics(text=FormattedText(plain=_lyrics)))
|
||||
|
||||
return Song(
|
||||
title=track["item"]["name"].strip(),
|
||||
title=clean_song_title(track["item"]["name"]),
|
||||
source_list=[Source(self.SOURCE_TYPE, track["item"]["mainEntityOfPage"])],
|
||||
tracksort=int(track["position"])
|
||||
tracksort=int(track["position"]),
|
||||
artwork=artwork,
|
||||
)
|
||||
|
||||
def fetch_album(self, source: Source, stop_at_level: int = 1) -> Album:
|
||||
@@ -289,12 +298,32 @@ class Bandcamp(Page):
|
||||
)]
|
||||
)
|
||||
|
||||
artwork: Artwork = Artwork()
|
||||
|
||||
def _get_artwork_url(_data: dict) -> Optional[str]:
|
||||
if "image" in _data:
|
||||
return _data["image"]
|
||||
for _property in _data.get("additionalProperty", []):
|
||||
if _property.get("name") == "art_id":
|
||||
return f"https://f4.bcbits.com/img/a{_property.get('value')}_2.jpg"
|
||||
|
||||
_artwork_url = _get_artwork_url(data)
|
||||
if _artwork_url is not None:
|
||||
artwork.append(url=_artwork_url, width=350, height=350)
|
||||
else:
|
||||
for album_release in data.get("albumRelease", []):
|
||||
_artwork_url = _get_artwork_url(album_release)
|
||||
if _artwork_url is not None:
|
||||
artwork.append(url=_artwork_url, width=350, height=350)
|
||||
break
|
||||
|
||||
|
||||
for i, track_json in enumerate(data.get("track", {}).get("itemListElement", [])):
|
||||
if DEBUG:
|
||||
dump_to_file(f"album_track_{i}.json", json.dumps(track_json), is_json=True, exit_after_dump=False)
|
||||
|
||||
try:
|
||||
album.song_collection.append(self._parse_track_element(track_json))
|
||||
album.song_collection.append(self._parse_track_element(track_json, artwork=artwork))
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
@@ -304,7 +333,6 @@ class Bandcamp(Page):
|
||||
def _fetch_lyrics(self, soup: BeautifulSoup) -> List[Lyrics]:
|
||||
track_lyrics = soup.find("div", {"class": "lyricsText"})
|
||||
if track_lyrics:
|
||||
self.LOGGER.debug(" Lyrics retrieved..")
|
||||
return [Lyrics(text=FormattedText(html=track_lyrics.prettify()))]
|
||||
|
||||
return []
|
||||
@@ -323,10 +351,9 @@ class Bandcamp(Page):
|
||||
if len(other_data_list) > 0:
|
||||
other_data = json.loads(other_data_list[0]["data-tralbum"])
|
||||
|
||||
if DEBUG:
|
||||
dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
|
||||
dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
|
||||
dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
|
||||
dump_to_file("bandcamp_song_data.json", data_container.text, is_json=True, exit_after_dump=False)
|
||||
dump_to_file("bandcamp_song_data_other.json", json.dumps(other_data), is_json=True, exit_after_dump=False)
|
||||
dump_to_file("bandcamp_song_page.html", r.text, exit_after_dump=False)
|
||||
|
||||
data = json.loads(data_container.text)
|
||||
album_data = data["inAlbum"]
|
||||
@@ -337,8 +364,8 @@ class Bandcamp(Page):
|
||||
mp3_url = value
|
||||
|
||||
song = Song(
|
||||
title=data["name"].strip(),
|
||||
source_list=[Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
|
||||
title=clean_song_title(data["name"], artist_name=artist_data["name"]),
|
||||
source_list=[source, Source(self.SOURCE_TYPE, data.get("mainEntityOfPage", data["@id"]), audio_url=mp3_url)],
|
||||
album_list=[Album(
|
||||
title=album_data["name"].strip(),
|
||||
date=ID3Timestamp.strptime(data["datePublished"], "%d %b %Y %H:%M:%S %Z"),
|
||||
@@ -351,8 +378,6 @@ class Bandcamp(Page):
|
||||
lyrics_list=self._fetch_lyrics(soup=soup)
|
||||
)
|
||||
|
||||
song.source_collection.append(source)
|
||||
|
||||
return song
|
||||
|
||||
def download_song_to_target(self, source: Source, target: Target, desc: str = None) -> DownloadResult:
|
||||
|
||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
||||
import json
|
||||
import logging
|
||||
|
||||
from .shared import DEBUG, DEBUG_LOGGING, DEBUG_DUMP, DEBUG_TRACE, DEBUG_OBJECT_TRACE
|
||||
from .shared import DEBUG, DEBUG_LOGGING, DEBUG_DUMP, DEBUG_TRACE, DEBUG_OBJECT_TRACE, DEBUG_OBJECT_TRACE_CALLSTACK
|
||||
from .config import config, read_config, write_config
|
||||
from .enums.colors import BColors
|
||||
from .path_manager import LOCATIONS
|
||||
@@ -56,7 +56,8 @@ def object_trace(obj):
|
||||
if not DEBUG_OBJECT_TRACE:
|
||||
return
|
||||
|
||||
output("object: " + str(obj), BColors.GREY)
|
||||
appendix = f" called by [{' | '.join(f'{s.function} {Path(s.filename).name}:{str(s.lineno)}' for s in inspect.stack()[1:5])}]" if DEBUG_OBJECT_TRACE_CALLSTACK else ""
|
||||
output("object: " + str(obj) + appendix, BColors.GREY)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
@@ -13,12 +13,13 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
|
||||
__stage__ = os.getenv("STAGE", "prod")
|
||||
|
||||
DEBUG = (__stage__ == "dev") and True
|
||||
DEBUG_LOGGING = DEBUG and True
|
||||
DEBUG_LOGGING = DEBUG and False
|
||||
DEBUG_TRACE = DEBUG and True
|
||||
DEBUG_OBJECT_TRACE = DEBUG and False
|
||||
DEBUG_OBJECT_TRACE = DEBUG and True
|
||||
DEBUG_OBJECT_TRACE_CALLSTACK = DEBUG and False
|
||||
DEBUG_YOUTUBE_INITIALIZING = DEBUG and False
|
||||
DEBUG_PAGES = DEBUG and False
|
||||
DEBUG_DUMP = DEBUG and True
|
||||
DEBUG_DUMP = DEBUG and False
|
||||
|
||||
if DEBUG:
|
||||
print("DEBUG ACTIVE")
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Tuple, Union
|
||||
from typing import Tuple, Union, Optional
|
||||
from pathlib import Path
|
||||
import string
|
||||
from functools import lru_cache
|
||||
|
||||
from transliterate.exceptions import LanguageDetectionError
|
||||
from transliterate import translit
|
||||
@@ -10,8 +11,11 @@ from pathvalidate import sanitize_filename
|
||||
COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
|
||||
"(official video)",
|
||||
)
|
||||
OPEN_BRACKETS = "(["
|
||||
CLOSE_BRACKETS = ")]"
|
||||
DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")
|
||||
|
||||
|
||||
@lru_cache
|
||||
def unify(string: str) -> str:
|
||||
"""
|
||||
returns a unified str, to make comparisons easy.
|
||||
@@ -52,7 +56,8 @@ def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
|
||||
return fit_string(string)
|
||||
|
||||
|
||||
def clean_song_title(raw_song_title: str, artist_name: str) -> str:
|
||||
@lru_cache(maxsize=128)
|
||||
def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
|
||||
"""
|
||||
This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
|
||||
|
||||
@@ -64,19 +69,45 @@ def clean_song_title(raw_song_title: str, artist_name: str) -> str:
|
||||
- `song (prod. some producer)`
|
||||
"""
|
||||
raw_song_title = raw_song_title.strip()
|
||||
artist_name = artist_name.strip()
|
||||
|
||||
# Clean official Video appendix
|
||||
for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
|
||||
if raw_song_title.lower().endswith(dirty_appendix):
|
||||
raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()
|
||||
|
||||
# Remove artist from the start of the title
|
||||
if raw_song_title.lower().startswith(artist_name.lower()):
|
||||
raw_song_title = raw_song_title[len(artist_name):].strip()
|
||||
# remove brackets and their content if they contain disallowed substrings
|
||||
for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
|
||||
if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
|
||||
continue
|
||||
|
||||
start = 0
|
||||
|
||||
if raw_song_title.startswith("-"):
|
||||
raw_song_title = raw_song_title[1:].strip()
|
||||
while True:
|
||||
try:
|
||||
open_bracket_index = raw_song_title.index(open_bracket, start)
|
||||
except ValueError:
|
||||
break
|
||||
try:
|
||||
close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
|
||||
except ValueError:
|
||||
break
|
||||
|
||||
substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
|
||||
if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
|
||||
raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
|
||||
else:
|
||||
start = close_bracket_index + 1
|
||||
|
||||
# everything that requires the artist name
|
||||
if artist_name is not None:
|
||||
artist_name = artist_name.strip()
|
||||
|
||||
# Remove artist from the start of the title
|
||||
if raw_song_title.lower().startswith(artist_name.lower()):
|
||||
raw_song_title = raw_song_title[len(artist_name):].strip()
|
||||
|
||||
if raw_song_title.startswith("-"):
|
||||
raw_song_title = raw_song_title[1:].strip()
|
||||
|
||||
return raw_song_title.strip()
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ dependencies = [
|
||||
|
||||
"rich~=13.7.1",
|
||||
"mistune~=3.0.2",
|
||||
"markdownify~=0.12.1",
|
||||
"html2markdown~=0.1.7",
|
||||
"jellyfish~=0.9.0",
|
||||
"transliterate~=1.10.2",
|
||||
|
||||
Reference in New Issue
Block a user