feat: genius fixes and duplicate detection
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
ci/woodpecker/pr/woodpecker Pipeline was successful
ci/woodpecker/pull_request_closed/woodpecker Pipeline was successful

This commit is contained in:
Luna 2024-07-02 17:20:25 +02:00
parent 93c9a367a2
commit 5ce76c758e
4 changed files with 77 additions and 49 deletions

View File

@ -3,6 +3,7 @@ from collections import defaultdict
from pathlib import Path from pathlib import Path
import re import re
import logging import logging
import subprocess
from PIL import Image from PIL import Image
@ -90,7 +91,8 @@ class Pages:
exclude_pages = exclude_pages.union(SHADY_PAGES) exclude_pages = exclude_pages.union(SHADY_PAGES)
if not exclude_pages.issubset(ALL_PAGES): if not exclude_pages.issubset(ALL_PAGES):
raise ValueError(f"The excluded pages have to be a subset of all pages: {exclude_pages} | {ALL_PAGES}") raise ValueError(
f"The excluded pages have to be a subset of all pages: {exclude_pages} | {ALL_PAGES}")
def _set_to_tuple(page_set: Set[Type[Page]]) -> Tuple[Type[Page], ...]: def _set_to_tuple(page_set: Set[Type[Page]]) -> Tuple[Type[Page], ...]:
return tuple(sorted(page_set, key=lambda page: page.__name__)) return tuple(sorted(page_set, key=lambda page: page.__name__))
@ -98,11 +100,14 @@ class Pages:
self._pages_set: Set[Type[Page]] = ALL_PAGES.difference(exclude_pages) self._pages_set: Set[Type[Page]] = ALL_PAGES.difference(exclude_pages)
self.pages: Tuple[Type[Page], ...] = _set_to_tuple(self._pages_set) self.pages: Tuple[Type[Page], ...] = _set_to_tuple(self._pages_set)
self._audio_pages_set: Set[Type[Page]] = self._pages_set.intersection(AUDIO_PAGES) self._audio_pages_set: Set[Type[Page]
self.audio_pages: Tuple[Type[Page], ...] = _set_to_tuple(self._audio_pages_set) ] = self._pages_set.intersection(AUDIO_PAGES)
self.audio_pages: Tuple[Type[Page], ...] = _set_to_tuple(
self._audio_pages_set)
for page_type in self.pages: for page_type in self.pages:
self._page_instances[page_type] = page_type(fetch_options=self.fetch_options, download_options=self.download_options) self._page_instances[page_type] = page_type(
fetch_options=self.fetch_options, download_options=self.download_options)
self._source_to_page[page_type.SOURCE_TYPE] = page_type self._source_to_page[page_type.SOURCE_TYPE] = page_type
def _get_page_from_enum(self, source_page: SourceType) -> Page: def _get_page_from_enum(self, source_page: SourceType) -> Page:
@ -116,7 +121,8 @@ class Pages:
for page_type in self.pages: for page_type in self.pages:
result.add( result.add(
page=page_type, page=page_type,
search_result=self._page_instances[page_type].search(query=query) search_result=self._page_instances[page_type].search(
query=query)
) )
return result return result
@ -129,7 +135,8 @@ class Pages:
for source in data_object.source_collection.get_sources(source_type_sorting={ for source in data_object.source_collection.get_sources(source_type_sorting={
"only_with_page": True, "only_with_page": True,
}): }):
new_data_object = self.fetch_from_source(source=source, stop_at_level=stop_at_level) new_data_object = self.fetch_from_source(
source=source, stop_at_level=stop_at_level)
if new_data_object is not None: if new_data_object is not None:
data_object.merge(new_data_object) data_object.merge(new_data_object)
@ -165,11 +172,11 @@ class Pages:
return False return False
def _fetch_artist_artwork(self, artist: Artist, naming: dict): def _fetch_artist_artwork(self, artist: Artist, naming: dict):
naming: Dict[str, List[str]] = defaultdict(list, naming) naming: Dict[str, List[str]] = defaultdict(list, naming)
naming["artist"].append(artist.name) naming["artist"].append(artist.name)
naming["label"].extend([l.title_value for l in artist.label_collection]) naming["label"].extend(
[l.title_value for l in artist.label_collection])
# removing duplicates from the naming, and process the strings # removing duplicates from the naming, and process the strings
for key, value in naming.items(): for key, value in naming.items():
# https://stackoverflow.com/a/17016257 # https://stackoverflow.com/a/17016257
@ -182,8 +189,12 @@ class Pages:
naming["image_number"] = [str(image_number)] naming["image_number"] = [str(image_number)]
target = Target( target = Target(
relative_to_music_dir=True, relative_to_music_dir=True,
file_path=Path(self._parse_path_template(main_settings["artist_artwork_path"], naming=naming)) file_path=Path(self._parse_path_template(
main_settings["artist_artwork_path"], naming=naming))
) )
if not target.file_path.parent.exists():
target.create_path()
subprocess.Popen(["gio", "set", target.file_path.parent, "metadata::custom-icon", "file://"+str(target.file_path)])
with Image.open(artwork_variant.target.file_path) as img: with Image.open(artwork_variant.target.file_path) as img:
img.save(target.file_path, main_settings["image_format"]) img.save(target.file_path, main_settings["image_format"])
artwork_variant.target = Target artwork_variant.target = Target
@ -191,7 +202,8 @@ class Pages:
def download(self, data_object: DataObject, genre: str, **kwargs) -> DownloadResult: def download(self, data_object: DataObject, genre: str, **kwargs) -> DownloadResult:
# fetch the given object # fetch the given object
self.fetch_details(data_object) self.fetch_details(data_object)
output(f"\nDownloading {data_object.option_string}...", color=BColors.BOLD) output(
f"\nDownloading {data_object.option_string}...", color=BColors.BOLD)
# fetching all parent objects (e.g. if you only download a song) # fetching all parent objects (e.g. if you only download a song)
if not kwargs.get("fetched_upwards", False): if not kwargs.get("fetched_upwards", False):
@ -247,13 +259,15 @@ class Pages:
return set(re.findall(r"{([^}]+)}", path_template)) return set(re.findall(r"{([^}]+)}", path_template))
def _parse_path_template(self, path_template: str, naming: Dict[str, List[str]]) -> str: def _parse_path_template(self, path_template: str, naming: Dict[str, List[str]]) -> str:
field_names: Set[str] = self._extract_fields_from_template(path_template) field_names: Set[str] = self._extract_fields_from_template(
path_template)
for field in field_names: for field in field_names:
if len(naming[field]) == 0: if len(naming[field]) == 0:
raise MKMissingNameException(f"Missing field for {field}.") raise MKMissingNameException(f"Missing field for {field}.")
path_template = path_template.replace(f"{{{field}}}", naming[field][0]) path_template = path_template.replace(
f"{{{field}}}", naming[field][0])
return path_template return path_template
@ -272,7 +286,8 @@ class Pages:
naming["song"].append(song.title_value) naming["song"].append(song.title_value)
naming["isrc"].append(song.isrc) naming["isrc"].append(song.isrc)
naming["album"].extend(a.title_value for a in song.album_collection) naming["album"].extend(a.title_value for a in song.album_collection)
naming["album_type"].extend(a.album_type.value for a in song.album_collection) naming["album_type"].extend(
a.album_type.value for a in song.album_collection)
naming["artist"].extend(a.name for a in song.artist_collection) naming["artist"].extend(a.name for a in song.artist_collection)
naming["artist"].extend(a.name for a in song.feature_artist_collection) naming["artist"].extend(a.name for a in song.feature_artist_collection)
for a in song.album_collection: for a in song.album_collection:
@ -289,13 +304,16 @@ class Pages:
song.target_collection.append(Target( song.target_collection.append(Target(
relative_to_music_dir=True, relative_to_music_dir=True,
file_path=Path( file_path=Path(
self._parse_path_template(main_settings["download_path"], naming=naming), self._parse_path_template(
self._parse_path_template(main_settings["download_file"], naming=naming), main_settings["download_path"], naming=naming),
self._parse_path_template(
main_settings["download_file"], naming=naming),
) )
)) ))
for target in song.target_collection: for target in song.target_collection:
if target.exists: if target.exists:
output(f'{target.file_path} {BColors.OKGREEN.value}[already exists]', color=BColors.GREY) output(
f'{target.file_path} {BColors.OKGREEN.value}[already exists]', color=BColors.GREY)
r.found_on_disk += 1 r.found_on_disk += 1
if not self.download_options.download_again_if_found: if not self.download_options.download_again_if_found:
@ -316,8 +334,10 @@ class Pages:
break break
used_source = source used_source = source
streaming_results = source.page.download_song_to_target(source=source, target=tmp, desc="download") streaming_results = source.page.download_song_to_target(
skip_intervals = source.page.get_skip_intervals(song=song, source=source) source=source, target=tmp, desc="download")
skip_intervals = source.page.get_skip_intervals(
song=song, source=source)
# if something has been downloaded but it somehow failed, delete the file # if something has been downloaded but it somehow failed, delete the file
if streaming_results.is_fatal_error and tmp.exists: if streaming_results.is_fatal_error and tmp.exists:
@ -341,7 +361,8 @@ class Pages:
used_source.page.post_process_hook(song=song, temp_target=tmp) used_source.page.post_process_hook(song=song, temp_target=tmp)
if not found_on_disk or self.download_options.process_metadata_if_found: if not found_on_disk or self.download_options.process_metadata_if_found:
write_metadata_to_target(metadata=song.metadata, target=tmp, song=song) write_metadata_to_target(
metadata=song.metadata, target=tmp, song=song)
# copy the tmp target to the final locations # copy the tmp target to the final locations
for target in song.target_collection: for target in song.target_collection:
@ -359,5 +380,3 @@ class Pages:
_actual_page = self._source_to_page[source.source_type] _actual_page = self._source_to_page[source.source_type]
return _actual_page, self._page_instances[_actual_page].fetch_object_from_source(source=source, stop_at_level=stop_at_level) return _actual_page, self._page_instances[_actual_page].fetch_object_from_source(source=source, stop_at_level=stop_at_level)

View File

@ -175,6 +175,7 @@ class ArtworkCollection:
This will make the artworks ready for download and delete duplicates. This will make the artworks ready for download and delete duplicates.
""" """
artwork_hashes: list = list() artwork_hashes: list = list()
artwork_urls: list = list()
for artwork in self._data: for artwork in self._data:
index = 0 index = 0
for artwork_variant in artwork.variants: for artwork_variant in artwork.variants:
@ -182,6 +183,12 @@ class ArtworkCollection:
url=artwork_variant.url, url=artwork_variant.url,
name=artwork_variant.url, name=artwork_variant.url,
) )
if artwork_variant.url in artwork_urls:
artwork.variants.pop(index)
continue
artwork_urls.append(artwork_variant.url)
target: Target = artwork_variant.target target: Target = artwork_variant.target
with target.open("wb") as f: with target.open("wb") as f:
f.write(r.content) f.write(r.content)

View File

@ -185,7 +185,7 @@ class Song(Base):
return return
def _compile(self): def _compile(self):
self.artwork.compile(self.target_collection.get(0)) self.artwork.compile()
INDEX_DEPENDS_ON = ("title", "isrc", "source_collection") INDEX_DEPENDS_ON = ("title", "isrc", "source_collection")

View File

@ -1,4 +1,5 @@
import json import simplejson as json
from json_unescape import escape_json, unescape_json
from enum import Enum from enum import Enum
from typing import List, Optional, Type from typing import List, Optional, Type
from urllib.parse import urlencode, urlparse, urlunparse from urllib.parse import urlencode, urlparse, urlunparse
@ -268,7 +269,8 @@ class Genius(Page):
# get the contents that are between `JSON.parse('` and `');` # get the contents that are between `JSON.parse('` and `');`
content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n window.__APP_CONFIG__ = ") content = self.get_json_content_from_response(r, start="window.__PRELOADED_STATE__ = JSON.parse('", end="');\n window.__APP_CONFIG__ = ")
if content is not None: if content is not None:
content = content.replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'") #IMPLEMENT FIX FROM HAZEL
content = escape_json(content)
data = json.loads(content) data = json.loads(content)
lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None) lyrics_html = traverse_json_path(data, "songPage.lyricsData.body.html", default=None)