From fe1751a7f331d2596d8609cfc195b4df9060fd8d Mon Sep 17 00:00:00 2001 From: Hellow <74311245+HeIIow2@users.noreply.github.com> Date: Thu, 27 Jul 2023 20:44:24 +0200 Subject: [PATCH] clean up dirty song titles --- src/music_kraken/pages/youtube.py | 5 +++ src/music_kraken/utils/string_processing.py | 37 ++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/music_kraken/pages/youtube.py b/src/music_kraken/pages/youtube.py index 3d63d7c..576e22a 100644 --- a/src/music_kraken/pages/youtube.py +++ b/src/music_kraken/pages/youtube.py @@ -19,6 +19,7 @@ from ..objects import ( ID3Timestamp ) from ..connection import Connection +from ..utils.string_processing import clean_song_title from ..utils.support_classes import DownloadResult from ..utils.shared import YOUTUBE_LOGGER, INVIDIOUS_INSTANCE, BITRATE, ENABLE_SPONSOR_BLOCK, PIPED_INSTANCE, SLEEP_AFTER_YOUTUBE_403 @@ -226,6 +227,10 @@ class YouTube(Page): )) else: + # If the song is not a topic song in the beginning, it cleans the title. If it is from a topic channel, it is clean anyways + # If cleaned data is returned by the api, it will be overridden in the next step anyways + title = clean_song_title(title, _author) + for music_track in data.get("musicTracks", []): title = music_track["song"] license_str = music_track["license"] diff --git a/src/music_kraken/utils/string_processing.py b/src/music_kraken/utils/string_processing.py index a11c56d..d622626 100644 --- a/src/music_kraken/utils/string_processing.py +++ b/src/music_kraken/utils/string_processing.py @@ -1,9 +1,15 @@ +from typing import Tuple + from transliterate.exceptions import LanguageDetectionError from transliterate import translit - from pathvalidate import sanitize_filename +COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = ( + "(official video)", +) + + def unify(string: str) -> str: """ returns a unified str, to make comparisons easy. @@ -33,3 +39,32 @@ def fit_to_file_system(string: str) -> str: string = sanitize_filename(string) return string + + +def clean_song_title(raw_song_title: str, artist_name: str) -> str: + """ + This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos + + cleans: + + - `artist - song` -> `song` + - `song (Official Video)` -> `song` + - ` song` -> `song` + - `song (prod. some producer)` + """ + raw_song_title = raw_song_title.strip() + artist_name = artist_name.strip() + + # Clean official Video appendix + for dirty_appendix in COMMON_TITLE_APPENDIX_LIST: + if raw_song_title.lower().endswith(dirty_appendix): + raw_song_title = raw_song_title[:-len(dirty_appendix)].strip() + + # Remove artist from the start of the title + if raw_song_title.lower().startswith(artist_name.lower()): + raw_song_title = raw_song_title[len(artist_name):].strip() + + if raw_song_title.startswith("-"): + raw_song_title = raw_song_title[1:].strip() + + return raw_song_title.strip()