diff --git a/music_kraken/utils/string_processing.py b/music_kraken/utils/string_processing.py index d21b367..b591604 100644 --- a/music_kraken/utils/string_processing.py +++ b/music_kraken/utils/string_processing.py @@ -11,6 +11,9 @@ from pathvalidate import sanitize_filename COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = ( "(official video)", ) +OPEN_BRACKETS = "([" +CLOSE_BRACKETS = ")]" +DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.") @lru_cache def unify(string: str) -> str: @@ -71,6 +74,26 @@ def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> if raw_song_title.lower().endswith(dirty_appendix): raw_song_title = raw_song_title[:-len(dirty_appendix)].strip() + # remove brackets and their content if they contain disallowed substrings + for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS): + start = 0 + + while True: + try: + open_bracket_index = raw_song_title.index(open_bracket, start) + except ValueError: + break + try: + close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1) + except ValueError: + break + + substring = raw_song_title[open_bracket_index + 1:close_bracket_index] + if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS): + raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:] + else: + start = close_bracket_index + 1 + # everything that requires the artist name if artist_name is not None: artist_name = artist_name.strip()