music-kraken-core/music_kraken/utils/string_processing.py

from typing import Tuple, Union, Optional
from pathlib import Path
import string
from functools import lru_cache

from transliterate.exceptions import LanguageDetectionError
from transliterate import translit
from pathvalidate import sanitize_filename


COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
    "(official video)",
)
OPEN_BRACKETS = "(["
CLOSE_BRACKETS = ")]"
DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")

@lru_cache
def unify(string: str) -> str:
    """
    returns a unified str, to make comparisons easy.
    a unified string has the following attributes:
    - is lowercase
    """

    if string is None:
        return None

    try:
        string = translit(string, reversed=True)
    except LanguageDetectionError:
        pass

    return string.lower()


def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:
    def fit_string(string: str) -> str:
        if string == "/":
            return "/"
        string = string.strip()

        while string[0] == ".":
            if len(string) == 0:
                return string

            string = string[1:]

        string = string.replace("/", "_").replace("\\", "_")
        string = sanitize_filename(string)
        return string

    if isinstance(string, Path):
        return Path(*(fit_string(part) for part in string.parts))
    else:
        return fit_string(string)


@lru_cache(maxsize=128)
def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
    """
    This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
    
    cleans:

    - `artist - song` -> `song`
    - `song (Official Video)` -> `song`
    - ` song` -> `song`
    - `song (prod. some producer)`
    """
    raw_song_title = raw_song_title.strip()

    # Clean official Video appendix
    for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
        if raw_song_title.lower().endswith(dirty_appendix):
            raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()

    # remove brackets and their content if they contain disallowed substrings
    for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
        if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
            continue
        
        start = 0

        while True:
            try:
                open_bracket_index = raw_song_title.index(open_bracket, start)
            except ValueError:
                break
            try:
                close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
            except ValueError:
                break

            substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
            if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
                raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
            else:
                start = close_bracket_index + 1

    # everything that requires the artist name
    if artist_name is not None:
        artist_name = artist_name.strip()

        # Remove artist from the start of the title
        if raw_song_title.lower().startswith(artist_name.lower()):
            raw_song_title = raw_song_title[len(artist_name):].strip()

            if raw_song_title.startswith("-"):
                raw_song_title = raw_song_title[1:].strip()

    return raw_song_title.strip()

    
def comment(uncommented_string: str) -> str:
    _fragments = uncommented_string.split("\n")
    _fragments = ["# " + frag for frag in _fragments]
    return "\n".join(_fragments)


# comparisons
TITLE_THRESHOLD_LEVENSHTEIN = 1
UNIFY_TO = " "

ALLOWED_LENGTH_DISTANCE = 20


def unify_punctuation(to_unify: str) -> str:
    for char in string.punctuation:
        to_unify = to_unify.replace(char, UNIFY_TO)
    return to_unify

def hash_url(url: str) -> int:
    return url.strip().lower().lstrip("https://").lstrip("http://")


def remove_feature_part_from_track(title: str) -> str:
    if ")" != title[-1]:
        return title
    if "(" not in title:
        return title

    return title[:title.index("(")]


def modify_title(to_modify: str) -> str:
    to_modify = to_modify.strip()
    to_modify = to_modify.lower()
    to_modify = remove_feature_part_from_track(to_modify)
    to_modify = unify_punctuation(to_modify)
    return to_modify


def match_titles(title_1: str, title_2: str):
    title_1, title_2 = modify_title(title_1), modify_title(title_2)
    distance = jellyfish.levenshtein_distance(title_1, title_2)
    return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance


def match_artists(artist_1, artist_2: str):
    if type(artist_1) == list:
        distances = []

        for artist_1_ in artist_1:
            match, distance = match_titles(artist_1_, artist_2)
            if not match:
                return match, distance

            distances.append(distance)
        return True, min(distances)
    return match_titles(artist_1, artist_2)

def match_length(length_1: int | None, length_2: int | None) -> bool:
    # returning true if either one is Null, because if one value is not known,
    # then it shouldn't be an attribute which could reject an audio source
    if length_1 is None or length_2 is None:
        return True
    return abs(length_1 - length_2) <= ALLOWED_LENGTH_DISTANCE
feat: artist name in clean song title is optional 2024-04-19 09:43:21 +00:00			`from typing import Tuple, Union, Optional`
fix: sanatizing file names correctly 2023-09-13 16:55:04 +00:00			`from pathlib import Path`
feat: removed redundand classes 2024-04-09 12:00:51 +00:00			`import string`
feat: added lru cache to unify function to speed up indexing 2024-04-19 09:40:00 +00:00			`from functools import lru_cache`
clean up dirty song titles 2023-07-27 18:44:24 +00:00
translit 2023-04-18 20:39:19 +00:00			`from transliterate.exceptions import LanguageDetectionError`
			`from transliterate import translit`
fixed file paths 2023-05-10 14:39:44 +00:00			`from pathvalidate import sanitize_filename`

translit 2023-04-18 20:39:19 +00:00
clean up dirty song titles 2023-07-27 18:44:24 +00:00			`COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (`
			`"(official video)",`
			`)`
feat: improved the cleaning song title function to remove redundand brackets 2024-04-19 10:02:54 +00:00			`OPEN_BRACKETS = "(["`
			`CLOSE_BRACKETS = ")]"`
			`DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")`
clean up dirty song titles 2023-07-27 18:44:24 +00:00
feat: added lru cache to unify function to speed up indexing 2024-04-19 09:40:00 +00:00			`@lru_cache`
fixed mutable argument bug in the source dict of SourceAttribute 2023-01-31 12:18:52 +00:00			`def unify(string: str) -> str:`
			`"""`
translit 2023-04-18 20:39:19 +00:00			`returns a unified str, to make comparisons easy.`
			`a unified string has the following attributes:`
fix: dynamic unified title 2024-04-12 12:14:10 +00:00			`- is lowercase`
fixed mutable argument bug in the source dict of SourceAttribute 2023-01-31 12:18:52 +00:00			`"""`
translit 2023-04-18 20:39:19 +00:00
fix: dynamic unified title 2024-04-12 12:14:10 +00:00			`if string is None:`
			`return None`

translit 2023-04-18 20:39:19 +00:00			`try:`
			`string = translit(string, reversed=True)`
			`except LanguageDetectionError:`
			`pass`

fixed mutable argument bug in the source dict of SourceAttribute 2023-01-31 12:18:52 +00:00			`return string.lower()`
escaping strings 2023-04-03 09:17:55 +00:00
translit 2023-04-18 20:39:19 +00:00
fix: sanatizing file names correctly 2023-09-13 16:55:04 +00:00			`def fit_to_file_system(string: Union[str, Path]) -> Union[str, Path]:`
			`def fit_string(string: str) -> str:`
			`if string == "/":`
			`return "/"`
			`string = string.strip()`
translit 2023-04-18 20:39:19 +00:00
fix: sanatizing file names correctly 2023-09-13 16:55:04 +00:00			`while string[0] == ".":`
			`if len(string) == 0:`
			`return string`
translit 2023-04-18 20:39:19 +00:00
fix: sanatizing file names correctly 2023-09-13 16:55:04 +00:00			`string = string[1:]`
translit 2023-04-18 20:39:19 +00:00
fix: sanatizing file names correctly 2023-09-13 16:55:04 +00:00			`string = string.replace("/", "_").replace("\\", "_")`
			`string = sanitize_filename(string)`
			`return string`
translit 2023-04-18 20:39:19 +00:00
fix: sanatizing file names correctly 2023-09-13 16:55:04 +00:00			`if isinstance(string, Path):`
			`return Path(*(fit_string(part) for part in string.parts))`
			`else:`
			`return fit_string(string)`
clean up dirty song titles 2023-07-27 18:44:24 +00:00

feat: some slight performance improvements 2024-04-19 10:04:13 +00:00			`@lru_cache(maxsize=128)`
feat: artist name in clean song title is optional 2024-04-19 09:43:21 +00:00			`def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:`
clean up dirty song titles 2023-07-27 18:44:24 +00:00			`"""`
			`This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos`

			`cleans:`

			- `artist - song` -> `song`
			- `song (Official Video)` -> `song`
			- ` song` -> `song`
			- `song (prod. some producer)`
			`"""`
			`raw_song_title = raw_song_title.strip()`

			`# Clean official Video appendix`
			`for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:`
			`if raw_song_title.lower().endswith(dirty_appendix):`
			`raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()`

feat: improved the cleaning song title function to remove redundand brackets 2024-04-19 10:02:54 +00:00			`# remove brackets and their content if they contain disallowed substrings`
			`for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):`
feat: some slight performance improvements 2024-04-19 10:04:13 +00:00			`if open_bracket not in raw_song_title or close_bracket not in raw_song_title:`
			`continue`

feat: improved the cleaning song title function to remove redundand brackets 2024-04-19 10:02:54 +00:00			`start = 0`

			`while True:`
			`try:`
			`open_bracket_index = raw_song_title.index(open_bracket, start)`
			`except ValueError:`
			`break`
			`try:`
			`close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)`
			`except ValueError:`
			`break`

			`substring = raw_song_title[open_bracket_index + 1:close_bracket_index]`
			`if any(disallowed_substring in substring for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):`
			`raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]`
			`else:`
			`start = close_bracket_index + 1`

feat: artist name in clean song title is optional 2024-04-19 09:43:21 +00:00			`# everything that requires the artist name`
			`if artist_name is not None:`
			`artist_name = artist_name.strip()`
clean up dirty song titles 2023-07-27 18:44:24 +00:00
feat: artist name in clean song title is optional 2024-04-19 09:43:21 +00:00			`# Remove artist from the start of the title`
			`if raw_song_title.lower().startswith(artist_name.lower()):`
			`raw_song_title = raw_song_title[len(artist_name):].strip()`

			`if raw_song_title.startswith("-"):`
			`raw_song_title = raw_song_title[1:].strip()`
clean up dirty song titles 2023-07-27 18:44:24 +00:00
			`return raw_song_title.strip()`
Merge branch 'youtube_music' into experimental 2023-09-12 09:06:26 +00:00

feat: layed out commit 2023-08-10 21:01:16 +00:00			`def comment(uncommented_string: str) -> str:`
			`_fragments = uncommented_string.split("\n")`
			`_fragments = ["# " + frag for frag in _fragments]`
			`return "\n".join(_fragments)`

feat: removed redundand classes 2024-04-09 12:00:51 +00:00
			`# comparisons`
			`TITLE_THRESHOLD_LEVENSHTEIN = 1`
			`UNIFY_TO = " "`

			`ALLOWED_LENGTH_DISTANCE = 20`


			`def unify_punctuation(to_unify: str) -> str:`
			`for char in string.punctuation:`
			`to_unify = to_unify.replace(char, UNIFY_TO)`
			`return to_unify`

feat: added base functionality of artwork class 2024-04-10 14:39:46 +00:00			`def hash_url(url: str) -> int:`
			`return url.strip().lower().lstrip("https://").lstrip("http://")`

feat: removed redundand classes 2024-04-09 12:00:51 +00:00
			`def remove_feature_part_from_track(title: str) -> str:`
			`if ")" != title[-1]:`
			`return title`
			`if "(" not in title:`
			`return title`

			`return title[:title.index("(")]`


			`def modify_title(to_modify: str) -> str:`
			`to_modify = to_modify.strip()`
			`to_modify = to_modify.lower()`
			`to_modify = remove_feature_part_from_track(to_modify)`
			`to_modify = unify_punctuation(to_modify)`
			`return to_modify`


			`def match_titles(title_1: str, title_2: str):`
			`title_1, title_2 = modify_title(title_1), modify_title(title_2)`
			`distance = jellyfish.levenshtein_distance(title_1, title_2)`
			`return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance`


			`def match_artists(artist_1, artist_2: str):`
			`if type(artist_1) == list:`
			`distances = []`

			`for artist_1_ in artist_1:`
			`match, distance = match_titles(artist_1_, artist_2)`
			`if not match:`
			`return match, distance`

			`distances.append(distance)`
			`return True, min(distances)`
			`return match_titles(artist_1, artist_2)`

			`def match_length(length_1: int \| None, length_2: int \| None) -> bool:`
			`# returning true if either one is Null, because if one value is not known,`
			`# then it shouldn't be an attribute which could reject an audio source`
			`if length_1 is None or length_2 is None:`
			`return True`
			`return abs(length_1 - length_2) <= ALLOWED_LENGTH_DISTANCE`