music-kraken-core/music_kraken/utils/string_processing.py

from typing import Tuple, Union, Optional
from pathlib import Path
import string
from functools import lru_cache

from transliterate.exceptions import LanguageDetectionError
from transliterate import translit
from pathvalidate import sanitize_filename
from urllib.parse import urlparse, ParseResult, parse_qs


COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
    "(official video)",
)
OPEN_BRACKETS = "(["
CLOSE_BRACKETS = ")]"
DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")

@lru_cache
def unify(string: str) -> str:
    """
    returns a unified str, to make comparisons easy.
    a unified string has the following attributes:
    - is lowercase
    - is transliterated to Latin characters from e.g. Cyrillic
    """

    if string is None:
        return None

    try:
        string = translit(string, reversed=True)
    except LanguageDetectionError:
        pass
    
    string = unify_punctuation(string)
    return string.lower().strip()


def fit_to_file_system(string: Union[str, Path], hidden_ok: bool = False) -> Union[str, Path]:
    def fit_string(string: str) -> str:
        nonlocal hidden_ok
        
        if string == "/":
            return "/"
        string = string.strip()

        while string[0] == "." and not hidden_ok:
            if len(string) == 0:
                return string

            string = string[1:]

        string = string.replace("/", "_").replace("\\", "_")

        try:
            string = translit(string, reversed=True)
        except LanguageDetectionError:
            pass
        
        string = sanitize_filename(string)

        return string

    if isinstance(string, Path):
        return Path(*(fit_string(part) for part in string.parts))
    else:
        return fit_string(string)


@lru_cache(maxsize=128)
def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
    """
    This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
    
    cleans:

    - `artist - song` -> `song`
    - `song (Official Video)` -> `song`
    - ` song` -> `song`
    - `song (prod. some producer)`
    """
    raw_song_title = raw_song_title.strip()

    # Clean official Video appendix
    for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
        if raw_song_title.lower().endswith(dirty_appendix):
            raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()

    # remove brackets and their content if they contain disallowed substrings
    for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
        if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
            continue
        
        start = 0

        while True:
            try:
                open_bracket_index = raw_song_title.index(open_bracket, start)
            except ValueError:
                break
            try:
                close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
            except ValueError:
                break

            substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
            if any(disallowed_substring in substring.lower() for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
                raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
            else:
                start = close_bracket_index + 1

    # everything that requires the artist name
    if artist_name is not None:
        artist_name = artist_name.strip()

        # Remove artist from the start of the title
        if raw_song_title.lower().startswith(artist_name.lower()):

            possible_new_name = raw_song_title[len(artist_name):].strip()

            for char in ("-", "–", ":", "|"):
                if possible_new_name.startswith(char):
                    raw_song_title = possible_new_name[1:].strip()
                    break

    return raw_song_title.strip()

    
def comment(uncommented_string: str) -> str:
    _fragments = uncommented_string.split("\n")
    _fragments = ["# " + frag for frag in _fragments]
    return "\n".join(_fragments)


# comparisons
TITLE_THRESHOLD_LEVENSHTEIN = 1
UNIFY_TO = " "

ALLOWED_LENGTH_DISTANCE = 20


def unify_punctuation(to_unify: str, unify_to: str = UNIFY_TO) -> str:
    for char in string.punctuation:
        to_unify = to_unify.replace(char, unify_to)
    return to_unify

@lru_cache(maxsize=128)
def hash_url(url: Union[str, ParseResult]) -> str:
    if isinstance(url, str): 
        url = urlparse(url)

    unify_to = "-"

    def unify_part(part: str) -> str:
        nonlocal unify_to
        return unify_punctuation(part.lower(), unify_to=unify_to).strip(unify_to)

    # netloc
    netloc = unify_part(url.netloc)
    if netloc.startswith("www" + unify_to):
        netloc = netloc[3 + len(unify_to):]

    # query
    query = url.query
    query_dict: Optional[dict] = None
    try:
        query_dict: dict = parse_qs(url.query, strict_parsing=True)
    except ValueError:
        # the query couldn't be parsed
        pass

    if isinstance(query_dict, dict):
        # sort keys alphabetically
        query = ""
        for key, value in sorted(query_dict.items(), key=lambda i: i[0]):
            query += f"{key.strip()}-{''.join(i.strip() for i in value)}"

    r = f"{netloc}_{unify_part(url.path)}_{unify_part(query)}"
    r = r.lower().strip()
    return r


def remove_feature_part_from_track(title: str) -> str:
    if ")" != title[-1]:
        return title
    if "(" not in title:
        return title

    return title[:title.index("(")]


def modify_title(to_modify: str) -> str:
    to_modify = to_modify.strip()
    to_modify = to_modify.lower()
    to_modify = remove_feature_part_from_track(to_modify)
    to_modify = unify_punctuation(to_modify)
    return to_modify


def match_titles(title_1: str, title_2: str):
    title_1, title_2 = modify_title(title_1), modify_title(title_2)
    distance = jellyfish.levenshtein_distance(title_1, title_2)
    return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance


def match_artists(artist_1, artist_2: str):
    if type(artist_1) == list:
        distances = []

        for artist_1_ in artist_1:
            match, distance = match_titles(artist_1_, artist_2)
            if not match:
                return match, distance

            distances.append(distance)
        return True, min(distances)
    return match_titles(artist_1, artist_2)

def match_length(length_1: int | None, length_2: int | None) -> bool:
    # returning true if either one is Null, because if one value is not known,
    # then it shouldn't be an attribute which could reject an audio source
    if length_1 is None or length_2 is None:
        return True
    return abs(length_1 - length_2) <= ALLOWED_LENGTH_DISTANCE

def shorten_display_url(url: str, max_length: int = 150, chars_at_end: int = 4, shorten_string: str = "[...]") -> str:
    if len(url) <= max_length + chars_at_end + len(shorten_string):
        return url
    
    return url[:max_length] + shorten_string + url[-chars_at_end:]
-												feat: artist name in clean song title is optional

											
										
										
											2024-04-19 09:43:21 +00:00
+								from typing import Tuple, Union, Optional
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								from pathlib import Path
-												feat: removed redundand classes

											
										
										
											2024-04-09 12:00:51 +00:00
+								import string
-												feat: added lru cache to unify function to speed up indexing

											
										
										
											2024-04-19 09:40:00 +00:00
+								from functools import lru_cache
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
+								from transliterate.exceptions import LanguageDetectionError
 								from transliterate import translit
-												fixed file paths

											
										
										
											2023-05-10 14:39:44 +00:00
+								from pathvalidate import sanitize_filename
-												draft: string processing

											
										
										
											2024-04-29 12:40:49 +00:00
+								from urllib.parse import urlparse, ParseResult, parse_qs
-												fixed file paths

											
										
										
											2023-05-10 14:39:44 +00:00
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
+								COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
 								    "(official video)",
 								)
-												feat: improved the cleaning song title function to remove redundand brackets

											
										
										
											2024-04-19 10:02:54 +00:00
+								OPEN_BRACKETS = "(["
 								CLOSE_BRACKETS = ")]"
 								DISALLOWED_SUBSTRING_IN_BRACKETS = ("official", "video", "audio", "lyrics", "prod", "remix", "ft", "feat", "ft.", "feat.")
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
-												feat: added lru cache to unify function to speed up indexing

											
										
										
											2024-04-19 09:40:00 +00:00
+								@lru_cache
-												fixed mutable argument bug in the source dict of SourceAttribute

											
										
										
											2023-01-31 12:18:52 +00:00
+								def unify(string: str) -> str:
 								    """
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
+								    returns a unified str, to make comparisons easy.
 								    a unified string has the following attributes:
-												fix: dynamic unified title

											
										
										
											2024-04-12 12:14:10 +00:00
+								    - is lowercase
-												draft: string processing

											
										
										
											2024-04-29 12:40:49 +00:00
+								    - is transliterated to Latin characters from e.g. Cyrillic
-												fixed mutable argument bug in the source dict of SourceAttribute

											
										
										
											2023-01-31 12:18:52 +00:00
+								    """
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												fix: dynamic unified title

											
										
										
											2024-04-12 12:14:10 +00:00
+								    if string is None:
 								        return None
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
+								    try:
 								        string = translit(string, reversed=True)
 								    except LanguageDetectionError:
 								        pass
-												draft

											
										
										
											2024-04-30 10:32:55 +00:00
 								    string = unify_punctuation(string)
 								    return string.lower().strip()
-												escaping strings

											
										
										
											2023-04-03 09:17:55 +00:00
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												feat: fixed cache plus concisten cache throughout musify

											
										
										
											2024-04-25 23:05:22 +00:00
+								def fit_to_file_system(string: Union[str, Path], hidden_ok: bool = False) -> Union[str, Path]:
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								    def fit_string(string: str) -> str:
-												feat: fixed cache plus concisten cache throughout musify

											
										
										
											2024-04-25 23:05:22 +00:00
+								        nonlocal hidden_ok
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								        if string == "/":
 								            return "/"
 								        string = string.strip()
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												feat: fixed cache plus concisten cache throughout musify

											
										
										
											2024-04-25 23:05:22 +00:00
+								        while string[0] == "." and not hidden_ok:
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								            if len(string) == 0:
 								                return string
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								            string = string[1:]
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								        string = string.replace("/", "_").replace("\\", "_")
-												fix: removing the possibility or file names containing /

											
										
										
											2024-05-06 16:48:13 +00:00
 								        try:
 								            string = translit(string, reversed=True)
 								        except LanguageDetectionError:
 								            pass
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								        string = sanitize_filename(string)
-												fix: removing the possibility or file names containing /

											
										
										
											2024-05-06 16:48:13 +00:00
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								        return string
-												translit

											
										
										
											2023-04-18 20:39:19 +00:00
-												fix: sanatizing file names correctly

											
										
										
											2023-09-13 16:55:04 +00:00
+								    if isinstance(string, Path):
 								        return Path(*(fit_string(part) for part in string.parts))
 								    else:
 								        return fit_string(string)
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
-												feat: some slight performance improvements

											
										
										
											2024-04-19 10:04:13 +00:00
+								@lru_cache(maxsize=128)
-												feat: artist name in clean song title is optional

											
										
										
											2024-04-19 09:43:21 +00:00
+								def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) -> str:
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
+								    """
 								    This function cleans common naming "conventions" for non clean song titles, like the title of youtube videos
 								    cleans:
 								    - `artist - song` -> `song`
 								    - `song (Official Video)` -> `song`
 								    - ` song` -> `song`
 								    - `song (prod. some producer)`
 								    """
 								    raw_song_title = raw_song_title.strip()
 								    # Clean official Video appendix
 								    for dirty_appendix in COMMON_TITLE_APPENDIX_LIST:
 								        if raw_song_title.lower().endswith(dirty_appendix):
 								            raw_song_title = raw_song_title[:-len(dirty_appendix)].strip()
-												feat: improved the cleaning song title function to remove redundand brackets

											
										
										
											2024-04-19 10:02:54 +00:00
+								    # remove brackets and their content if they contain disallowed substrings
 								    for open_bracket, close_bracket in zip(OPEN_BRACKETS, CLOSE_BRACKETS):
-												feat: some slight performance improvements

											
										
										
											2024-04-19 10:04:13 +00:00
+								        if open_bracket not in raw_song_title or close_bracket not in raw_song_title:
 								            continue
-												feat: improved the cleaning song title function to remove redundand brackets

											
										
										
											2024-04-19 10:02:54 +00:00
+								        start = 0
 								        while True:
 								            try:
 								                open_bracket_index = raw_song_title.index(open_bracket, start)
 								            except ValueError:
 								                break
 								            try:
 								                close_bracket_index = raw_song_title.index(close_bracket, open_bracket_index + 1)
 								            except ValueError:
 								                break
 								            substring = raw_song_title[open_bracket_index + 1:close_bracket_index]
-												feat: improved cleanup of song title

											
										
										
											2024-04-25 22:23:04 +00:00
+								            if any(disallowed_substring in substring.lower() for disallowed_substring in DISALLOWED_SUBSTRING_IN_BRACKETS):
-												feat: improved the cleaning song title function to remove redundand brackets

											
										
										
											2024-04-19 10:02:54 +00:00
+								                raw_song_title = raw_song_title[:open_bracket_index] + raw_song_title[close_bracket_index + 1:]
 								            else:
 								                start = close_bracket_index + 1
-												feat: artist name in clean song title is optional

											
										
										
											2024-04-19 09:43:21 +00:00
+								    # everything that requires the artist name
 								    if artist_name is not None:
 								        artist_name = artist_name.strip()
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
-												feat: artist name in clean song title is optional

											
										
										
											2024-04-19 09:43:21 +00:00
+								        # Remove artist from the start of the title
 								        if raw_song_title.lower().startswith(artist_name.lower()):
-												fix: cleaning the song name deleted the song if the song name was the same as the artist name

											
										
										
											2024-05-10 13:25:11 +00:00
+								            possible_new_name = raw_song_title[len(artist_name):].strip()
 								            for char in ("-", "–", ":", "|"):
 								                if possible_new_name.startswith(char):
 								                    raw_song_title = possible_new_name[1:].strip()
 								                    break
-												clean up dirty song titles

											
										
										
											2023-07-27 18:44:24 +00:00
 								    return raw_song_title.strip()
-												Merge branch 'youtube_music' into experimental

											
										
										
											2023-09-12 09:06:26 +00:00
-												feat: layed out commit

											
										
										
											2023-08-10 21:01:16 +00:00
+								def comment(uncommented_string: str) -> str:
 								    _fragments = uncommented_string.split("\n")
 								    _fragments = ["# " + frag for frag in _fragments]
 								    return "\n".join(_fragments)
-												feat: removed redundand classes

											
										
										
											2024-04-09 12:00:51 +00:00
 								# comparisons
 								TITLE_THRESHOLD_LEVENSHTEIN = 1
 								UNIFY_TO = " "
 								ALLOWED_LENGTH_DISTANCE = 20
-												feat: added tests

											
										
										
											2024-04-29 13:31:32 +00:00
+								def unify_punctuation(to_unify: str, unify_to: str = UNIFY_TO) -> str:
-												feat: removed redundand classes

											
										
										
											2024-04-09 12:00:51 +00:00
+								    for char in string.punctuation:
-												feat: added tests

											
										
										
											2024-04-29 13:31:32 +00:00
+								        to_unify = to_unify.replace(char, unify_to)
-												feat: removed redundand classes

											
										
										
											2024-04-09 12:00:51 +00:00
+								    return to_unify
-												draft: no metadata function for source

											
										
										
											2024-04-29 15:06:31 +00:00
+								@lru_cache(maxsize=128)
-												draft: string processing

											
										
										
											2024-04-29 12:40:49 +00:00
+								def hash_url(url: Union[str, ParseResult]) -> str:
 								    if isinstance(url, str):
 								        url = urlparse(url)
-												feat: added tests

											
										
										
											2024-04-29 13:31:32 +00:00
+								    unify_to = "-"
 								    def unify_part(part: str) -> str:
 								        nonlocal unify_to
 								        return unify_punctuation(part.lower(), unify_to=unify_to).strip(unify_to)
 								    # netloc
 								    netloc = unify_part(url.netloc)
 								    if netloc.startswith("www" + unify_to):
 								        netloc = netloc[3 + len(unify_to):]
 								    # query
-												draft: string processing

											
										
										
											2024-04-29 12:40:49 +00:00
+								    query = url.query
 								    query_dict: Optional[dict] = None
 								    try:
 								        query_dict: dict = parse_qs(url.query, strict_parsing=True)
 								    except ValueError:
 								        # the query couldn't be parsed
 								        pass
 								    if isinstance(query_dict, dict):
 								        # sort keys alphabetically
 								        query = ""
 								        for key, value in sorted(query_dict.items(), key=lambda i: i[0]):
-												feat: added tests

											
										
										
											2024-04-29 13:31:32 +00:00
+								            query += f"{key.strip()}-{''.join(i.strip() for i in value)}"
-												draft: string processing

											
										
										
											2024-04-29 12:40:49 +00:00
-												feat: added tests

											
										
										
											2024-04-29 13:31:32 +00:00
+								    r = f"{netloc}_{unify_part(url.path)}_{unify_part(query)}"
-												draft: string processing

											
										
										
											2024-04-29 12:40:49 +00:00
+								    r = r.lower().strip()
 								    return r
-												feat: added base functionality of artwork class

											
										
										
											2024-04-10 14:39:46 +00:00
-												feat: removed redundand classes

											
										
										
											2024-04-09 12:00:51 +00:00
 								def remove_feature_part_from_track(title: str) -> str:
 								    if ")" != title[-1]:
 								        return title
 								    if "(" not in title:
 								        return title
 								    return title[:title.index("(")]
 								def modify_title(to_modify: str) -> str:
 								    to_modify = to_modify.strip()
 								    to_modify = to_modify.lower()
 								    to_modify = remove_feature_part_from_track(to_modify)
 								    to_modify = unify_punctuation(to_modify)
 								    return to_modify
 								def match_titles(title_1: str, title_2: str):
 								    title_1, title_2 = modify_title(title_1), modify_title(title_2)
 								    distance = jellyfish.levenshtein_distance(title_1, title_2)
 								    return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance
 								def match_artists(artist_1, artist_2: str):
 								    if type(artist_1) == list:
 								        distances = []
 								        for artist_1_ in artist_1:
 								            match, distance = match_titles(artist_1_, artist_2)
 								            if not match:
 								                return match, distance
 								            distances.append(distance)
 								        return True, min(distances)
 								    return match_titles(artist_1, artist_2)
 								def match_length(length_1: int | None, length_2: int | None) -> bool:
 								    # returning true if either one is Null, because if one value is not known,
 								    # then it shouldn't be an attribute which could reject an audio source
 								    if length_1 is None or length_2 is None:
 								        return True
 								    return abs(length_1 - length_2) <= ALLOWED_LENGTH_DISTANCE
-												fix: trace was passed in the request method shortened url for trace

											
										
										
											2024-04-26 11:03:20 +00:00
 								def shorten_display_url(url: str, max_length: int = 150, chars_at_end: int = 4, shorten_string: str = "[...]") -> str:
 								    if len(url) <= max_length + chars_at_end + len(shorten_string):
 								        return url
 								    return url[:max_length] + shorten_string + url[-chars_at_end:]