draft: string processing
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

This commit is contained in:
2024-04-29 14:40:49 +02:00
parent aa50d2cf20
commit c6bdf724e3
2 changed files with 51 additions and 25 deletions

View File

@@ -6,6 +6,7 @@ from functools import lru_cache
from transliterate.exceptions import LanguageDetectionError
from transliterate import translit
from pathvalidate import sanitize_filename
from urllib.parse import urlparse, ParseResult, parse_qs
COMMON_TITLE_APPENDIX_LIST: Tuple[str, ...] = (
@@ -21,6 +22,7 @@ def unify(string: str) -> str:
returns a unified str, to make comparisons easy.
a unified string has the following attributes:
- is lowercase
- is transliterated to Latin characters from e.g. Cyrillic
"""
if string is None:
@@ -132,8 +134,27 @@ def unify_punctuation(to_unify: str) -> str:
to_unify = to_unify.replace(char, UNIFY_TO)
return to_unify
def hash_url(url: str) -> int:
return url.strip().lower().lstrip("https://").lstrip("http://")
def hash_url(url: Union[str, ParseResult]) -> str:
if isinstance(url, str):
url = urlparse(url)
query = url.query
query_dict: Optional[dict] = None
try:
query_dict: dict = parse_qs(url.query, strict_parsing=True)
except ValueError:
# the query couldn't be parsed
pass
if isinstance(query_dict, dict):
# sort keys alphabetically
query = ""
for key, value in sorted(query_dict.items(), key=lambda i: i[0]):
query += f"_{key.strip()}_{''.join(i.strip() for i in value)}"
r = f"{url.netloc}_{url.path.replace('/', '_')}{query}"
r = r.lower().strip()
return r
def remove_feature_part_from_track(title: str) -> str: