diff --git a/music_kraken/utils/string_processing.py b/music_kraken/utils/string_processing.py index 0b45c6f..d9d4c70 100644 --- a/music_kraken/utils/string_processing.py +++ b/music_kraken/utils/string_processing.py @@ -129,15 +129,27 @@ UNIFY_TO = " " ALLOWED_LENGTH_DISTANCE = 20 -def unify_punctuation(to_unify: str) -> str: +def unify_punctuation(to_unify: str, unify_to: str = UNIFY_TO) -> str: for char in string.punctuation: - to_unify = to_unify.replace(char, UNIFY_TO) + to_unify = to_unify.replace(char, unify_to) return to_unify def hash_url(url: Union[str, ParseResult]) -> str: if isinstance(url, str): url = urlparse(url) + unify_to = "-" + + def unify_part(part: str) -> str: + nonlocal unify_to + return unify_punctuation(part.lower(), unify_to=unify_to).strip(unify_to) + + # netloc + netloc = unify_part(url.netloc) + if netloc.startswith("www" + unify_to): + netloc = netloc[3 + len(unify_to):] + + # query query = url.query query_dict: Optional[dict] = None try: @@ -150,9 +162,9 @@ def hash_url(url: Union[str, ParseResult]) -> str: # sort keys alphabetically query = "" for key, value in sorted(query_dict.items(), key=lambda i: i[0]): - query += f"_{key.strip()}_{''.join(i.strip() for i in value)}" + query += f"{key.strip()}-{''.join(i.strip() for i in value)}" - r = f"{url.netloc}_{url.path.replace('/', '_')}{query}" + r = f"{netloc}_{unify_part(url.path)}_{unify_part(query)}" r = r.lower().strip() return r diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_hash_url.py b/tests/test_hash_url.py new file mode 100644 index 0000000..f87b2ff --- /dev/null +++ b/tests/test_hash_url.py @@ -0,0 +1,35 @@ +import unittest + +from music_kraken.utils.string_processing import hash_url + + +class TestCollection(unittest.TestCase): + def test_remove_schema(self): + self.assertFalse(hash_url("https://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https")) + self.assertFalse(hash_url("ftp://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https")) + self.assertFalse(hash_url("sftp://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https")) + self.assertFalse(hash_url("http://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https")) + + def test_no_punctuation(self): + self.assertNotIn(hash_url("https://www.you_tube.com/watch?v=3jZ_D3ELwOQ"), "you_tube") + self.assertNotIn(hash_url("https://docs.gitea.com/next/install.ation/comparison"), ".") + + def test_three_parts(self): + """ + The url is parsed into three parts [netloc; path; query] + Which are then appended to each other with an underscore between. + """ + + self.assertTrue(hash_url("https://duckduckgo.com/?t=h_&q=dfasf&ia=web").count("_") == 2) + + def test_sort_query(self): + """ + The query is sorted alphabetically + """ + hashed = hash_url("https://duckduckgo.com/?t=h_&q=dfasf&ia=web") + sorted_keys = ["ia-", "q-", "t-"] + + self.assertTrue(hashed.index(sorted_keys[0]) < hashed.index(sorted_keys[1]) < hashed.index(sorted_keys[2])) + +if __name__ == "__main__": + unittest.main()