This commit is contained in:
parent
c6bdf724e3
commit
1971982d27
@ -129,15 +129,27 @@ UNIFY_TO = " "
|
|||||||
ALLOWED_LENGTH_DISTANCE = 20
|
ALLOWED_LENGTH_DISTANCE = 20
|
||||||
|
|
||||||
|
|
||||||
def unify_punctuation(to_unify: str) -> str:
|
def unify_punctuation(to_unify: str, unify_to: str = UNIFY_TO) -> str:
|
||||||
for char in string.punctuation:
|
for char in string.punctuation:
|
||||||
to_unify = to_unify.replace(char, UNIFY_TO)
|
to_unify = to_unify.replace(char, unify_to)
|
||||||
return to_unify
|
return to_unify
|
||||||
|
|
||||||
def hash_url(url: Union[str, ParseResult]) -> str:
|
def hash_url(url: Union[str, ParseResult]) -> str:
|
||||||
if isinstance(url, str):
|
if isinstance(url, str):
|
||||||
url = urlparse(url)
|
url = urlparse(url)
|
||||||
|
|
||||||
|
unify_to = "-"
|
||||||
|
|
||||||
|
def unify_part(part: str) -> str:
|
||||||
|
nonlocal unify_to
|
||||||
|
return unify_punctuation(part.lower(), unify_to=unify_to).strip(unify_to)
|
||||||
|
|
||||||
|
# netloc
|
||||||
|
netloc = unify_part(url.netloc)
|
||||||
|
if netloc.startswith("www" + unify_to):
|
||||||
|
netloc = netloc[3 + len(unify_to):]
|
||||||
|
|
||||||
|
# query
|
||||||
query = url.query
|
query = url.query
|
||||||
query_dict: Optional[dict] = None
|
query_dict: Optional[dict] = None
|
||||||
try:
|
try:
|
||||||
@ -150,9 +162,9 @@ def hash_url(url: Union[str, ParseResult]) -> str:
|
|||||||
# sort keys alphabetically
|
# sort keys alphabetically
|
||||||
query = ""
|
query = ""
|
||||||
for key, value in sorted(query_dict.items(), key=lambda i: i[0]):
|
for key, value in sorted(query_dict.items(), key=lambda i: i[0]):
|
||||||
query += f"_{key.strip()}_{''.join(i.strip() for i in value)}"
|
query += f"{key.strip()}-{''.join(i.strip() for i in value)}"
|
||||||
|
|
||||||
r = f"{url.netloc}_{url.path.replace('/', '_')}{query}"
|
r = f"{netloc}_{unify_part(url.path)}_{unify_part(query)}"
|
||||||
r = r.lower().strip()
|
r = r.lower().strip()
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
35
tests/test_hash_url.py
Normal file
35
tests/test_hash_url.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from music_kraken.utils.string_processing import hash_url
|
||||||
|
|
||||||
|
|
||||||
|
class TestCollection(unittest.TestCase):
|
||||||
|
def test_remove_schema(self):
|
||||||
|
self.assertFalse(hash_url("https://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https"))
|
||||||
|
self.assertFalse(hash_url("ftp://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https"))
|
||||||
|
self.assertFalse(hash_url("sftp://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https"))
|
||||||
|
self.assertFalse(hash_url("http://www.youtube.com/watch?v=3jZ_D3ELwOQ").startswith("https"))
|
||||||
|
|
||||||
|
def test_no_punctuation(self):
|
||||||
|
self.assertNotIn(hash_url("https://www.you_tube.com/watch?v=3jZ_D3ELwOQ"), "you_tube")
|
||||||
|
self.assertNotIn(hash_url("https://docs.gitea.com/next/install.ation/comparison"), ".")
|
||||||
|
|
||||||
|
def test_three_parts(self):
|
||||||
|
"""
|
||||||
|
The url is parsed into three parts [netloc; path; query]
|
||||||
|
Which are then appended to each other with an underscore between.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.assertTrue(hash_url("https://duckduckgo.com/?t=h_&q=dfasf&ia=web").count("_") == 2)
|
||||||
|
|
||||||
|
def test_sort_query(self):
|
||||||
|
"""
|
||||||
|
The query is sorted alphabetically
|
||||||
|
"""
|
||||||
|
hashed = hash_url("https://duckduckgo.com/?t=h_&q=dfasf&ia=web")
|
||||||
|
sorted_keys = ["ia-", "q-", "t-"]
|
||||||
|
|
||||||
|
self.assertTrue(hashed.index(sorted_keys[0]) < hashed.index(sorted_keys[1]) < hashed.index(sorted_keys[2]))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
Loading…
Reference in New Issue
Block a user