diff --git a/requirements.txt b/requirements.txt index 823a91c..d525e14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ pandas~=1.5.0 mutagen~=1.46.0 musicbrainzngs~=0.7.1 jellyfish~=0.9.0 -pydub~=0.25.1 \ No newline at end of file +pydub~=0.25.1 +youtube_dl \ No newline at end of file diff --git a/src/__pycache__/phonetic_compares.cpython-310.pyc b/src/__pycache__/phonetic_compares.cpython-310.pyc new file mode 100644 index 0000000..8286e70 Binary files /dev/null and b/src/__pycache__/phonetic_compares.cpython-310.pyc differ diff --git a/src/phonetic_compares.py b/src/phonetic_compares.py index a98f09c..556c563 100644 --- a/src/phonetic_compares.py +++ b/src/phonetic_compares.py @@ -1,2 +1,8 @@ import jellyfish +TITLE_THRESHOLD_LEVENSHTEIN = 1 + + +def match_titles(title_1: str, title_2: str) -> (bool, int): + distance = jellyfish.levenshtein_distance(title_1, title_2) + return distance > 1, distance diff --git a/src/youtube_music.py b/src/youtube_music.py index c469378..3f9ac99 100644 --- a/src/youtube_music.py +++ b/src/youtube_music.py @@ -1,9 +1,10 @@ import youtube_dl import pandas as pd -import jellyfish import logging import time +import phonetic_compares + YDL_OPTIONS = {'format': 'bestaudio', 'noplaylist': 'True'} YOUTUBE_URL_KEY = 'webpage_url' WAIT_BETWEEN_BLOCK = 10 @@ -31,12 +32,11 @@ def get_youtube_url(row): result = get_youtube_from_isrc(row['isrc']) video_title = result['title'].lower() - phonetic_distance = jellyfish.levenshtein_distance(real_title, video_title) + match, distance = phonetic_compares.match_titles(video_title, real_title) - print(real_title, video_title, phonetic_distance) - if phonetic_distance > 1: + if match: logging.warning( - f"dont downloading {result['url']} cuz the phonetic distance ({phonetic_distance}) between {real_title} and {video_title} is to high.") + f"dont downloading {result['url']} cuz the phonetic distance ({distance}) between {real_title} and {video_title} is to high.") return None return result['url']