From 137b18517f63b0bba3d39b4ced48ed4cfacb7984 Mon Sep 17 00:00:00 2001 From: lars Date: Mon, 24 Oct 2022 13:23:07 +0200 Subject: [PATCH] moved linguistics stuff to another file --- requirements.txt | 3 ++- src/__pycache__/phonetic_compares.cpython-310.pyc | Bin 0 -> 196 bytes src/phonetic_compares.py | 6 ++++++ src/youtube_music.py | 10 +++++----- 4 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 src/__pycache__/phonetic_compares.cpython-310.pyc diff --git a/requirements.txt b/requirements.txt index 823a91c..d525e14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ pandas~=1.5.0 mutagen~=1.46.0 musicbrainzngs~=0.7.1 jellyfish~=0.9.0 -pydub~=0.25.1 \ No newline at end of file +pydub~=0.25.1 +youtube_dl \ No newline at end of file diff --git a/src/__pycache__/phonetic_compares.cpython-310.pyc b/src/__pycache__/phonetic_compares.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8286e70db6bd2c436db3e93ccd84330eb0af37c2 GIT binary patch literal 196 zcmYk0F%H5o3`J8|0JRGzVC?|8079GqA&@$-ppqDxlGGt?g?b{ymAW!<1twf3zT{`U zzu1^06Tx}hGBxG>>g7KJmNgH>3@vnI#eumO>EO+c*rZW6F2UONY^ZpOFy!)w&rUkf zCYLr3Br~*)1Gx=Y!nG!OGMAa}MX|VsjGAk_R~B;(K~hlC7g#}U)KQ`Hc>r4Ywi_)S O>c+w<;xp7g7mF`qs55;4 literal 0 HcmV?d00001 diff --git a/src/phonetic_compares.py b/src/phonetic_compares.py index a98f09c..556c563 100644 --- a/src/phonetic_compares.py +++ b/src/phonetic_compares.py @@ -1,2 +1,8 @@ import jellyfish +TITLE_THRESHOLD_LEVENSHTEIN = 1 + + +def match_titles(title_1: str, title_2: str) -> (bool, int): + distance = jellyfish.levenshtein_distance(title_1, title_2) + return distance > 1, distance diff --git a/src/youtube_music.py b/src/youtube_music.py index c469378..3f9ac99 100644 --- a/src/youtube_music.py +++ b/src/youtube_music.py @@ -1,9 +1,10 @@ import youtube_dl import pandas as pd -import jellyfish import logging import time +import phonetic_compares + YDL_OPTIONS = {'format': 'bestaudio', 'noplaylist': 'True'} YOUTUBE_URL_KEY = 'webpage_url' WAIT_BETWEEN_BLOCK = 10 @@ -31,12 +32,11 @@ def get_youtube_url(row): result = get_youtube_from_isrc(row['isrc']) video_title = result['title'].lower() - phonetic_distance = jellyfish.levenshtein_distance(real_title, video_title) + match, distance = phonetic_compares.match_titles(video_title, real_title) - print(real_title, video_title, phonetic_distance) - if phonetic_distance > 1: + if match: logging.warning( - f"dont downloading {result['url']} cuz the phonetic distance ({phonetic_distance}) between {real_title} and {video_title} is to high.") + f"dont downloading {result['url']} cuz the phonetic distance ({distance}) between {real_title} and {video_title} is to high.") return None return result['url']