moved linguistics stuff to another file

This commit is contained in:
lars 2022-10-24 13:23:07 +02:00
parent 966f0c9602
commit 137b18517f
4 changed files with 13 additions and 6 deletions

View File

@ -3,4 +3,5 @@ pandas~=1.5.0
mutagen~=1.46.0 mutagen~=1.46.0
musicbrainzngs~=0.7.1 musicbrainzngs~=0.7.1
jellyfish~=0.9.0 jellyfish~=0.9.0
pydub~=0.25.1 pydub~=0.25.1
youtube_dl

Binary file not shown.

View File

@ -1,2 +1,8 @@
import jellyfish import jellyfish
TITLE_THRESHOLD_LEVENSHTEIN = 1
def match_titles(title_1: str, title_2: str) -> (bool, int):
distance = jellyfish.levenshtein_distance(title_1, title_2)
return distance > 1, distance

View File

@ -1,9 +1,10 @@
import youtube_dl import youtube_dl
import pandas as pd import pandas as pd
import jellyfish
import logging import logging
import time import time
import phonetic_compares
YDL_OPTIONS = {'format': 'bestaudio', 'noplaylist': 'True'} YDL_OPTIONS = {'format': 'bestaudio', 'noplaylist': 'True'}
YOUTUBE_URL_KEY = 'webpage_url' YOUTUBE_URL_KEY = 'webpage_url'
WAIT_BETWEEN_BLOCK = 10 WAIT_BETWEEN_BLOCK = 10
@ -31,12 +32,11 @@ def get_youtube_url(row):
result = get_youtube_from_isrc(row['isrc']) result = get_youtube_from_isrc(row['isrc'])
video_title = result['title'].lower() video_title = result['title'].lower()
phonetic_distance = jellyfish.levenshtein_distance(real_title, video_title) match, distance = phonetic_compares.match_titles(video_title, real_title)
print(real_title, video_title, phonetic_distance) if match:
if phonetic_distance > 1:
logging.warning( logging.warning(
f"dont downloading {result['url']} cuz the phonetic distance ({phonetic_distance}) between {real_title} and {video_title} is to high.") f"dont downloading {result['url']} cuz the phonetic distance ({distance}) between {real_title} and {video_title} is to high.")
return None return None
return result['url'] return result['url']