From 1be7f6e337399311fd6744bf9135886ad024d05d Mon Sep 17 00:00:00 2001 From: lars Date: Wed, 26 Oct 2022 10:56:33 +0200 Subject: [PATCH] made the fetching from musify more consistent --- src/musify.py | 25 +++++++++++++++---------- src/phonetic_compares.py | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/musify.py b/src/musify.py index fa30b8f..16fcea4 100644 --- a/src/musify.py +++ b/src/musify.py @@ -12,10 +12,10 @@ session.headers = { def get_musify_url(row): - title = row['title'] - artists = row['artist'] + title = row.title + artists = row.artist - url = f"https://musify.club/search/suggestions?term={title}" + url = f"https://musify.club/search/suggestions?term={artists[0]} - {title}" r = session.get(url=url) if r.status_code == 200: @@ -59,15 +59,19 @@ def download(row): return download_from_musify(file_, url) +def get_soup_of_search(query: str): + url = f"https://musify.club/search?searchText={query}" + print(url) + r = session.get(url) + if r.status_code != 200: + raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}") + return bs4.BeautifulSoup(r.content, features="html.parser") + def search_for_track(row): track = row.title artist = row.artist - url = f"https://musify.club/search?searchText={track}" - r = session.get(url) - if r.status_code != 200: - raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}") - soup = bs4.BeautifulSoup(r.content, features="html.parser") + soup = get_soup_of_search(f"{artist[0]} - {track}") tracklist_container_soup = soup.find_all("div", {"class": "playlist"}) if len(tracklist_container_soup) != 1: raise Exception("Connfusion Error. HTML Layout of https://musify.club changed.") @@ -98,7 +102,6 @@ def search_for_track(row): def get_musify_url_slow(row): - print(row) result = search_for_track(row) if result is not None: return result @@ -109,9 +112,11 @@ if __name__ == "__main__": import json df = pd.read_csv("../temp/.cache1.csv") - print(df) for idx, row in df.iterrows(): row['artist'] = json.loads(row['artist'].replace("'", '"')) print("-" * 200) + print("fast") + print(get_musify_url(row)) + print("slow") print(get_musify_url_slow(row)) diff --git a/src/phonetic_compares.py b/src/phonetic_compares.py index 54f00e8..a5bd28f 100644 --- a/src/phonetic_compares.py +++ b/src/phonetic_compares.py @@ -5,7 +5,7 @@ TITLE_THRESHOLD_LEVENSHTEIN = 2 def match_titles(title_1: str, title_2: str) -> (bool, int): distance = jellyfish.levenshtein_distance(title_1, title_2) - return distance > 1, distance + return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance def match_artists(artist_1, artist_2: str) -> (bool, int):