made the fetching from musify more consistent

This commit is contained in:
lars 2022-10-26 10:56:33 +02:00
parent a830e4eb61
commit 1be7f6e337
2 changed files with 16 additions and 11 deletions

View File

@ -12,10 +12,10 @@ session.headers = {
def get_musify_url(row): def get_musify_url(row):
title = row['title'] title = row.title
artists = row['artist'] artists = row.artist
url = f"https://musify.club/search/suggestions?term={title}" url = f"https://musify.club/search/suggestions?term={artists[0]} - {title}"
r = session.get(url=url) r = session.get(url=url)
if r.status_code == 200: if r.status_code == 200:
@ -59,15 +59,19 @@ def download(row):
return download_from_musify(file_, url) return download_from_musify(file_, url)
def get_soup_of_search(query: str):
url = f"https://musify.club/search?searchText={query}"
print(url)
r = session.get(url)
if r.status_code != 200:
raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}")
return bs4.BeautifulSoup(r.content, features="html.parser")
def search_for_track(row): def search_for_track(row):
track = row.title track = row.title
artist = row.artist artist = row.artist
url = f"https://musify.club/search?searchText={track}" soup = get_soup_of_search(f"{artist[0]} - {track}")
r = session.get(url)
if r.status_code != 200:
raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}")
soup = bs4.BeautifulSoup(r.content, features="html.parser")
tracklist_container_soup = soup.find_all("div", {"class": "playlist"}) tracklist_container_soup = soup.find_all("div", {"class": "playlist"})
if len(tracklist_container_soup) != 1: if len(tracklist_container_soup) != 1:
raise Exception("Connfusion Error. HTML Layout of https://musify.club changed.") raise Exception("Connfusion Error. HTML Layout of https://musify.club changed.")
@ -98,7 +102,6 @@ def search_for_track(row):
def get_musify_url_slow(row): def get_musify_url_slow(row):
print(row)
result = search_for_track(row) result = search_for_track(row)
if result is not None: if result is not None:
return result return result
@ -109,9 +112,11 @@ if __name__ == "__main__":
import json import json
df = pd.read_csv("../temp/.cache1.csv") df = pd.read_csv("../temp/.cache1.csv")
print(df)
for idx, row in df.iterrows(): for idx, row in df.iterrows():
row['artist'] = json.loads(row['artist'].replace("'", '"')) row['artist'] = json.loads(row['artist'].replace("'", '"'))
print("-" * 200) print("-" * 200)
print("fast")
print(get_musify_url(row))
print("slow")
print(get_musify_url_slow(row)) print(get_musify_url_slow(row))

View File

@ -5,7 +5,7 @@ TITLE_THRESHOLD_LEVENSHTEIN = 2
def match_titles(title_1: str, title_2: str) -> (bool, int): def match_titles(title_1: str, title_2: str) -> (bool, int):
distance = jellyfish.levenshtein_distance(title_1, title_2) distance = jellyfish.levenshtein_distance(title_1, title_2)
return distance > 1, distance return distance > TITLE_THRESHOLD_LEVENSHTEIN, distance
def match_artists(artist_1, artist_2: str) -> (bool, int): def match_artists(artist_1, artist_2: str) -> (bool, int):