added a slower method to get stuff from musify, which in comperason takes longer though

This commit is contained in:
lars
2022-10-25 17:59:33 +02:00
parent 84b4f8b6c2
commit e923dcdaf2
9 changed files with 109 additions and 28 deletions

View File

@@ -31,6 +31,12 @@ class Download:
self.add_url(youtube_url, 'youtube', dict(row))
continue
# check musify again, but with a diffrent methode that takes longer
musify_url = musify.get_musify_url_slow(row)
if musify_url is not None:
self.add_url(musify_url, 'musify', dict(row))
continue
logging.warning(f"Didn't find any sources for {row['title']}")
self.dump_urls(file)

View File

@@ -1,5 +1,8 @@
import logging
import requests
import bs4
import phonetic_compares
session = requests.Session()
session.headers = {
@@ -54,3 +57,61 @@ def download(row):
url = row['url']
file_ = row['file']
return download_from_musify(file_, url)
def search_for_track(row):
track = row.title
artist = row.artist
url = f"https://musify.club/search?searchText={track}"
r = session.get(url)
if r.status_code != 200:
raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}")
soup = bs4.BeautifulSoup(r.content, features="html.parser")
tracklist_container_soup = soup.find_all("div", {"class": "playlist"})
if len(tracklist_container_soup) != 1:
raise Exception("Connfusion Error. HTML Layout of https://musify.club changed.")
tracklist_container_soup = tracklist_container_soup[0]
tracklist_soup = tracklist_container_soup.find_all("div", {"class": "playlist__details"})
def parse_track_soup(_track_soup):
anchor_soups = _track_soup.find_all("a")
band_name = anchor_soups[0].text.strip()
title = anchor_soups[1].text.strip()
url_ = anchor_soups[1]['href']
return band_name, title, url_
for track_soup in tracklist_soup:
band_option, title_option, track_url = parse_track_soup(track_soup)
title_match, title_distance = phonetic_compares.match_titles(track, title_option)
band_match, band_distance = phonetic_compares.match_artists(artist, band_option)
print(track, title_option, title_match, title_distance)
print(artist, band_option, band_match, band_distance)
if not title_match and not band_match:
return get_download_link(track_url)
return None
def get_musify_url_slow(row):
print(row)
result = search_for_track(row)
if result is not None:
return result
if __name__ == "__main__":
import pandas as pd
import json
df = pd.read_csv("../temp/.cache1.csv")
print(df)
for idx, row in df.iterrows():
row['artist'] = json.loads(row['artist'].replace("'", '"'))
print("-" * 200)
print(get_musify_url_slow(row))

View File

@@ -1,8 +1,23 @@
import jellyfish
TITLE_THRESHOLD_LEVENSHTEIN = 1
TITLE_THRESHOLD_LEVENSHTEIN = 2
def match_titles(title_1: str, title_2: str) -> (bool, int):
distance = jellyfish.levenshtein_distance(title_1, title_2)
return distance > 1, distance
def match_artists(artist_1, artist_2: str) -> (bool, int):
if type(artist_1) == list:
distances = []
for artist_1_ in artist_1:
print(artist_1_)
match, distance = match_titles(artist_1_, artist_2)
if not match:
return match, distance
distances.append(distance)
return True, min(distances)
return match_titles(artist_1, artist_2)