2022-10-20 13:13:19 +00:00
|
|
|
import logging
|
|
|
|
import requests
|
2022-10-25 15:59:33 +00:00
|
|
|
import bs4
|
|
|
|
|
|
|
|
import phonetic_compares
|
2022-10-20 13:13:19 +00:00
|
|
|
|
|
|
|
session = requests.Session()
|
|
|
|
session.headers = {
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"Referer": "https://musify.club/"
|
|
|
|
}
|
|
|
|
|
2022-10-24 11:13:48 +00:00
|
|
|
|
2022-10-26 15:42:11 +00:00
|
|
|
def set_proxy(proxies):
|
|
|
|
session.proxies = proxies
|
|
|
|
|
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
def get_musify_url(row):
|
2022-10-30 23:56:32 +00:00
|
|
|
title = row['title']
|
|
|
|
artists = row['artists']
|
2022-10-20 13:13:19 +00:00
|
|
|
|
2022-10-26 08:56:33 +00:00
|
|
|
url = f"https://musify.club/search/suggestions?term={artists[0]} - {title}"
|
2022-10-20 13:13:19 +00:00
|
|
|
|
|
|
|
r = session.get(url=url)
|
|
|
|
if r.status_code == 200:
|
|
|
|
autocomplete = r.json()
|
|
|
|
for row in autocomplete:
|
|
|
|
if any(a in row['label'] for a in artists) and "/track" in row['url']:
|
|
|
|
return get_download_link(row['url'])
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
2022-10-24 11:13:48 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
def get_download_link(default_url):
|
|
|
|
# https://musify.club/track/dl/18567672/rauw-alejandro-te-felicito-feat-shakira.mp3
|
|
|
|
# /track/sundenklang-wenn-mein-herz-schreit-3883217'
|
|
|
|
|
|
|
|
file_ = default_url.split("/")[-1]
|
|
|
|
musify_id = file_.split("-")[-1]
|
|
|
|
musify_name = "-".join(file_.split("-")[:-1])
|
|
|
|
|
|
|
|
return f"https://musify.club/track/dl/{musify_id}/{musify_name}.mp3"
|
2022-10-21 09:02:41 +00:00
|
|
|
|
2022-10-24 11:13:48 +00:00
|
|
|
|
2022-10-26 06:14:05 +00:00
|
|
|
def download_from_musify(file, url):
|
2022-10-21 09:02:41 +00:00
|
|
|
logging.info(f"downloading: '{url}'")
|
|
|
|
r = session.get(url)
|
|
|
|
if r.status_code != 200:
|
|
|
|
if r.status_code == 404:
|
2022-10-26 09:17:51 +00:00
|
|
|
logging.warning(f"{r.url} was not found")
|
|
|
|
return -1
|
|
|
|
if r.status_code == 503:
|
|
|
|
logging.warning(f"{r.url} raised an internal server error")
|
2022-10-21 09:02:41 +00:00
|
|
|
return -1
|
|
|
|
raise ConnectionError(f"\"{url}\" returned {r.status_code}: {r.text}")
|
|
|
|
with open(file, "wb") as mp3_file:
|
|
|
|
mp3_file.write(r.content)
|
|
|
|
logging.info("finished")
|
|
|
|
|
2022-10-24 11:13:48 +00:00
|
|
|
|
2022-10-21 09:02:41 +00:00
|
|
|
def download(row):
|
|
|
|
url = row['url']
|
|
|
|
file_ = row['file']
|
|
|
|
return download_from_musify(file_, url)
|
2022-10-25 15:59:33 +00:00
|
|
|
|
|
|
|
|
2022-10-26 08:56:33 +00:00
|
|
|
def get_soup_of_search(query: str):
|
|
|
|
url = f"https://musify.club/search?searchText={query}"
|
2022-10-26 09:17:51 +00:00
|
|
|
logging.debug(f"Trying to get soup from {url}")
|
2022-10-26 08:56:33 +00:00
|
|
|
r = session.get(url)
|
|
|
|
if r.status_code != 200:
|
|
|
|
raise ConnectionError(f"{r.url} returned {r.status_code}:\n{r.content}")
|
|
|
|
return bs4.BeautifulSoup(r.content, features="html.parser")
|
|
|
|
|
2022-10-26 09:17:51 +00:00
|
|
|
|
2022-10-25 15:59:33 +00:00
|
|
|
def search_for_track(row):
|
2022-10-30 23:56:32 +00:00
|
|
|
track = row['title']
|
|
|
|
artist = row['artists']
|
2022-10-25 15:59:33 +00:00
|
|
|
|
2022-10-26 08:56:33 +00:00
|
|
|
soup = get_soup_of_search(f"{artist[0]} - {track}")
|
2022-10-25 15:59:33 +00:00
|
|
|
tracklist_container_soup = soup.find_all("div", {"class": "playlist"})
|
2022-10-26 15:42:11 +00:00
|
|
|
if len(tracklist_container_soup) == 0:
|
|
|
|
return None
|
2022-10-25 15:59:33 +00:00
|
|
|
if len(tracklist_container_soup) != 1:
|
|
|
|
raise Exception("Connfusion Error. HTML Layout of https://musify.club changed.")
|
|
|
|
tracklist_container_soup = tracklist_container_soup[0]
|
|
|
|
|
|
|
|
tracklist_soup = tracklist_container_soup.find_all("div", {"class": "playlist__details"})
|
|
|
|
|
|
|
|
def parse_track_soup(_track_soup):
|
|
|
|
anchor_soups = _track_soup.find_all("a")
|
|
|
|
band_name = anchor_soups[0].text.strip()
|
|
|
|
title = anchor_soups[1].text.strip()
|
|
|
|
url_ = anchor_soups[1]['href']
|
|
|
|
return band_name, title, url_
|
|
|
|
|
|
|
|
for track_soup in tracklist_soup:
|
|
|
|
band_option, title_option, track_url = parse_track_soup(track_soup)
|
|
|
|
|
|
|
|
title_match, title_distance = phonetic_compares.match_titles(track, title_option)
|
|
|
|
band_match, band_distance = phonetic_compares.match_artists(artist, band_option)
|
|
|
|
|
2022-10-26 09:00:21 +00:00
|
|
|
logging.debug(f"{(track, title_option, title_match, title_distance)}")
|
|
|
|
logging.debug(f"{(artist, band_option, band_match, band_distance)}")
|
2022-10-25 15:59:33 +00:00
|
|
|
|
|
|
|
if not title_match and not band_match:
|
|
|
|
return get_download_link(track_url)
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def get_musify_url_slow(row):
|
|
|
|
result = search_for_track(row)
|
|
|
|
if result is not None:
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import pandas as pd
|
|
|
|
import json
|
|
|
|
|
2022-10-26 15:42:11 +00:00
|
|
|
TOR = True
|
|
|
|
if TOR:
|
|
|
|
set_proxy({
|
|
|
|
'http': 'socks5h://127.0.0.1:9150',
|
|
|
|
'https': 'socks5h://127.0.0.1:9150'
|
|
|
|
})
|
|
|
|
|
2022-10-25 15:59:33 +00:00
|
|
|
df = pd.read_csv("../temp/.cache1.csv")
|
|
|
|
|
|
|
|
for idx, row in df.iterrows():
|
|
|
|
row['artist'] = json.loads(row['artist'].replace("'", '"'))
|
|
|
|
print("-" * 200)
|
2022-10-26 08:56:33 +00:00
|
|
|
print("slow")
|
2022-10-25 15:59:33 +00:00
|
|
|
print(get_musify_url_slow(row))
|