2022-10-17 22:27:30 +00:00
|
|
|
import json
|
2022-10-17 17:28:33 +00:00
|
|
|
import os.path
|
2022-10-17 12:56:32 +00:00
|
|
|
import pandas as pd
|
|
|
|
import requests
|
2022-10-17 22:27:30 +00:00
|
|
|
import logging
|
2022-10-17 12:56:32 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
import musify
|
|
|
|
import youtube_music
|
|
|
|
|
2022-10-17 17:28:33 +00:00
|
|
|
|
2022-10-17 12:56:32 +00:00
|
|
|
class Download:
|
2022-10-17 17:28:33 +00:00
|
|
|
def __init__(self, metadata_csv: str = ".cache1.csv", session: requests.Session = requests.Session(),
|
|
|
|
file: str = ".cache2.csv", temp: str = "temp") -> None:
|
|
|
|
self.temp = temp
|
|
|
|
self.metadata = pd.read_csv(os.path.join(self.temp, metadata_csv), index_col=0)
|
|
|
|
|
2022-10-17 13:49:20 +00:00
|
|
|
self.urls = []
|
2022-10-17 12:56:32 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
for idx, row in self.metadata.iterrows():
|
|
|
|
row['artist'] = json.loads(row['artist'].replace("'", '"'))
|
|
|
|
|
|
|
|
# check musify
|
|
|
|
musify_url = musify.get_musify_url(row)
|
|
|
|
if musify_url is not None:
|
|
|
|
self.add_url(musify_url, 'musify', dict(row))
|
|
|
|
continue
|
2022-10-17 13:49:20 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
# check youtube
|
|
|
|
youtube_url = youtube_music.get_youtube_url(row)
|
|
|
|
if youtube_url is not None:
|
|
|
|
self.add_url(youtube_url, 'youtube', dict(row))
|
|
|
|
continue
|
2022-10-17 13:49:20 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
logging.warning(f"Didn't find any sources for {row['title']}")
|
2022-10-17 13:49:20 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
self.dump_urls(file)
|
|
|
|
|
2022-10-17 22:27:30 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
def add_url(self, url: str, src: str, row: dict):
|
|
|
|
row['url'] = url
|
|
|
|
row['src'] = src
|
2022-10-17 13:49:20 +00:00
|
|
|
|
2022-10-20 13:13:19 +00:00
|
|
|
self.urls.append(row)
|
2022-10-17 17:28:33 +00:00
|
|
|
|
2022-10-17 13:49:20 +00:00
|
|
|
|
2022-10-17 17:28:33 +00:00
|
|
|
def dump_urls(self, file: str = ".cache2.csv"):
|
2022-10-17 13:49:20 +00:00
|
|
|
df = pd.DataFrame(self.urls)
|
2022-10-17 17:28:33 +00:00
|
|
|
df.to_csv(os.path.join(self.temp, file))
|
|
|
|
|
2022-10-17 12:56:32 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2022-10-17 22:27:30 +00:00
|
|
|
proxies = {
|
|
|
|
'http': 'socks5h://127.0.0.1:9150',
|
|
|
|
'https': 'socks5h://127.0.0.1:9150'
|
|
|
|
}
|
|
|
|
|
|
|
|
s = requests.Session()
|
|
|
|
s.proxies = proxies
|
|
|
|
download = Download(session=s)
|