music-kraken-core/src/download_links.py

66 lines
1.8 KiB
Python
Raw Normal View History

2022-10-17 22:27:30 +00:00
import json
import os.path
2022-10-17 12:56:32 +00:00
import pandas as pd
import requests
2022-10-17 22:27:30 +00:00
import logging
2022-10-17 12:56:32 +00:00
2022-10-20 13:13:19 +00:00
import musify
import youtube_music
2022-10-17 12:56:32 +00:00
class Download:
def __init__(self, metadata_csv: str = ".cache1.csv", session: requests.Session = requests.Session(),
file: str = ".cache2.csv", temp: str = "temp") -> None:
self.temp = temp
self.metadata = pd.read_csv(os.path.join(self.temp, metadata_csv), index_col=0)
self.urls = []
2022-10-17 12:56:32 +00:00
2022-10-20 13:13:19 +00:00
for idx, row in self.metadata.iterrows():
row['artist'] = json.loads(row['artist'].replace("'", '"'))
# check musify
musify_url = musify.get_musify_url(row)
if musify_url is not None:
self.add_url(musify_url, 'musify', dict(row))
continue
2022-10-20 13:13:19 +00:00
# check youtube
youtube_url = youtube_music.get_youtube_url(row)
if youtube_url is not None:
self.add_url(youtube_url, 'youtube', dict(row))
continue
# check musify again, but with a diffrent methode that takes longer
musify_url = musify.get_musify_url_slow(row)
if musify_url is not None:
self.add_url(musify_url, 'musify', dict(row))
continue
2022-10-20 13:13:19 +00:00
logging.warning(f"Didn't find any sources for {row['title']}")
2022-10-20 13:13:19 +00:00
self.dump_urls(file)
2022-10-17 22:27:30 +00:00
2022-10-20 13:13:19 +00:00
def add_url(self, url: str, src: str, row: dict):
row['url'] = url
row['src'] = src
2022-10-20 13:13:19 +00:00
self.urls.append(row)
def dump_urls(self, file: str = ".cache2.csv"):
df = pd.DataFrame(self.urls)
df.to_csv(os.path.join(self.temp, file))
2022-10-17 12:56:32 +00:00
if __name__ == "__main__":
2022-10-17 22:27:30 +00:00
proxies = {
'http': 'socks5h://127.0.0.1:9150',
'https': 'socks5h://127.0.0.1:9150'
}
s = requests.Session()
s.proxies = proxies
download = Download(session=s)