startet genius lyrics fetcher

This commit is contained in:
lars 2022-11-09 22:52:32 +01:00
parent 3bc133ef62
commit 9bb510e41a
7 changed files with 163 additions and 162 deletions

View File

@ -1,5 +1,4 @@
requests~=2.28.1
pandas~=1.5.0
mutagen~=1.46.0
musicbrainzngs~=0.7.1
jellyfish~=0.9.0

View File

@ -1,121 +0,0 @@
from mutagen.id3 import ID3, USLT
"""
https://cweiske.de/tagebuch/rhythmbox-lyrics.htm
Rythmbox, my music player doesn't support ID3 lyrics (USLT) yet, so I have to find something else
Lyrics in MP3 ID3 tags (SYLT/USLT) is still missing, because GStreamer does not support that yet.
One possible sollution would be to use ogg/vorbis files. Those lyrics are supported in rythmbox
'So, the next Rhythmbox release (3.5.0 or 3.4.2) will read lyrics directly from ogg/vorbis files, using the LYRICS and SYNCLYRICS tags.'
Another possible sollution (probaply the better one cuz I dont need to refactor whole metadata AGAIN)
would be to write a Rhythmbox plugin that fetches lyrics from ID3 USLT
"""
# https://www.programcreek.com/python/example/63462/mutagen.mp3.EasyMP3
# https://code.activestate.com/recipes/577138-embed-lyrics-into-mp3-files-using-mutagen-uslt-tag/
MP3_PATH = "/home/lars/Music/deathcore/Brand of Sacrifice/The Interstice/Eclipse.mp3"
LYRICS_BREAKING_DOWN = """
[Chorus: Brian Burkheiser]
I think, I think too much
I'm a little bit paranoid, I think Im breaking
Maybe it's in my blood
Got a pain that I can't avoid, I think Im breaking down
[Verse 1: Brian Burkheiser]
Hate every single second, minute, hour every day
Person in the mirror, they won't let me feel a thing
Keep me focused on my problems, I'm addicted to the pain
Everybody's out to get you
[Pre-Chorus: Eric Vanlerberghe]
I guess I never noticed how it came creeping in
My enemy emotion, but I can't sink or swim
I say I'm feeling hopeless, they give me medicine
They give me medicine, they give me medicine
[Chorus: Brian Burkheiser & Eric Vanlerberghe]
I think I think too much (Too much)
I'm a little bit paranoid, I think I'm breaking
Maybe its in my blood (My blood)
Got a pain that I cant avoid, I think I'm breaking
Down, I think Im breaking
Down, I think I'm breaking
I think I think too much (Too much)
I'm a little bit paranoid, I think I'm breaking down
[Verse 2: Brian Burkheiser]
Lies, every time they ask me, I just tell em that I'm fine
Try to hide my demons, but they only multiply
Keep me running from the voices on repeat inside my mind
Everybody fucking hates you
[Pre-Chorus: Eric Vanlerberghe]
I guess I never noticed how it came creeping in
My enemy emotion, but I can't sink or swim
I say I'm feeling hopeless, but no one's listening
But no one's listening, but no one's listening
You might also like
DOA
I Prevail
Rise Above It
I Prevail
Bow Down
I Prevail
[Chorus: Brian Burkheiser & Eric Vanlerberghe]
I think I think too much (Too much)
I'm a little bit paranoid, I think I'm breaking
Maybe it's in my blood (My blood)
Got a pain that I can't avoid, I think I'm breaking
Down, I think I'm breaking
Down, I think I'm breaking
I think I think too much (Too much)
I'm a little bit paranoid, I think I'm breaking down
[Outro: Brian Burkheiser]
I don't really like myself
I don't really like myself
I don't really like myself
I don't really like myself
I think I'm breaking down
"""
LYRICS_ECLIPSE = """
Your offerings have consecrated
They are marked by the brand
The sun has seen it's fifth death
For the red lake to flow again
He will
Feel their pain in order to
Complete the final transformation
A name new and old
Your offerings have been consecrated by the laws of Causality
Falcon of Darkness
Send us into an age of abyss
Blinded by beauty
With stacks of bodies as high as the eye can see
Feast, apostles, feast
The one chosen by the hand of God
The master of the sinful black sheep
And the king of the faithful blind
Welcome to the new age
Welcome to the new age
We are the branded ones"""
def add_lyrics(file_name, lyrics=""):
tags = ID3(file_name)
uslt_output = USLT(encoding=3, lang=u'eng', desc=u'desc', text=lyrics)
tags["USLT::'eng'"] = uslt_output
tags.save(file_name)
def get_lyrics(file_name):
tags = ID3(file_name)
return tags.getall("USLT")
if __name__ == "__main__":
add_lyrics(MP3_PATH, lyrics=LYRICS_ECLIPSE)
print(get_lyrics(MP3_PATH))

View File

@ -3,10 +3,12 @@ import sys
import os
import logging
from typing import List
from bs4 import BeautifulSoup
current = os.path.dirname(os.path.realpath(__file__))
parent = os.path.dirname(current)
sys.path.append(parent)
# utils >:3
from tools import phonetic_compares
from tools.object_handeling import get_elem_from_obj
@ -49,7 +51,8 @@ class Song:
self.lyricist: str
if get_elem_from_obj(song_data, ['lyrics_state']) != "complete":
logger.warning(f"lyrics state of {self.title} by {self.artist} is not complete but {get_elem_from_obj(song_data, ['lyrics_state'])}")
logger.warning(
f"lyrics state of {self.title} by {self.artist} is not complete but {get_elem_from_obj(song_data, ['lyrics_state'])}")
self.valid = self.is_valid()
if self.valid:
@ -61,33 +64,44 @@ class Song:
title_match, title_distance = phonetic_compares.match_titles(self.title, self.desired_data['track'])
artist_match, artist_distance = phonetic_compares.match_artists(self.artist, self.desired_data['artist'])
return title_match and artist_match
return not title_match and not artist_match
def __repr__(self) -> str:
return f"{self.title} by {self.artist}"
return f"{self.title} by {self.artist} ({self.url})"
def fetch_lyrics(self) -> str:
def fetch_lyrics(self) -> str | None:
if not self.valid:
logger.warning(f"{self.__repr__()} is invalid but the lyrics still get fetched. Something could be wrong.")
lyrics = ""
r = session.get(self.url)
if r.status_code != 200:
logging.warning(f"{r.url} returned {r.status_code}:\n{r.content}")
return None
soup = BeautifulSoup(r.content, "html.parser")
lyrics_soups = soup.find_all('div', {'data-lyrics-container': "true"})
if len(lyrics_soups) == 0:
logger.warning(f"didn't found lyrics on {self.url}")
return None
if len(lyrics_soups) != 1:
logger.warning(f"number of lyrics_soups doesn't equals 1, but {len(lyrics_soups)} on {self.url}")
lyrics_soup = lyrics_soups[0]
lyrics = lyrics_soup.getText(separator="\n", strip=True)
# <div data-lyrics-container="true" class="Lyrics__Container-sc-1ynbvzw-6 YYrds">With the soundle
self.lyrics = lyrics
return lyrics
def build_search_query(artist: str, track: str) -> str:
return f"{artist} - {track}"
def process_multiple_songs(song_datas: list, desired_data: dict) -> List[Song]:
all_songs = [Song(song_data, desired_data) for song_data in song_datas]
return [song for song in all_songs if not song.valid]
return [song_ for song_ in all_songs if not song_.valid]
def search_song_list(artist: str, track: str) -> List[Song]:
endpoint = "https://genius.com/api/search/multi?q="
url = endpoint + build_search_query(artist, track)
url = f"{endpoint}{artist} - {track}"
logging.info(f"requesting {url}")
desired_data = {
@ -104,24 +118,56 @@ def search_song_list(artist: str, track: str) -> List[Song]:
logging.warning(f"{r.url} returned {get_elem_from_obj(content, ['meta', 'status'])}:\n{content}")
return []
# print(r.status_code)
# print(r.json())
sections = get_elem_from_obj(content, ['response', 'sections'])
for section in sections:
section_type = get_elem_from_obj(section, ['type'])
print(section_type)
if section_type == "song":
return process_multiple_songs(get_elem_from_obj(section, ['hits'], return_if_none=[]), desired_data)
return []
def search(artist: str, track: str):
return search_song_list(artist, track)
raw_songs = search_song_list(artist, track)
all_lyrics = [raw_song.fetch_lyrics() for raw_song in raw_songs]
return [i for i in all_lyrics if i is not None]
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
songs = search_song_list("Psychonaut 4", "Sana Sana Sana, Cura Cura Cura")
print(songs)
song = Song(
{'highlights': [], 'index': 'song', 'type': 'song',
'result': {'_type': 'song', 'annotation_count': 0, 'api_path': '/songs/6142483',
'artist_names': 'Psychonaut 4',
'full_title': 'Sana Sana Sana, Cura Cura Cura by\xa0Psychonaut\xa04',
'header_image_thumbnail_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.300x300x1.jpg',
'header_image_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.651x651x1.jpg',
'id': 6142483, 'instrumental': False, 'language': 'en', 'lyrics_owner_id': 4443216,
'lyrics_state': 'complete', 'lyrics_updated_at': 1604698709,
'path': '/Psychonaut-4-sana-sana-sana-cura-cura-cura-lyrics', 'pyongs_count': None,
'relationships_index_url': 'https://genius.com/Psychonaut-4-sana-sana-sana-cura-cura-cura-sample',
'release_date_components': {'year': 2020, 'month': 7, 'day': 1},
'release_date_for_display': 'July 1, 2020',
'release_date_with_abbreviated_month_for_display': 'Jul. 1, 2020',
'song_art_image_thumbnail_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.300x300x1.jpg',
'song_art_image_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.651x651x1.jpg',
'stats': {'unreviewed_annotations': 0, 'hot': False}, 'title': 'Sana Sana Sana, Cura Cura Cura',
'title_with_featured': 'Sana Sana Sana, Cura Cura Cura', 'updated_by_human_at': 1647353214,
'url': 'https://genius.com/Psychonaut-4-sana-sana-sana-cura-cura-cura-lyrics',
'featured_artists': [], 'primary_artist': {'_type': 'artist', 'api_path': '/artists/1108956',
'header_image_url': 'https://images.genius.com/ff13efc74a043237cfca3fc0a6cb12dd.1000x563x1.jpg',
'id': 1108956,
'image_url': 'https://images.genius.com/25ff7cfdcb6d92a9f19ebe394a895736.640x640x1.jpg',
'index_character': 'p', 'is_meme_verified': False,
'is_verified': False, 'name': 'Psychonaut 4',
'slug': 'Psychonaut-4',
'url': 'https://genius.com/artists/Psychonaut-4'}}},
{'artist': 'Psychonaut 4', 'track': 'Sana Sana Sana, Cura Cura Cura'}
)
print(song.fetch_lyrics())
exit()
songs = search("Psychonaut 4", "Sana Sana Sana, Cura Cura Cura")
for song in songs:
print(song)

85
src/lyrics_.py Normal file
View File

@ -0,0 +1,85 @@
from mutagen.id3 import ID3, USLT
from metadata import database as db
import logging
import os
from lyrics import genius
"""
This whole Part is bodgy as hell and I need to rewrite this little file urgently. genius.py is really clean though :3
Just wanted to get it to work.
- lyrics need to be put in the database and everything should continue from there then
"""
"""
https://cweiske.de/tagebuch/rhythmbox-lyrics.htm
Rythmbox, my music player doesn't support ID3 lyrics (USLT) yet, so I have to find something else
Lyrics in MP3 ID3 tags (SYLT/USLT) is still missing, because GStreamer does not support that yet.
One possible sollution would be to use ogg/vorbis files. Those lyrics are supported in rythmbox
'So, the next Rhythmbox release (3.5.0 or 3.4.2) will read lyrics directly from ogg/vorbis files, using the LYRICS and SYNCLYRICS tags.'
Another possible sollution (probaply the better one cuz I dont need to refactor whole metadata AGAIN)
would be to write a Rhythmbox plugin that fetches lyrics from ID3 USLT
I have written that Rhythmbox plugin: https://github.com/HeIIow2/rythmbox-id3-lyrics-support
"""
# https://www.programcreek.com/python/example/63462/mutagen.mp3.EasyMP3
# https://code.activestate.com/recipes/577138-embed-lyrics-into-mp3-files-using-mutagen-uslt-tag/
def add_lyrics(file_name, lyrics=""):
print(lyrics)
tags = ID3(file_name)
uslt_output = USLT(encoding=3, lang=u'eng', desc=u'desc', text=lyrics)
tags["USLT::'eng'"] = uslt_output
tags.save(file_name)
def get_lyrics(file_name):
tags = ID3(file_name)
return tags.getall("USLT")
def fetch_single_lyrics(row: dict):
file_ = os.path.join(os.path.expanduser('~/Music'), row['file'])
artist = row['artists'][0]['name']
track = row['title']
if not os.path.exists(file_):
return
print(file_, artist, track)
lyrics = genius.search(artist, track)
if len(lyrics) == 0:
return
add_lyrics(file_, lyrics[0])
def fetch_lyrics(database: db.Database):
for row in database.get_custom_track([]):
print(row)
fetch_single_lyrics(row)
if __name__ == "__main__":
import tempfile
import os
temp_folder = "music-downloader"
temp_dir = os.path.join(tempfile.gettempdir(), temp_folder)
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
logging.basicConfig(level=logging.DEBUG)
db_logger = logging.getLogger("database")
db_logger.setLevel(logging.DEBUG)
database = db.Database(os.path.join(temp_dir, "metadata.db"),
os.path.join(temp_dir, "database_structure.sql"),
"https://raw.githubusercontent.com/HeIIow2/music-downloader/new_metadata/assets/database_structure.sql",
db_logger,
reset_anyways=False)
fetch_lyrics(database)

View File

@ -6,11 +6,13 @@ import download_links
import url_to_path
import download
# NEEDS REFACTORING
from lyrics_ import fetch_lyrics
import logging
import os
import tempfile
TEMP_FOLDER = "music-downloader"
LOG_FILE = "download_logs.log"
DATABASE_FILE = "metadata.db"
@ -42,8 +44,6 @@ database = Database(os.path.join(temp_dir, DATABASE_FILE),
reset_anyways=True)
def get_existing_genre():
valid_directories = []
for elem in os.listdir(MUSIC_DIR):
@ -77,6 +77,7 @@ def search_for_metadata():
return search.current_option
def get_genre():
existing_genres = get_existing_genre()
print("printing available genres:")
@ -125,6 +126,10 @@ def cli(start_at: int = 0):
logging.info("starting to download the mp3's")
download.Download(database, DOWNLOAD_LOGGER, proxies=proxies, base_path=MUSIC_DIR)
if start_at <= 4:
logging.info("starting to fetch the lyrics")
fetch_lyrics(database)
if __name__ == "__main__":
cli(start_at=0)

View File

@ -205,7 +205,9 @@ GROUP BY track.id;
return self.get_custom_track(["(track.file IS NULL OR track.path IS NULL OR track.genre IS NULL)"])
def update_download_status(self, track_id: str):
pass
query = f"UPDATE track SET downloaded = 1, WHERE '{track_id}' == id;"
self.cursor.execute(query)
self.connection.commit()
def set_download_data(self, track_id: str, url: str, src: str):
query = f"""

View File

@ -14,6 +14,7 @@ TIMEOUT = 10
session = requests.Session()
session.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Connection": "keep-alive",
"Referer": "https://musify.club/"
}
@ -137,20 +138,4 @@ def get_musify_url_slow(row):
if __name__ == "__main__":
import pandas as pd
import json
TOR = True
if TOR:
set_proxy({
'http': 'socks5h://127.0.0.1:9150',
'https': 'socks5h://127.0.0.1:9150'
})
df = pd.read_csv("../temp/.cache1.csv")
for idx, row in df.iterrows():
row['artist'] = json.loads(row['artist'].replace("'", '"'))
print("-" * 200)
print("slow")
print(get_musify_url_slow(row))
pass