From 9bb510e41a8505f681121f7fceea6675704b1a2f Mon Sep 17 00:00:00 2001 From: lars Date: Wed, 9 Nov 2022 22:52:32 +0100 Subject: [PATCH] startet genius lyrics fetcher --- requirements.txt | 1 - src/lyrics.py | 121 --------------------------------------- src/lyrics/genius.py | 82 ++++++++++++++++++++------ src/lyrics_.py | 85 +++++++++++++++++++++++++++ src/main.py | 13 +++-- src/metadata/database.py | 4 +- src/scraping/musify.py | 19 +----- 7 files changed, 163 insertions(+), 162 deletions(-) delete mode 100644 src/lyrics.py create mode 100644 src/lyrics_.py diff --git a/requirements.txt b/requirements.txt index 0516df7..2017b36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ requests~=2.28.1 -pandas~=1.5.0 mutagen~=1.46.0 musicbrainzngs~=0.7.1 jellyfish~=0.9.0 diff --git a/src/lyrics.py b/src/lyrics.py deleted file mode 100644 index 2beb1d5..0000000 --- a/src/lyrics.py +++ /dev/null @@ -1,121 +0,0 @@ -from mutagen.id3 import ID3, USLT - -""" -https://cweiske.de/tagebuch/rhythmbox-lyrics.htm -Rythmbox, my music player doesn't support ID3 lyrics (USLT) yet, so I have to find something else -Lyrics in MP3 ID3 tags (SYLT/USLT) is still missing, because GStreamer does not support that yet. - -One possible sollution would be to use ogg/vorbis files. Those lyrics are supported in rythmbox -'So, the next Rhythmbox release (3.5.0 or 3.4.2) will read lyrics directly from ogg/vorbis files, using the LYRICS and SYNCLYRICS tags.' -Another possible sollution (probaply the better one cuz I dont need to refactor whole metadata AGAIN) -would be to write a Rhythmbox plugin that fetches lyrics from ID3 USLT -""" - -# https://www.programcreek.com/python/example/63462/mutagen.mp3.EasyMP3 -# https://code.activestate.com/recipes/577138-embed-lyrics-into-mp3-files-using-mutagen-uslt-tag/ - -MP3_PATH = "/home/lars/Music/deathcore/Brand of Sacrifice/The Interstice/Eclipse.mp3" -LYRICS_BREAKING_DOWN = """ -[Chorus: Brian Burkheiser] -I think, I think too much -I'm a little bit paranoid, I think I’m breaking -Maybe it's in my blood -Got a pain that I can't avoid, I think I’m breaking down - -[Verse 1: Brian Burkheiser] -Hate every single second, minute, hour every day -Person in the mirror, they won't let me feel a thing -Keep me focused on my problems, I'm addicted to the pain -Everybody's out to get you -[Pre-Chorus: Eric Vanlerberghe] -I guess I never noticed how it came creeping in -My enemy emotion, but I can't sink or swim -I say I'm feeling hopeless, they give me medicine -They give me medicine, they give me medicine - -[Chorus: Brian Burkheiser & Eric Vanlerberghe] -I think I think too much (Too much) -I'm a little bit paranoid, I think I'm breaking -Maybe it’s in my blood (My blood) -Got a pain that I can’t avoid, I think I'm breaking -Down, I think I’m breaking -Down, I think I'm breaking -I think I think too much (Too much) -I'm a little bit paranoid, I think I'm breaking down - -[Verse 2: Brian Burkheiser] -Lies, every time they ask me, I just tell ’em that I'm fine -Try to hide my demons, but they only multiply -Keep me running from the voices on repeat inside my mind -Everybody fucking hates you - -[Pre-Chorus: Eric Vanlerberghe] -I guess I never noticed how it came creeping in -My enemy emotion, but I can't sink or swim -I say I'm feeling hopeless, but no one's listening -But no one's listening, but no one's listening -You might also like -DOA -I Prevail -Rise Above It -I Prevail -Bow Down -I Prevail -[Chorus: Brian Burkheiser & Eric Vanlerberghe] -I think I think too much (Too much) -I'm a little bit paranoid, I think I'm breaking -Maybe it's in my blood (My blood) -Got a pain that I can't avoid, I think I'm breaking -Down, I think I'm breaking -Down, I think I'm breaking -I think I think too much (Too much) -I'm a little bit paranoid, I think I'm breaking down - -[Outro: Brian Burkheiser] -I don't really like myself -I don't really like myself -I don't really like myself -I don't really like myself -I think I'm breaking down -""" -LYRICS_ECLIPSE = """ -Your offerings have consecrated -They are marked by the brand -The sun has seen it's fifth death -For the red lake to flow again - -He will -Feel their pain in order to -Complete the final transformation -A name new and old - -Your offerings have been consecrated by the laws of Causality -Falcon of Darkness -Send us into an age of abyss -Blinded by beauty -With stacks of bodies as high as the eye can see -Feast, apostles, feast - -The one chosen by the hand of God -The master of the sinful black sheep -And the king of the faithful blind - -Welcome to the new age -Welcome to the new age -We are the branded ones""" - - -def add_lyrics(file_name, lyrics=""): - tags = ID3(file_name) - uslt_output = USLT(encoding=3, lang=u'eng', desc=u'desc', text=lyrics) - tags["USLT::'eng'"] = uslt_output - - tags.save(file_name) - -def get_lyrics(file_name): - tags = ID3(file_name) - return tags.getall("USLT") - -if __name__ == "__main__": - add_lyrics(MP3_PATH, lyrics=LYRICS_ECLIPSE) - print(get_lyrics(MP3_PATH)) diff --git a/src/lyrics/genius.py b/src/lyrics/genius.py index e6e4a75..46900d1 100644 --- a/src/lyrics/genius.py +++ b/src/lyrics/genius.py @@ -3,10 +3,12 @@ import sys import os import logging from typing import List +from bs4 import BeautifulSoup current = os.path.dirname(os.path.realpath(__file__)) parent = os.path.dirname(current) sys.path.append(parent) +# utils >:3 from tools import phonetic_compares from tools.object_handeling import get_elem_from_obj @@ -49,7 +51,8 @@ class Song: self.lyricist: str if get_elem_from_obj(song_data, ['lyrics_state']) != "complete": - logger.warning(f"lyrics state of {self.title} by {self.artist} is not complete but {get_elem_from_obj(song_data, ['lyrics_state'])}") + logger.warning( + f"lyrics state of {self.title} by {self.artist} is not complete but {get_elem_from_obj(song_data, ['lyrics_state'])}") self.valid = self.is_valid() if self.valid: @@ -61,33 +64,44 @@ class Song: title_match, title_distance = phonetic_compares.match_titles(self.title, self.desired_data['track']) artist_match, artist_distance = phonetic_compares.match_artists(self.artist, self.desired_data['artist']) - return title_match and artist_match + return not title_match and not artist_match def __repr__(self) -> str: - return f"{self.title} by {self.artist}" + return f"{self.title} by {self.artist} ({self.url})" - def fetch_lyrics(self) -> str: + def fetch_lyrics(self) -> str | None: if not self.valid: logger.warning(f"{self.__repr__()} is invalid but the lyrics still get fetched. Something could be wrong.") - lyrics = "" + r = session.get(self.url) + if r.status_code != 200: + logging.warning(f"{r.url} returned {r.status_code}:\n{r.content}") + return None + soup = BeautifulSoup(r.content, "html.parser") + lyrics_soups = soup.find_all('div', {'data-lyrics-container': "true"}) + if len(lyrics_soups) == 0: + logger.warning(f"didn't found lyrics on {self.url}") + return None + if len(lyrics_soups) != 1: + logger.warning(f"number of lyrics_soups doesn't equals 1, but {len(lyrics_soups)} on {self.url}") + + lyrics_soup = lyrics_soups[0] + lyrics = lyrics_soup.getText(separator="\n", strip=True) + + #
With the soundle self.lyrics = lyrics return lyrics -def build_search_query(artist: str, track: str) -> str: - return f"{artist} - {track}" - - def process_multiple_songs(song_datas: list, desired_data: dict) -> List[Song]: all_songs = [Song(song_data, desired_data) for song_data in song_datas] - return [song for song in all_songs if not song.valid] + return [song_ for song_ in all_songs if not song_.valid] def search_song_list(artist: str, track: str) -> List[Song]: endpoint = "https://genius.com/api/search/multi?q=" - url = endpoint + build_search_query(artist, track) + url = f"{endpoint}{artist} - {track}" logging.info(f"requesting {url}") desired_data = { @@ -104,24 +118,56 @@ def search_song_list(artist: str, track: str) -> List[Song]: logging.warning(f"{r.url} returned {get_elem_from_obj(content, ['meta', 'status'])}:\n{content}") return [] - # print(r.status_code) - # print(r.json()) - sections = get_elem_from_obj(content, ['response', 'sections']) for section in sections: section_type = get_elem_from_obj(section, ['type']) - print(section_type) if section_type == "song": return process_multiple_songs(get_elem_from_obj(section, ['hits'], return_if_none=[]), desired_data) return [] + def search(artist: str, track: str): - return search_song_list(artist, track) + raw_songs = search_song_list(artist, track) + all_lyrics = [raw_song.fetch_lyrics() for raw_song in raw_songs] + return [i for i in all_lyrics if i is not None] if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) - songs = search_song_list("Psychonaut 4", "Sana Sana Sana, Cura Cura Cura") - print(songs) + song = Song( + {'highlights': [], 'index': 'song', 'type': 'song', + 'result': {'_type': 'song', 'annotation_count': 0, 'api_path': '/songs/6142483', + 'artist_names': 'Psychonaut 4', + 'full_title': 'Sana Sana Sana, Cura Cura Cura by\xa0Psychonaut\xa04', + 'header_image_thumbnail_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.300x300x1.jpg', + 'header_image_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.651x651x1.jpg', + 'id': 6142483, 'instrumental': False, 'language': 'en', 'lyrics_owner_id': 4443216, + 'lyrics_state': 'complete', 'lyrics_updated_at': 1604698709, + 'path': '/Psychonaut-4-sana-sana-sana-cura-cura-cura-lyrics', 'pyongs_count': None, + 'relationships_index_url': 'https://genius.com/Psychonaut-4-sana-sana-sana-cura-cura-cura-sample', + 'release_date_components': {'year': 2020, 'month': 7, 'day': 1}, + 'release_date_for_display': 'July 1, 2020', + 'release_date_with_abbreviated_month_for_display': 'Jul. 1, 2020', + 'song_art_image_thumbnail_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.300x300x1.jpg', + 'song_art_image_url': 'https://images.genius.com/f9f67a3f9c801f697fbaf68c7efd3599.651x651x1.jpg', + 'stats': {'unreviewed_annotations': 0, 'hot': False}, 'title': 'Sana Sana Sana, Cura Cura Cura', + 'title_with_featured': 'Sana Sana Sana, Cura Cura Cura', 'updated_by_human_at': 1647353214, + 'url': 'https://genius.com/Psychonaut-4-sana-sana-sana-cura-cura-cura-lyrics', + 'featured_artists': [], 'primary_artist': {'_type': 'artist', 'api_path': '/artists/1108956', + 'header_image_url': 'https://images.genius.com/ff13efc74a043237cfca3fc0a6cb12dd.1000x563x1.jpg', + 'id': 1108956, + 'image_url': 'https://images.genius.com/25ff7cfdcb6d92a9f19ebe394a895736.640x640x1.jpg', + 'index_character': 'p', 'is_meme_verified': False, + 'is_verified': False, 'name': 'Psychonaut 4', + 'slug': 'Psychonaut-4', + 'url': 'https://genius.com/artists/Psychonaut-4'}}}, + {'artist': 'Psychonaut 4', 'track': 'Sana Sana Sana, Cura Cura Cura'} + ) + print(song.fetch_lyrics()) + + exit() + songs = search("Psychonaut 4", "Sana Sana Sana, Cura Cura Cura") + for song in songs: + print(song) diff --git a/src/lyrics_.py b/src/lyrics_.py new file mode 100644 index 0000000..2a939ef --- /dev/null +++ b/src/lyrics_.py @@ -0,0 +1,85 @@ +from mutagen.id3 import ID3, USLT +from metadata import database as db +import logging +import os + +from lyrics import genius + +""" +This whole Part is bodgy as hell and I need to rewrite this little file urgently. genius.py is really clean though :3 +Just wanted to get it to work. + - lyrics need to be put in the database and everything should continue from there then +""" + +""" +https://cweiske.de/tagebuch/rhythmbox-lyrics.htm +Rythmbox, my music player doesn't support ID3 lyrics (USLT) yet, so I have to find something else +Lyrics in MP3 ID3 tags (SYLT/USLT) is still missing, because GStreamer does not support that yet. + +One possible sollution would be to use ogg/vorbis files. Those lyrics are supported in rythmbox +'So, the next Rhythmbox release (3.5.0 or 3.4.2) will read lyrics directly from ogg/vorbis files, using the LYRICS and SYNCLYRICS tags.' +Another possible sollution (probaply the better one cuz I dont need to refactor whole metadata AGAIN) +would be to write a Rhythmbox plugin that fetches lyrics from ID3 USLT + +I have written that Rhythmbox plugin: https://github.com/HeIIow2/rythmbox-id3-lyrics-support +""" + + +# https://www.programcreek.com/python/example/63462/mutagen.mp3.EasyMP3 +# https://code.activestate.com/recipes/577138-embed-lyrics-into-mp3-files-using-mutagen-uslt-tag/ + + +def add_lyrics(file_name, lyrics=""): + print(lyrics) + tags = ID3(file_name) + uslt_output = USLT(encoding=3, lang=u'eng', desc=u'desc', text=lyrics) + tags["USLT::'eng'"] = uslt_output + + tags.save(file_name) + + +def get_lyrics(file_name): + tags = ID3(file_name) + return tags.getall("USLT") + + +def fetch_single_lyrics(row: dict): + file_ = os.path.join(os.path.expanduser('~/Music'), row['file']) + artist = row['artists'][0]['name'] + track = row['title'] + + if not os.path.exists(file_): + return + print(file_, artist, track) + lyrics = genius.search(artist, track) + if len(lyrics) == 0: + return + add_lyrics(file_, lyrics[0]) + + +def fetch_lyrics(database: db.Database): + for row in database.get_custom_track([]): + print(row) + fetch_single_lyrics(row) + + +if __name__ == "__main__": + import tempfile + import os + + temp_folder = "music-downloader" + temp_dir = os.path.join(tempfile.gettempdir(), temp_folder) + if not os.path.exists(temp_dir): + os.mkdir(temp_dir) + + logging.basicConfig(level=logging.DEBUG) + db_logger = logging.getLogger("database") + db_logger.setLevel(logging.DEBUG) + + database = db.Database(os.path.join(temp_dir, "metadata.db"), + os.path.join(temp_dir, "database_structure.sql"), + "https://raw.githubusercontent.com/HeIIow2/music-downloader/new_metadata/assets/database_structure.sql", + db_logger, + reset_anyways=False) + + fetch_lyrics(database) diff --git a/src/main.py b/src/main.py index df97213..a56e0c4 100644 --- a/src/main.py +++ b/src/main.py @@ -6,11 +6,13 @@ import download_links import url_to_path import download +# NEEDS REFACTORING +from lyrics_ import fetch_lyrics + import logging import os import tempfile - TEMP_FOLDER = "music-downloader" LOG_FILE = "download_logs.log" DATABASE_FILE = "metadata.db" @@ -37,13 +39,11 @@ logging.basicConfig(level=logging.INFO, filename=os.path.join(temp_dir, LOG_FILE database = Database(os.path.join(temp_dir, DATABASE_FILE), os.path.join(temp_dir, DATABASE_STRUCTURE_FILE), - DATABASE_STRUCTURE_FALLBACK, + DATABASE_STRUCTURE_FALLBACK, DATABASE_LOGGER, reset_anyways=True) - - def get_existing_genre(): valid_directories = [] for elem in os.listdir(MUSIC_DIR): @@ -77,6 +77,7 @@ def search_for_metadata(): return search.current_option + def get_genre(): existing_genres = get_existing_genre() print("printing available genres:") @@ -125,6 +126,10 @@ def cli(start_at: int = 0): logging.info("starting to download the mp3's") download.Download(database, DOWNLOAD_LOGGER, proxies=proxies, base_path=MUSIC_DIR) + if start_at <= 4: + logging.info("starting to fetch the lyrics") + fetch_lyrics(database) + if __name__ == "__main__": cli(start_at=0) diff --git a/src/metadata/database.py b/src/metadata/database.py index f10ef22..da78c7a 100644 --- a/src/metadata/database.py +++ b/src/metadata/database.py @@ -205,7 +205,9 @@ GROUP BY track.id; return self.get_custom_track(["(track.file IS NULL OR track.path IS NULL OR track.genre IS NULL)"]) def update_download_status(self, track_id: str): - pass + query = f"UPDATE track SET downloaded = 1, WHERE '{track_id}' == id;" + self.cursor.execute(query) + self.connection.commit() def set_download_data(self, track_id: str, url: str, src: str): query = f""" diff --git a/src/scraping/musify.py b/src/scraping/musify.py index b4719bc..4afd889 100644 --- a/src/scraping/musify.py +++ b/src/scraping/musify.py @@ -14,6 +14,7 @@ TIMEOUT = 10 session = requests.Session() session.headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0", "Connection": "keep-alive", "Referer": "https://musify.club/" } @@ -137,20 +138,4 @@ def get_musify_url_slow(row): if __name__ == "__main__": - import pandas as pd - import json - - TOR = True - if TOR: - set_proxy({ - 'http': 'socks5h://127.0.0.1:9150', - 'https': 'socks5h://127.0.0.1:9150' - }) - - df = pd.read_csv("../temp/.cache1.csv") - - for idx, row in df.iterrows(): - row['artist'] = json.loads(row['artist'].replace("'", '"')) - print("-" * 200) - print("slow") - print(get_musify_url_slow(row)) + pass