added support for genius

This commit is contained in:
matthias@arch 2022-04-04 05:53:35 +02:00
parent 6fc54858de
commit 9299140b87

View File

@ -1,17 +1,22 @@
#!/bin/python3
# Copyright © 2022 Matthias Quintern.
# This software comes with no warranty.
# This software is licensed under the GPL3
from mutagen import easyid3, id3, flac from mutagen import easyid3, id3, flac
import urllib.request as ur
import urllib
import re
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
from json import loads
from os import path, getcwd, listdir, mkdir from os import path, getcwd, listdir, mkdir
from time import sleep from time import sleep
from sys import argv from sys import argv
import re # Der Name Nicole ist frei erfunden und hat keine Bedeutung.
# Jeglicher Zusammenhang mit einer Website der DHL wird hiermit ausdrücklich ausgeschlossen.
"""
Der Name Nicole ist frei erfunden und hat keine Bedeutung.
Jeglicher Zusammenhang mit einer Website der DHL wird hiermit ausdrücklich ausgeschlossen.
"""
class Nicole: class Nicole:
""" """
@ -27,7 +32,7 @@ class Nicole:
Nicole creates a azlyrics url from the title and artist mp3-tags of the file. Nicole creates a azlyrics url from the title and artist mp3-tags of the file.
The lyrics are extracted from the html document using regex. The lyrics are extracted from the html document using regex.
""" """
def __init__(self, test_run=False, silent=False, write_history=True, ignore_history=False, overwrite_tag=False, recursive=False, rm_explicit=False): def __init__(self, test_run=False, silent=False, write_history=True, ignore_history=False, overwrite_tag=False, recursive=False, rm_explicit=False, lyrics_site="all"):
self.test_run = test_run self.test_run = test_run
self.silent = silent self.silent = silent
@ -37,9 +42,17 @@ class Nicole:
self.overwrite_tag = overwrite_tag self.overwrite_tag = overwrite_tag
self.recursive = recursive self.recursive = recursive
self.lyrics_site = "azlyrics" self.lyrics_site = lyrics_site
self.delay = 5 # enough delay so that azlyrics doesnt block the ip self.delay = 5 # enough delay so that azlyrics doesnt block the ip
self.genius_search = "https://api.genius.com/search?q="
self.genius_song = "https://api.genius.com/songs/"
self.genius_access_token = "MzQaNvA53GOGvRTV8OXUbq2NCMahcnVre5EZmj-OcSjVleVO4kNwMVZicPsD5AL7"
self.sanity_checks = True
self.sanity_min_title_ratio = 0.6
self.sanity_min_artist_ratio = 0.7
self.history = [] self.history = []
self.failed = [] # All files that failed self.failed = [] # All files that failed
if not self.ignore_history: if not self.ignore_history:
@ -82,7 +95,7 @@ class Nicole:
def get_urls_azlyrics(self, artist:str, title:str): def get_urls_azlyrics(self, artist:str, title:str):
""" """
Create a azlyrics html from the artist and title Create a azlyrics html from the artist and title
If the title contains paranthesis or äüö, there will be multiple versions, one that contains the (...)öäü and one that doesn't. If the title contains parenthesis or äüö, there will be multiple versions, one that contains the (...)öäü and one that doesn't.
""" """
# convert to lower case # convert to lower case
artist = artist.casefold() artist = artist.casefold()
@ -94,7 +107,7 @@ class Nicole:
elif artist[0:4] == "the ": elif artist[0:4] == "the ":
artist = artist[4:] artist = artist[4:]
# remove anything in square bracketrs (eg [Explicit]) # remove anything in square brackets (eg [Explicit])
for match in re.finditer(r"\[.*\]", title): for match in re.finditer(r"\[.*\]", title):
title = title.replace(match.group(), "") title = title.replace(match.group(), "")
@ -153,7 +166,6 @@ class Nicole:
urls.append("https://azlyrics.com/lyrics/" + artist + '/' + title + ".html") urls.append("https://azlyrics.com/lyrics/" + artist + '/' + title + ".html")
return urls return urls
def get_lyrics_azlyrics(self, urls): def get_lyrics_azlyrics(self, urls):
""" """
Extract the lyrics from the html Extract the lyrics from the html
@ -164,7 +176,7 @@ class Nicole:
# visit the url # visit the url
html = None html = None
try: try:
html = str(ur.urlopen(url).read().decode("utf-8")) html = str(urllib.request.urlopen(url).read().decode("utf-8"))
sleep(self.delay) # azlyrics blocks requests if there is no delay sleep(self.delay) # azlyrics blocks requests if there is no delay
except Exception: except Exception:
sleep(self.delay) # azlyrics blocks requests if there is no delay sleep(self.delay) # azlyrics blocks requests if there is no delay
@ -191,10 +203,82 @@ class Nicole:
return (True, lyrics) return (True, lyrics)
message += f"Could not lyrics in html for {url}\n " message += f"Could not find lyrics in html for {url}\n "
message = message.strip(" \n") message = message.strip(" \n")
return (False, message) return (False, message)
def get_url_genius(self, artist:str, title:str):
"""
Retrieve the url using the genius api:
1) Get song id using search for song + artist
2) Get url from song id
"""
# get search results
query_search = self.genius_search + urllib.parse.quote(f"{artist} {title}")
request_search = urllib.request.Request(query_search)
request_search.add_header("Authorization", f"Bearer {self.genius_access_token}")
try:
results = loads(urllib.request.urlopen(request_search).read())["response"]["hits"]
except urllib.error.URLError:
return (False, f"Could not access url: {query_search}")
message = ""
url = None
i = 0
while url is None and i < len(results):
# check if result is song and then get url
if results[i]["type"] == "song":
song_id = results[i]["result"]["id"]
# check if result is garbage by checking how similar title and artist names are
if self.sanity_checks:
genius_artist = results[i]["result"]["primary_artist"]["name"]
genius_artist_featured = results[i]["result"]["artist_names"]
genius_title = results[i]["result"]["title"]
genius_title_featured = results[i]["result"]["title_with_featured"]
if SequenceMatcher(None, title, genius_title).ratio() < self.sanity_min_title_ratio:
if SequenceMatcher(None, title, genius_title_featured).ratio() < self.sanity_min_title_ratio:
message += f"Genius result: titles do not match enough: '{title}' and '{genius_title}'/'{genius_title_featured}'\n "
i += 1
continue
if SequenceMatcher(None, artist, genius_artist).ratio() < self.sanity_min_artist_ratio:
if SequenceMatcher(None, artist, genius_artist_featured).ratio() < self.sanity_min_artist_ratio:
message += f"Genius result: artists do not match enough: '{artist}' and '{genius_artist}'/'{genius_artist_featured}'\n "
i += 1
continue
request_song = urllib.request.Request(f"{self.genius_song}{song_id}")
request_song.add_header("Authorization", f"Bearer {self.genius_access_token}")
try:
url = loads(urllib.request.urlopen(request_song).read())["response"]["song"]["url"]
except urllib.error.URLError:
message += f"Genius result: Could not access url: '{self.genius_song}{song_id}'\n "
i += 1
if not url:
message += f"Could not find song lyrics on genius"
return (False, message)
return (True, url)
def get_lyrics_genius(self, url):
request_lyrics = urllib.request.Request(url)
# request_lyrics.add_header("Authorization", f"Bearer {self.genius_access_token}")
request_lyrics.add_header("User-Agent", "Mozilla/5.0")
try:
html = urllib.request.urlopen(request_lyrics).read()
except urllib.error.URLError:
return (False, f"Could not access url: {url}")
# extract lyrics from html: lyrics are in divs with "data-lyrics-container=true"
lyrics = ""
soup = BeautifulSoup(html, "html.parser")
for br in soup.find_all("br"):
br.replaceWith("\n")
divs = soup.find_all("div", attrs={"data-lyrics-container": "true"})
if not divs:
return (False, f"Could not find lyrics in html: {url}")
for div in divs:
lyrics += div.get_text(separator="")
return (True, lyrics)
def process_dir(self, directory): def process_dir(self, directory):
if not path.isabs(directory): if not path.isabs(directory):
directory = path.normpath(getcwd() + "/" + directory) directory = path.normpath(getcwd() + "/" + directory)
@ -204,7 +288,6 @@ class Nicole:
if not self.silent: if not self.silent:
print("\nProcessing directory: " + directory) print("\nProcessing directory: " + directory)
entries = listdir(directory) entries = listdir(directory)
entries.sort() entries.sort()
@ -301,14 +384,31 @@ class Nicole:
audio.save() audio.save()
print(f"Removed '{word}' from the title.") print(f"Removed '{word}' from the title.")
# currently the only supported site lyrics = "Sample Lyrics"
if self.lyrics_site == "azlyrics": success = False
site = "Sample Site"
message = ""
# try genius
if self.lyrics_site in ["all", "genius"]:
success, url = self.get_url_genius(artist, title)
if success:
success, lyrics = self.get_lyrics_genius(url)
if not success:
message += lyrics + "\n " # lyrics is error message
site = "genius"
else:
message += url + "\n " # url is error message
# try azlyrics
if not success and self.lyrics_site in ["all", "azlyrics"]:
urls = self.get_urls_azlyrics(artist, title) urls = self.get_urls_azlyrics(artist, title)
success, lyrics = self.get_lyrics_azlyrics(urls) success, lyrics = self.get_lyrics_azlyrics(urls)
site = "azlyrics"
if not success:
message += lyrics
# if found lyrics
if success: if success:
if self.test_run: if self.test_run:
print(f"{artist} - {title}:\n{lyrics}\n\n") print(f"\n\n{artist} - {title}:\n{lyrics}\n")
# write to tags # write to tags
else: else:
if type(audio) == id3.ID3: if type(audio) == id3.ID3:
@ -318,22 +418,20 @@ class Nicole:
audio["LYRICS"] = lyrics audio["LYRICS"] = lyrics
audio.save() audio.save()
else: else:
return (False, f"Could not write lyrics.") return (False, f"Could find lyrics but failed to write the tag.")
# add to history # add to history
if self.write_history and file not in self.history: if self.write_history and file not in self.history:
self.history.append(file) self.history.append(file)
return (True, f"Written lyrics to {artist} - {title}") message += f"Written lyrics from {site} to {artist} - {title}"
return (True, message)
else: else:
return (False, lyrics) # lyrics is error message here return (False, message.strip("\n "))
return (False, "Failed for unknown reason.")
def main(): def main():
print("Nicole version 1.1") print("Nicole version 2.0")
# print("Get updates here: https://github.com/MatthiasQuintern/nicole")
helpstring = """Command line options: helpstring = """Command line options:
-d [directory] process directory [directory] -d [directory] process directory [directory]
@ -345,24 +443,25 @@ def main():
-o overwrite if the file already has lyrics -o overwrite if the file already has lyrics
-t test, do not write lyrics to file, but print to console -t test, do not write lyrics to file, but print to console
-h show this -h show this
--rm_explicit remove the "[Explicit]" lyrics warning from the songs title tag""" --rm_explicit remove the "[Explicit]" lyrics warning from the songs title tag
--site [site] use only [site]: azlyrics or genius
Visit https://github.com/MatthiasQuintern/nicole for updates and further help."""
args = [] args = []
if len(argv) > 1: if len(argv) > 1:
# iterate over argv list and extract the args # iterate over argv list and extract the args
i = 1 i = 1
while i < len(argv): while i < len(argv):
arg = argv[i] arg = argv[i]
if "--" in arg: if arg[0] == "-":
args.append(arg.replace("--", ""))
elif "-" in arg:
# check if option with arg, if yes add tuple to args # check if option with arg, if yes add tuple to args
if len(argv) > i + 1 and argv[i+1][0] != "-": if len(argv) > i + 1 and argv[i+1][0] != "-":
args.append((arg.replace("-", ""), argv[i+1])) args.append((arg.replace("-", ""), argv[i+1]))
i += 1 i += 1
elif not "--" in arg:
for char in arg.replace("-", ""):
args.append(char)
else: else:
for c in arg.replace("-", ""): args.append(arg.replace("-", ""))
args.append(c)
else: else:
print(f"Invalid or missing argument: '{arg}'") print(f"Invalid or missing argument: '{arg}'")
print(helpstring) print(helpstring)
@ -383,11 +482,16 @@ def main():
directory = None directory = None
file = None file = None
site = "all"
for arg in args: for arg in args:
if type(arg) == tuple: if type(arg) == tuple:
if arg[0] == "d": directory = arg[1] if arg[0] == "d": directory = arg[1]
elif arg[0] == "f": file = arg[1] elif arg[0] == "f": file = arg[1]
elif arg[0] == "site":
if arg[1] in ["genius", "azlyrics", "all"]: site = arg[1]
else:
print(f"Invalid site: '{arg[1]}'")
elif arg in options.keys(): elif arg in options.keys():
# flip the bool associated with the char # flip the bool associated with the char
@ -405,7 +509,7 @@ def main():
return 0 return 0
# create nicole instance # create nicole instance
nicole = Nicole(test_run=options["t"], silent=options["s"], write_history=options["n"], ignore_history=options["i"], overwrite_tag=options["o"], recursive=options["r"], rm_explicit=options["rm_explicit"]) nicole = Nicole(test_run=options["t"], silent=options["s"], write_history=options["n"], ignore_history=options["i"], overwrite_tag=options["o"], recursive=options["r"], rm_explicit=options["rm_explicit"], lyrics_site=site)
# start with file or directory # start with file or directory
if file: if file: