489 lines
19 KiB
Python
489 lines
19 KiB
Python
#!/bin/python3
|
||
# Copyright © 2024 Matthias Quintern.
|
||
# This software comes with no warranty.
|
||
# This software is licensed under the GPL3
|
||
|
||
from mutagen import easyid3, id3, flac
|
||
|
||
import urllib
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from difflib import SequenceMatcher
|
||
from json import loads
|
||
import argparse
|
||
|
||
from os import path, getcwd, listdir, mkdir
|
||
from time import sleep
|
||
from sys import argv
|
||
|
||
version = "2.1.0"
|
||
|
||
# Der Name Nicole ist frei erfunden und hat keine Bedeutung.
|
||
# Jeglicher Zusammenhang mit einer Website der DHL wird hiermit ausdrücklich ausgeschlossen.
|
||
|
||
class Nicole:
|
||
"""
|
||
Overview:
|
||
Nicole is a program that searches for lyrics and writes them into the mp3-tag "USLT".
|
||
There is a 5 second delay between each request to azlyrics.com because the site will block your ip if there are too many requests.
|
||
History:
|
||
Nicole can create a history of all files that were processed in ~/.configs/nicole.
|
||
If a file is in the history, it will be skipped unless ignore_history=True.
|
||
If the lyrics for a file can not be obtained, it is added to ~/.configs/nicole/failed_files.
|
||
Those files are not skipped, the file only exists so that you can see which lyrics were not downloaded.
|
||
azlyrics:
|
||
Nicole creates a azlyrics url from the title and artist mp3-tags of the file.
|
||
The lyrics are extracted from the html document using regex.
|
||
genius:
|
||
Nicole searches the song from the title and artist mp3-tags via the genius api.
|
||
"""
|
||
allowed_extensions = [".mp3", ".flac"]
|
||
|
||
def __init__(self, test_run=False, silent=False, write_history=True, ignore_history=False, overwrite_tag=False, recursive=False, rm_explicit=False, lyrics_site="all"):
|
||
self.dry_run = test_run
|
||
self.silent = silent
|
||
|
||
self.write_history = write_history
|
||
self.ignore_history = ignore_history
|
||
|
||
self.overwrite_tag = overwrite_tag
|
||
self.recursive = recursive
|
||
|
||
self.lyrics_site = lyrics_site
|
||
self.delay = 5 # enough delay so that azlyrics doesnt block the ip
|
||
|
||
self.genius_search = "https://api.genius.com/search?q="
|
||
self.genius_song = "https://api.genius.com/songs/"
|
||
self.genius_access_token = "MzQaNvA53GOGvRTV8OXUbq2NCMahcnVre5EZmj-OcSjVleVO4kNwMVZicPsD5AL7"
|
||
|
||
self.sanity_checks = True
|
||
self.sanity_min_title_ratio = 0.6
|
||
self.sanity_min_artist_ratio = 0.7
|
||
|
||
self.history = []
|
||
self.failed = [] # All files that failed
|
||
if not self.ignore_history:
|
||
self._load_history()
|
||
|
||
self.rm_explicit = rm_explicit
|
||
|
||
def __del__(self):
|
||
if self.write_history:
|
||
self._write_history()
|
||
|
||
def _load_history(self):
|
||
config_path = path.expanduser("~") + "/.config/nicole/"
|
||
# check config dir exists
|
||
if not path.isdir(config_path):
|
||
mkdir(config_path)
|
||
|
||
history_file_path = config_path + "history"
|
||
# if history file does not exist, dont open it
|
||
if not path.isfile(history_file_path):
|
||
return
|
||
|
||
history_file = open(history_file_path, "r")
|
||
self.history = history_file.read().split("\n")
|
||
history_file.close()
|
||
|
||
def _write_history(self):
|
||
config_path = path.expanduser("~") + "/.config/nicole/"
|
||
|
||
with open(config_path + "history", "w") as history_file:
|
||
for file in self.history:
|
||
history_file.write(file + "\n")
|
||
|
||
with open(config_path + "failed", "w") as failed_file:
|
||
for file in self.failed:
|
||
failed_file.write(file + "\n")
|
||
|
||
def get_urls_azlyrics(self, artist:str, title:str):
|
||
"""
|
||
Create a azlyrics html from the artist and title
|
||
If the title contains parenthesis or äüö, there will be multiple versions, one that contains the (...)öäü and one that doesn't.
|
||
"""
|
||
# convert to lower case
|
||
artist = artist.casefold()
|
||
title = title.casefold()
|
||
|
||
# remove 'a' or 'the' from the artist
|
||
if artist[0:2] == "a ":
|
||
artist = artist[2:]
|
||
elif artist[0:4] == "the ":
|
||
artist = artist[4:]
|
||
|
||
# remove anything in square brackets (eg [Explicit])
|
||
for match in re.finditer(r"\[.*\]", title):
|
||
title = title.replace(match.group(), "")
|
||
|
||
titles = [title]
|
||
|
||
# if title has(), create one version with and one without them
|
||
if re.search(r"\(.*\)", title):
|
||
for match in re.finditer(r"\(.*\)", title):
|
||
title = title.replace(match.group(), "")
|
||
titles.append(title)
|
||
|
||
# some special chars
|
||
toNone = [' ', '-', ',', '.', '…', '\'', '"', '°', '`', '´', '/', '!', '?', '#', '*', '(', ')']
|
||
for c in toNone:
|
||
artist = artist.replace(c, "")
|
||
|
||
#
|
||
# replace umlaute, create multiple versions
|
||
#
|
||
old = ['ä', 'ö', 'ü', 'ß', '&']
|
||
new1 = ['a', 'o', 'u', 'ss', "and"]
|
||
new2 = ['', '', '', '', "and"]
|
||
|
||
# in artist
|
||
if any(c in old for c in artist):
|
||
for i in range(len(old)):
|
||
artist = artist.replace(old[i], new1[i])
|
||
# multiple loops are needed since the array might grow
|
||
|
||
# umlaute
|
||
for n in range(len(titles)):
|
||
if any(c in old for c in titles[n]):
|
||
# replace titles[n] with the first version and append the second
|
||
title2 = titles[n]
|
||
for i in range(len(old)):
|
||
titles[n] = titles[n].replace(old[i], new1[i])
|
||
title2 = title2.replace(old[i], new2[i])
|
||
titles.append(title2)
|
||
|
||
# features
|
||
for title in titles:
|
||
match = re.search(r"fe?a?t\.?.*", title)
|
||
if match:
|
||
titles.append(title.replace(match.group(), ""))
|
||
|
||
# spaces, etc
|
||
for n in range(len(titles)):
|
||
for c in toNone:
|
||
titles[n] = titles[n].replace(c, '')
|
||
|
||
#
|
||
# create urls
|
||
#
|
||
urls = []
|
||
for title in titles:
|
||
urls.append("https://azlyrics.com/lyrics/" + artist + '/' + title + ".html")
|
||
return urls
|
||
|
||
def get_lyrics_azlyrics(self, urls):
|
||
"""
|
||
Extract the lyrics from the html
|
||
"""
|
||
|
||
message = ""
|
||
for url in urls:
|
||
# visit the url
|
||
html = None
|
||
try:
|
||
html = str(urllib.request.urlopen(url).read().decode("utf-8"))
|
||
sleep(self.delay) # azlyrics blocks requests if there is no delay
|
||
except Exception:
|
||
sleep(self.delay) # azlyrics blocks requests if there is no delay
|
||
message += f"Could not access url: {url}\n "
|
||
continue
|
||
|
||
lyrics = None
|
||
match = re.search(r"<!\-\- Usage of azlyrics.com content by any third\-party lyrics provider is prohibited by our licensing agreement. Sorry about that. \-\->(.|\n)+?</div>", html)
|
||
if match:
|
||
lyrics = match.group()
|
||
for key, value in {
|
||
"<!-- Usage of azlyrics.com content by any third-party lyrics provider is prohibited by our licensing agreement. Sorry about that. -->": "",
|
||
"</div>": "",
|
||
"\n": "",
|
||
"<br>": "\n",
|
||
}.items():
|
||
lyrics = lyrics.replace(key, value)
|
||
|
||
# remove all html tags
|
||
for tag in re.finditer(r"<.+>", lyrics):
|
||
lyrics = lyrics.replace(tag.group(), "")
|
||
for tag in re.finditer(r"</.+>", lyrics):
|
||
lyrics = lyrics.replace(tag.group(), "")
|
||
|
||
return (True, lyrics)
|
||
|
||
message += f"Could not find lyrics in html for {url}\n "
|
||
message = message.strip(" \n")
|
||
return (False, message)
|
||
|
||
def get_url_genius(self, artist:str, title:str):
|
||
"""
|
||
Retrieve the url using the genius api:
|
||
1) Get song id using search for song + artist
|
||
2) Get url from song id
|
||
"""
|
||
# get search results
|
||
query_search = self.genius_search + urllib.parse.quote(f"{artist} {title}")
|
||
request_search = urllib.request.Request(query_search)
|
||
request_search.add_header("Authorization", f"Bearer {self.genius_access_token}")
|
||
try:
|
||
results = loads(urllib.request.urlopen(request_search).read())["response"]["hits"]
|
||
except urllib.error.URLError:
|
||
return (False, f"Could not access url: {query_search}")
|
||
|
||
message = ""
|
||
url = None
|
||
i = 0
|
||
while url is None and i < len(results):
|
||
# check if result is song and then get url
|
||
if results[i]["type"] == "song":
|
||
song_id = results[i]["result"]["id"]
|
||
# check if result is garbage by checking how similar title and artist names are
|
||
if self.sanity_checks:
|
||
genius_artist = results[i]["result"]["primary_artist"]["name"]
|
||
genius_artist_featured = results[i]["result"]["artist_names"]
|
||
genius_title = results[i]["result"]["title"]
|
||
genius_title_featured = results[i]["result"]["title_with_featured"]
|
||
if SequenceMatcher(None, title.lower(), genius_title.lower()).ratio() < self.sanity_min_title_ratio:
|
||
if SequenceMatcher(None, title.lower(), genius_title_featured.lower()).ratio() < self.sanity_min_title_ratio:
|
||
message += f"Genius result: titles do not match enough: '{title}' and '{genius_title}'/'{genius_title_featured}'\n "
|
||
i += 1
|
||
continue
|
||
|
||
if SequenceMatcher(None, artist.lower(), genius_artist.lower()).ratio() < self.sanity_min_artist_ratio:
|
||
if SequenceMatcher(None, artist.lower(), genius_artist_featured.lower()).ratio() < self.sanity_min_artist_ratio:
|
||
message += f"Genius result: artists do not match enough: '{artist}' and '{genius_artist}'/'{genius_artist_featured}'\n "
|
||
i += 1
|
||
continue
|
||
request_song = urllib.request.Request(f"{self.genius_song}{song_id}")
|
||
request_song.add_header("Authorization", f"Bearer {self.genius_access_token}")
|
||
try:
|
||
url = loads(urllib.request.urlopen(request_song).read())["response"]["song"]["url"]
|
||
except urllib.error.URLError:
|
||
message += f"Genius result: Could not access url: '{self.genius_song}{song_id}'\n "
|
||
i += 1
|
||
if not url:
|
||
message += f"Could not find song lyrics on genius"
|
||
return (False, message)
|
||
return (True, url)
|
||
|
||
def get_lyrics_genius(self, url):
|
||
request_lyrics = urllib.request.Request(url)
|
||
# request_lyrics.add_header("Authorization", f"Bearer {self.genius_access_token}")
|
||
request_lyrics.add_header("User-Agent", "Mozilla/5.0")
|
||
try:
|
||
html = urllib.request.urlopen(request_lyrics).read()
|
||
except urllib.error.URLError:
|
||
return (False, f"Could not access url: {url}")
|
||
|
||
# extract lyrics from html: lyrics are in divs with "data-lyrics-container=true"
|
||
lyrics = ""
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
for br in soup.find_all("br"):
|
||
br.replaceWith("\n")
|
||
divs = soup.find_all("div", attrs={"data-lyrics-container": "true"})
|
||
if not divs:
|
||
return (False, f"Could not find lyrics in html: {url}")
|
||
for div in divs:
|
||
lyrics += div.get_text(separator="")
|
||
return (True, lyrics)
|
||
|
||
def process_dir(self, directory):
|
||
f"""
|
||
Process all files from <directory> having a {Nicole.allowed_extensions} fileextension.
|
||
If recursive, call process_dir for subdirectories.
|
||
"""
|
||
if not path.isabs(directory):
|
||
directory = path.normpath(getcwd() + "/" + directory)
|
||
if not path.isdir(directory):
|
||
print(f"\nInvalid directory: '{directory}'")
|
||
return 1
|
||
if not self.silent:
|
||
print("\nProcessing directory: " + directory)
|
||
|
||
entries = listdir(directory)
|
||
entries.sort()
|
||
|
||
for entry in entries:
|
||
entry = path.normpath(directory + "/" + entry)
|
||
|
||
if path.isfile(entry):
|
||
extension = path.splitext(entry)[1]
|
||
|
||
# if sound file with mp3 tags
|
||
if extension in Nicole.allowed_extensions:
|
||
self.process_file(entry)
|
||
|
||
elif path.isdir(entry) and self.recursive:
|
||
self.process_dir(entry)
|
||
|
||
|
||
def process_file(self, file):
|
||
"""
|
||
process a file, append to history and print a message (depending on settings)
|
||
"""
|
||
success, message = self._process_file(file)
|
||
|
||
# add to history
|
||
if self.write_history:
|
||
if file not in self.history:
|
||
self.history.append(file)
|
||
if not success:
|
||
self.failed.append(file)
|
||
|
||
if not self.silent:
|
||
if success:
|
||
print(f"✓ {file}")
|
||
else:
|
||
print(f"✕ {file}")
|
||
print(" " + message)
|
||
# print("History\n", self.history)
|
||
# print("Failed\n", self.failed)
|
||
|
||
|
||
def _process_file(self, file):
|
||
"""
|
||
search for tags, write them to the file (if not dry run) and return wether successful or not and a message
|
||
"""
|
||
if not path.isabs(file):
|
||
file = path.normpath(getcwd() + "/" + file)
|
||
if not path.isfile(file):
|
||
return (False, f"Invalid filename: '{file}'")
|
||
|
||
if not self.ignore_history and file in self.history:
|
||
return (False, f"Already processed by nicole.")
|
||
|
||
audio = None
|
||
artist = None
|
||
title = None
|
||
|
||
has_lyrics = False
|
||
|
||
# mp3/id3
|
||
if ".mp3" in file:
|
||
try:
|
||
audio = id3.ID3(file)
|
||
|
||
artist = audio.getall("TPE1")
|
||
title = audio.getall("TIT2")
|
||
|
||
has_lyrics = not (audio.getall("USLT") == [])
|
||
except id3.ID3NoHeaderError:
|
||
return (False, f"No id3 header found.")
|
||
# flac
|
||
elif ".flac" in file:
|
||
try:
|
||
audio = flac.FLAC(file)
|
||
|
||
artist = audio.get("ARTIST")
|
||
title = audio.get("TITLE")
|
||
|
||
has_lyrics = not (audio.get("LYRICS") == None)
|
||
except flac.FLACNoHeaderError:
|
||
return (False, f"No FLAC comment header found.")
|
||
|
||
if artist:
|
||
artist = str(artist[0])
|
||
if title:
|
||
title = str(title[0])
|
||
|
||
# dont proceed when not overwrite and audio has tags
|
||
if not self.overwrite_tag and has_lyrics:
|
||
return (False, f"Already has lyrics")
|
||
|
||
# dont proceed when invalid audio/artist/title
|
||
if not (audio and artist and title):
|
||
return (False, f"Could not get tags.")
|
||
|
||
if self.rm_explicit:
|
||
for word in ["[Explicit]", "[exlicit]"]:
|
||
if word in title:
|
||
title = str(title).replace(word, "")
|
||
title = title.strip(" ")
|
||
if type(audio) == id3.ID3:
|
||
audio.setall("TIT2", [id3.TIT2(text=title)])
|
||
audio.save()
|
||
print(f"Removed '{word}' from the title.")
|
||
elif type(audio) == flac.FLAC:
|
||
audio["TITLE"] = title
|
||
audio.save()
|
||
print(f"Removed '{word}' from the title.")
|
||
|
||
lyrics = "Sample Lyrics"
|
||
success = False
|
||
site = "Sample Site"
|
||
message = ""
|
||
# try genius
|
||
if self.lyrics_site in ["all", "genius"]:
|
||
success, url = self.get_url_genius(artist, title)
|
||
if success:
|
||
success, lyrics = self.get_lyrics_genius(url)
|
||
if not success:
|
||
message += lyrics + "\n " # lyrics is error message
|
||
site = "genius"
|
||
else:
|
||
message += url + "\n " # url is error message
|
||
# try azlyrics
|
||
if not success and self.lyrics_site in ["all", "azlyrics"]:
|
||
urls = self.get_urls_azlyrics(artist, title)
|
||
success, lyrics = self.get_lyrics_azlyrics(urls)
|
||
site = "azlyrics"
|
||
if not success:
|
||
message += lyrics
|
||
# if found lyrics
|
||
if success:
|
||
if self.dry_run:
|
||
print(f"\n\n{artist} - {title}:\n{lyrics}\n")
|
||
# write to tags
|
||
else:
|
||
if type(audio) == id3.ID3:
|
||
audio.add(id3.USLT(encoding=id3.Encoding.UTF8, lang=" ", text=lyrics))
|
||
audio.save(v2_version=4)
|
||
elif type(audio) == flac.FLAC:
|
||
audio["LYRICS"] = lyrics
|
||
audio.save()
|
||
else:
|
||
return (False, f"Could find lyrics but failed to write the tag (unknown audio type: {type(audio)})")
|
||
|
||
message += f"Written lyrics from {site} to {artist} - {title}"
|
||
return (True, message)
|
||
else:
|
||
return (False, message.strip("\n "))
|
||
|
||
|
||
def main():
|
||
print(f"Nicole version {version}")
|
||
|
||
parser = argparse.ArgumentParser(prog="nicole", description="lyrics scraper and embedder", epilog="https://github.com/MatthiasQuinter/nicole")
|
||
parser.add_argument("--directory", "-d", action="append", help="process directory [directory]")
|
||
parser.add_argument("--file", "-f", action="append", help="process file [file]")
|
||
parser.add_argument("--recursive", "-r", action="store_true", help="go through directories recursively")
|
||
parser.add_argument("--silent", action="store_true", help="silent, no command-line output")
|
||
parser.add_argument("--ignore-history", "-i", action="store_true", help="ignore history")
|
||
parser.add_argument("--no-history", "-n", action="store_true", help="do not write to history")
|
||
parser.add_argument("--overwrite", "-o", action="store_true", help="overwrite if the file already has lyrics")
|
||
parser.add_argument("--dry-run", "-t", action="store_true", help="test, do not write lyrics to file, but print to console")
|
||
parser.add_argument("--rm-explicit", action="store_true", help="remove the \"[Explicit]\" lyrics warning from the songs title tag")
|
||
parser.add_argument("--site", "-s", action="store", help="use only [site]: azlyrics or genius", default="all")
|
||
args = parser.parse_args()
|
||
|
||
if args.file is None and args.directory is None:
|
||
parser.error("Either --directory or --file is required")
|
||
|
||
# create nicole instance
|
||
nicole = Nicole(test_run=args.dry_run, silent=args.silent, write_history=not args.no_history, ignore_history=args.ignore_history, overwrite_tag=args.overwrite,
|
||
recursive=args.recursive, rm_explicit=args.rm_explicit, lyrics_site=args.site)
|
||
|
||
if type(args.file) == list:
|
||
for file in args.file:
|
||
try:
|
||
nicole.process_file(file)
|
||
except KeyboardInterrupt:
|
||
pass
|
||
if type(args.directory) == list:
|
||
for directory in args.directory:
|
||
try:
|
||
nicole.process_dir(directory)
|
||
except KeyboardInterrupt:
|
||
pass
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|