nicole/nicole/nicole.py
2024-05-02 23:34:27 +02:00

489 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/python3
# Copyright © 2024 Matthias Quintern.
# This software comes with no warranty.
# This software is licensed under the GPL3
from mutagen import easyid3, id3, flac
import urllib
import re
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
from json import loads
import argparse
from os import path, getcwd, listdir, mkdir
from time import sleep
from sys import argv
version = "2.1.0"
# Der Name Nicole ist frei erfunden und hat keine Bedeutung.
# Jeglicher Zusammenhang mit einer Website der DHL wird hiermit ausdrücklich ausgeschlossen.
class Nicole:
"""
Overview:
Nicole is a program that searches for lyrics and writes them into the mp3-tag "USLT".
There is a 5 second delay between each request to azlyrics.com because the site will block your ip if there are too many requests.
History:
Nicole can create a history of all files that were processed in ~/.configs/nicole.
If a file is in the history, it will be skipped unless ignore_history=True.
If the lyrics for a file can not be obtained, it is added to ~/.configs/nicole/failed_files.
Those files are not skipped, the file only exists so that you can see which lyrics were not downloaded.
azlyrics:
Nicole creates a azlyrics url from the title and artist mp3-tags of the file.
The lyrics are extracted from the html document using regex.
genius:
Nicole searches the song from the title and artist mp3-tags via the genius api.
"""
allowed_extensions = [".mp3", ".flac"]
def __init__(self, test_run=False, silent=False, write_history=True, ignore_history=False, overwrite_tag=False, recursive=False, rm_explicit=False, lyrics_site="all"):
self.dry_run = test_run
self.silent = silent
self.write_history = write_history
self.ignore_history = ignore_history
self.overwrite_tag = overwrite_tag
self.recursive = recursive
self.lyrics_site = lyrics_site
self.delay = 5 # enough delay so that azlyrics doesnt block the ip
self.genius_search = "https://api.genius.com/search?q="
self.genius_song = "https://api.genius.com/songs/"
self.genius_access_token = "MzQaNvA53GOGvRTV8OXUbq2NCMahcnVre5EZmj-OcSjVleVO4kNwMVZicPsD5AL7"
self.sanity_checks = True
self.sanity_min_title_ratio = 0.6
self.sanity_min_artist_ratio = 0.7
self.history = []
self.failed = [] # All files that failed
if not self.ignore_history:
self._load_history()
self.rm_explicit = rm_explicit
def __del__(self):
if self.write_history:
self._write_history()
def _load_history(self):
config_path = path.expanduser("~") + "/.config/nicole/"
# check config dir exists
if not path.isdir(config_path):
mkdir(config_path)
history_file_path = config_path + "history"
# if history file does not exist, dont open it
if not path.isfile(history_file_path):
return
history_file = open(history_file_path, "r")
self.history = history_file.read().split("\n")
history_file.close()
def _write_history(self):
config_path = path.expanduser("~") + "/.config/nicole/"
with open(config_path + "history", "w") as history_file:
for file in self.history:
history_file.write(file + "\n")
with open(config_path + "failed", "w") as failed_file:
for file in self.failed:
failed_file.write(file + "\n")
def get_urls_azlyrics(self, artist:str, title:str):
"""
Create a azlyrics html from the artist and title
If the title contains parenthesis or äüö, there will be multiple versions, one that contains the (...)öäü and one that doesn't.
"""
# convert to lower case
artist = artist.casefold()
title = title.casefold()
# remove 'a' or 'the' from the artist
if artist[0:2] == "a ":
artist = artist[2:]
elif artist[0:4] == "the ":
artist = artist[4:]
# remove anything in square brackets (eg [Explicit])
for match in re.finditer(r"\[.*\]", title):
title = title.replace(match.group(), "")
titles = [title]
# if title has(), create one version with and one without them
if re.search(r"\(.*\)", title):
for match in re.finditer(r"\(.*\)", title):
title = title.replace(match.group(), "")
titles.append(title)
# some special chars
toNone = [' ', '-', ',', '.', '', '\'', '"', '°', '`', '´', '/', '!', '?', '#', '*', '(', ')']
for c in toNone:
artist = artist.replace(c, "")
#
# replace umlaute, create multiple versions
#
old = ['ä', 'ö', 'ü', 'ß', '&']
new1 = ['a', 'o', 'u', 'ss', "and"]
new2 = ['', '', '', '', "and"]
# in artist
if any(c in old for c in artist):
for i in range(len(old)):
artist = artist.replace(old[i], new1[i])
# multiple loops are needed since the array might grow
# umlaute
for n in range(len(titles)):
if any(c in old for c in titles[n]):
# replace titles[n] with the first version and append the second
title2 = titles[n]
for i in range(len(old)):
titles[n] = titles[n].replace(old[i], new1[i])
title2 = title2.replace(old[i], new2[i])
titles.append(title2)
# features
for title in titles:
match = re.search(r"fe?a?t\.?.*", title)
if match:
titles.append(title.replace(match.group(), ""))
# spaces, etc
for n in range(len(titles)):
for c in toNone:
titles[n] = titles[n].replace(c, '')
#
# create urls
#
urls = []
for title in titles:
urls.append("https://azlyrics.com/lyrics/" + artist + '/' + title + ".html")
return urls
def get_lyrics_azlyrics(self, urls):
"""
Extract the lyrics from the html
"""
message = ""
for url in urls:
# visit the url
html = None
try:
html = str(urllib.request.urlopen(url).read().decode("utf-8"))
sleep(self.delay) # azlyrics blocks requests if there is no delay
except Exception:
sleep(self.delay) # azlyrics blocks requests if there is no delay
message += f"Could not access url: {url}\n "
continue
lyrics = None
match = re.search(r"<!\-\- Usage of azlyrics.com content by any third\-party lyrics provider is prohibited by our licensing agreement. Sorry about that. \-\->(.|\n)+?</div>", html)
if match:
lyrics = match.group()
for key, value in {
"<!-- Usage of azlyrics.com content by any third-party lyrics provider is prohibited by our licensing agreement. Sorry about that. -->": "",
"</div>": "",
"\n": "",
"<br>": "\n",
}.items():
lyrics = lyrics.replace(key, value)
# remove all html tags
for tag in re.finditer(r"<.+>", lyrics):
lyrics = lyrics.replace(tag.group(), "")
for tag in re.finditer(r"</.+>", lyrics):
lyrics = lyrics.replace(tag.group(), "")
return (True, lyrics)
message += f"Could not find lyrics in html for {url}\n "
message = message.strip(" \n")
return (False, message)
def get_url_genius(self, artist:str, title:str):
"""
Retrieve the url using the genius api:
1) Get song id using search for song + artist
2) Get url from song id
"""
# get search results
query_search = self.genius_search + urllib.parse.quote(f"{artist} {title}")
request_search = urllib.request.Request(query_search)
request_search.add_header("Authorization", f"Bearer {self.genius_access_token}")
try:
results = loads(urllib.request.urlopen(request_search).read())["response"]["hits"]
except urllib.error.URLError:
return (False, f"Could not access url: {query_search}")
message = ""
url = None
i = 0
while url is None and i < len(results):
# check if result is song and then get url
if results[i]["type"] == "song":
song_id = results[i]["result"]["id"]
# check if result is garbage by checking how similar title and artist names are
if self.sanity_checks:
genius_artist = results[i]["result"]["primary_artist"]["name"]
genius_artist_featured = results[i]["result"]["artist_names"]
genius_title = results[i]["result"]["title"]
genius_title_featured = results[i]["result"]["title_with_featured"]
if SequenceMatcher(None, title.lower(), genius_title.lower()).ratio() < self.sanity_min_title_ratio:
if SequenceMatcher(None, title.lower(), genius_title_featured.lower()).ratio() < self.sanity_min_title_ratio:
message += f"Genius result: titles do not match enough: '{title}' and '{genius_title}'/'{genius_title_featured}'\n "
i += 1
continue
if SequenceMatcher(None, artist.lower(), genius_artist.lower()).ratio() < self.sanity_min_artist_ratio:
if SequenceMatcher(None, artist.lower(), genius_artist_featured.lower()).ratio() < self.sanity_min_artist_ratio:
message += f"Genius result: artists do not match enough: '{artist}' and '{genius_artist}'/'{genius_artist_featured}'\n "
i += 1
continue
request_song = urllib.request.Request(f"{self.genius_song}{song_id}")
request_song.add_header("Authorization", f"Bearer {self.genius_access_token}")
try:
url = loads(urllib.request.urlopen(request_song).read())["response"]["song"]["url"]
except urllib.error.URLError:
message += f"Genius result: Could not access url: '{self.genius_song}{song_id}'\n "
i += 1
if not url:
message += f"Could not find song lyrics on genius"
return (False, message)
return (True, url)
def get_lyrics_genius(self, url):
request_lyrics = urllib.request.Request(url)
# request_lyrics.add_header("Authorization", f"Bearer {self.genius_access_token}")
request_lyrics.add_header("User-Agent", "Mozilla/5.0")
try:
html = urllib.request.urlopen(request_lyrics).read()
except urllib.error.URLError:
return (False, f"Could not access url: {url}")
# extract lyrics from html: lyrics are in divs with "data-lyrics-container=true"
lyrics = ""
soup = BeautifulSoup(html, "html.parser")
for br in soup.find_all("br"):
br.replaceWith("\n")
divs = soup.find_all("div", attrs={"data-lyrics-container": "true"})
if not divs:
return (False, f"Could not find lyrics in html: {url}")
for div in divs:
lyrics += div.get_text(separator="")
return (True, lyrics)
def process_dir(self, directory):
f"""
Process all files from <directory> having a {Nicole.allowed_extensions} fileextension.
If recursive, call process_dir for subdirectories.
"""
if not path.isabs(directory):
directory = path.normpath(getcwd() + "/" + directory)
if not path.isdir(directory):
print(f"\nInvalid directory: '{directory}'")
return 1
if not self.silent:
print("\nProcessing directory: " + directory)
entries = listdir(directory)
entries.sort()
for entry in entries:
entry = path.normpath(directory + "/" + entry)
if path.isfile(entry):
extension = path.splitext(entry)[1]
# if sound file with mp3 tags
if extension in Nicole.allowed_extensions:
self.process_file(entry)
elif path.isdir(entry) and self.recursive:
self.process_dir(entry)
def process_file(self, file):
"""
process a file, append to history and print a message (depending on settings)
"""
success, message = self._process_file(file)
# add to history
if self.write_history:
if file not in self.history:
self.history.append(file)
if not success:
self.failed.append(file)
if not self.silent:
if success:
print(f"{file}")
else:
print(f"{file}")
print(" " + message)
# print("History\n", self.history)
# print("Failed\n", self.failed)
def _process_file(self, file):
"""
search for tags, write them to the file (if not dry run) and return wether successful or not and a message
"""
if not path.isabs(file):
file = path.normpath(getcwd() + "/" + file)
if not path.isfile(file):
return (False, f"Invalid filename: '{file}'")
if not self.ignore_history and file in self.history:
return (False, f"Already processed by nicole.")
audio = None
artist = None
title = None
has_lyrics = False
# mp3/id3
if ".mp3" in file:
try:
audio = id3.ID3(file)
artist = audio.getall("TPE1")
title = audio.getall("TIT2")
has_lyrics = not (audio.getall("USLT") == [])
except id3.ID3NoHeaderError:
return (False, f"No id3 header found.")
# flac
elif ".flac" in file:
try:
audio = flac.FLAC(file)
artist = audio.get("ARTIST")
title = audio.get("TITLE")
has_lyrics = not (audio.get("LYRICS") == None)
except flac.FLACNoHeaderError:
return (False, f"No FLAC comment header found.")
if artist:
artist = str(artist[0])
if title:
title = str(title[0])
# dont proceed when not overwrite and audio has tags
if not self.overwrite_tag and has_lyrics:
return (False, f"Already has lyrics")
# dont proceed when invalid audio/artist/title
if not (audio and artist and title):
return (False, f"Could not get tags.")
if self.rm_explicit:
for word in ["[Explicit]", "[exlicit]"]:
if word in title:
title = str(title).replace(word, "")
title = title.strip(" ")
if type(audio) == id3.ID3:
audio.setall("TIT2", [id3.TIT2(text=title)])
audio.save()
print(f"Removed '{word}' from the title.")
elif type(audio) == flac.FLAC:
audio["TITLE"] = title
audio.save()
print(f"Removed '{word}' from the title.")
lyrics = "Sample Lyrics"
success = False
site = "Sample Site"
message = ""
# try genius
if self.lyrics_site in ["all", "genius"]:
success, url = self.get_url_genius(artist, title)
if success:
success, lyrics = self.get_lyrics_genius(url)
if not success:
message += lyrics + "\n " # lyrics is error message
site = "genius"
else:
message += url + "\n " # url is error message
# try azlyrics
if not success and self.lyrics_site in ["all", "azlyrics"]:
urls = self.get_urls_azlyrics(artist, title)
success, lyrics = self.get_lyrics_azlyrics(urls)
site = "azlyrics"
if not success:
message += lyrics
# if found lyrics
if success:
if self.dry_run:
print(f"\n\n{artist} - {title}:\n{lyrics}\n")
# write to tags
else:
if type(audio) == id3.ID3:
audio.add(id3.USLT(encoding=id3.Encoding.UTF8, lang=" ", text=lyrics))
audio.save(v2_version=4)
elif type(audio) == flac.FLAC:
audio["LYRICS"] = lyrics
audio.save()
else:
return (False, f"Could find lyrics but failed to write the tag (unknown audio type: {type(audio)})")
message += f"Written lyrics from {site} to {artist} - {title}"
return (True, message)
else:
return (False, message.strip("\n "))
def main():
print(f"Nicole version {version}")
parser = argparse.ArgumentParser(prog="nicole", description="lyrics scraper and embedder", epilog="https://github.com/MatthiasQuinter/nicole")
parser.add_argument("--directory", "-d", action="append", help="process directory [directory]")
parser.add_argument("--file", "-f", action="append", help="process file [file]")
parser.add_argument("--recursive", "-r", action="store_true", help="go through directories recursively")
parser.add_argument("--silent", action="store_true", help="silent, no command-line output")
parser.add_argument("--ignore-history", "-i", action="store_true", help="ignore history")
parser.add_argument("--no-history", "-n", action="store_true", help="do not write to history")
parser.add_argument("--overwrite", "-o", action="store_true", help="overwrite if the file already has lyrics")
parser.add_argument("--dry-run", "-t", action="store_true", help="test, do not write lyrics to file, but print to console")
parser.add_argument("--rm-explicit", action="store_true", help="remove the \"[Explicit]\" lyrics warning from the songs title tag")
parser.add_argument("--site", "-s", action="store", help="use only [site]: azlyrics or genius", default="all")
args = parser.parse_args()
if args.file is None and args.directory is None:
parser.error("Either --directory or --file is required")
# create nicole instance
nicole = Nicole(test_run=args.dry_run, silent=args.silent, write_history=not args.no_history, ignore_history=args.ignore_history, overwrite_tag=args.overwrite,
recursive=args.recursive, rm_explicit=args.rm_explicit, lyrics_site=args.site)
if type(args.file) == list:
for file in args.file:
try:
nicole.process_file(file)
except KeyboardInterrupt:
pass
if type(args.directory) == list:
for directory in args.directory:
try:
nicole.process_dir(directory)
except KeyboardInterrupt:
pass
if __name__ == "__main__":
main()