diff --git a/nicole/nicole.py b/nicole/nicole.py index d9427ff..125c9ee 100644 --- a/nicole/nicole.py +++ b/nicole/nicole.py @@ -1,17 +1,22 @@ +#!/bin/python3 +# Copyright © 2022 Matthias Quintern. +# This software comes with no warranty. +# This software is licensed under the GPL3 + from mutagen import easyid3, id3, flac -import urllib.request as ur + +import urllib +import re +from bs4 import BeautifulSoup +from difflib import SequenceMatcher +from json import loads + from os import path, getcwd, listdir, mkdir from time import sleep - from sys import argv -import re - -""" -Der Name Nicole ist frei erfunden und hat keine Bedeutung. -Jeglicher Zusammenhang mit einer Website der DHL wird hiermit ausdrücklich ausgeschlossen. -""" - +# Der Name Nicole ist frei erfunden und hat keine Bedeutung. +# Jeglicher Zusammenhang mit einer Website der DHL wird hiermit ausdrücklich ausgeschlossen. class Nicole: """ @@ -27,7 +32,7 @@ class Nicole: Nicole creates a azlyrics url from the title and artist mp3-tags of the file. The lyrics are extracted from the html document using regex. """ - def __init__(self, test_run=False, silent=False, write_history=True, ignore_history=False, overwrite_tag=False, recursive=False, rm_explicit=False): + def __init__(self, test_run=False, silent=False, write_history=True, ignore_history=False, overwrite_tag=False, recursive=False, rm_explicit=False, lyrics_site="all"): self.test_run = test_run self.silent = silent @@ -37,9 +42,17 @@ class Nicole: self.overwrite_tag = overwrite_tag self.recursive = recursive - self.lyrics_site = "azlyrics" + self.lyrics_site = lyrics_site self.delay = 5 # enough delay so that azlyrics doesnt block the ip + self.genius_search = "https://api.genius.com/search?q=" + self.genius_song = "https://api.genius.com/songs/" + self.genius_access_token = "MzQaNvA53GOGvRTV8OXUbq2NCMahcnVre5EZmj-OcSjVleVO4kNwMVZicPsD5AL7" + + self.sanity_checks = True + self.sanity_min_title_ratio = 0.6 + self.sanity_min_artist_ratio = 0.7 + self.history = [] self.failed = [] # All files that failed if not self.ignore_history: @@ -82,7 +95,7 @@ class Nicole: def get_urls_azlyrics(self, artist:str, title:str): """ Create a azlyrics html from the artist and title - If the title contains paranthesis or äüö, there will be multiple versions, one that contains the (...)öäü and one that doesn't. + If the title contains parenthesis or äüö, there will be multiple versions, one that contains the (...)öäü and one that doesn't. """ # convert to lower case artist = artist.casefold() @@ -94,7 +107,7 @@ class Nicole: elif artist[0:4] == "the ": artist = artist[4:] - # remove anything in square bracketrs (eg [Explicit]) + # remove anything in square brackets (eg [Explicit]) for match in re.finditer(r"\[.*\]", title): title = title.replace(match.group(), "") @@ -153,7 +166,6 @@ class Nicole: urls.append("https://azlyrics.com/lyrics/" + artist + '/' + title + ".html") return urls - def get_lyrics_azlyrics(self, urls): """ Extract the lyrics from the html @@ -164,7 +176,7 @@ class Nicole: # visit the url html = None try: - html = str(ur.urlopen(url).read().decode("utf-8")) + html = str(urllib.request.urlopen(url).read().decode("utf-8")) sleep(self.delay) # azlyrics blocks requests if there is no delay except Exception: sleep(self.delay) # azlyrics blocks requests if there is no delay @@ -191,10 +203,82 @@ class Nicole: return (True, lyrics) - message += f"Could not lyrics in html for {url}\n " + message += f"Could not find lyrics in html for {url}\n " message = message.strip(" \n") return (False, message) + def get_url_genius(self, artist:str, title:str): + """ + Retrieve the url using the genius api: + 1) Get song id using search for song + artist + 2) Get url from song id + """ + # get search results + query_search = self.genius_search + urllib.parse.quote(f"{artist} {title}") + request_search = urllib.request.Request(query_search) + request_search.add_header("Authorization", f"Bearer {self.genius_access_token}") + try: + results = loads(urllib.request.urlopen(request_search).read())["response"]["hits"] + except urllib.error.URLError: + return (False, f"Could not access url: {query_search}") + + message = "" + url = None + i = 0 + while url is None and i < len(results): + # check if result is song and then get url + if results[i]["type"] == "song": + song_id = results[i]["result"]["id"] + # check if result is garbage by checking how similar title and artist names are + if self.sanity_checks: + genius_artist = results[i]["result"]["primary_artist"]["name"] + genius_artist_featured = results[i]["result"]["artist_names"] + genius_title = results[i]["result"]["title"] + genius_title_featured = results[i]["result"]["title_with_featured"] + if SequenceMatcher(None, title, genius_title).ratio() < self.sanity_min_title_ratio: + if SequenceMatcher(None, title, genius_title_featured).ratio() < self.sanity_min_title_ratio: + message += f"Genius result: titles do not match enough: '{title}' and '{genius_title}'/'{genius_title_featured}'\n " + i += 1 + continue + + if SequenceMatcher(None, artist, genius_artist).ratio() < self.sanity_min_artist_ratio: + if SequenceMatcher(None, artist, genius_artist_featured).ratio() < self.sanity_min_artist_ratio: + message += f"Genius result: artists do not match enough: '{artist}' and '{genius_artist}'/'{genius_artist_featured}'\n " + i += 1 + continue + request_song = urllib.request.Request(f"{self.genius_song}{song_id}") + request_song.add_header("Authorization", f"Bearer {self.genius_access_token}") + try: + url = loads(urllib.request.urlopen(request_song).read())["response"]["song"]["url"] + except urllib.error.URLError: + message += f"Genius result: Could not access url: '{self.genius_song}{song_id}'\n " + i += 1 + if not url: + message += f"Could not find song lyrics on genius" + return (False, message) + return (True, url) + + def get_lyrics_genius(self, url): + request_lyrics = urllib.request.Request(url) + # request_lyrics.add_header("Authorization", f"Bearer {self.genius_access_token}") + request_lyrics.add_header("User-Agent", "Mozilla/5.0") + try: + html = urllib.request.urlopen(request_lyrics).read() + except urllib.error.URLError: + return (False, f"Could not access url: {url}") + + # extract lyrics from html: lyrics are in divs with "data-lyrics-container=true" + lyrics = "" + soup = BeautifulSoup(html, "html.parser") + for br in soup.find_all("br"): + br.replaceWith("\n") + divs = soup.find_all("div", attrs={"data-lyrics-container": "true"}) + if not divs: + return (False, f"Could not find lyrics in html: {url}") + for div in divs: + lyrics += div.get_text(separator="") + return (True, lyrics) + def process_dir(self, directory): if not path.isabs(directory): directory = path.normpath(getcwd() + "/" + directory) @@ -204,7 +288,6 @@ class Nicole: if not self.silent: print("\nProcessing directory: " + directory) - entries = listdir(directory) entries.sort() @@ -230,7 +313,7 @@ class Nicole: print(f"✓ {entry}") else: print(f"✕ {entry}") - print(" " + message) + print(" " + message) elif path.isdir(entry) and self.recursive: @@ -301,39 +384,54 @@ class Nicole: audio.save() print(f"Removed '{word}' from the title.") - # currently the only supported site - if self.lyrics_site == "azlyrics": - urls = self.get_urls_azlyrics(artist, title) - - success, lyrics = self.get_lyrics_azlyrics(urls) + lyrics = "Sample Lyrics" + success = False + site = "Sample Site" + message = "" + # try genius + if self.lyrics_site in ["all", "genius"]: + success, url = self.get_url_genius(artist, title) if success: - if self.test_run: - print(f"{artist} - {title}:\n{lyrics}\n\n") - # write to tags - else: - if type(audio) == id3.ID3: - audio.add(id3.USLT(encoding=id3.Encoding.UTF8, lang=" ", text=lyrics)) - audio.save(v2_version=4) - elif type(audio) == flac.FLAC: - audio["LYRICS"] = lyrics - audio.save() - else: - return (False, f"Could not write lyrics.") - - # add to history - if self.write_history and file not in self.history: - self.history.append(file) - - return (True, f"Written lyrics to {artist} - {title}") + success, lyrics = self.get_lyrics_genius(url) + if not success: + message += lyrics + "\n " # lyrics is error message + site = "genius" else: - return (False, lyrics) # lyrics is error message here + message += url + "\n " # url is error message + # try azlyrics + if not success and self.lyrics_site in ["all", "azlyrics"]: + urls = self.get_urls_azlyrics(artist, title) + success, lyrics = self.get_lyrics_azlyrics(urls) + site = "azlyrics" + if not success: + message += lyrics + # if found lyrics + if success: + if self.test_run: + print(f"\n\n{artist} - {title}:\n{lyrics}\n") + # write to tags + else: + if type(audio) == id3.ID3: + audio.add(id3.USLT(encoding=id3.Encoding.UTF8, lang=" ", text=lyrics)) + audio.save(v2_version=4) + elif type(audio) == flac.FLAC: + audio["LYRICS"] = lyrics + audio.save() + else: + return (False, f"Could find lyrics but failed to write the tag.") - return (False, "Failed for unknown reason.") + # add to history + if self.write_history and file not in self.history: + self.history.append(file) + + message += f"Written lyrics from {site} to {artist} - {title}" + return (True, message) + else: + return (False, message.strip("\n ")) def main(): - print("Nicole version 1.1") - # print("Get updates here: https://github.com/MatthiasQuintern/nicole") + print("Nicole version 2.0") helpstring = """Command line options: -d [directory] process directory [directory] @@ -345,24 +443,25 @@ def main(): -o overwrite if the file already has lyrics -t test, do not write lyrics to file, but print to console -h show this - --rm_explicit remove the "[Explicit]" lyrics warning from the songs title tag""" + --rm_explicit remove the "[Explicit]" lyrics warning from the songs title tag + --site [site] use only [site]: azlyrics or genius + Visit https://github.com/MatthiasQuintern/nicole for updates and further help.""" args = [] if len(argv) > 1: # iterate over argv list and extract the args i = 1 while i < len(argv): arg = argv[i] - if "--" in arg: - args.append(arg.replace("--", "")) - - elif "-" in arg: + if arg[0] == "-": # check if option with arg, if yes add tuple to args if len(argv) > i + 1 and argv[i+1][0] != "-": args.append((arg.replace("-", ""), argv[i+1])) i += 1 + elif not "--" in arg: + for char in arg.replace("-", ""): + args.append(char) else: - for c in arg.replace("-", ""): - args.append(c) + args.append(arg.replace("-", "")) else: print(f"Invalid or missing argument: '{arg}'") print(helpstring) @@ -383,11 +482,16 @@ def main(): directory = None file = None + site = "all" for arg in args: if type(arg) == tuple: if arg[0] == "d": directory = arg[1] elif arg[0] == "f": file = arg[1] + elif arg[0] == "site": + if arg[1] in ["genius", "azlyrics", "all"]: site = arg[1] + else: + print(f"Invalid site: '{arg[1]}'") elif arg in options.keys(): # flip the bool associated with the char @@ -403,9 +507,9 @@ def main(): if options["h"]: print(helpstring) return 0 - + # create nicole instance - nicole = Nicole(test_run=options["t"], silent=options["s"], write_history=options["n"], ignore_history=options["i"], overwrite_tag=options["o"], recursive=options["r"], rm_explicit=options["rm_explicit"]) + nicole = Nicole(test_run=options["t"], silent=options["s"], write_history=options["n"], ignore_history=options["i"], overwrite_tag=options["o"], recursive=options["r"], rm_explicit=options["rm_explicit"], lyrics_site=site) # start with file or directory if file: @@ -415,7 +519,7 @@ def main(): print(f"✓ {file}") else: print(f"✕ {file}") - print(" " + message) + print(" " + message) elif directory: try: nicole.process_dir(directory)