#!/bin/python3 import os from os import path import re from sys import argv from collections.abc import Callable import argparse import pickle """ TODO: - more testing - reintroduce the nav_selected class on nav feature """ """ ************************************************************ SETTINGS ************************************************************ """ sidenav_format = """\

""" sidenav_content_link = "

" sidenav_content_section = """\

#name

#links

""" exit_on_include_failure = False sitemap_begin = """\ \n""" sitemap_end = "" """ ************************************************************ REGULAR EXPRESSIONS ************************************************************ """ # SIDENAV # heading with id re_sidenav_heading = r"(.+)" # custom entry re_sidenav_custom = r"href=(?:\"|\')([^\"\' ]+)(?:\"|\') +name=(?:\"|\')(.+)(?:\"|\')" # commas re_set_map = r"([a-zA-Z0-9_]+) *\? *\{( *(?:[a-zA-Z0-9_*]+ *: *[^,]*, *)+[a-zA-Z0-9_*]+ *: *[^,]*) *,? *\}" # semicolons re_set_map_alt = r"([a-zA-Z0-9_]+) *\? *\{( *(?:[a-zA-Z0-9_*]+ *: *[^;]* *; *)+[a-zA-Z0-9_*]+ *: *[^;]*) *;? *\}" """ #$(myvar) """ re_variable_use = r"#\$$([a-zA-Z0-9_]+)$" """ only in comments """ re_preprocessor_command = r"[\t ]*#([a-zA-Z]+) *(.*)[\t ]*" # https://www.w3.org/TR/NOTE-datetime re_w3cdate = r"\d{4}-(?)]-\d{2}" r"\d{4}-(?:0[1-9]|1[0-2])-(?:[0-2]\d|3[01])(T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d([\+\-](?:0\d|1[0-2]):[0-5]\d)?)?" COMMENT_BEGIN = "" """ ************************************************************ GLOBALS ************************************************************ """ glob_dependcies: list[str] = [] exit_codes = { "FileNotFound": 2, "MarkdownConversionError": 3, } error_levels = { "light": 0, "serious": 1, "critical": 2, } exit_on_error_level = error_levels["serious"] # url that the currently processed file have current_file_url = "" """ ************************************************************ UTILITY ************************************************************ """ RED = '\033[91m' GREEN = '\033[92m' YELLOW = '\033[93m' BLUE = '\033[94m' MAGENTA = '\033[95m' CYAN = '\033[96m' GRAY = '\033[97m' RESET = '\033[0m' BOLD = '\033[1m' WHITE = '\033[37m' DEBUG = False def pdebug(*args, **keys): fname, *_args = args if DEBUG: print(f"{CYAN}{fname}{GRAY}", *_args, RESET, **keys) TRACE = False def ptrace(*args, **keys): fname, *_args = args if TRACE: print(f"{BLUE}{fname}{GRAY}", *_args, RESET, **keys) def error(*args, level:int=exit_on_error_level, exit_code:int=1, **keys): fname, *_args = args if level >= exit_on_error_level: print(f"{RED}ERROR: {fname}{RESET}", *_args, RESET, **keys) exit(exit_code) else: print(f"{YELLOW}WARNING: {fname}{RESET}", *_args, RESET, **keys) def line_is_link_to_path(line, path): # check if the line is a link to html thats currently being processed match = re.search(r"(.+)", line) if match: # get filename match = re.match(r"[a-zA-Z0-9_\-]+\.html", match.groups()[1]) if match and match.group() in path: return True return False def pos2line(s: str, pos:int): return s[:pos].count('\n') + 1 def generate_dependecy_file(filename:str, deps:list[str]): line1 = f"{filename}:" s = "" for dep in deps: line1 += f" {dep}" s += f"{dep}:\n" return line1 #+ "\n" + s def evaluate_condition(input_string) -> bool: words = re.split(r"(==|!=|&&|\|\|)", input_string.replace(" ", "")) for i in range(len(words)): if words[i] not in ["==", "!=", "&&", "||"]: words[i] = '"' + words[i].replace('"', r'\"') + '"' condition = "".join(words).replace("&&", " and ").replace("||", " or ") ptrace("evaluate_conditon", f"Evaluating condition {condition}") try: return eval(condition) except SyntaxError: error("evaluate_conditon", f"Pythonized condition is invalid: {condition}", level=error_levels["light"]) return False """ ************************************************************ SITEMAP ************************************************************ """ class Sitemap: urls:dict = {} def __init__(self, url=None): self.url = url self.priority = None self.changefreq = None self.lastmod = None def set_url(self, url): self.url = url def set_priority(self, priority): try: priority = float(priority) except ValueError: error("Sitemap.set_priority", f"invalid priority: '{priority}'", level=error_levels["serious"]) if not (type(priority) == float and 0.0 <= priority and priority <= 1.0): error("Sitemap.set_priority", f"invalid priority: '{priority}'", level=error_levels["serious"]) self.priority = priority def set_changefreq(self, changefreq): if not (type(changefreq) == str and changefreq in ["always", "hourly", "daily", "weekly", "monthly", "yearly", "never"]): error("Sitemap.set_changefreq", f"invalid changefreq: '{changefreq}'", level=error_levels["serious"]) self.changefreq = changefreq def set_lastmod(self, lastmod): if not (type(lastmod) == str and re.fullmatch(re_w3cdate, lastmod)): error("Sitemap.set_lastmod", f"invalid lastmod: '{lastmod}'", level=error_levels["serious"]) self.lastmod = lastmod def get_entry(self): s = f"\n\t{self.url}" if self.priority is not None: s += f"\n\t{self.priority}" if self.changefreq is not None: s += f"\n\t{self.changefreq}" if self.lastmod is not None: s += f"\n\t{self.lastmod}" s += "\n" return s def __repr__(self) -> str: return f"Sitemap(url={self.url}, priority={self.priority}, changefreq={self.changefreq}, lastmod={self.lastmod})" @staticmethod def gen_sidemap(): s = sitemap_begin for url in Sitemap.urls.values(): s += "\t" + url.get_entry().replace("\n", "\n\t").strip("\t") + "\n" s += sitemap_end return s @staticmethod def cmd_sitemap(args:str, variables:dict[str,str]) -> str: space = args.find(" ") if space < 0: space = len(args) cmd = args[:space] cmd_args = "" if 0 < space and space < len(args) - 1: cmd_args = args[space+1:].strip(" ") pdebug("cmd_sitemap", f"cmd='{cmd}' cmd_args='{cmd_args}'") if not current_file_url in Sitemap.urls: Sitemap.urls[current_file_url] = Sitemap() if cmd == "include": if cmd_args: Sitemap.urls[current_file_url].set_url(cmd_args) else: Sitemap.urls[current_file_url].set_url(current_file_url) elif cmd == "priority": Sitemap.urls[current_file_url].set_priority(cmd_args) elif cmd == "changefreq": Sitemap.urls[current_file_url].set_changefreq(cmd_args) elif cmd == "lastmod": Sitemap.urls[current_file_url].set_lastmod(cmd_args) else: error("cmd_sitemap", f"Invalid command '{cmd}'", error_levels["serious"]) ptrace("cmd_sitemap", f"Sitemap[{current_file_url}] is now: {Sitemap.urls[current_file_url]}") return "" """ ************************************************************ SIDENAV ************************************************************ """ def replace_and_respect_indent(string, replace, replacement): """ replace all occurences of 'replace' with 'replacement', add the whitespaces in front of 'replace' to every line of 'replacement' """ i = string.find(replace) while i >= 0: line_begin = string.rfind("\n", 0, i) + 1 indent = string[line_begin:i] string = string[:line_begin] + replacement.replace("\n", "\n" + indent) + string[i+len(replace):] i = string.find(replace) return string class Sidenav: class Link: def __init__(self, name: str, link: str): self.link = link self.name = name def __repr__(self): return f"Link: name={self.name}, link={self.link}" def get(self): return sidenav_content_link.replace("#name", self.name).replace("#link", self.link) class Section: def __init__(self, name: str): self.name = name self.links = [] def add_link(self, link): self.links.append(link) def __repr__(self): return f"Section: name={self.name}" def get(self): links = "".join([ link.get() + "\n" for link in self.links ]) return replace_and_respect_indent(sidenav_content_section.replace("#name", self.name), "#links", links) entries: list[Link|Section] = [] skip_next = False custom_name = None @staticmethod def addEntry(name: str, link: str): if Sidenav.skip_next: Sidenav.skip_next = None return if Sidenav.custom_name: name = Sidenav.custom_name Sidenav.custom_name = None if len(Sidenav.entries) > 0 and type(Sidenav.entries[-1]) == Sidenav.Section: Sidenav.entries[-1].add_link(Sidenav.Link(name, link)) else: Sidenav.entries.append(Sidenav.Link(name, link)) @staticmethod def addSection(name): Sidenav.entries.append(Sidenav.Section(name)) @staticmethod def setCustomName(name: str): Sidenav.custom_name = name @staticmethod def skipNext(): Sidenav.skip_next = True @staticmethod def generate() -> str: pdebug("Sidenav.generate", f"found the following entries: {Sidenav.entries}") entries = "".join([entry.get() + "\n" for entry in Sidenav.entries]) return replace_and_respect_indent(sidenav_format, "#sidenav-content", entries) @staticmethod def cmd_sidenav(args:str, variables:dict[str,str]) -> str: space = args.find(" ") if space < 0: space = len(args) cmd = args[:space] cmd_args = "" if 0 < space and space < len(args) - 1: cmd_args = args[space+1:].strip(" ") pdebug("cmd_sidenav", f"cmd='{cmd}' cmd_args='{cmd_args}'") if cmd == "skip": Sidenav.skipNext() elif cmd == "section": Sidenav.addSection(cmd_args) elif cmd == "name": Sidenav.setCustomName(cmd_args) elif cmd == "custom": match = re.fullmatch(re_sidenav_custom, cmd_args) if match: Sidenav.addEntry(match.groups()[1], match.groups()[0]) else: error("cmd_sidenav", f"Invalid argument for command 'custom': '{cmd_args}'", level=error_levels["light"]) elif cmd == "include": return Sidenav.generate() else: error("cmd_sidenav", f"Invalid command: '{cmd}'", level=error_levels["light"]) return "" """ ************************************************************ COMMANDS ************************************************************ All these commands take one arg with trimmed whitespaces. The arg may be anything They all need to return a string, which will be placed into the source file at the place where the command was. """ def cmd_include(args: str, variables:dict[str, str]={}) -> str: args = args.split(' ') pdebug("cmd_include", f"args='{args}', variables='{variables}'") filename = args[0] content = "" try: with open(filename) as file: content = file.read() if len(args) > 1: # if section was specified target_section = args[1] p = HTMLParser(content, {}) p.pos["start"] = p.pos["end"] = -1 while p.i < len(p): # at start of new line or end of comment p.find_line_end() ptrace("cmd_include", f"Processing at i={p.i} in line {pos2line(p.file, p.i)}: '{p[p.i:p.pos['line_end']]}'") if not p.find_comment_begin(): continue if not p.find_comment_end(): continue p.replace_multiline_comments() match = p.find_command() if match: command = match.groups()[0] cmd_args = match.groups()[1].replace('\t', ' ').strip(' ') pdebug("cmd_include", f"Found command '{command}' with args '{cmd_args}'") if command == "section": if cmd_args.startswith(target_section): p.pos["start"] = max(p.pos["cmt_end"] + len(COMMENT_END), p.pos["line_end"] + 1) elif p.pos["start"] >= 0: #end p.pos["end"] = max(p.pos["cmt_end"] + len(COMMENT_END), p.pos["line_end"] + 1) # p.pos["end"] = p.pos["cmt_beg"] p.replace_command_with_output("") p.command_end() # remove the command (+comment) if p.pos["start"] >= 0 and p.pos["end"] > 0: break continue # section cmd in multiline comment is not supported, so simply jump to end of comment p.i = p.pos["cmt_end"] + len(COMMENT_END) p.pos["cmt_beg"] = -1 p.pos["cmd_beg"] = -1 p.pos["cmt_end"] = -1 p.pos["cmd_end"] = -1 if p.pos["start"] >= 0: if p.pos["end"] < 0: p.pos["end"] = len(p) content = p[p.pos["start"]:p.pos["end"]] else: error("cmd_include", f"Could not find section {target_section} in file {filename}") except FileNotFoundError: error("cmd_include", f"Could not open file '{filename}'", level=error_levels["serious"], exit_code=exit_codes["FileNotFound"]) content = f"" if filename.endswith(".md"): try: import mdtex2html as m2h # this package also converts tex to MathML content = m2h.convert(content, extensions=["extra"]) except: error("cmd_include", f"mdtex2html could not be imported, falling back to python-markdown for md to html conversion", level=error_levels["light"], exit_code=exit_codes["MarkdownConversionError"]) try: from markdown import markdown content = markdown(content, output_format="xhtml", extensions=["extra"]) except: error("cmd_include", f"Could convert markdown to html for file '{filename}'. Is python-markdown installed?", level=error_levels["critical"], exit_code=exit_codes["MarkdownConversionError"]) content = f"" glob_dependcies.append(filename) return content def cmd_section(args: str, variables:dict[str, str]={}) -> str: return "" def cmd_return(args: str, variables:dict[str, str]={}) -> str: # re_set_map = r"([a-zA-Z0-9_]+)\?\{(([a-zA-Z0-9_]+:.+,)*([a-zA-Z0-9_]+:.+))\}" # space = args.find(' ') pdebug("cmd_set", f"varname='{args[:space]}, 'arg='{args[space+1:]}', variables='{variables}'") if not (space > 0 and space < len(args)-1): variables[args] = "" pdebug("cmd_set", f"Setting to empty string: {args}") else: varname = args[:space] variables[varname] = "" # check if map assignment with either , or ; separator = ',' match = re.fullmatch(re_set_map, args[space+1:].strip(' ')) if not match: match = re.fullmatch(re_set_map_alt, args[space+1:].strip(' ')) separator = ';' if match: pdebug("cmd_set", f"Map {match.group()}") depends = match.groups()[0] if not depends in variables: pdebug("cmd_set", f"Setting from map, but depends='{depends}' is not in variables") return "" depends_val = variables[depends] for option in match.groups()[1].split(separator): option = option.strip(" ") pdebug("cmd_set", f"Found option {option}") colon = option.find(':') # we will find one, regex guarantees if option[:colon].strip(" ") == depends_val or option[:colon].strip(" ") == "*": variables[varname] = option[colon+1:].strip(" ") else: # simple asignment value = args[space+1:].strip(" ") variables[varname] = value pdebug("cmd_set", f"Assignment {varname} -> {value}") return variables[varname] return "" def cmd_set(args: str, variables:dict[str, str]={}) -> str: cmd_return(args, variables) return "" def cmd_unset(args: str, variables:dict[str, str]={}) -> str: variable = args.strip(' ') if variable not in variables: pdebug("cmd_unset", f"variable '{variable}' is not set", level=error_levels["light"]) else: variables.pop(variable) return "" def cmd_default(args: str, variables:dict[str, str]={}) -> str: separator = args.find(' ') if args[:separator] not in variables: cmd_return(args, variables) return "" def cmd_comment(args: str, variables:dict[str, str]={}) -> str: return f"" def cmd_uncomment(args: str, variables:dict[str, str]={}) -> str: return args def cmd_error(args: str, variables:dict[str, str]={}) -> str: error("cmd_error", f"Encounted 'error' command: {args}", level=error_levels["critical"]) return "" def cmd_warning(args: str, variables:dict[str, str]={}) -> str: error("cmd_warning", f"Encounted 'warning' command: {args}", level=error_levels["light"]) return "" command2function:dict[str, Callable[[str, dict[str,str]], str]] = { "include": cmd_include, "section": cmd_section, "return": cmd_return, "set": cmd_set, "unset": cmd_unset, "default": cmd_default, "comment": cmd_comment, "uncomment": cmd_uncomment, "sidenav": Sidenav.cmd_sidenav, "sitemap": Sitemap.cmd_sitemap, "warning": cmd_warning, "error": cmd_error, } """ ************************************************************ PARSING ************************************************************ """ class Parser(): """ General purpose parser class It has states and positions in a text, which are updated when portions of the text are replaced or removed """ def __init__(self, file): self.file = file self.pos: dict[str, int] = {} self.state: dict[str, bool] = {} def remove(self, start, stop, ignore_bounds=[]): """remove range [start, stop) of text and update positions""" delete_length = stop - start nl, esl = "\n", "\\n" ptrace("Parser.remove", f"Deleting range [{start}, {stop}) of length {delete_length}: '{self.file[start:stop].replace(nl, esl)}'") assert(stop >= start) assert(stop <= len(self.file)) self.file = self.file[:start] + self.file[stop:] for k,pos in self.pos.items(): if pos >= stop: self.pos[k] -= delete_length elif pos > start and not k in ignore_bounds: error("Parser.remove", f"Position {k}={pos} within deleted range [{start},{stop})", level=error_levels["light"]) def replace(self, start, stop, replacement, ignore_bounds=[]): assert(stop >= start) assert(stop <= len(self.file)) ptrace("Parser.replace", f"Replacing range [{start}, {stop}): '{self.file[start:stop]}' with '{replacement}'") self.file = self.file[:start] + replacement + self.file[stop:] length_difference = stop - start - len(replacement) for k,pos in self.pos.items(): if pos >= stop: self.pos[k] -= length_difference elif pos > start and k not in ignore_bounds: error("Parser.replace", f"Position {k}={pos} within replaced range [{start},{stop})", level=error_levels["light"]) def __getitem__(self, key): return self.file[key] def __len__(self): return len(self.file) class HTMLParser(Parser): """ Parse a html file Each function operates the positon indicated by i until the position "line_end" """ def __init__(self, file, variables:dict[str, str], remove_comments=False): super().__init__(file) self.i = 0 self.variables = variables self.pos["cmt_beg"] = -1 self.pos["cmt_end"] = -1 self.pos["cmd_beg"] = -1 self.pos["cmd_end"] = -1 self.pos["line_end"] = -1 self.pos["conditional_block_beg"] = -1 # char pos of the first char of the last block, if waiting for elif, else or endif self.state["cmd_in_cmt"] = False self.state["last_condition"] = False # if the last if condition was true self.remove_comments = remove_comments def use_variables(self): """replace variable usages in the current line""" self.replace(self.i, self.pos["line_end"], substitute_variables(self[self.i:self.pos["line_end"]], self.variables)) ptrace("HTMLParser.use_variables", f"Line after variable substitution:", self.file[self.i:self.pos["line_end"]]) def add_sidenav_headings(self): """check if heading for sidenav in line""" match = re.search(re_sidenav_heading, self[self.i:self.pos["line_end"]]) if match: Sidenav.addEntry(match.groups()[1], f"#{match.groups()[0]}") ptrace("HTMLParser.add_sidenav_headings:", f"Found heading with id:", match.groups()) def get_leading_whitespaces(self): """returns the whitespaces at the start of the line""" # find last newline line_beg = self.file.rfind("\n", 0, self.i) if line_beg < 0: line_beg = 0 else: line_beg += 1 # start after newline match = re.match(r"^([ \t]*)", self.file[line_beg:self.pos['line_end']]) if not match: return "" else: return match.groups()[0] # Parsing functions def find_line_end(self): """ line_end -> position of next newline char or EOF """ self.pos["line_end"] = self.file.find('\n', self.i+1) if self.pos["line_end"] < 0: self.pos["line_end"] = len(self) def find_comment_begin(self) -> bool: """ find the beginning of a comment in the current line if comment begin was found, jump into the comment, return True cmt_beg -> beginning of COMMENT_BEGIN i -> first character after COMMENT_BEGIN / line_end + 1 """ # look for comment begin if self.pos["cmt_beg"] < 0: # if not in comment, find next comment self.pos["cmt_beg"] = self.file.find(COMMENT_BEGIN, self.i, self.pos["line_end"]) if self.pos["cmt_beg"] < 0: self.i = self.pos["line_end"] + 1 return False else: # jump to comment_begin old_i = self.i self.i = self.pos["cmt_beg"] + len(COMMENT_BEGIN) # after comment begin ptrace(f"HTMLParser.find_comment_begin", f"Found comment begin, jumping from pos {old_i} to {self.i}") return True return True # still in previous comment def find_comment_end(self): """ call after find_comment_begin returns true to update the cmt_end call continue when returning false cmt_end -> beginning of COMMENT_END / --- cmt_beg -> --- / -1 when invalid comment """ # in comment, i at the character after COMMENT_BEGIN self.pos["cmt_end"] = self.file.find(COMMENT_END, self.i) #, self.pos["line_end"]) # sanity checks if self.pos["cmt_end"] < 0: error("HTMLParser.find_comment_end", f"Comment starting in line {pos2line(self.file, self.pos['cmt_beg'])} is never ended.", level=error_levels["serious"]) return False else: tmp_next_begin = self.file.find(COMMENT_BEGIN, self.i) if 0 < tmp_next_begin and tmp_next_begin < self.pos["cmt_end"]: error("HTMLParser.find_comment_end", f"Found next comment begin before the comment starting in line {pos2line(self.file, self.pos['cmt_beg'])} is ended! Skipping comment. Comment without proper closing tags: '{self.file[self.i:self.pos['line_end']]}'", level=error_levels["light"]) self.pos["cmt_beg"] = -1 return False return True def replace_multiline_comments(self): """ if in a multiline comment, turn every line into a separate comment """ # not a multiline comment if self.pos["line_end"] > self.pos["cmt_end"]: return indent = self.get_leading_whitespaces() self.replace(self.pos["cmt_beg"], self.pos["cmt_end"], self.file[self.pos["cmt_beg"]:self.pos["cmt_end"]].replace("\n", "-->\n" + indent + "