#!/bin/python3
import os
from os import path
import re
from sys import argv
from collections.abc import Callable
import argparse
import pickle
"""
TODO:
- more testing
- reintroduce the nav_selected class on nav feature
"""
"""
************************************************************ SETTINGS ************************************************************
"""
sidenav_format = """\
"""
sidenav_content_link = "#name"
sidenav_content_section = """\
#name
"""
exit_on_include_failure = False
sitemap_begin = """\
\n"""
sitemap_end = ""
"""
************************************************************ REGULAR EXPRESSIONS ************************************************************
"""
# SIDENAV
# heading with id
re_sidenav_heading = r"(.+)"
# custom entry
re_sidenav_custom = r"href=(?:\"|\')([^\"\' ]+)(?:\"|\') +name=(?:\"|\')(.+)(?:\"|\')"
# commas
re_set_map = r"([a-zA-Z0-9_]+) *\? *\{( *(?:[a-zA-Z0-9_*]+ *: *[^,]*, *)+[a-zA-Z0-9_*]+ *: *[^,]*) *,? *\}"
# semicolons
re_set_map_alt = r"([a-zA-Z0-9_]+) *\? *\{( *(?:[a-zA-Z0-9_*]+ *: *[^;]* *; *)+[a-zA-Z0-9_*]+ *: *[^;]*) *;? *\}"
""" #$(myvar) """
re_variable_use = r"#\$\(([a-zA-Z0-9_]+)\)"
""" only in comments """
re_preprocessor_command = r"[\t ]*#([a-zA-Z]+) *(.*)[\t ]*"
# https://www.w3.org/TR/NOTE-datetime
re_w3cdate = r"\d{4}-(?)]-\d{2}"
r"\d{4}-(?:0[1-9]|1[0-2])-(?:[0-2]\d|3[01])(T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d([\+\-](?:0\d|1[0-2]):[0-5]\d)?)?"
COMMENT_BEGIN = ""
"""
************************************************************ GLOBALS ************************************************************
"""
glob_dependcies: list[str] = []
exit_codes = {
"FileNotFound": 2,
"MarkdownConversionError": 3,
}
error_levels = {
"light": 0,
"serious": 1,
"critical": 2,
}
exit_on_error_level = error_levels["serious"]
# url that the currently processed file have
current_file_url = ""
"""
************************************************************ UTILITY ************************************************************
"""
RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
MAGENTA = '\033[95m'
CYAN = '\033[96m'
GRAY = '\033[97m'
RESET = '\033[0m'
BOLD = '\033[1m'
WHITE = '\033[37m'
DEBUG = False
def pdebug(*args, **keys):
fname, *_args = args
if DEBUG: print(f"{CYAN}{fname}{GRAY}", *_args, RESET, **keys)
TRACE = False
def ptrace(*args, **keys):
fname, *_args = args
if TRACE: print(f"{BLUE}{fname}{GRAY}", *_args, RESET, **keys)
def error(*args, level:int=exit_on_error_level, exit_code:int=1, **keys):
fname, *_args = args
if level >= exit_on_error_level:
print(f"{RED}ERROR: {fname}{RESET}", *_args, RESET, **keys)
exit(exit_code)
else:
print(f"{YELLOW}WARNING: {fname}{RESET}", *_args, RESET, **keys)
def line_is_link_to_path(line, path):
# check if the line is a link to html thats currently being processed
match = re.search(r"(.+)", line)
if match:
# get filename
match = re.match(r"[a-zA-Z0-9_\-]+\.html", match.groups()[1])
if match and match.group() in path:
return True
return False
def pos2line(s: str, pos:int):
return s[:pos].count('\n') + 1
def generate_dependecy_file(filename:str, deps:list[str]):
line1 = f"{filename}:"
s = ""
for dep in deps:
line1 += f" {dep}"
s += f"{dep}:\n"
return line1 #+ "\n" + s
def evaluate_condition(input_string) -> bool:
words = re.split(r"(==|!=|&&|\|\|)", input_string.replace(" ", ""))
for i in range(len(words)):
if words[i] not in ["==", "!=", "&&", "||"]:
words[i] = '"' + words[i].replace('"', r'\"') + '"'
condition = "".join(words).replace("&&", " and ").replace("||", " or ")
ptrace("evaluate_conditon", f"Evaluating condition {condition}")
try:
return eval(condition)
except SyntaxError:
error("evaluate_conditon", f"Pythonized condition is invalid: {condition}", level=error_levels["light"])
return False
"""
************************************************************ SITEMAP ************************************************************
"""
class Sitemap:
urls:dict = {}
def __init__(self, url=None):
self.url = url
self.priority = None
self.changefreq = None
self.lastmod = None
def set_url(self, url):
self.url = url
def set_priority(self, priority):
try:
priority = float(priority)
except ValueError:
error("Sitemap.set_priority", f"invalid priority: '{priority}'", level=error_levels["serious"])
if not (type(priority) == float and 0.0 <= priority and priority <= 1.0):
error("Sitemap.set_priority", f"invalid priority: '{priority}'", level=error_levels["serious"])
self.priority = priority
def set_changefreq(self, changefreq):
if not (type(changefreq) == str and changefreq in ["always", "hourly", "daily", "weekly", "monthly", "yearly", "never"]):
error("Sitemap.set_changefreq", f"invalid changefreq: '{changefreq}'", level=error_levels["serious"])
self.changefreq = changefreq
def set_lastmod(self, lastmod):
if not (type(lastmod) == str and re.fullmatch(re_w3cdate, lastmod)):
error("Sitemap.set_lastmod", f"invalid lastmod: '{lastmod}'", level=error_levels["serious"])
self.lastmod = lastmod
def get_entry(self):
s = f"\n\t{self.url}"
if self.priority is not None: s += f"\n\t{self.priority}"
if self.changefreq is not None: s += f"\n\t{self.changefreq}"
if self.lastmod is not None: s += f"\n\t{self.lastmod}"
s += "\n"
return s
def __repr__(self) -> str:
return f"Sitemap(url={self.url}, priority={self.priority}, changefreq={self.changefreq}, lastmod={self.lastmod})"
@staticmethod
def gen_sidemap():
s = sitemap_begin
for url in Sitemap.urls.values():
s += "\t" + url.get_entry().replace("\n", "\n\t").strip("\t") + "\n"
s += sitemap_end
return s
@staticmethod
def cmd_sitemap(args:str, variables:dict[str,str]) -> str:
space = args.find(" ")
if space < 0:
space = len(args)
cmd = args[:space]
cmd_args = ""
if 0 < space and space < len(args) - 1:
cmd_args = args[space+1:].strip(" ")
pdebug("cmd_sitemap", f"cmd='{cmd}' cmd_args='{cmd_args}'")
if not current_file_url in Sitemap.urls:
Sitemap.urls[current_file_url] = Sitemap()
if cmd == "include":
if cmd_args:
Sitemap.urls[current_file_url].set_url(cmd_args)
else:
Sitemap.urls[current_file_url].set_url(current_file_url)
elif cmd == "priority":
Sitemap.urls[current_file_url].set_priority(cmd_args)
elif cmd == "changefreq":
Sitemap.urls[current_file_url].set_changefreq(cmd_args)
elif cmd == "lastmod":
Sitemap.urls[current_file_url].set_lastmod(cmd_args)
else:
error("cmd_sitemap", f"Invalid command '{cmd}'", error_levels["serious"])
ptrace("cmd_sitemap", f"Sitemap[{current_file_url}] is now: {Sitemap.urls[current_file_url]}")
return ""
"""
************************************************************ SIDENAV ************************************************************
"""
def replace_and_respect_indent(string, replace, replacement):
"""
replace all occurences of 'replace' with 'replacement', add the whitespaces in front of 'replace' to every line of 'replacement'
"""
i = string.find(replace)
while i >= 0:
line_begin = string.rfind("\n", 0, i) + 1
indent = string[line_begin:i]
string = string[:line_begin] + replacement.replace("\n", "\n" + indent) + string[i+len(replace):]
i = string.find(replace)
return string
class Sidenav:
class Link:
def __init__(self, name: str, link: str):
self.link = link
self.name = name
def __repr__(self):
return f"Link: name={self.name}, link={self.link}"
def get(self):
return sidenav_content_link.replace("#name", self.name).replace("#link", self.link)
class Section:
def __init__(self, name: str):
self.name = name
self.links = []
def add_link(self, link):
self.links.append(link)
def __repr__(self):
return f"Section: name={self.name}"
def get(self):
links = "".join([ link.get() + "\n" for link in self.links ])
return replace_and_respect_indent(sidenav_content_section.replace("#name", self.name), "#links", links)
entries: list[Link|Section] = []
skip_next = False
custom_name = None
@staticmethod
def addEntry(name: str, link: str):
if Sidenav.skip_next:
Sidenav.skip_next = None
return
if Sidenav.custom_name:
name = Sidenav.custom_name
Sidenav.custom_name = None
if len(Sidenav.entries) > 0 and type(Sidenav.entries[-1]) == Sidenav.Section:
Sidenav.entries[-1].add_link(Sidenav.Link(name, link))
else:
Sidenav.entries.append(Sidenav.Link(name, link))
@staticmethod
def addSection(name):
Sidenav.entries.append(Sidenav.Section(name))
@staticmethod
def setCustomName(name: str):
Sidenav.custom_name = name
@staticmethod
def skipNext():
Sidenav.skip_next = True
@staticmethod
def generate() -> str:
pdebug("Sidenav.generate", f"found the following entries: {Sidenav.entries}")
entries = "".join([entry.get() + "\n" for entry in Sidenav.entries])
return replace_and_respect_indent(sidenav_format, "#sidenav-content", entries)
@staticmethod
def cmd_sidenav(args:str, variables:dict[str,str]) -> str:
space = args.find(" ")
if space < 0:
space = len(args)
cmd = args[:space]
cmd_args = ""
if 0 < space and space < len(args) - 1:
cmd_args = args[space+1:].strip(" ")
pdebug("cmd_sidenav", f"cmd='{cmd}' cmd_args='{cmd_args}'")
if cmd == "skip":
Sidenav.skipNext()
elif cmd == "section":
Sidenav.addSection(cmd_args)
elif cmd == "name":
Sidenav.setCustomName(cmd_args)
elif cmd == "custom":
match = re.fullmatch(re_sidenav_custom, cmd_args)
if match:
Sidenav.addEntry(match.groups()[1], match.groups()[0])
else:
error("cmd_sidenav", f"Invalid argument for command 'custom': '{cmd_args}'", level=error_levels["light"])
elif cmd == "include":
return Sidenav.generate()
else:
error("cmd_sidenav", f"Invalid command: '{cmd}'", level=error_levels["light"])
return ""
"""
************************************************************ COMMANDS ************************************************************
All these commands take one arg with trimmed whitespaces.
The arg may be anything
They all need to return a string, which will be placed
into the source file at the place where the command was.
"""
def cmd_include(args: str, variables:dict[str, str]={}) -> str:
args = args.split(' ')
pdebug("cmd_include", f"args='{args}', variables='{variables}'")
filename = args[0]
content = ""
try:
with open(filename) as file:
content = file.read()
if len(args) > 1: # if section was specified
target_section = args[1]
p = HTMLParser(content, {})
p.pos["start"] = p.pos["end"] = -1
while p.i < len(p): # at start of new line or end of comment
p.find_line_end()
ptrace("cmd_include", f"Processing at i={p.i} in line {pos2line(p.file, p.i)}: '{p[p.i:p.pos['line_end']]}'")
if not p.find_comment_begin(): continue
if not p.find_comment_end(): continue
p.replace_multiline_comments()
match = p.find_command()
if match:
command = match.groups()[0]
cmd_args = match.groups()[1].replace('\t', ' ').strip(' ')
pdebug("cmd_include", f"Found command '{command}' with args '{cmd_args}'")
if command == "section":
if cmd_args.startswith(target_section):
p.pos["start"] = max(p.pos["cmt_end"] + len(COMMENT_END), p.pos["line_end"] + 1)
elif p.pos["start"] >= 0: #end
p.pos["end"] = max(p.pos["cmt_end"] + len(COMMENT_END), p.pos["line_end"] + 1)
# p.pos["end"] = p.pos["cmt_beg"]
p.replace_command_with_output("")
p.command_end() # remove the command (+comment)
if p.pos["start"] >= 0 and p.pos["end"] > 0: break
continue
# section cmd in multiline comment is not supported, so simply jump to end of comment
p.i = p.pos["cmt_end"] + len(COMMENT_END)
p.pos["cmt_beg"] = -1
p.pos["cmd_beg"] = -1
p.pos["cmt_end"] = -1
p.pos["cmd_end"] = -1
if p.pos["start"] >= 0:
if p.pos["end"] < 0:
p.pos["end"] = len(p)
content = p[p.pos["start"]:p.pos["end"]]
else:
error("cmd_include", f"Could not find section {target_section} in file {filename}")
except FileNotFoundError:
error("cmd_include", f"Could not open file '{filename}'", level=error_levels["serious"], exit_code=exit_codes["FileNotFound"])
content = f""
if filename.endswith(".md"):
try:
import mdtex2html as m2h # this package also converts tex to MathML
content = m2h.convert(content, extensions=["extra"])
except:
error("cmd_include", f"mdtex2html could not be imported, falling back to python-markdown for md to html conversion", level=error_levels["light"], exit_code=exit_codes["MarkdownConversionError"])
try:
from markdown import markdown
content = markdown(content, output_format="xhtml", extensions=["extra"])
except:
error("cmd_include", f"Could convert markdown to html for file '{filename}'. Is python-markdown installed?", level=error_levels["critical"], exit_code=exit_codes["MarkdownConversionError"])
content = f""
glob_dependcies.append(filename)
return content
def cmd_section(args: str, variables:dict[str, str]={}) -> str:
return ""
def cmd_return(args: str, variables:dict[str, str]={}) -> str:
# re_set_map = r"([a-zA-Z0-9_]+)\?\{(([a-zA-Z0-9_]+:.+,)*([a-zA-Z0-9_]+:.+))\}"
#
space = args.find(' ')
pdebug("cmd_set", f"varname='{args[:space]}, 'arg='{args[space+1:]}', variables='{variables}'")
if not (space > 0 and space < len(args)-1):
variables[args] = ""
pdebug("cmd_set", f"Setting to empty string: {args}")
else:
varname = args[:space]
variables[varname] = ""
# check if map assignment with either , or ;
separator = ','
match = re.fullmatch(re_set_map, args[space+1:].strip(' '))
if not match:
match = re.fullmatch(re_set_map_alt, args[space+1:].strip(' '))
separator = ';'
if match:
pdebug("cmd_set", f"Map {match.group()}")
depends = match.groups()[0]
if not depends in variables:
pdebug("cmd_set", f"Setting from map, but depends='{depends}' is not in variables")
return ""
depends_val = variables[depends]
for option in match.groups()[1].split(separator):
option = option.strip(" ")
pdebug("cmd_set", f"Found option {option}")
colon = option.find(':') # we will find one, regex guarantees
if option[:colon].strip(" ") == depends_val or option[:colon].strip(" ") == "*":
variables[varname] = option[colon+1:].strip(" ")
else: # simple asignment
value = args[space+1:].strip(" ")
variables[varname] = value
pdebug("cmd_set", f"Assignment {varname} -> {value}")
return variables[varname]
return ""
def cmd_set(args: str, variables:dict[str, str]={}) -> str:
cmd_return(args, variables)
return ""
def cmd_unset(args: str, variables:dict[str, str]={}) -> str:
variable = args.strip(' ')
if variable not in variables:
pdebug("cmd_unset", f"variable '{variable}' is not set", level=error_levels["light"])
else:
variables.pop(variable)
return ""
def cmd_default(args: str, variables:dict[str, str]={}) -> str:
separator = args.find(' ')
if args[:separator] not in variables:
cmd_return(args, variables)
return ""
def cmd_comment(args: str, variables:dict[str, str]={}) -> str:
return f""
def cmd_uncomment(args: str, variables:dict[str, str]={}) -> str:
return args
def cmd_error(args: str, variables:dict[str, str]={}) -> str:
error("cmd_error", f"Encounted 'error' command: {args}", level=error_levels["critical"])
return ""
def cmd_warning(args: str, variables:dict[str, str]={}) -> str:
error("cmd_warning", f"Encounted 'warning' command: {args}", level=error_levels["light"])
return ""
command2function:dict[str, Callable[[str, dict[str,str]], str]] = {
"include": cmd_include,
"section": cmd_section,
"return": cmd_return,
"set": cmd_set,
"unset": cmd_unset,
"default": cmd_default,
"comment": cmd_comment,
"uncomment": cmd_uncomment,
"sidenav": Sidenav.cmd_sidenav,
"sitemap": Sitemap.cmd_sitemap,
"warning": cmd_warning,
"error": cmd_error,
}
"""
************************************************************ PARSING ************************************************************
"""
class Parser():
"""
General purpose parser class
It has states and positions in a text, which are updated when portions of the text are replaced or removed
"""
def __init__(self, file):
self.file = file
self.pos: dict[str, int] = {}
self.state: dict[str, bool] = {}
def remove(self, start, stop, ignore_bounds=[]):
"""remove range [start, stop) of text and update positions"""
delete_length = stop - start
nl, esl = "\n", "\\n"
ptrace("Parser.remove", f"Deleting range [{start}, {stop}) of length {delete_length}: '{self.file[start:stop].replace(nl, esl)}'")
assert(stop >= start)
assert(stop <= len(self.file))
self.file = self.file[:start] + self.file[stop:]
for k,pos in self.pos.items():
if pos >= stop: self.pos[k] -= delete_length
elif pos > start and not k in ignore_bounds: error("Parser.remove", f"Position {k}={pos} within deleted range [{start},{stop})", level=error_levels["light"])
def replace(self, start, stop, replacement, ignore_bounds=[]):
assert(stop >= start)
assert(stop <= len(self.file))
ptrace("Parser.replace", f"Replacing range [{start}, {stop}): '{self.file[start:stop]}' with '{replacement}'")
self.file = self.file[:start] + replacement + self.file[stop:]
length_difference = stop - start - len(replacement)
for k,pos in self.pos.items():
if pos >= stop: self.pos[k] -= length_difference
elif pos > start and k not in ignore_bounds: error("Parser.replace", f"Position {k}={pos} within replaced range [{start},{stop})", level=error_levels["light"])
def __getitem__(self, key):
return self.file[key]
def __len__(self):
return len(self.file)
class HTMLParser(Parser):
"""
Parse a html file
Each function operates the positon indicated by i until the position "line_end"
"""
def __init__(self, file, variables:dict[str, str], remove_comments=False):
super().__init__(file)
self.i = 0
self.variables = variables
self.pos["cmt_beg"] = -1
self.pos["cmt_end"] = -1
self.pos["cmd_beg"] = -1
self.pos["cmd_end"] = -1
self.pos["line_end"] = -1
self.pos["conditional_block_beg"] = -1 # char pos of the first char of the last block, if waiting for elif, else or endif
self.state["cmd_in_cmt"] = False
self.state["last_condition"] = False # if the last if condition was true
self.remove_comments = remove_comments
def use_variables(self):
"""replace variable usages in the current line"""
self.replace(self.i, self.pos["line_end"], substitute_variables(self[self.i:self.pos["line_end"]], self.variables))
ptrace("HTMLParser.use_variables", f"Line after variable substitution:", self.file[self.i:self.pos["line_end"]])
def add_sidenav_headings(self):
"""check if heading for sidenav in line"""
match = re.search(re_sidenav_heading, self[self.i:self.pos["line_end"]])
if match:
Sidenav.addEntry(match.groups()[1], f"#{match.groups()[0]}")
ptrace("HTMLParser.add_sidenav_headings:", f"Found heading with id:", match.groups())
def get_leading_whitespaces(self):
"""returns the whitespaces at the start of the line"""
# find last newline
line_beg = self.file.rfind("\n", 0, self.i)
if line_beg < 0: line_beg = 0
else: line_beg += 1 # start after newline
match = re.match(r"^([ \t]*)", self.file[line_beg:self.pos['line_end']])
if not match: return ""
else: return match.groups()[0]
# Parsing functions
def find_line_end(self):
"""
line_end -> position of next newline char or EOF
"""
self.pos["line_end"] = self.file.find('\n', self.i+1)
if self.pos["line_end"] < 0: self.pos["line_end"] = len(self)
def find_comment_begin(self) -> bool:
"""
find the beginning of a comment in the current line
if comment begin was found, jump into the comment, return True
cmt_beg -> beginning of COMMENT_BEGIN
i -> first character after COMMENT_BEGIN / line_end + 1
"""
# look for comment begin
if self.pos["cmt_beg"] < 0: # if not in comment, find next comment
self.pos["cmt_beg"] = self.file.find(COMMENT_BEGIN, self.i, self.pos["line_end"])
if self.pos["cmt_beg"] < 0:
self.i = self.pos["line_end"] + 1
return False
else:
# jump to comment_begin
old_i = self.i
self.i = self.pos["cmt_beg"] + len(COMMENT_BEGIN) # after comment begin
ptrace(f"HTMLParser.find_comment_begin", f"Found comment begin, jumping from pos {old_i} to {self.i}")
return True
return True # still in previous comment
def find_comment_end(self):
"""
call after find_comment_begin returns true to update the cmt_end
call continue when returning false
cmt_end -> beginning of COMMENT_END / ---
cmt_beg -> --- / -1 when invalid comment
"""
# in comment, i at the character after COMMENT_BEGIN
self.pos["cmt_end"] = self.file.find(COMMENT_END, self.i) #, self.pos["line_end"])
# sanity checks
if self.pos["cmt_end"] < 0:
error("HTMLParser.find_comment_end", f"Comment starting in line {pos2line(self.file, self.pos['cmt_beg'])} is never ended.", level=error_levels["serious"])
return False
else:
tmp_next_begin = self.file.find(COMMENT_BEGIN, self.i)
if 0 < tmp_next_begin and tmp_next_begin < self.pos["cmt_end"]:
error("HTMLParser.find_comment_end", f"Found next comment begin before the comment starting in line {pos2line(self.file, self.pos['cmt_beg'])} is ended! Skipping comment. Comment without proper closing tags: '{self.file[self.i:self.pos['line_end']]}'", level=error_levels["light"])
self.pos["cmt_beg"] = -1
return False
return True
def replace_multiline_comments(self):
"""
if in a multiline comment, turn every line into a separate comment
"""
# not a multiline comment
if self.pos["line_end"] > self.pos["cmt_end"]: return
indent = self.get_leading_whitespaces()
self.replace(self.pos["cmt_beg"], self.pos["cmt_end"], self.file[self.pos["cmt_beg"]:self.pos["cmt_end"]].replace("\n", "-->\n" + indent + "