bUwUma/html-preprocessor

#!/bin/python3
import os
import re
from sys import argv
from collections.abc import Callable

"""
TODO:
- more testing
- reintroduce the nav_selected class on nav feature
"""
"""
************************************************************ SETTINGS ************************************************************
"""
sidenav_format = """\
    <div class="sidenav">
    <ul>
        <li class="menudrop">&#9776;</li>
        #sidenav-content
    </ul>
    </div>
    """
sidenav_content_link = "<li class=\"sidenav_link\"><a href=\"#link\">#name</a></li>"
sidenav_content_section = "<li class=\"sidenav_section\">#name</li>"

exit_on_include_failure = False

"""
************************************************************ REGULAR EXPRESSIONS ************************************************************
"""
# SIDENAV
# heading with id
re_sidenav_heading = r"<h\d.*id=(?:\"|\')([a-zA-Z0-9_\-]+)(?:\"|\').*>(.+)</h\d>"
# custom entry
re_sidenav_custom = r"href=(?:\"|\')([^\"\' ]+)(?:\"|\') +name=(?:\"|\')(.+)(?:\"|\')"

# commas
re_set_map = r"([a-zA-Z0-9_]+) *\? *\{( *(?:[a-zA-Z0-9_*]+ *: *[^,]*, *)+[a-zA-Z0-9_*]+ *: *[^,]*) *,? *\}"
# semicolons
re_set_map_alt = r"([a-zA-Z0-9_]+) *\? *\{( *(?:[a-zA-Z0-9_*]+ *: *[^;]* *; *)+[a-zA-Z0-9_*]+ *: *[^;]*) *;? *\}"

""" #$(myvar) """
re_variable_use = r"#\$\(([a-zA-Z0-9_]+)\)"

""" only in comments """
re_preprocessor_command = r"#([a-zA-Z]+) *(.*) *"

COMMENT_BEGIN = "<!--"
COMMENT_END = "-->"


"""
************************************************************ GLOBALS ************************************************************
"""
glob_dependcies: list[str] = []

exit_codes = {
    "FileNotFound": 2,
    "MarkdownConversionError": 3,
}
error_levels = {
    "light": 0,
    "serious": 1,
    "critical": 2,
}
exit_on_error_level = error_levels["serious"]


"""
************************************************************ UTILITY ************************************************************
"""
DEBUG = True
def pdebug(*args, **keys):
    if DEBUG: print(*args, **keys)

TRACE = True
def ptrace(*args, **keys):
    if TRACE: print(*args, **keys)

def error(*args, level:int=exit_on_error_level, exit_code:int=1, **keys):
    if level >= exit_on_error_level:
        print(f"ERROR:", *args, **keys)
        exit(exit_code)
    else:
        print(f"WARNING:", *args, **keys)

def line_is_link_to_path(line, path):
    # check if the line is a link to html thats currently being processed
    match = re.search(r"<a href=(\"|\')(.+)(\"|\')>(.+)</a>", line)
    if match:
        # get filename
        match = re.match(r"[a-zA-Z0-9_\-]+\.html", match.groups()[1])
        if match and match.group() in path:
                return True
    return False

def pos2line(s: str, pos:int):
    return s[:pos].count('\n') + 1


def generate_dependecy_file(filename:str, deps:list[str]):
    line1 = f"{filename}:"
    s = ""
    for dep in deps:
        line1 += f" {dep}"
        s += f"{dep}:\n"
    return line1 #+ "\n" + s

def evaluate_condition(input_string) -> bool:
    words = re.split(r"(==|!=|&&|\|\|)", input_string.replace(" ", ""))
    for i in range(len(words)):
        if words[i] not in ["==", "!=", "&&", "||"]:
            words[i] = '"' + words[i].replace('"', r'\"') + '"'

    condition = "".join(words).replace("&&", " and ").replace("||", " or ")
    ptrace(f"> Evaluating condition {condition}")
    try:
        return eval(condition)
    except SyntaxError:
        error(f"Pythonized condition is invalid: {condition}", level=error_levels["light"])
        return False


"""
************************************************************ SIDENAV ************************************************************
"""
class Sidenav:
    LINK = 0
    SECTION = 1
    # 0: link, 1: section
    entries: list[tuple[int, str, str]] = []
    skip_next = False
    custom_name = None
    @staticmethod
    def addEntry(name: str, link: str):
        if Sidenav.skip_next:
            Sidenav.skip_next = None
            return
        if Sidenav.custom_name:
            name = Sidenav.custom_name
            Sidenav.custom_name = None
        Sidenav.entries.append((Sidenav.LINK, name, link))
    @staticmethod
    def addSection(name):
        Sidenav.entries.append((Sidenav.SECTION, name, ""))
    @staticmethod
    def setCustomName(name: str):
        Sidenav.custom_name = name
    @staticmethod
    def skipNext():
        Sidenav.skip_next = True
    @staticmethod
    def generate() -> str:
        pdebug(f"Sidenav.generate(): found the following entries: {Sidenav.entries}")
        sidenav:list[str] = sidenav_format.split('\n')
        content_i = -1
        for i in range(len(sidenav)):  # find in which line the entries need to be placed
            if "#sidenav-content" in sidenav[i]:
                content_i = i
                break
        if content_i >= 0:
            sidenav.pop(content_i)
            added_links = []
            for i in reversed(range(len(Sidenav.entries))):
                entry = Sidenav.entries[i]
                if entry[0] == Sidenav.LINK:
                    if entry[2] in added_links: continue  # no duplicates
                    added_links.append(entry[2])
                    sidenav.insert(content_i, sidenav_content_link.replace("#name", entry[1]).replace("#link", entry[2]))
                else:
                    sidenav.insert(content_i, sidenav_content_section.replace("#name", entry[1]))
        sidenav_s = ""
        for line in sidenav: sidenav_s += line + "\n"  # cant use "".join because of newlines
        return sidenav_s
    @staticmethod
    def cmd_sidenav(args:str, variables:dict[str,str]) -> str:
        space = args.find(" ")
        if space < 0:
            space = len(args)
        cmd = args[:space]
        cmd_args = ""
        if 0 < space and space < len(args) - 1:
            cmd_args = args[space+1:].strip(" ")
        if cmd == "skip":
            Sidenav.skipNext()
        elif cmd == "section":
            Sidenav.addSection(cmd_args)
        elif cmd == "name":
            Sidenav.setCustomName(cmd_args)
        elif cmd == "custom":
            match = re.fullmatch(re_sidenav_custom, cmd_args)
            if match:
                Sidenav.addEntry(match.groups()[1], match.groups()[0])
            else:
                error(f"cmd_sidenav: Invalid argument for command 'custom': '{cmd_args}'", level=error_levels["light"])
        elif cmd == "include":
            return Sidenav.generate()
        else:
            error(f"cmd_sidenav: Invalid command: '{cmd}'", level=error_levels["light"])

        return ""


"""
************************************************************ COMMANDS ************************************************************
All these commands take one arg with trimmed whitespaces.
The arg may be anything

They all need to return a string, which will be placed
into the source file at the place where the command was.
"""
def cmd_include(args: str, variables:dict[str, str]={}) -> str:
    pdebug(f"cmd_include: args='{args}', variables='{variables}'")
    content = ""
    try:
        with open(args) as file:
            content = file.read()
    except:
        error(f"cmd_include: Could not open file '{args}'", level=error_levels["serious"], exit_code=exit_codes["FileNotFound"])
        content = f"<!-- Could not include '{args}' -->"
    if args.endswith(".md"):
        try:
            from markdown import markdown
            content = markdown(content, output_format="xhtml")
        except:
            error(f"cmd_include: Could convert markdown to html for file '{args}'. Is python-markdown installed?", level=error_levels["critical"], exit_code=exit_codes["MarkdownConversionError"])
            content = f"<!-- Could not convert to html: '{args}' -->"
    glob_dependcies.append(args)
    return content

def cmd_return(args: str, variables:dict[str, str]={}) -> str:
    # re_set_map = r"([a-zA-Z0-9_]+)\?\{(([a-zA-Z0-9_]+:.+,)*([a-zA-Z0-9_]+:.+))\}"
    # <!-- #set section=lang?{*:Fallback,de:Abschnitt,en:Section} -->
    space = args.find(' ')
    # pdebug(f"cmd_set: varname='{args[:space]}, 'arg='{args[space+1:]}', variables='{variables}'")
    if not (space > 0 and space < len(args)-1):
        variables[args] = ""
        pdebug(f"cmd_set: Setting to emptry string: {args}")
    else:
        varname = args[:space]
        variables[varname] = ""
        # check if map assignment with either , or ;
        separator = ','
        match = re.fullmatch(re_set_map, args[space+1:].strip(' '))
        if not match:
            match = re.fullmatch(re_set_map_alt, args[space+1:].strip(' '))
            separator = ';'
        if match:
            pdebug(f"cmd_set: Map {match.group()}")
            depends = match.groups()[0]
            if not depends in variables:
                pdebug(f"cmd_set: Setting from map, but depends='{depends}' is not in variables")
                return ""
            depends_val = variables[depends]
            for option in match.groups()[1].split(separator):
                option = option.strip(" ")
                pdebug(f"cmd_set: Found option {option}")
                colon = option.find(':')  # we will find one, regex guarantees
                if option[:colon].strip(" ") == depends_val or option[:colon].strip(" ") == "*":
                    variables[varname] = option[colon+1:].strip(" ")

        else:  # simple asignment
            value = args[space+1:].strip(" ")
            variables[varname] = value
            pdebug(f"cmd_set: Assignment {varname} -> {value}")
        return variables[varname]
    return ""

def cmd_set(args: str, variables:dict[str, str]={}) -> str:
    cmd_return(args, variables)
    return ""

def cmd_default(args: str, variables:dict[str, str]={}) -> str:
    separator = args.find(' ')
    if args[:separator] not in variables:
        cmd_return(args, variables)
    return ""


def cmd_comment(args: str, variables:dict[str, str]={}) -> str:
    return f"<!-- {args} -->"
def cmd_uncomment(args: str, variables:dict[str, str]={}) -> str:
    return args


command2function:dict[str, Callable[[str, dict[str,str]], str]] = {
    "include":      cmd_include,
    "set":          cmd_set,
    "return":       cmd_return,
    "default":      cmd_default,
    "comment":      cmd_comment,
    "uncomment":    cmd_uncomment,
    "sidenav":      Sidenav.cmd_sidenav
}

"""
************************************************************ PARSING ************************************************************
"""

class Parser():
    def __init__(self, file):
        self.file = file
        self.pos: dict[str, int] = {}
        self.state: dict[str, bool] = {}

    def remove(self, start, stop, ignore_bounds=[]):
        """remove range [start, stop) of text and update positions"""
        delete_length = stop - start
        nl, esl = "\n", "\\n"

        ptrace(f"- Deleting range [{start}, {stop}) of length {delete_length}: '{self.file[start:stop].replace(nl, esl)}'")
        assert(stop >= start)
        assert(stop <= len(self.file))
        self.file = self.file[:start] + self.file[stop:]
        for k,pos in self.pos.items():
            if pos >= stop: self.pos[k] -= delete_length
            elif pos > start and not k in ignore_bounds: error(f"Position {k}={pos} within deleted range [{start},{stop})", level=1)

    def replace(self, start, stop, replacement):
        assert(stop >= start)
        assert(stop <= len(self.file))
        ptrace(f"- Replacing range [{start}, {stop}): '{self.file[start:stop]}' with '{replacement}'")
        self.file = self.file[:start] + replacement + self.file[stop:]
        length_difference = stop - start - len(replacement)
        for k,pos in self.pos.items():
            if pos >= stop: self.pos[k] -= length_difference
            elif pos > start: error(f"Position {k}={pos} within replaced range [{start},{stop})", level=1)

    def __getitem__(self, key):
        return self.file[key]

    def __len__(self):
        return len(self.file)


def parse_file(_file:str, variables:dict[str,str]):
    p = Parser(_file)
    sidenav_include_pos = -1
    p.pos["cmt_beg"] = -1
    p.pos["cmt_end"] = -1
    p.pos["cmd_beg"] = -1
    p.pos["cmdend"] = -1
    p.pos["conditional_block_beg"] = -1  # char pos of the first char of the last block, if waiting for elif, else or endif
    p.state["cmd_in_cmt"] = False
    p.state["last_condition"] = False  # if the last if condition was true
    i = 0
    # if file.count(COMMENT_BEGIN) != file.count(COMMENT_END):

    while i < len(p):  # at start of new line or end of comment
        ptrace(f"Processing at i={i} in line {pos2line(p.file, i)}")

        # replace variable usages in the current line
        p.pos["line_end"] = p.file.find('\n', i)
        if p.pos["line_end"] < 0: p.pos["line_end"] = len(p)
        p.replace(i, p.pos["line_end"], replace_variables(p[i:p.pos["line_end"]], variables))
        ptrace("> Line after replacing variables:", p.file[i:p.pos["line_end"]])

        # check if heading for sidenav in line
        match = re.search(re_sidenav_heading, p[i:p.pos["line_end"]])
        if match:
            Sidenav.addEntry(match.groups()[1], f"#{match.groups()[0]}")
            ptrace("> Found heading with id:", match.groups())

        # look for comment
        if p.pos["cmt_beg"] < 0:  # if not in comment, find next comment
            p.pos["cmt_beg"] = p.file.find(COMMENT_BEGIN, i, p.pos["line_end"])
            # ptrace(f"i={i}, line_end={line_end}, comment_begin={comment_begin}")
            if p.pos["cmt_beg"] < 0:
                i = p.pos["line_end"] + 1
                continue
            else:
                # jump to comment_begin
                old_i = i
                i = p.pos["cmt_beg"] + len(COMMENT_BEGIN)  # after comment begin
                ptrace(f"> Found comment begin, jumping from pos {old_i} to {i}")

        # in comment, i at the character after COMMENT_BEGIN
        p.pos["cmt_end"] = p.file.find(COMMENT_END, i) #, p.pos["line_end"])
        # sanity checks
        if p.pos["cmt_end"] < 0:
            error(f"Comment starting in line {pos2line(p.file, p.pos['cmt_beg'])} is never ended.", level=error_levels["serious"])
        else:
            tmp_next_begin = p.file.find(COMMENT_BEGIN, i)
            if 0 < tmp_next_begin and  tmp_next_begin < p.pos["cmt_end"]:
                error(f"Found next comment begin before the comment starting in line {pos2line(p.file, p.pos['cmt_beg'])} is ended! Skipping comment. Comment without proper closing tags: '{p.file[i:p.pos['line_end']]}'", level=error_levels["light"])
                p.pos["cmt_beg"] = -1
                continue

        # either at newline (if in multiline comment) or at comment end
        p.pos["cmd_beg"] = i
        p.pos["cmd_end"] = min(p.pos["line_end"], p.pos["cmt_end"])
        assert p.pos["cmd_end"] >= i, f"cmd_end={p.pos['cmd_end']}, i={i}, line_end={p.pos['line_end']}, cmt_end={p.pos['cmt_end']}"
        ptrace(f"> Possible command end: {p.pos['cmd_end']}, possible command: '{p[i:p.pos['cmd_end']]}'")

        # find commands
        match = re.fullmatch(re_preprocessor_command, p[i:p.pos["cmd_end"]].strip(" "))
        if match:  # command comment
            p.state["cmd_in_cmt"] = True
            command = match.groups()[0]
            args = match.groups()[1].replace('\t', ' ').strip(' ')
            pdebug(f"> Found command '{command}' with args '{args}'")
            # delete from previous block if
            if command in ["elif", "else", "endif"]:
                if p.pos["conditional_block_beg"] < 0: error(f"Misplaced '{command}' in line {pos2line(p.file, i)}")
                if p.state["last_condition"]:
                    # delete block from here at next endif
                    p.state["last_condition"] = False
                else:
                    # delete block from last condition statement
                    ptrace(f"> Deleting block from last condition")
                    p.remove(p.pos["conditional_block_beg"], p.pos["cmt_beg"])
                    i = p.pos["cmd_beg"]
                p.pos["conditional_block_beg"] = i
                if command == "endif":
                    p.pos["conditional_block_beg"] = -1
                    p.state["last_condition"] = False
                    p.state["any_condition"] = False
            # evaluate ifs
            if command == "if":
                p.pos["conditional_block_beg"] = i
                p.state["last_condition"] = evaluate_condition(args)
                p.state["any_condition"] = p.state["last_condition"]
                pdebug(f"> Command {command} condition evaluated to {p.state['last_condition']}")
                cmd_output = ""
            elif command =="elif":
                p.pos["conditional_block_beg"] = i
                p.state["last_condition"] = evaluate_condition(args) if not p.state["any_condition"] else False
                if p.state["last_condition"]:
                    p.state["any_condition"] = True
                pdebug(f"> Command {command} condition evaluated to {p.state['last_condition']}")
                cmd_output = ""
            elif command == "else":
                p.pos["conditional_block_beg"] = i
                p.state["last_condition"] = True if not p.state["any_condition"] else False
                cmd_output = ""
            elif p.pos["conditional_block_beg"] < 0 or p.state["last_condition"]:
                if command == "sidenav" and args == "include":  # if args contains anything else this wont work
                    sidenav_include_pos = p.pos["cmt_beg"]  # remove the comment
                    cmd_output = ""
                elif command == "endif":
                    cmd_output = ""
                elif command not in command2function:
                    error(f"Invalid command in line {pos2line(p.file, i)}: {command}", level=error_levels["light"])
                    cmd_output = ""
                else:
                    cmd_output = command2function[command](args, variables)
            else:
                cmd_output = ""
            p.replace(i, p.pos["cmd_end"], cmd_output)
            ptrace(f"> After command, the line is now '{p.file[i:p.pos['line_end']]}'")


        if p.pos["cmd_end"] == p.pos["cmt_end"]:  # reached end of comment
            if p.state["cmd_in_cmt"]:
                # remove comment tags if a command was found
                remove_newline = 0
                if p[p.pos["cmt_beg"]-1] == '\n' and p[p.pos["cmt_end"]+len(COMMENT_END)] == '\n':  # if the comment consumes the whole line, remove the entire line
                    remove_newline = 1
                # remove comment if done
                ptrace(f"Deleting opening comment tags")
                p.remove(p.pos["cmt_beg"], p.pos["cmt_beg"] + len(COMMENT_BEGIN))
                p.remove(p.pos["cmt_end"], p.pos["cmt_end"] + len(COMMENT_END) + remove_newline, ignore_bounds=["cmt_end", "cmd_end", "line_end"])
                # process the line again, because a command might have inserted new comments
                i -= len(COMMENT_BEGIN)
            p.state["cmd_in_cmt"] = False
            p.pos["cmt_beg"] = -1
            p.pos["cmt_end"] = -1
            p.pos["cmd_end"] = -1
        else:  # multiline comment
            p.pos["cmt_end"] = -1
            p.pos["cmd_end"] = -1
            i = p.pos["line_end"] + 1
            ptrace(f"> Multiline comment, jumping to next line.")
        # i = possible_command_end commented, because if something containing new commands is inserted we need to parse that as well

    if sidenav_include_pos >= 0:
        return p.file[:sidenav_include_pos] + Sidenav.generate() + p.file[sidenav_include_pos:]
    else:
        return p.file


def replace_variables(html:str, variables:dict[str, str]):
    """
    find usage of variables and replace them with their value
    """
    matches = []
    for match in re.finditer(re_variable_use, html):
        matches.append(match)
    html_list = list(html)
    for match in reversed(matches):
        pdebug(f"> Found variable usage {match.groups()[0]}, match from {match.start()} to {match.end()}")
        value = ""
        if match.groups()[0] in variables: value = variables[match.groups()[0]]
        for _ in range(match.start(), match.end()):
            html_list.pop(match.start())
        html_list.insert(match.start(), value.strip(" "))
    return ''.join(html_list)

"""
************************************************************ COMMAND LINE ************************************************************
"""
def missing_arg_val(arg):
    print("Missing argument for", arg)
    exit(1)

def missing_arg(arg):
    print("Missing ", arg)
    exit(1)

def help():
    helpstring = """Synopsis:
    Inject <inject-file> into <target-file>:
        python3 html-inect.py --target <target-file> --output <output-file> [OPTIONS]
    \nCommand line options:
    --target <file>             path to the target file
    --output <file>             output to this file instead of overwriting target
    --inplace                   edit target file in place
    --var <varname>=<value>     set the value of a variable. Can be used multiple times
    --output-deps <file>        output a Makefile listing all dependencies
    --help                      show this
    --exit-on <errorlevel>      where errorlevel is 'light', 'serious' or 'critical'
    """
    print(helpstring)

if __name__ == "__main__":
    variables:dict[str, str] = {}
    # parse args
    target_path = ""
    output_path = ""
    dep_output_path = ""
    gen_sidenav = False
    inplace = False
    i = 1
    while i in range(1, len(argv)):
        if argv[i] == "--target":
            if len(argv) > i + 1: target_path = argv[i+1].strip(" ")
            else: missing_arg_val(argv[i])
            i += 1
        elif argv[i] == "--output":
            if len(argv) > i + 1: output_path = argv[i+1].strip(" ")
            else: missing_arg_val(argv[i])
            i += 1
        elif argv[i] == "--output-deps":
            if len(argv) > i + 1: dep_output_path = argv[i+1].strip(" ")
            else: missing_arg_val(argv[i])
            i += 1
        elif argv[i] == "--exit-on":
            if argv[i+1].strip(" ") in error_levels.keys():
                if len(argv) > i + 1: exit_on_error_level = error_levels[argv[i+1].strip(" ")]
                else: missing_arg_val(argv[i])
            else:
                error(f"Invalid argument for --exit-on: {argv[i+1]}. Valid are {error_levels.keys()}")
            i += 1
        elif argv[i] == "--var":
            if len(argv) > i + 1:
                sep = argv[i+1].find('=')
                if sep > 0 and sep < len(argv[i+1]):
                    variables[argv[i+1][:sep].strip(" ")] = argv[i+1][sep+1:].strip(" ")
            else: missing_arg_val(argv[i])
            i += 1
        elif argv[i] == "--inplace":
            inplace = True
        elif argv[i] == "--help":
            help()
            exit(0)
        else:
            error(f"Invalid argument: {argv[i]}")
        i += 1
    # sanity checks
    if not target_path: missing_arg("--target")
    if not os.path.isfile(target_path): error(f"Invalid target: {target_path} (does not exist)")
    if inplace: output_path = target_path
    if not output_path:
        print("Missing output path, just printing to stdout. Use --output or --inplace to save the result.")

    # get html
    with open(target_path, "r") as file:
        target_html = file.read()


    output_html = parse_file(target_html, variables)

    # pdebug(f"Output: {output_html}")

    # save
    if output_path:
        with open(output_path, "w") as file:
            file.write(output_html)
    else:
        print(output_html)

    if dep_output_path:
        if output_path != target_path:
            glob_dependcies.append(target_path)
        depfile = generate_dependecy_file(output_path, glob_dependcies)
        pdebug(f"Writing dependency file to {os.path.abspath(dep_output_path)}: {depfile}")
        with open(dep_output_path, "w") as file:
            file.write(depfile)