6502-OS/doxy-asm65.py

import sys
import re
from typing import Callable

## 
# @defgroup documentation Documentation
# @brief
#

##
# @file
# @brief Doxygen filter for ca65 assembler files
# @details
#  This filter converts ca65 to C++ statements that doxygen can parse.
#
#  Doxygen comments are double semicolons `;;`
#  - turns procedures `.proc` into function statements with `proc` as return type
#    Parameters documented with the @param command are put into the paranthesis (function arguments) with `Param` as type (@ref handle_procedure)
#  - turns macros `.macro` into function statements with `macro` as return type with the parameter macros
#    as function arguments with `Param` type (@ref handle_procedure)
#  - enums become ... enums, documentation of enum members after their name is also handled when using ;;< (@ref handle_procedure)
#  - labeled storage allocations with `.byte`, `.res`, `.ascii` etc. are turned into variable declarations with the label as variable name (@ref handle_procedure)
#    - if allocations are strings, they are concatenated together to `char * LABEL_NAME = "<string(s)>";`
#    - if there are multiple non-string allocations: `bytes LABEL_NAME[] = {alloc1, alloc2, ...};`
#    - if there is one non-string allocation: `alloc_type * LABEL_NAME = alloc;`
#    - if the allocation is not initilized: `alloc_type * LABEL_NAME;`
#    - if the allocation type is `res SIZE`: `bytes LABEL_NAME[SIZE];`
#    - the sizes of the arrays may be wrong!
#  - include statements are kept
#  - all other preprocessor macros are removed
# 
# @todo Handle structs
# @todo for storage allocators, check in which segment they are in and apply `const` where necessary
# @ingroup documentation

filename = "unknown"

def pdebug(*args, **k):
    print(f"DEBUG ({filename}):", *args, file=sys.stderr, **k)


def parse_custom_language(file_content: str):
    # procedure_name: scope_name
    exported_names = {}
    def handle_export(m):
        export_type = m.groups()[0]
        scope = m.groups()[1]
        functions = m.groups()[2].replace(" ", "").strip(",")
        for f in functions.split(","):
            # pdebug(f"Add Exported function: '{f}' in '{scope}'")
            exported_names[f] = scope
        return ""

    def handle_procedure(m):
        # print("handle procedure:", m.groups())
        p_docs = m.groups()[0].strip("\n")
        p_type = m.groups()[1]
        p_name = m.groups()[2]
        p_args = m.groups()[3].strip(" ")
        p_code = m.groups()[4]
        s = ""
        in_namespace = False
        if p_name in exported_names:
            # print(f"{p_name} is exported")
            in_namespace = True
            # wrap function in namespace {}, which consumes the first line of the docstring, which must be ;;***...
            namespace = exported_names[p_name]
            # assert p_docs.startswith(";;*"), f"Documentation of an exported procedure must start with ';;***...' ({p_name})"
            # assert p_docs[p_docs.rfind('\n'):].startswith("\n;;*"), f"Documentation of an exported procedure must end with ';;***...' ({p_name})"
            s += f"namespace {namespace}" + " {" + p_docs
            # s += p_docs[p_docs.find('\n'):p_docs.rfind('\n')]
            s += "\n"
            # s += f"@ingroup {namespace}\n"
        else:
            s += p_docs + "\n" #re.sub(r";;\*+", ";;", p_docs, 0, re.MULTILINE) + "\n"

        if p_type == "proc":
            s += f"proc {p_name}("
            for match in re.finditer(r"[\@\\]param +(.+?) *:", p_docs):
                s += f"Param {match.groups()[0]},"
            if s[-1] == ",": s = s[:-1]
            s += ");\n"
        elif p_type == "macro":
            # pdebug(f"Processing macro '{p_name}' with args '{'TXT'.join(p_args.replace(' ', '').split(','))}'")
            s += f"macro {p_name}("
            p_args = "".join("Param " + param + "," for param in p_args.replace(" ", "").split(',')).strip(",")
            s += p_args
            s += ");\n"
        elif p_type == "enum":
            p_code = re.sub(r"(.*=.*?)( *(?:;;.*)?\n)", r"\1,\2", p_code)
            s += f"enum {p_name}" + "{\n" + p_code + "};"
        else:
            raise NotImplementedError(f"handle_procedure not implemented for procedure type {p_type}")
        s += re.sub(".*", "", p_code)
        if in_namespace:
            s += "} // namespace"
        else:
            s += "\n"
        return s

    def handle_storage_label(m):
        l_docs = m.groups()[0]
        l_name = m.groups()[1]
        l_allocs = m.groups()[2]
        l_docs2 = m.groups()[3]  # if doc was in the same line as the label

        storage_alloc = r"\.(byte|res|dbyte|word|addr|faraddr|dword|asciiz?)(([, ]+(?:0x[a-fA-F0-9]+|0b[01]+|\d+|\w+|\"[^\n]*?[^\\\n]\")[ \n]*)*)"
        storage_alloc_arg = r"(0x[a-fA-F0-9]+|0b[01]+|\d+|\w+|\"[^\n]*[^\\\n]\")"

        args = []
        allocs = []
        for alloc_match in re.finditer(storage_alloc, l_allocs):
            allocs.append(alloc_match)
            alloc_args = alloc_match.groups()[1]
            if alloc_args:
                args += re.findall(storage_alloc_arg, alloc_args)

        # pdebug(f"Storage label {l_name} with allocs '{[ma.group() for ma in allocs]}' and args '{args}'\n\t{m.groups()}")
        s = ""
        in_namespace = False
        # if the label is exported, put it in a namespace
        if l_name in exported_names:
            in_namespace = True
            namespace = exported_names[l_name]
            s += f"namespace {namespace}" + " {"
        # docs after the namespace, otherwise they document the namespace
        if l_docs:
            s += l_docs
        # put the single line comment into a /** */ comment in front of the declaration
        if l_docs2:
            s += "/** "
            if not "brief" in l_docs2: s += "@brief "
            s += f"{l_docs2.strip(';')} */ "
        # completely ignoring the type of the storage allocation here
        if len(args) > 1:
            if all(arg.startswith("\"") for arg in args):
                s += f'char* {l_name} = "' + "".join(map(lambda x: x.strip('"'), args)) + '"'
            else:
                s += f"bytes {l_name}[{len(args)}] = " + "{"
                for arg in args:
                    s += arg + ","
                s = s.strip(",") + "}"
        else:
            l_type = allocs[0].groups()[0]
            if len(args) == 0:
                l_arg = None
            else:
                l_arg = args[0]
            # if res: use bytes[length] as type
            if l_type == "res":
                l_type = f"bytes[{l_arg}]"
                l_arg = None
            # else use type* as type
            else: l_type += "*"
            s += f"{l_type} {l_name}"
            if l_arg:
                s += f" = {l_arg}"
        s += ";"
        if in_namespace: s += "} // namespace"
        s += m.group().count('\n') * '\n'  # make sure the #lines is the same
        # pdebug(args, "\n---\n", s)
        return s

    patterns: dict[str, str|Callable[[re.Match], str]] = {
        r"\@(?:macro|function)": "@brief",
        r"^\.scope ([a-zA-Z0-9_]+)": r"namespace \1 {",
        # r"^\.macro ([a-zA-Z0-9_]+)(.*)?": r"macro \1(\2 \2); ",
        # r"^\.end(?:macro)": "",
        r"^\.end(?:scope)": "}",
        r"^\.(include)": r"#\1",
        r"^(Export(?:Zp)?) (\w+)((?: *, *\w+)+)": handle_export,
        r"^(Import(?:Zp)?) (\w+)((?: *, *\w+)+)": "",
        r"(?<!^;)\$([A-Fa-f0-9_]+)": r"0x\1",   # $HEX -> 0xHEX except in comments
        r"(?<!^;)%([01_]+)": r"0b\1",           # %BIN -> 0bBIN except in comments
        r"^((?:;;.*\n)*) *\.(proc|enum|macro) (\w+)(.*?)\n((?:.|\n)*?)\.end(proc|enum|macro).*": handle_procedure,
        r"^((?:;;.*\n)*) *(\w+):((?:\s*\.(?:byte|res|dbyte|word|addr|faraddr|dword|asciiz?)(?:[, ]+(?:0x[a-fA-F0-9]+|0b[01]+|\d+|\w+|\"[^\n]*[^\\\n]\")[ \n]*)*)+)(;;.*)?": handle_storage_label,
        r"^INCLUDE_[A-Z0-9_]+ *= *1$": r"",     # Include guard variables
        r";;": "//!",          # C++ comments
        # TODO this is currently case sensitive
        r"(?<!^;)( *\w+ *= *[^;,\n]+?) *(//.*)?$": r"\1;\2",     # semicolons after assignments, except in comments and when they already end with a comma or semicolon. Also preserve comments after the assignment
        r"^([^\n;]*)(?<!\w)\.(\w+)": r"\1// #\2",   # all .preprocessor commands

    }

    compiled_patterns = []
    for k,v in patterns.items():
        compiled_patterns.append((re.compile(k), v))

    resub_patterns: dict[str, str|Callable[[re.Match], str]] = {
        r"(?<!^;;)[ \t\r\f\v]+": " ",   # turn all spaces into single whitespace except if in doxygen comment
        r"^((?:[^\"\n;]||[^\"\n;]*\"(?:[^\"\n]|\\\")+\")+);(?!;).*": r"\1",          # remove normal comments, detect strings
        r"^;;\*+": ";;",                # remove ;;*** comments
        r"[ \t\r\f\v]+$": "",           # remove trailing spaces print(file_content)

    }
    for pat, subst in resub_patterns.items():
        file_content = re.sub(pat, subst, file_content, 0, re.MULTILINE)
    for pat,subst in patterns.items():
        (file_content, n_subst) = re.subn(pat, subst, file_content, 0, re.MULTILINE)
    return file_content

def main():
    global filename
    if len(sys.argv) != 2:
        print("Usage: python doxy-asm65.py <input_file>")
        sys.exit(1)

    filename = sys.argv[1]

    with open(filename, 'r') as file:
        file_content = file.read()

    transformed_content = parse_custom_language(file_content)

    print(transformed_content)

if __name__ == "__main__":
    main()
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`import sys`
			`import re`
			`from typing import Callable`

improve documentation 2024-08-08 20:15:50 +02:00			`##`
			`# @defgroup documentation Documentation`
			`# @brief`
			`#`

			`##`
			`# @file`
			`# @brief Doxygen filter for ca65 assembler files`
			`# @details`
			`# This filter converts ca65 to C++ statements that doxygen can parse.`
			`#`
			# Doxygen comments are double semicolons `;;`
			# - turns procedures `.proc` into function statements with `proc` as return type
			# Parameters documented with the @param command are put into the paranthesis (function arguments) with `Param` as type (@ref handle_procedure)
			# - turns macros `.macro` into function statements with `macro` as return type with the parameter macros
			# as function arguments with `Param` type (@ref handle_procedure)
doc improvements 2024-08-08 21:11:25 +02:00			`# - enums become ... enums, documentation of enum members after their name is also handled when using ;;< (@ref handle_procedure)`
improve documentation 2024-08-08 20:15:50 +02:00			# - labeled storage allocations with `.byte`, `.res`, `.ascii` etc. are turned into variable declarations with the label as variable name (@ref handle_procedure)
			# - if allocations are strings, they are concatenated together to `char * LABEL_NAME = "<string(s)>";`
			# - if there are multiple non-string allocations: `bytes LABEL_NAME[] = {alloc1, alloc2, ...};`
			# - if there is one non-string allocation: `alloc_type * LABEL_NAME = alloc;`
			# - if the allocation is not initilized: `alloc_type * LABEL_NAME;`
			# - if the allocation type is `res SIZE`: `bytes LABEL_NAME[SIZE];`
			`# - the sizes of the arrays may be wrong!`
			`# - include statements are kept`
			`# - all other preprocessor macros are removed`
			`#`
			`# @todo Handle structs`
			# @todo for storage allocators, check in which segment they are in and apply `const` where necessary
			`# @ingroup documentation`

add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`filename = "unknown"`

			`def pdebug(args, *k):`
			`print(f"DEBUG ({filename}):", args, file=sys.stderr, *k)`


			`def parse_custom_language(file_content: str):`
			`# procedure_name: scope_name`
			`exported_names = {}`
			`def handle_export(m):`
			`export_type = m.groups()[0]`
			`scope = m.groups()[1]`
			`functions = m.groups()[2].replace(" ", "").strip(",")`
			`for f in functions.split(","):`
improve documentation 2024-08-08 20:15:50 +02:00			`# pdebug(f"Add Exported function: '{f}' in '{scope}'")`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`exported_names[f] = scope`
			`return ""`

			`def handle_procedure(m):`
			`# print("handle procedure:", m.groups())`
			`p_docs = m.groups()[0].strip("\n")`
			`p_type = m.groups()[1]`
			`p_name = m.groups()[2]`
			`p_args = m.groups()[3].strip(" ")`
			`p_code = m.groups()[4]`
			`s = ""`
			`in_namespace = False`
			`if p_name in exported_names:`
			`# print(f"{p_name} is exported")`
			`in_namespace = True`
			`# wrap function in namespace {}, which consumes the first line of the docstring, which must be ;;***...`
			`namespace = exported_names[p_name]`
			`# assert p_docs.startswith(";;"), f"Documentation of an exported procedure must start with ';;**...' ({p_name})"`
			`# assert p_docs[p_docs.rfind('\n'):].startswith("\n;;"), f"Documentation of an exported procedure must end with ';;**...' ({p_name})"`
			`s += f"namespace {namespace}" + " {" + p_docs`
			`# s += p_docs[p_docs.find('\n'):p_docs.rfind('\n')]`
			`s += "\n"`
			`# s += f"@ingroup {namespace}\n"`
			`else:`
			`s += p_docs + "\n" #re.sub(r";;\*+", ";;", p_docs, 0, re.MULTILINE) + "\n"`

			`if p_type == "proc":`
			`s += f"proc {p_name}("`
			`for match in re.finditer(r"[\@\\]param +(.+?) *:", p_docs):`
			`s += f"Param {match.groups()[0]},"`
			`if s[-1] == ",": s = s[:-1]`
			`s += ");\n"`
			`elif p_type == "macro":`
improve documentation 2024-08-08 20:15:50 +02:00			`# pdebug(f"Processing macro '{p_name}' with args '{'TXT'.join(p_args.replace(' ', '').split(','))}'")`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`s += f"macro {p_name}("`
			`p_args = "".join("Param " + param + "," for param in p_args.replace(" ", "").split(',')).strip(",")`
			`s += p_args`
			`s += ");\n"`
			`elif p_type == "enum":`
doc improvements 2024-08-08 21:11:25 +02:00			`p_code = re.sub(r"(.=.?)( (?:;;.)?\n)", r"\1,\2", p_code)`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`s += f"enum {p_name}" + "{\n" + p_code + "};"`
			`else:`
			`raise NotImplementedError(f"handle_procedure not implemented for procedure type {p_type}")`
			`s += re.sub(".*", "", p_code)`
			`if in_namespace:`
			`s += "} // namespace"`
			`else:`
			`s += "\n"`
			`return s`

			`def handle_storage_label(m):`
improve documentation 2024-08-08 20:15:50 +02:00			`l_docs = m.groups()[0]`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`l_name = m.groups()[1]`
			`l_allocs = m.groups()[2]`
improve documentation 2024-08-08 20:15:50 +02:00			`l_docs2 = m.groups()[3] # if doc was in the same line as the label`

			`storage_alloc = r"\.(byte\|res\|dbyte\|word\|addr\|faraddr\|dword\|asciiz?)(([, ]+(?:0x[a-fA-F0-9]+\|0b[01]+\|\d+\|\w+\|\"[^\n]?[^\\\n]\")[ \n])*)"`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`storage_alloc_arg = r"(0x[a-fA-F0-9]+\|0b[01]+\|\d+\|\w+\|\"[^\n]*[^\\\n]\")"`

			`args = []`
			`allocs = []`
			`for alloc_match in re.finditer(storage_alloc, l_allocs):`
			`allocs.append(alloc_match)`
			`alloc_args = alloc_match.groups()[1]`
			`if alloc_args:`
			`args += re.findall(storage_alloc_arg, alloc_args)`

improve documentation 2024-08-08 20:15:50 +02:00			`# pdebug(f"Storage label {l_name} with allocs '{[ma.group() for ma in allocs]}' and args '{args}'\n\t{m.groups()}")`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`s = ""`
			`in_namespace = False`
improve documentation 2024-08-08 20:15:50 +02:00			`# if the label is exported, put it in a namespace`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`if l_name in exported_names:`
			`in_namespace = True`
			`namespace = exported_names[l_name]`
improve documentation 2024-08-08 20:15:50 +02:00			`s += f"namespace {namespace}" + " {"`
			`# docs after the namespace, otherwise they document the namespace`
			`if l_docs:`
			`s += l_docs`
			`# put the single line comment into a /** */ comment in front of the declaration`
			`if l_docs2:`
			`s += "/** "`
			`if not "brief" in l_docs2: s += "@brief "`
			`s += f"{l_docs2.strip(';')} */ "`
			`# completely ignoring the type of the storage allocation here`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`if len(args) > 1:`
			`if all(arg.startswith("\"") for arg in args):`
			`s += f'char* {l_name} = "' + "".join(map(lambda x: x.strip('"'), args)) + '"'`
			`else:`
improve documentation 2024-08-08 20:15:50 +02:00			`s += f"bytes {l_name}[{len(args)}] = " + "{"`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`for arg in args:`
			`s += arg + ","`
improve documentation 2024-08-08 20:15:50 +02:00			`s = s.strip(",") + "}"`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`else:`
			`l_type = allocs[0].groups()[0]`
			`if len(args) == 0:`
			`l_arg = None`
			`else:`
			`l_arg = args[0]`
improve documentation 2024-08-08 20:15:50 +02:00			`# if res: use bytes[length] as type`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`if l_type == "res":`
			`l_type = f"bytes[{l_arg}]"`
			`l_arg = None`
improve documentation 2024-08-08 20:15:50 +02:00			`# else use type* as type`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`else: l_type += "*"`
			`s += f"{l_type} {l_name}"`
			`if l_arg:`
			`s += f" = {l_arg}"`
			`s += ";"`
			`if in_namespace: s += "} // namespace"`
			`s += m.group().count('\n') * '\n' # make sure the #lines is the same`
			`# pdebug(args, "\n---\n", s)`
			`return s`

			`patterns: dict[str, str\|Callable[[re.Match], str]] = {`
			`r"\@(?:macro\|function)": "@brief",`
			`r"^\.scope ([a-zA-Z0-9_]+)": r"namespace \1 {",`
			`# r"^\.macro ([a-zA-Z0-9_]+)(.*)?": r"macro \1(\2 \2); ",`
			`# r"^\.end(?:macro)": "",`
			`r"^\.end(?:scope)": "}",`
			`r"^\.(include)": r"#\1",`
			`r"^(Export(?:Zp)?) (\w+)((?: , \w+)+)": handle_export,`
			`r"^(Import(?:Zp)?) (\w+)((?: , \w+)+)": "",`
			`r"(?<!^;)\$([A-Fa-f0-9_]+)": r"0x\1", # $HEX -> 0xHEX except in comments`
			`r"(?<!^;)%([01_]+)": r"0b\1", # %BIN -> 0bBIN except in comments`
improve documentation 2024-08-08 20:15:50 +02:00			`r"^((?:;;.\n)) \.(proc\|enum\|macro) (\w+)(.?)\n((?:.\|\n)?)\.end(proc\|enum\|macro).": handle_procedure,`
			`r"^((?:;;.\n)) (\w+):((?:\s\.(?:byte\|res\|dbyte\|word\|addr\|faraddr\|dword\|asciiz?)(?:[, ]+(?:0x[a-fA-F0-9]+\|0b[01]+\|\d+\|\w+\|\"[^\n][^\\\n]\")[ \n]))+)(;;.)?": handle_storage_label,`
			`r"^INCLUDE_[A-Z0-9_]+ = 1$": r"", # Include guard variables`
add docs with doxygen using custom filter 2024-08-08 00:11:15 +02:00			`r";;": "//!", # C++ comments`
			`# TODO this is currently case sensitive`
			`r"(?<!^;)( \w+ = [^;,\n]+?) (//.*)?$": r"\1;\2", # semicolons after assignments, except in comments and when they already end with a comma or semicolon. Also preserve comments after the assignment`
			`r"^([^\n;]*)(?<!\w)\.(\w+)": r"\1// #\2", # all .preprocessor commands`

			`}`

			`compiled_patterns = []`
			`for k,v in patterns.items():`
			`compiled_patterns.append((re.compile(k), v))`

			`resub_patterns: dict[str, str\|Callable[[re.Match], str]] = {`
			`r"(?<!^;;)[ \t\r\f\v]+": " ", # turn all spaces into single whitespace except if in doxygen comment`
			`r"^((?:[^\"\n;]\|\|[^\"\n;]\"(?:[^\"\n]\|\\\")+\")+);(?!;).": r"\1", # remove normal comments, detect strings`
			`r"^;;\+": ";;", # remove ;;** comments`
			`r"[ \t\r\f\v]+$": "", # remove trailing spaces print(file_content)`

			`}`
			`for pat, subst in resub_patterns.items():`
			`file_content = re.sub(pat, subst, file_content, 0, re.MULTILINE)`
			`for pat,subst in patterns.items():`
			`(file_content, n_subst) = re.subn(pat, subst, file_content, 0, re.MULTILINE)`
			`return file_content`

			`def main():`
			`global filename`
			`if len(sys.argv) != 2:`
			`print("Usage: python doxy-asm65.py <input_file>")`
			`sys.exit(1)`

			`filename = sys.argv[1]`

			`with open(filename, 'r') as file:`
			`file_content = file.read()`

			`transformed_content = parse_custom_language(file_content)`

			`print(transformed_content)`

			`if __name__ == "__main__":`
			`main()`