From 3180892cc7db2f89c0af20e8ec4919f88212c6d6 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Thu, 23 Nov 2023 15:44:47 +0100 Subject: [PATCH] add sitemap.xml support --- Makefile | 80 ++++++++++++++---- README.md | 62 ++++++++++++++ html-preprocessor | 209 +++++++++++++++++++++++++++++++++++++--------- 3 files changed, 297 insertions(+), 54 deletions(-) diff --git a/Makefile b/Makefile index db2a6f7..b419ef8 100644 --- a/Makefile +++ b/Makefile @@ -17,46 +17,77 @@ # change these to fir your project # -# root dir for the project, all other paths relative to PROJECT_DIR (except for OUT_DIR and DEP_DIR) +# root dir for the project, most other paths are relative to PROJECT_DIR +# [absolute or relative to current working directory] PROJECT_DIR = src -# path where final website will be in, this one is not relative to PROJECT_DIR +# path where final website will be in +# [absolute or relative to current working directory] OUT_DIR = build # SOURCE FILES: # all SRC_FLS and all files (recursively) in the SRC_DIRS will be built # all files in PROJECT_DIR (not recursively) are source files +# [relative to PROJECT_DIR] SRC_DIRS = de en script SRC_FLS = # CSS FILES: # directories which may contain sass and scss to compile sass to a correspondig css in OUT_DIR/CSS_DIR (also css, it will simply be copied) +# [relative to PROJECT_DIR] CSS_DIRS = style CSS_FILES = # RESOURCE FILES: # all RESOURCE_FLS and all files in the RESOURCE_DIRS will be copied to OUT_DIR +# [relative to PROJECT_DIR] RESOURCE_DIRS = resources RESOURCE_FLS = # THUMBNAILS: -# if set, thumbnails for all resource files will be generated and placed in THUMB_OUT_DIR (relative to OUT_DIR) +# if set, thumbnails for all resource files having an extension in THUMB_FOR_TYPES will be generated and placed relative to THUMB_OUT_DIR +# [relative to OUT_DIR] THUMB_OUT_DIR = thumbs +# build thumbnails for these types: supported: mp3, flac, wav, pdf and all image formats that magick can handle +THUMB_FOR_TYPES = png gif jpg jpeg webp pdf mp4 mp3 flac wav +# filetype for the thumbnails. (pdfs will always have .jpg) +THUMB_TYPE = jpg +# size for the thumbnails (not respected by pdf) +THUMB_SIZE = 300 # MULTI-LANG SOURCE FILES: # the files in COMMON_DIR will be built for all LANGS: +# for example: +# LANGS = de en +# PROJECT_DIR/COMMON_DIR/home.html +# -> OUT_DIR/de/home.html +# -> OUT_DIR/en/home.html # foreach html-file in COMMON_DIR: # foreach lang in LANGS: # run HTML_PP_CMD with --var lang=lang on file and output to OUT_DIR without the COMMON_DIR prefix, so COMMON_DIR/subdir/file.html -> OUT_DIR/lang/subdir/file.html -# all non-html files will handled the same way, but without the preprocessor being run on them. They are simply copied +# For all .html files, the proprocessor will make the variable `lang` available, for example lang=de +# All non-html files will handled the same way, but without the preprocessor being run on them. They are simply copied. # leave COMMON_DIR empty to disable multi-lang feature -COMMON_DIR = +# [relative to PROJECT_DIR] +COMMON_DIR = common LANGS = de en +# SITEMAP +# sitemap relative to OUT_DIR, leave blank to not generate a sitemap [relative to OUT_DIR] +SITEMAP = sitemap.xml +# base url of the website, without trailing / +WEBSITE_URL = https://quintern.xyz +# file required during build process for sitemap generation [absolute or relative to current working directory] +SITEMAP_TEMP_FILE = .sitemap.pkl +# comment to keep the file extension on sitemap entries +SITEMAP_REMOVE_EXT = 1 + # PREPROCESSOR # path to of the files that should be included +# [relative to PROJECT_DIR] INCLUDE_DIR = include # additional search paths passed to sass compiler +# [relative to PROJECT_DIR] SASS_INCLUDE_DIRS = include/style @@ -68,10 +99,10 @@ HTML_PP_CMD = python3 html-preprocessor --exit-on light # --source-maps-urls=absolute is appended for generating dependency files SASS_CMD = sass --color +# [absolute or relative to current working directory] DEP_DIR = .dependencies - # # NOT SETTINGS ANYMORE # DO NOT CHANGE ANYTHING HERE UNLESS YOU KNOW WHAT YOU ARE DOING! @@ -121,10 +152,8 @@ ML_OUT_FLS = $(foreach lang, $(LANGS), $(patsubst $(_COMMON_DIR)/%, $(ML_OUT_D endif ifdef THUMB_OUT_DIR -_THUMB_FOR_TYPES = png gif jpg jpeg webp pdf -_THUMB_TYPE = jpg # files for which to generate thumbnails -_THUMB_FLS = $(filter $(foreach type, $(_THUMB_FOR_TYPES), %.$(type)), $(_RES_FLS)) +_THUMB_FLS = $(filter $(foreach type, $(THUMB_FOR_TYPES), %.$(type)), $(_RES_FLS)) THUMB_OUT_FLS = $(addsuffix .jpg, $(basename $(patsubst $(PROJECT_DIR)/%, $(OUT_DIR)/$(THUMB_OUT_DIR)/%, $(_THUMB_FLS)))) THUMB_OUT_DIRS = $(sort $(dir $(THUMB_OUT_FLS))) # sort for removing duplicates endif @@ -134,6 +163,14 @@ _DEP_DIRS = $(sort $(patsubst $(OUT_DIR)/%, $(DEP_DIR)/%, $(OUT_DIRS) $(ML_OUT # needed for reading _DEP_FLS = $(shell find $(DEP_DIR) -type f -name '*.d' 2>/dev/null) +ifdef SITEMAP + _SITEMAP = $(addprefix $(OUT_DIR)/, $(SITEMAP)) + HTML_PP_CMD += --sitemap-temp-file "$(SITEMAP_TEMP_FILE)" --sitemap-base-url $(WEBSITE_URL) --sitemap-webroot-dir "$(OUT_DIR)" +endif +ifdef SITEMAP_REMOVE_EXT + HTML_PP_CMD += --sitemap-remove-ext +endif + # SASS, add load-paths _SASS_CMD = $(SASS_CMD) $(foreach includedir, $(_SASS_INCLUDE_DIRS), --load-path=$(includedir)) --source-map-urls=absolute @@ -145,6 +182,7 @@ FMT_OUT_HTML ="\e[1;34mBuilding html\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_CSS ="\e[1;34mBuilding css\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_THUMB ="\e[1;34mBuilding thumbnail\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_OTHER ="\e[1;34mBuilding\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" +FMT_OUT_SITEMAP ="\e[1;34mBuilding sitemap\e[0m: \e[1;35m%s\e[0m\n" FMT_OUT_ML_HTML="\e[1;34mBuilding html\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_ML_OTHER="\e[1;34mBuilding\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" @@ -159,7 +197,7 @@ FMT_OUT_ML_OTHER="\e[1;34mBuilding\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m include $(_DEP_FLS) all: normal multilang resources thumbnails -normal: $(OUT_FLS) +normal: $(_SITEMAP) $(OUT_FLS) multilang: $(ML_OUT_FLS) resources: $(RES_OUT_FLS) thumbnails: $(THUMB_OUT_FLS) @@ -217,11 +255,19 @@ $(OUT_DIR)/$(THUMB_OUT_DIR)/%.jpg: | $(THUMB_OUT_DIRS) sources=($(_THUMB_FLS)); \ source=$$(printf "%s\n" $${sources[@]} | grep "$$target"'\.'); \ printf $(FMT_OUT_THUMB) "$$source" "$$fulltarget"; \ - if [ "$${source##*.}" = "pdf" ]; then \ - pdftoppm -f 1 -singlefile -jpeg -r 50 "$$source" "$${fulltarget%.*}"; \ - else \ - magick "$$source" -thumbnail '100x100>' "$@"; \ - fi; \ + case "$${source##*.}" in \ + "mp4-") ffmpegthumbnailer -i "$$source" -o "$$fulltarget" -s 300 -q 5;; \ + "pdf") pdftoppm -f 1 -singlefile -jpeg -r 50 "$$source" "$${fulltarget%.*}";; \ + "mp3"|"flac"|"wav") ffmpeg -hide_banner -i "$$source" "$$fulltarget" -y >/dev/null;; \ + "*") magick "$$source[0]" -thumbnail '$(THUMB_SIZE)x$(THUMB_SIZE)>' "$@";; \ + esac + +# SITEMAP +ifdef _SITEMAP +$(_SITEMAP): $(OUT_FLS) $(ML_OUT_FLS) # build sitemap after all other files + @printf $(FMT_OUT_SITEMAP) "$@" + @$(HTML_PP_CMD) --sitemap-generate "$@" +endif # @@ -249,11 +295,13 @@ $(OUT_DIR)/%.css: $(PROJECT_DIR)/%.scss | $(OUT_DIRS) $(_DEP_DIRS) jq -r '.sources | @sh' $@.map | tr -d \' | sed 's|file://||g' >> "$$depfile"; \ rm $@.map +# this rule must be last! $(OUT_DIR)/%: $(PROJECT_DIR)/% | $(OUT_DIRS) $(RES_OUT_DIRS) @printf $(FMT_OUT_OTHER) "$<" "$@" @cp -r $< $@ + # .DEFAULT: # @echo "MISSING RULE: $@" @@ -264,7 +312,7 @@ stop: killall nginx clean: - -@rm $(OUT_FLS) $(ML_OUT_FLS) 2>/dev/null + -@rm $(OUT_FLS) $(ML_OUT_FLS) $(SITEMAP_TEMP_FILE) $(SITEMAP) 2>/dev/null -@rm -r $(DEP_DIR) 2>/dev/null cleaner: diff --git a/README.md b/README.md index d6f3eea..eb11c98 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,7 @@ An entry is a html heading with a id: `

This heading will be lin `` sidenav-command must be one of the following: + #### `include` Include the generated sidenav at this position. This command will always be executed last, after the whole file has been parsed. @@ -183,6 +184,7 @@ Ignored **Return Value**: The generated sidenav + #### `section` Group all following entries in named section. `section` may not appear in conditional blocks and multiline comments. @@ -192,6 +194,7 @@ The name of the section **Return Value** Empty string + #### `name` Use a custom name instead of the heading itself for the link to the next heading. @@ -201,6 +204,7 @@ The link-name of the next heading **Return Value**: Empty string + #### `custom` Include a custom link in the sidenav. @@ -215,6 +219,64 @@ Empty string --- +### sitemap +Used for automatically generating a `sitemap.xml` for the website. + +#### `include` +Include the current page in the sitemap + +**Synopsis**: +`` +`` + +**Argument**: +Optional: Use a different link for this page + +**Return Value**: +Empty string + + +#### `priority` +Set the `priority` field + +**Synopsis**: +`` + +**Argument**: +Float between 0.0 and 1.0 + +**Return Value**: +Empty string + + +#### `changefreq` +Set the `changefreq` field + +**Synopsis**: +`` + +**Argument**: +One of *always, hourly, daily, weekly, monthly, yearly, never* + +**Return Value**: +Empty string + + +#### `lastmod` +Set the `lastmod` field + +**Synopsis**: +`` + +**Argument**: +The lastmod date in w3c date format + +**Return Value**: +Empty string + +--- + + ## Pitfalls - The `#include` command must not be in the last line of the file - The `#include` command can not be in multi-line comment if the included file also contains comments diff --git a/html-preprocessor b/html-preprocessor index c9c8d63..fd80b83 100755 --- a/html-preprocessor +++ b/html-preprocessor @@ -5,6 +5,7 @@ import re from sys import argv from collections.abc import Callable import argparse +import pickle """ TODO: @@ -27,6 +28,11 @@ sidenav_content_section = "
  • #name
  • " exit_on_include_failure = False +sitemap_begin = """\ + +\n""" +sitemap_end = "" + """ ************************************************************ REGULAR EXPRESSIONS ************************************************************ """ @@ -47,6 +53,10 @@ re_variable_use = r"#\$\(([a-zA-Z0-9_]+)\)" """ only in comments """ re_preprocessor_command = r"#([a-zA-Z]+) *(.*) *" +# https://www.w3.org/TR/NOTE-datetime +re_w3cdate = r"\d{4}-(?)]-\d{2}" +r"\d{4}-(?:0[1-9]|1[0-2])-(?:[0-2]\d|3[01])(T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d([\+\-](?:0\d|1[0-2]):[0-5]\d)?)?" + COMMENT_BEGIN = "" @@ -67,6 +77,9 @@ error_levels = { } exit_on_error_level = error_levels["serious"] +# url that the currently processed file have +current_file_url = "" + """ ************************************************************ UTILITY ************************************************************ @@ -122,6 +135,86 @@ def evaluate_condition(input_string) -> bool: error(f"Pythonized condition is invalid: {condition}", level=error_levels["light"]) return False +""" +************************************************************ SITEMAP ************************************************************ +""" +class Sitemap: + urls:dict = {} + def __init__(self, url=None): + self.url = url + self.priority = None + self.changefreq = None + self.lastmod = None + + def set_url(self, url): + self.url = url + + def set_priority(self, priority): + try: + priority = float(priority) + except ValueError: + error(f"Sitemap: invalid priority: '{priority}'", level=error_levels["serious"]) + if not (type(priority) == float and 0.0 <= priority and priority <= 1.0): + error(f"Sitemap: invalid priority: '{priority}'", level=error_levels["serious"]) + self.priority = priority + + def set_changefreq(self, changefreq): + if not (type(changefreq) == str and changefreq in ["always", "hourly", "daily", "weekly", "monthly", "yearly", "never"]): + error(f"Sitemap: invalid changefreq: '{changefreq}'", level=error_levels["serious"]) + self.changefreq = changefreq + + def set_lastmod(self, lastmod): + if not (type(lastmod) == str and re.fullmatch(re_w3cdate, lastmod)): + error(f"Sitemap: invalid lastmod: '{lastmod}'", level=error_levels["serious"]) + self.lastmod = lastmod + + def get_entry(self): + s = f"\n\t{self.url}" + if self.priority is not None: s += f"\n\t{self.priority}" + if self.changefreq is not None: s += f"\n\t{self.changefreq}" + if self.lastmod is not None: s += f"\n\t{self.lastmod}" + s += "\n" + return s + + def __repr__(self) -> str: + return f"Sitemap(url={self.url}, priority={self.priority}, changefreq={self.changefreq}, lastmod={self.lastmod})" + + @staticmethod + def gen_sidemap(): + s = sitemap_begin + for url in Sitemap.urls.values(): + s += "\t" + url.get_entry().replace("\n", "\n\t").strip("\t") + "\n" + s += sitemap_end + return s + + @staticmethod + def cmd_sitemap(args:str, variables:dict[str,str]) -> str: + space = args.find(" ") + if space < 0: + space = len(args) + cmd = args[:space] + cmd_args = "" + + if 0 < space and space < len(args) - 1: + cmd_args = args[space+1:].strip(" ") + pdebug(f"cmd_sitemap: cmd='{cmd}' cmd_args='{cmd_args}'") + if not current_file_url in Sitemap.urls: + Sitemap.urls[current_file_url] = Sitemap() + if cmd == "include": + if cmd_args: + Sitemap.urls[current_file_url].set_url(cmd_args) + else: + Sitemap.urls[current_file_url].set_url(current_file_url) + elif cmd == "priority": + Sitemap.urls[current_file_url].set_priority(cmd_args) + elif cmd == "changefreq": + Sitemap.urls[current_file_url].set_changefreq(cmd_args) + elif cmd == "lastmod": + Sitemap.urls[current_file_url].set_lastmod(cmd_args) + else: + error(f"cmd_sitemap: Invalid command '{cmd}'", error_levels["serious"]) + ptrace(f"Sitemap[{current_file_url}] is now: {Sitemap.urls[current_file_url]}") + return "" """ @@ -182,9 +275,9 @@ class Sidenav: space = len(args) cmd = args[:space] cmd_args = "" - pdebug(f"cmd_sidenav: cmd='{cmd}' cmd_args='{cmd_args}'") if 0 < space and space < len(args) - 1: cmd_args = args[space+1:].strip(" ") + pdebug(f"cmd_sidenav: cmd='{cmd}' cmd_args='{cmd_args}'") if cmd == "skip": Sidenav.skipNext() elif cmd == "section": @@ -354,6 +447,7 @@ command2function:dict[str, Callable[[str, dict[str,str]], str]] = { "comment": cmd_comment, "uncomment": cmd_uncomment, "sidenav": Sidenav.cmd_sidenav, + "sitemap": Sitemap.cmd_sitemap, "warning": cmd_warning, "error": cmd_error, } @@ -620,15 +714,20 @@ def substitute_variables(html:str, variables:dict[str, str]): """ if __name__ == "__main__": parser = argparse.ArgumentParser(prog="bUwUma html preprocessor") - parser.add_argument("--input", action="store", help="path to the input file", required=True) + parser.add_argument("--input", action="store", help="path to the input file", default="") parser.add_argument("--output", action="store", help="output to this file", default="") parser.add_argument("--inplace", action="store_true", help="overwrite input file") + parser.add_argument("--preserve-comments", action="store_true", help="do not remove normal html comments", default=False) parser.add_argument("--var", action="append", help="set a variable --var varname=value", default=[]) parser.add_argument("--output-deps", action="store", help="output a Makefile listing all dependencies", default="") + parser.add_argument("--sitemap-generate", action="store", help="generate the sitemap from the sitemap-temp-file", default="") + parser.add_argument("--sitemap-temp-file", action="store", help="file for storing sitemap data during build process", default="/tmp/sitemap.pkl") + parser.add_argument("--sitemap-webroot-dir", action="store", help="directory of the webroot, without trailing /. This will be removed from the output path for generating the sitemap url entry", default="") + parser.add_argument("--sitemap-base-url", action="store", help="base url of the website, without trailing /", default="https://www.example.com") + parser.add_argument("--sitemap-remove-ext", action="store_true", help="remove the file extenstion in the sitemap entry") parser.add_argument("--exit-on", action="store", help="exit when an error of the given severity occures", choices=["light", "serious", "critical"], default="serious") parser.add_argument("--debug", action="store_true", help="be more verbose", default=False) parser.add_argument("--trace", action="store_true", help="be extremly verbose", default=False) - parser.add_argument("--preserve-comments", action="store_true", help="do not remove normal html comments", default=False) variables:dict[str, str] = {} args = parser.parse_args() @@ -643,47 +742,81 @@ if __name__ == "__main__": args.input = args.input.strip(" ") args.output = args.output.strip(" ") args.output_deps = args.output_deps.strip(" ") + args.sitemap_temp_file = args.sitemap_temp_file.strip(" ") + args.sitemap_generate = args.sitemap_generate.strip(" ") TRACE = args.trace if args.trace: args.debug = True DEBUG = args.debug - # sanity checks - if not path.isfile(args.input): - parser.error(f"Invalid input file:: {args.input}") - if args.output: - if not path.isdir(path.dirname(args.output)): - parser.error(f"Invalid path to output file - directory does not exist: '{path.dirname(args.output)}'") - elif args.inplace: - args.output = args.input - if args.inplace and args.output: - parser.error(f"Only one of --output or --inplace mut be given") - if args.output_deps: - if not path.isdir(path.dirname(args.output_deps)): - parser.error(f"Invalid path to dependency file - directory does not exist: '{path.dirname(args.output_deps)}'") - if not args.output: - parser.error(f"--output-deps requires either --output our --inplace") + # either input file or sitemap_generate is required + if not (bool(args.input) ^ bool(args.sitemap_generate)): + parser.error(f"Exactly one if --input or --sitemap-generate must be given") - # get html - with open(args.input, "r") as file: - target_html = file.read() + if args.input: + if args.sitemap_webroot_dir: + current_file_url = args.sitemap_base_url + args.output.replace(args.sitemap_webroot_dir, "") + else: + current_file_url = args.sitemap_base_url + args.output - output_html = parse_file(target_html, variables, not args.preserve_comments) - # remove empty lines - output_html = re.sub(r"[\t\r ]*\n(?:[\t\r ]*\n[\t\r ]*)+", r"\n", output_html) + if args.sitemap_remove_ext: + current_file_url = os.path.splitext(current_file_url)[0] - # pdebug(f"Output: {output_html}") + pdebug(f"current_file={current_file_url}") - # save - if args.output: - with open(args.output, "w") as file: - file.write(output_html) - else: - print(output_html) + # sanity checks + if not path.isfile(args.input): + parser.error(f"Invalid input file:: {args.input}") + if args.output: + if not path.isdir(path.dirname(args.output)): + parser.error(f"Invalid path to output file - directory does not exist: '{path.dirname(args.output)}'") + elif args.inplace: + args.output = args.input + if args.inplace and args.output: + parser.error(f"Only one of --output or --inplace mut be given") + if args.output_deps: + if not path.isdir(path.dirname(args.output_deps)): + parser.error(f"Invalid path to dependency file - directory does not exist: '{path.dirname(args.output_deps)}'") + if not args.output: + parser.error(f"--output-deps requires either --output our --inplace") - if args.output_deps: - if args.output != args.input: - glob_dependcies.append(args.input) - depfile = generate_dependecy_file(args.output, glob_dependcies) - pdebug(f"Writing dependency file to {os.path.abspath(args.output_deps)}: {depfile}") - with open(args.output_deps, "w") as file: - file.write(depfile) + if args.sitemap_temp_file: + if path.isfile(args.sitemap_temp_file): + with open(args.sitemap_temp_file, "rb") as file: + Sitemap.urls = pickle.load(file) + + # get html + with open(args.input, "r") as file: + target_html = file.read() + + output_html = parse_file(target_html, variables, not args.preserve_comments) + # remove empty lines + output_html = re.sub(r"[\t\r ]*\n(?:[\t\r ]*\n[\t\r ]*)+", r"\n", output_html) + + # pdebug(f"Output: {output_html}") + + # save + if args.output: + with open(args.output, "w") as file: + file.write(output_html) + else: + print(output_html) + + if args.output_deps: + if args.output != args.input: + glob_dependcies.append(args.input) + depfile = generate_dependecy_file(args.output, glob_dependcies) + pdebug(f"Writing dependency file to {os.path.abspath(args.output_deps)}: {depfile}") + with open(args.output_deps, "w") as file: + file.write(depfile) + if args.sitemap_temp_file: + with open(args.sitemap_temp_file, "wb") as file: + pickle.dump(Sitemap.urls, file) + else: # sitemap_generate + if not path.isfile(args.sitemap_temp_file): + parser.error(f"Invalid sitemap-temp-file: '{args.sitemap_temp_file}'") + with open(args.sitemap_temp_file, "rb") as file: + Sitemap.urls = pickle.load(file) + sitemap = Sitemap.gen_sidemap() + pdebug(f"Writing sitemap to {os.path.abspath(args.sitemap_generate)}") + with open(args.sitemap_generate, "w") as file: + file.write(sitemap)