add sitemap.xml support

This commit is contained in:
matthias@arch 2023-11-23 15:44:47 +01:00
parent bdc3522f7f
commit 3180892cc7
3 changed files with 297 additions and 54 deletions

View File

@ -17,46 +17,77 @@
# change these to fir your project # change these to fir your project
# #
# root dir for the project, all other paths relative to PROJECT_DIR (except for OUT_DIR and DEP_DIR) # root dir for the project, most other paths are relative to PROJECT_DIR
# [absolute or relative to current working directory]
PROJECT_DIR = src PROJECT_DIR = src
# path where final website will be in, this one is not relative to PROJECT_DIR # path where final website will be in
# [absolute or relative to current working directory]
OUT_DIR = build OUT_DIR = build
# SOURCE FILES: # SOURCE FILES:
# all SRC_FLS and all files (recursively) in the SRC_DIRS will be built # all SRC_FLS and all files (recursively) in the SRC_DIRS will be built
# all files in PROJECT_DIR (not recursively) are source files # all files in PROJECT_DIR (not recursively) are source files
# [relative to PROJECT_DIR]
SRC_DIRS = de en script SRC_DIRS = de en script
SRC_FLS = SRC_FLS =
# CSS FILES: # CSS FILES:
# directories which may contain sass and scss to compile sass to a correspondig css in OUT_DIR/CSS_DIR (also css, it will simply be copied) # directories which may contain sass and scss to compile sass to a correspondig css in OUT_DIR/CSS_DIR (also css, it will simply be copied)
# [relative to PROJECT_DIR]
CSS_DIRS = style CSS_DIRS = style
CSS_FILES = CSS_FILES =
# RESOURCE FILES: # RESOURCE FILES:
# all RESOURCE_FLS and all files in the RESOURCE_DIRS will be copied to OUT_DIR # all RESOURCE_FLS and all files in the RESOURCE_DIRS will be copied to OUT_DIR
# [relative to PROJECT_DIR]
RESOURCE_DIRS = resources RESOURCE_DIRS = resources
RESOURCE_FLS = RESOURCE_FLS =
# THUMBNAILS: # THUMBNAILS:
# if set, thumbnails for all resource files will be generated and placed in THUMB_OUT_DIR (relative to OUT_DIR) # if set, thumbnails for all resource files having an extension in THUMB_FOR_TYPES will be generated and placed relative to THUMB_OUT_DIR
# [relative to OUT_DIR]
THUMB_OUT_DIR = thumbs THUMB_OUT_DIR = thumbs
# build thumbnails for these types: supported: mp3, flac, wav, pdf and all image formats that magick can handle
THUMB_FOR_TYPES = png gif jpg jpeg webp pdf mp4 mp3 flac wav
# filetype for the thumbnails. (pdfs will always have .jpg)
THUMB_TYPE = jpg
# size for the thumbnails (not respected by pdf)
THUMB_SIZE = 300
# MULTI-LANG SOURCE FILES: # MULTI-LANG SOURCE FILES:
# the files in COMMON_DIR will be built for all LANGS: # the files in COMMON_DIR will be built for all LANGS:
# for example:
# LANGS = de en
# PROJECT_DIR/COMMON_DIR/home.html
# -> OUT_DIR/de/home.html
# -> OUT_DIR/en/home.html
# foreach html-file in COMMON_DIR: # foreach html-file in COMMON_DIR:
# foreach lang in LANGS: # foreach lang in LANGS:
# run HTML_PP_CMD with --var lang=lang on file and output to OUT_DIR without the COMMON_DIR prefix, so COMMON_DIR/subdir/file.html -> OUT_DIR/lang/subdir/file.html # run HTML_PP_CMD with --var lang=lang on file and output to OUT_DIR without the COMMON_DIR prefix, so COMMON_DIR/subdir/file.html -> OUT_DIR/lang/subdir/file.html
# all non-html files will handled the same way, but without the preprocessor being run on them. They are simply copied # For all .html files, the proprocessor will make the variable `lang` available, for example lang=de
# All non-html files will handled the same way, but without the preprocessor being run on them. They are simply copied.
# leave COMMON_DIR empty to disable multi-lang feature # leave COMMON_DIR empty to disable multi-lang feature
COMMON_DIR = # [relative to PROJECT_DIR]
COMMON_DIR = common
LANGS = de en LANGS = de en
# SITEMAP
# sitemap relative to OUT_DIR, leave blank to not generate a sitemap [relative to OUT_DIR]
SITEMAP = sitemap.xml
# base url of the website, without trailing /
WEBSITE_URL = https://quintern.xyz
# file required during build process for sitemap generation [absolute or relative to current working directory]
SITEMAP_TEMP_FILE = .sitemap.pkl
# comment to keep the file extension on sitemap entries
SITEMAP_REMOVE_EXT = 1
# PREPROCESSOR # PREPROCESSOR
# path to of the files that should be included # path to of the files that should be included
# [relative to PROJECT_DIR]
INCLUDE_DIR = include INCLUDE_DIR = include
# additional search paths passed to sass compiler # additional search paths passed to sass compiler
# [relative to PROJECT_DIR]
SASS_INCLUDE_DIRS = include/style SASS_INCLUDE_DIRS = include/style
@ -68,10 +99,10 @@ HTML_PP_CMD = python3 html-preprocessor --exit-on light
# --source-maps-urls=absolute is appended for generating dependency files # --source-maps-urls=absolute is appended for generating dependency files
SASS_CMD = sass --color SASS_CMD = sass --color
# [absolute or relative to current working directory]
DEP_DIR = .dependencies DEP_DIR = .dependencies
# #
# NOT SETTINGS ANYMORE # NOT SETTINGS ANYMORE
# DO NOT CHANGE ANYTHING HERE UNLESS YOU KNOW WHAT YOU ARE DOING! # DO NOT CHANGE ANYTHING HERE UNLESS YOU KNOW WHAT YOU ARE DOING!
@ -121,10 +152,8 @@ ML_OUT_FLS = $(foreach lang, $(LANGS), $(patsubst $(_COMMON_DIR)/%, $(ML_OUT_D
endif endif
ifdef THUMB_OUT_DIR ifdef THUMB_OUT_DIR
_THUMB_FOR_TYPES = png gif jpg jpeg webp pdf
_THUMB_TYPE = jpg
# files for which to generate thumbnails # files for which to generate thumbnails
_THUMB_FLS = $(filter $(foreach type, $(_THUMB_FOR_TYPES), %.$(type)), $(_RES_FLS)) _THUMB_FLS = $(filter $(foreach type, $(THUMB_FOR_TYPES), %.$(type)), $(_RES_FLS))
THUMB_OUT_FLS = $(addsuffix .jpg, $(basename $(patsubst $(PROJECT_DIR)/%, $(OUT_DIR)/$(THUMB_OUT_DIR)/%, $(_THUMB_FLS)))) THUMB_OUT_FLS = $(addsuffix .jpg, $(basename $(patsubst $(PROJECT_DIR)/%, $(OUT_DIR)/$(THUMB_OUT_DIR)/%, $(_THUMB_FLS))))
THUMB_OUT_DIRS = $(sort $(dir $(THUMB_OUT_FLS))) # sort for removing duplicates THUMB_OUT_DIRS = $(sort $(dir $(THUMB_OUT_FLS))) # sort for removing duplicates
endif endif
@ -134,6 +163,14 @@ _DEP_DIRS = $(sort $(patsubst $(OUT_DIR)/%, $(DEP_DIR)/%, $(OUT_DIRS) $(ML_OUT
# needed for reading # needed for reading
_DEP_FLS = $(shell find $(DEP_DIR) -type f -name '*.d' 2>/dev/null) _DEP_FLS = $(shell find $(DEP_DIR) -type f -name '*.d' 2>/dev/null)
ifdef SITEMAP
_SITEMAP = $(addprefix $(OUT_DIR)/, $(SITEMAP))
HTML_PP_CMD += --sitemap-temp-file "$(SITEMAP_TEMP_FILE)" --sitemap-base-url $(WEBSITE_URL) --sitemap-webroot-dir "$(OUT_DIR)"
endif
ifdef SITEMAP_REMOVE_EXT
HTML_PP_CMD += --sitemap-remove-ext
endif
# SASS, add load-paths # SASS, add load-paths
_SASS_CMD = $(SASS_CMD) $(foreach includedir, $(_SASS_INCLUDE_DIRS), --load-path=$(includedir)) --source-map-urls=absolute _SASS_CMD = $(SASS_CMD) $(foreach includedir, $(_SASS_INCLUDE_DIRS), --load-path=$(includedir)) --source-map-urls=absolute
@ -145,6 +182,7 @@ FMT_OUT_HTML ="\e[1;34mBuilding html\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n"
FMT_OUT_CSS ="\e[1;34mBuilding css\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_CSS ="\e[1;34mBuilding css\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n"
FMT_OUT_THUMB ="\e[1;34mBuilding thumbnail\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_THUMB ="\e[1;34mBuilding thumbnail\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n"
FMT_OUT_OTHER ="\e[1;34mBuilding\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_OTHER ="\e[1;34mBuilding\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n"
FMT_OUT_SITEMAP ="\e[1;34mBuilding sitemap\e[0m: \e[1;35m%s\e[0m\n"
FMT_OUT_ML_HTML="\e[1;34mBuilding html\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_ML_HTML="\e[1;34mBuilding html\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n"
FMT_OUT_ML_OTHER="\e[1;34mBuilding\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n" FMT_OUT_ML_OTHER="\e[1;34mBuilding\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m at \e[1;35m%s\e[0m\n"
@ -159,7 +197,7 @@ FMT_OUT_ML_OTHER="\e[1;34mBuilding\e[0m in lang \e[1;34m%s\e[0m: \e[1;33m%s\e[0m
include $(_DEP_FLS) include $(_DEP_FLS)
all: normal multilang resources thumbnails all: normal multilang resources thumbnails
normal: $(OUT_FLS) normal: $(_SITEMAP) $(OUT_FLS)
multilang: $(ML_OUT_FLS) multilang: $(ML_OUT_FLS)
resources: $(RES_OUT_FLS) resources: $(RES_OUT_FLS)
thumbnails: $(THUMB_OUT_FLS) thumbnails: $(THUMB_OUT_FLS)
@ -217,11 +255,19 @@ $(OUT_DIR)/$(THUMB_OUT_DIR)/%.jpg: | $(THUMB_OUT_DIRS)
sources=($(_THUMB_FLS)); \ sources=($(_THUMB_FLS)); \
source=$$(printf "%s\n" $${sources[@]} | grep "$$target"'\.'); \ source=$$(printf "%s\n" $${sources[@]} | grep "$$target"'\.'); \
printf $(FMT_OUT_THUMB) "$$source" "$$fulltarget"; \ printf $(FMT_OUT_THUMB) "$$source" "$$fulltarget"; \
if [ "$${source##*.}" = "pdf" ]; then \ case "$${source##*.}" in \
pdftoppm -f 1 -singlefile -jpeg -r 50 "$$source" "$${fulltarget%.*}"; \ "mp4-") ffmpegthumbnailer -i "$$source" -o "$$fulltarget" -s 300 -q 5;; \
else \ "pdf") pdftoppm -f 1 -singlefile -jpeg -r 50 "$$source" "$${fulltarget%.*}";; \
magick "$$source" -thumbnail '100x100>' "$@"; \ "mp3"|"flac"|"wav") ffmpeg -hide_banner -i "$$source" "$$fulltarget" -y >/dev/null;; \
fi; \ "*") magick "$$source[0]" -thumbnail '$(THUMB_SIZE)x$(THUMB_SIZE)>' "$@";; \
esac
# SITEMAP
ifdef _SITEMAP
$(_SITEMAP): $(OUT_FLS) $(ML_OUT_FLS) # build sitemap after all other files
@printf $(FMT_OUT_SITEMAP) "$@"
@$(HTML_PP_CMD) --sitemap-generate "$@"
endif
# #
@ -249,11 +295,13 @@ $(OUT_DIR)/%.css: $(PROJECT_DIR)/%.scss | $(OUT_DIRS) $(_DEP_DIRS)
jq -r '.sources | @sh' $@.map | tr -d \' | sed 's|file://||g' >> "$$depfile"; \ jq -r '.sources | @sh' $@.map | tr -d \' | sed 's|file://||g' >> "$$depfile"; \
rm $@.map rm $@.map
# this rule must be last!
$(OUT_DIR)/%: $(PROJECT_DIR)/% | $(OUT_DIRS) $(RES_OUT_DIRS) $(OUT_DIR)/%: $(PROJECT_DIR)/% | $(OUT_DIRS) $(RES_OUT_DIRS)
@printf $(FMT_OUT_OTHER) "$<" "$@" @printf $(FMT_OUT_OTHER) "$<" "$@"
@cp -r $< $@ @cp -r $< $@
# .DEFAULT: # .DEFAULT:
# @echo "MISSING RULE: $@" # @echo "MISSING RULE: $@"
@ -264,7 +312,7 @@ stop:
killall nginx killall nginx
clean: clean:
-@rm $(OUT_FLS) $(ML_OUT_FLS) 2>/dev/null -@rm $(OUT_FLS) $(ML_OUT_FLS) $(SITEMAP_TEMP_FILE) $(SITEMAP) 2>/dev/null
-@rm -r $(DEP_DIR) 2>/dev/null -@rm -r $(DEP_DIR) 2>/dev/null
cleaner: cleaner:

View File

@ -174,6 +174,7 @@ An entry is a html heading with a id: `<h1 id=myheading>This heading will be lin
`<!-- #sidenav sidenav-command arguments -->` `<!-- #sidenav sidenav-command arguments -->`
sidenav-command must be one of the following: sidenav-command must be one of the following:
#### `include` #### `include`
Include the generated sidenav at this position. This command will always be executed last, after the whole file has been parsed. Include the generated sidenav at this position. This command will always be executed last, after the whole file has been parsed.
@ -183,6 +184,7 @@ Ignored
**Return Value**: **Return Value**:
The generated sidenav The generated sidenav
#### `section` #### `section`
Group all following entries in named section. `section` may not appear in conditional blocks and multiline comments. Group all following entries in named section. `section` may not appear in conditional blocks and multiline comments.
@ -192,6 +194,7 @@ The name of the section
**Return Value** **Return Value**
Empty string Empty string
#### `name` #### `name`
Use a custom name instead of the heading itself for the link to the next heading. Use a custom name instead of the heading itself for the link to the next heading.
@ -201,6 +204,7 @@ The link-name of the next heading
**Return Value**: **Return Value**:
Empty string Empty string
#### `custom` #### `custom`
Include a custom link in the sidenav. Include a custom link in the sidenav.
@ -215,6 +219,64 @@ Empty string
--- ---
### sitemap
Used for automatically generating a `sitemap.xml` for the website.
#### `include`
Include the current page in the sitemap
**Synopsis**:
`<!-- #sitemap include -->`
`<!-- #sitemap include https://use.custom.link/for-this/site -->`
**Argument**:
Optional: Use a different link for this page
**Return Value**:
Empty string
#### `priority`
Set the `priority` field
**Synopsis**:
`<!-- #sitemap priority 0.8 -->`
**Argument**:
Float between 0.0 and 1.0
**Return Value**:
Empty string
#### `changefreq`
Set the `changefreq` field
**Synopsis**:
`<!-- #sitemap changefreq never -->`
**Argument**:
One of *always, hourly, daily, weekly, monthly, yearly, never*
**Return Value**:
Empty string
#### `lastmod`
Set the `lastmod` field
**Synopsis**:
`<!-- #sitemap lastmod 2023-12-29T14:00:05+01:00 -->`
**Argument**:
The lastmod date in w3c date format
**Return Value**:
Empty string
---
## Pitfalls ## Pitfalls
- The `#include` command must not be in the last line of the file - The `#include` command must not be in the last line of the file
- The `#include` command can not be in multi-line comment if the included file also contains comments - The `#include` command can not be in multi-line comment if the included file also contains comments

View File

@ -5,6 +5,7 @@ import re
from sys import argv from sys import argv
from collections.abc import Callable from collections.abc import Callable
import argparse import argparse
import pickle
""" """
TODO: TODO:
@ -27,6 +28,11 @@ sidenav_content_section = "<li class=\"sidenav_section\">#name</li>"
exit_on_include_failure = False exit_on_include_failure = False
sitemap_begin = """\
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n"""
sitemap_end = "</urlset>"
""" """
************************************************************ REGULAR EXPRESSIONS ************************************************************ ************************************************************ REGULAR EXPRESSIONS ************************************************************
""" """
@ -47,6 +53,10 @@ re_variable_use = r"#\$\(([a-zA-Z0-9_]+)\)"
""" only in comments """ """ only in comments """
re_preprocessor_command = r"#([a-zA-Z]+) *(.*) *" re_preprocessor_command = r"#([a-zA-Z]+) *(.*) *"
# https://www.w3.org/TR/NOTE-datetime
re_w3cdate = r"\d{4}-(?)]-\d{2}"
r"\d{4}-(?:0[1-9]|1[0-2])-(?:[0-2]\d|3[01])(T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d([\+\-](?:0\d|1[0-2]):[0-5]\d)?)?"
COMMENT_BEGIN = "<!--" COMMENT_BEGIN = "<!--"
COMMENT_END = "-->" COMMENT_END = "-->"
@ -67,6 +77,9 @@ error_levels = {
} }
exit_on_error_level = error_levels["serious"] exit_on_error_level = error_levels["serious"]
# url that the currently processed file have
current_file_url = ""
""" """
************************************************************ UTILITY ************************************************************ ************************************************************ UTILITY ************************************************************
@ -122,6 +135,86 @@ def evaluate_condition(input_string) -> bool:
error(f"Pythonized condition is invalid: {condition}", level=error_levels["light"]) error(f"Pythonized condition is invalid: {condition}", level=error_levels["light"])
return False return False
"""
************************************************************ SITEMAP ************************************************************
"""
class Sitemap:
urls:dict = {}
def __init__(self, url=None):
self.url = url
self.priority = None
self.changefreq = None
self.lastmod = None
def set_url(self, url):
self.url = url
def set_priority(self, priority):
try:
priority = float(priority)
except ValueError:
error(f"Sitemap: invalid priority: '{priority}'", level=error_levels["serious"])
if not (type(priority) == float and 0.0 <= priority and priority <= 1.0):
error(f"Sitemap: invalid priority: '{priority}'", level=error_levels["serious"])
self.priority = priority
def set_changefreq(self, changefreq):
if not (type(changefreq) == str and changefreq in ["always", "hourly", "daily", "weekly", "monthly", "yearly", "never"]):
error(f"Sitemap: invalid changefreq: '{changefreq}'", level=error_levels["serious"])
self.changefreq = changefreq
def set_lastmod(self, lastmod):
if not (type(lastmod) == str and re.fullmatch(re_w3cdate, lastmod)):
error(f"Sitemap: invalid lastmod: '{lastmod}'", level=error_levels["serious"])
self.lastmod = lastmod
def get_entry(self):
s = f"<url>\n\t<loc>{self.url}</loc>"
if self.priority is not None: s += f"\n\t<priority>{self.priority}</priority>"
if self.changefreq is not None: s += f"\n\t<changefreq>{self.changefreq}</changefreq>"
if self.lastmod is not None: s += f"\n\t<lastmod>{self.lastmod}</lastmod>"
s += "\n</url>"
return s
def __repr__(self) -> str:
return f"Sitemap(url={self.url}, priority={self.priority}, changefreq={self.changefreq}, lastmod={self.lastmod})"
@staticmethod
def gen_sidemap():
s = sitemap_begin
for url in Sitemap.urls.values():
s += "\t" + url.get_entry().replace("\n", "\n\t").strip("\t") + "\n"
s += sitemap_end
return s
@staticmethod
def cmd_sitemap(args:str, variables:dict[str,str]) -> str:
space = args.find(" ")
if space < 0:
space = len(args)
cmd = args[:space]
cmd_args = ""
if 0 < space and space < len(args) - 1:
cmd_args = args[space+1:].strip(" ")
pdebug(f"cmd_sitemap: cmd='{cmd}' cmd_args='{cmd_args}'")
if not current_file_url in Sitemap.urls:
Sitemap.urls[current_file_url] = Sitemap()
if cmd == "include":
if cmd_args:
Sitemap.urls[current_file_url].set_url(cmd_args)
else:
Sitemap.urls[current_file_url].set_url(current_file_url)
elif cmd == "priority":
Sitemap.urls[current_file_url].set_priority(cmd_args)
elif cmd == "changefreq":
Sitemap.urls[current_file_url].set_changefreq(cmd_args)
elif cmd == "lastmod":
Sitemap.urls[current_file_url].set_lastmod(cmd_args)
else:
error(f"cmd_sitemap: Invalid command '{cmd}'", error_levels["serious"])
ptrace(f"Sitemap[{current_file_url}] is now: {Sitemap.urls[current_file_url]}")
return ""
""" """
@ -182,9 +275,9 @@ class Sidenav:
space = len(args) space = len(args)
cmd = args[:space] cmd = args[:space]
cmd_args = "" cmd_args = ""
pdebug(f"cmd_sidenav: cmd='{cmd}' cmd_args='{cmd_args}'")
if 0 < space and space < len(args) - 1: if 0 < space and space < len(args) - 1:
cmd_args = args[space+1:].strip(" ") cmd_args = args[space+1:].strip(" ")
pdebug(f"cmd_sidenav: cmd='{cmd}' cmd_args='{cmd_args}'")
if cmd == "skip": if cmd == "skip":
Sidenav.skipNext() Sidenav.skipNext()
elif cmd == "section": elif cmd == "section":
@ -354,6 +447,7 @@ command2function:dict[str, Callable[[str, dict[str,str]], str]] = {
"comment": cmd_comment, "comment": cmd_comment,
"uncomment": cmd_uncomment, "uncomment": cmd_uncomment,
"sidenav": Sidenav.cmd_sidenav, "sidenav": Sidenav.cmd_sidenav,
"sitemap": Sitemap.cmd_sitemap,
"warning": cmd_warning, "warning": cmd_warning,
"error": cmd_error, "error": cmd_error,
} }
@ -620,15 +714,20 @@ def substitute_variables(html:str, variables:dict[str, str]):
""" """
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="bUwUma html preprocessor") parser = argparse.ArgumentParser(prog="bUwUma html preprocessor")
parser.add_argument("--input", action="store", help="path to the input file", required=True) parser.add_argument("--input", action="store", help="path to the input file", default="")
parser.add_argument("--output", action="store", help="output to this file", default="") parser.add_argument("--output", action="store", help="output to this file", default="")
parser.add_argument("--inplace", action="store_true", help="overwrite input file") parser.add_argument("--inplace", action="store_true", help="overwrite input file")
parser.add_argument("--preserve-comments", action="store_true", help="do not remove normal html comments", default=False)
parser.add_argument("--var", action="append", help="set a variable --var varname=value", default=[]) parser.add_argument("--var", action="append", help="set a variable --var varname=value", default=[])
parser.add_argument("--output-deps", action="store", help="output a Makefile listing all dependencies", default="") parser.add_argument("--output-deps", action="store", help="output a Makefile listing all dependencies", default="")
parser.add_argument("--sitemap-generate", action="store", help="generate the sitemap from the sitemap-temp-file", default="")
parser.add_argument("--sitemap-temp-file", action="store", help="file for storing sitemap data during build process", default="/tmp/sitemap.pkl")
parser.add_argument("--sitemap-webroot-dir", action="store", help="directory of the webroot, without trailing /. This will be removed from the output path for generating the sitemap url entry", default="")
parser.add_argument("--sitemap-base-url", action="store", help="base url of the website, without trailing /", default="https://www.example.com")
parser.add_argument("--sitemap-remove-ext", action="store_true", help="remove the file extenstion in the sitemap entry")
parser.add_argument("--exit-on", action="store", help="exit when an error of the given severity occures", choices=["light", "serious", "critical"], default="serious") parser.add_argument("--exit-on", action="store", help="exit when an error of the given severity occures", choices=["light", "serious", "critical"], default="serious")
parser.add_argument("--debug", action="store_true", help="be more verbose", default=False) parser.add_argument("--debug", action="store_true", help="be more verbose", default=False)
parser.add_argument("--trace", action="store_true", help="be extremly verbose", default=False) parser.add_argument("--trace", action="store_true", help="be extremly verbose", default=False)
parser.add_argument("--preserve-comments", action="store_true", help="do not remove normal html comments", default=False)
variables:dict[str, str] = {} variables:dict[str, str] = {}
args = parser.parse_args() args = parser.parse_args()
@ -643,47 +742,81 @@ if __name__ == "__main__":
args.input = args.input.strip(" ") args.input = args.input.strip(" ")
args.output = args.output.strip(" ") args.output = args.output.strip(" ")
args.output_deps = args.output_deps.strip(" ") args.output_deps = args.output_deps.strip(" ")
args.sitemap_temp_file = args.sitemap_temp_file.strip(" ")
args.sitemap_generate = args.sitemap_generate.strip(" ")
TRACE = args.trace TRACE = args.trace
if args.trace: args.debug = True if args.trace: args.debug = True
DEBUG = args.debug DEBUG = args.debug
# sanity checks # either input file or sitemap_generate is required
if not path.isfile(args.input): if not (bool(args.input) ^ bool(args.sitemap_generate)):
parser.error(f"Invalid input file:: {args.input}") parser.error(f"Exactly one if --input or --sitemap-generate must be given")
if args.output:
if not path.isdir(path.dirname(args.output)):
parser.error(f"Invalid path to output file - directory does not exist: '{path.dirname(args.output)}'")
elif args.inplace:
args.output = args.input
if args.inplace and args.output:
parser.error(f"Only one of --output or --inplace mut be given")
if args.output_deps:
if not path.isdir(path.dirname(args.output_deps)):
parser.error(f"Invalid path to dependency file - directory does not exist: '{path.dirname(args.output_deps)}'")
if not args.output:
parser.error(f"--output-deps requires either --output <file> our --inplace")
# get html if args.input:
with open(args.input, "r") as file: if args.sitemap_webroot_dir:
target_html = file.read() current_file_url = args.sitemap_base_url + args.output.replace(args.sitemap_webroot_dir, "")
else:
current_file_url = args.sitemap_base_url + args.output
output_html = parse_file(target_html, variables, not args.preserve_comments) if args.sitemap_remove_ext:
# remove empty lines current_file_url = os.path.splitext(current_file_url)[0]
output_html = re.sub(r"[\t\r ]*\n(?:[\t\r ]*\n[\t\r ]*)+", r"\n", output_html)
# pdebug(f"Output: {output_html}") pdebug(f"current_file={current_file_url}")
# save # sanity checks
if args.output: if not path.isfile(args.input):
with open(args.output, "w") as file: parser.error(f"Invalid input file:: {args.input}")
file.write(output_html) if args.output:
else: if not path.isdir(path.dirname(args.output)):
print(output_html) parser.error(f"Invalid path to output file - directory does not exist: '{path.dirname(args.output)}'")
elif args.inplace:
args.output = args.input
if args.inplace and args.output:
parser.error(f"Only one of --output or --inplace mut be given")
if args.output_deps:
if not path.isdir(path.dirname(args.output_deps)):
parser.error(f"Invalid path to dependency file - directory does not exist: '{path.dirname(args.output_deps)}'")
if not args.output:
parser.error(f"--output-deps requires either --output <file> our --inplace")
if args.output_deps: if args.sitemap_temp_file:
if args.output != args.input: if path.isfile(args.sitemap_temp_file):
glob_dependcies.append(args.input) with open(args.sitemap_temp_file, "rb") as file:
depfile = generate_dependecy_file(args.output, glob_dependcies) Sitemap.urls = pickle.load(file)
pdebug(f"Writing dependency file to {os.path.abspath(args.output_deps)}: {depfile}")
with open(args.output_deps, "w") as file: # get html
file.write(depfile) with open(args.input, "r") as file:
target_html = file.read()
output_html = parse_file(target_html, variables, not args.preserve_comments)
# remove empty lines
output_html = re.sub(r"[\t\r ]*\n(?:[\t\r ]*\n[\t\r ]*)+", r"\n", output_html)
# pdebug(f"Output: {output_html}")
# save
if args.output:
with open(args.output, "w") as file:
file.write(output_html)
else:
print(output_html)
if args.output_deps:
if args.output != args.input:
glob_dependcies.append(args.input)
depfile = generate_dependecy_file(args.output, glob_dependcies)
pdebug(f"Writing dependency file to {os.path.abspath(args.output_deps)}: {depfile}")
with open(args.output_deps, "w") as file:
file.write(depfile)
if args.sitemap_temp_file:
with open(args.sitemap_temp_file, "wb") as file:
pickle.dump(Sitemap.urls, file)
else: # sitemap_generate
if not path.isfile(args.sitemap_temp_file):
parser.error(f"Invalid sitemap-temp-file: '{args.sitemap_temp_file}'")
with open(args.sitemap_temp_file, "rb") as file:
Sitemap.urls = pickle.load(file)
sitemap = Sitemap.gen_sidemap()
pdebug(f"Writing sitemap to {os.path.abspath(args.sitemap_generate)}")
with open(args.sitemap_generate, "w") as file:
file.write(sitemap)