changed structure, added cfg

2023-05-13 00:28:07 +02:00 · 2023-05-13 00:28:07 +02:00 · cf1294882b
commit cf1294882b
parent 25a06cde64
21 changed files with 1494 additions and 817 deletions
--- a/README.md
+++ b/README.md
@ -43,6 +43,13 @@ sudo chmod +x /usr/share/zsh/site-functions/_regina
 ```
 # Changelog
 ## 2.0
 - Refactored databse code
 - New database format: 
    - Removed filegroups table
    - Put referrer, browser and platform in own table to reduze size of the database
 - 
 ## 1.0
 - Initial release
--- a/regina/init.py
+++ b/regina/init.py
@ -1,4 +1,5 @@
 """Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
 # __package__ = 'regina'
-from regina.db_operation import database, visualize, collect
+from regina.data_collection import parse_log
 from regina import database
--- a/regina/data_collection/parse_log.py
+++ b/regina/data_collection/parse_log.py
@ -1,7 +1,5 @@
 import sqlite3 as sql
 from re import fullmatch, match
-from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id
+from regina.data_collection.request import Request
 from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
 from regina.utility.utility import pdebug, warning, pmessage
 """
@ -18,12 +16,12 @@ re_http_referer = r'"([^"]*)"'
 re_http_visitor_agent = r'"([^"]*)"'
 re_log_format: str = f'({re_remote_addr}) - ({re_remote_visitor}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_visitor_agent}'
-def parse_log(logfile:str) -> list[Request]:
+def parse_log(logfile_path:str) -> list[Request]:
    """
    create Request objects from each line in the logfile
    """
    requests = []
-    with open(logfile, "r") as file:
+    with open(logfile_path, "r") as file:
        lines = file.readlines()
    for line in lines:
        m = match(re_log_format, line)
@ -37,7 +35,7 @@ def parse_log(logfile:str) -> list[Request]:
            warning(f"parse_log: len('{m.groups()[3]}'.split(' ')) is {len(request_)} and not 3")
            continue
        requests.append(Request(ip_address=g[0], time_local=g[2],
-                                request_type=request_[0], request_file=request_[1], request_protocol=request_[2],
+                                request_type=request_[0], request_route=request_[1], request_protocol=request_[2],
-                                status=g[4], bytes_sent=g[5], referer=g[6], visitor_agent=g[7]))
+                                status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
    return requests
--- a/regina/data_collection/request.py
+++ b/regina/data_collection/request.py
@ -3,14 +3,14 @@ from time import mktime
 from re import fullmatch, match
 from datetime import datetime as dt
-from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
+from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
-from .utility.utility import pdebug, warning, pmessage
+from regina.utility.utility import pdebug, warning, pmessage
-from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
+from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
 class Request:
-    def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""):
+    def __init__(self, ip_address="", time_local="", request_type="", request_route="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""):
        self.ip_address = int(IPv4Address(sanitize(ip_address)))
        self.time_local = 0
        # turn [20/Nov/2022:00:47:36 +0100] to unix time
@ -29,21 +29,21 @@ class Request:
        else:
            warning(f"Request:__init__: Could not match time: '{time_local}'")
        self.request_type = sanitize(request_type)
-        self.request_route = sanitize(request_file)
+        self.request_route = sanitize(request_route)
        self.request_protocol = sanitize(request_protocol)
        self.status = sanitize(status)
        self.bytes_sent = sanitize(bytes_sent)
        self.referer = sanitize(referer)
-        self.visitor_agent = sanitize(visitor_agent)
+        self.user_agent = sanitize(user_agent)
    def __repr__(self):
-        return f"{self.ip_address} - {self.time_local} - {self.request_route} - {self.visitor_agent} - {self.status}"
+        return f"{self.ip_address} - {self.time_local} - {self.request_route} - {self.user_agent} - {self.status}"
    def get_platform(self):
        # for groups in findall(re_visitor_agent, visitor_agent):
        operating_system = ""
        for os in visitor_agent_operating_systems:
-            if os in self.visitor_agent:
+            if os in self.user_agent:
                operating_system = os
                break
        return operating_system
@ -51,12 +51,12 @@ class Request:
    def get_browser(self):
        browser = ""
        for br in visitor_agent_browsers:
-            if br in self.visitor_agent:
+            if br in self.user_agent:
                browser = br
                break
        return browser
    def get_mobile(self):
-        return "Mobi" in self.visitor_agent
+        return "Mobi" in self.user_agent
--- a/regina/data_visualization/init.py
+++ b/regina/data_visualization/init.py
@ -0,0 +1 @@
 """Visualization utility for regina"""
--- a/regina/data_visualization/ranking.py
+++ b/regina/data_visualization/ranking.py
@ -0,0 +1,151 @@
 from re import fullmatch
 from regina.database import Database
 from regina.utility.globals import settings
 from regina.utility.utility import pdebug, warning, missing_arg, is_blacklisted, is_whitelisted
 from regina.data_visualization.utility import is_valid_status, cleanup_referer
 def get_route_ranking(db: Database, date_condition:str) -> list[tuple[int, str]]:
    """
    :returns [(request_count, route name)]
    """
    ranking = []
    for (route_id, name) in db(f"SELECT route_id, name FROM route"):
        if     is_blacklisted(name, settings["route_ranking_blacklist"]): continue
        if not is_whitelisted(name, settings["route_ranking_whitelist"]): continue
        if settings["route_ranking_ignore_404"]:  # use only succesful routes
            success = False
            for (status) in db(f"SELECT status FROM request WHERE route_id = {route_id}"):
                if is_valid_status(status):
                    pdebug(f"get_route_ranking: success code {status} for route with route_id {route_id} and name {name}")
                    success = True
                    break
            if not success:
                pdebug(f"get_route_ranking: route with route_id {route_id} and name {name} has only requests resulting in error")
                continue
        db.execute(f"SELECT COUNT(*) FROM request WHERE route_id = {route_id} AND {date_condition}")
        ranking.append((db.fetchone()[0], name))
    ranking.sort()
    return ranking
 def get_ranking(db: Database, table: str, field_name: str, date_condition:str, whitelist_regex: str|list[str]|None=None, blacklist_regex: str|list[str]|None=None) -> list[tuple[int, str]]:
    """
    1) get all the distinct entries for field_name after min_date_unix_time
    2) call get_name_function with the distinct entry
    3) skip if not fully matching regex whitelist
    4) skip if fully matching regex blacklist
    5) for every entry, get the count in table after min_date_unix_time
    6) sort by count in ascending order
    @returns [(count, name)]
    """
    ranking = []
    for (name) in db(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date_condition}"):
        if     is_blacklisted(name, blacklist_regex): continue
        if not is_whitelisted(name, whitelist_regex): continue
        db.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date_condition}")
        ranking.append((db.fetchone()[0], name))
    ranking.sort()
    return ranking
 def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
    unique_referers = dict()
    for count, referer in referer_ranking:
        referer = cleanup_referer(referer)
        if referer in unique_referers:
            unique_referers[referer] += count
        else:
            unique_referers[referer] = count
    referer_ranking.clear()
    for referer, count in unique_referers.items():
        referer_ranking.append((count, referer))
    referer_ranking.sort()
 def get_city_and_country_ranking(db: Database, require_humans=True):
    """
    @returns [(count, "city (CO)")], [(count, country)]
    """
    cities_dict = {}
    country_dict = {}
    sql_cmd = f"SELECT ci.name, co.code, co.name FROM country AS co, city as ci, visitor as v, ip_range as i WHERE v.ip_range_id = i.ip_range_id AND i.city_id = ci.city_id AND ci.country_id = co.country_id"
    if require_humans: sql_cmd += " AND v.is_human = 1"
    result = db(sql_cmd)
    for (city, country_code, country) in result:
        if city in cities_dict:
            cities_dict[city][0] += 1
        else:
            if     is_blacklisted(city, settings["city_ranking_blacklist"]): continue
            if not is_whitelisted(city, settings["city_ranking_whitelist"]): continue
            cities_dict[city] = [1, country_code, country]  # count, country code
        if country in country_dict:
            country_dict[country] += 1
        else:
            if     is_blacklisted(country, settings["country_ranking_blacklist"]): continue
            if not is_whitelisted(country, settings["country_ranking_whitelist"]): continue
            country_dict[country] = 1  # count, country code
    city_ranking = [(v[0], f"{city} ({v[1]})") for city,v in cities_dict.items()]
    city_ranking.sort()
    country_ranking = [(count, country) for country,count in country_dict.items()]
    country_ranking.sort()
    return city_ranking, country_ranking
 def get_platform_browser_mobile_rankings(db: Database, visitor_ids: list[int]) -> tuple[list[tuple[int, str]], list[tuple[int, str]], float]:
    """
    returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage
    """
    platform_ranking = {}
    platform_count = 0.0
    browser_ranking = {}
    browser_count = 0.0
    mobile_ranking = { True: 0.0, False: 0.0 }
    for visitor_id in visitor_ids:
        platform_id, browser_id, is_mobile = db(f"SELECT platform_id, browser_id, is_mobile FROM visitor WHERE visitor_id = {visitor_id}")[0]
        is_mobile = bool(is_mobile)
        if platform_id:
            if platform_id in platform_ranking: platform_ranking[platform_id] += 1
            else: platform_ranking[platform_id] = 1
            platform_count += 1
        if browser_id:
            if browser_id in browser_ranking: browser_ranking[browser_id] += 1
            else: browser_ranking[browser_id] = 1
            browser_count += 1
        if (platform_id or browser_id):
            mobile_ranking[is_mobile] += 1
    try:
        mobile_visitor_percentage = mobile_ranking[True] / (mobile_ranking[True] + mobile_ranking[False])
    except ZeroDivisionError:
        mobile_visitor_percentage = 0.0
    platform_ranking =  [(c * 100/platform_count, db.get_name("platform", p_id)) for p_id, c in platform_ranking.items()]
    platform_ranking.sort()
    browser_ranking = [(c * 100/browser_count, db.get_name("browser", b_id)) for b_id, c in browser_ranking.items()]
    browser_ranking.sort()
    return platform_ranking, browser_ranking, mobile_visitor_percentage*100
 # Store ranking in results class and dump with pickle
 # class Results:
 #     def __init__(self, timespan_name,
 #                  r_routes:	    list[tuple[int, str]],
 #                  r_referrers:	list[tuple[int, str]],
 #                  r_platforms:	list[tuple[int, str]],
 #                  r_browsers:	list[tuple[int, str]],
 #                  r_cities:	    list[tuple[int, str]],
 #                  r_countries:	list[tuple[int, str]],
 #                  ):
 #         self.r_routes   = r_routes
 #         self.r_referrers= r_referrers
 #         self.r_platforms= r_platforms
 #         self.r_browsers = r_browsers
 #         self.r_cities   = r_cities
 #         self.r_countries= r_countries
--- a/regina/data_visualization/utility.py
+++ b/regina/data_visualization/utility.py
@ -0,0 +1,110 @@
 from re import fullmatch
 from regina.database import Database
 from regina.utility.globals import settings
 from regina.utility.utility import pdebug, warning, missing_arg
 # re_uri_protocol = f"(https?)://"
 re_uri_protocol = f"(https?://)?"
 re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
 # re_uri_ipv6 = ""
 re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
 re_uri_route = r"(?:/(.*))?"
 re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_route})"
 # (https?://)?((?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})|(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?))((?:/(.*))?)
 def cleanup_referer(referer: str) -> str:
    """
    split the referer uri into its parts and reassemeble them depending on settings
    """
    m = fullmatch(re_uri_full, referer)
    if not m:
        warning(f"cleanup_referer: Could not match referer '{referer}'")
        return referer
    # pdebug(f"cleanup_referer: {referer} - {m.groups()}")
    protocol = m.groups()[0]
    subdomains = m.groups()[2]
    if not subdomains: subdomains = ""
    domain = m.groups()[1].replace(subdomains, "")
    route = m.groups()[3]
    referer = domain
    if settings["referer_ranking_ignore_tld"]:
        if len(domain.split(".")) == 2:  # if domain.tld
            referer = domain.split(".")[0]
    if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
    if not settings["referer_ranking_ignore_protocol"]: referer = protocol + referer
    if not settings["referer_ranking_ignore_route"]: referer += route
    # pdebug(f"cleanup_referer: cleaned up: {referer}")
    return referer
 def get_where_date_str(at_date=None, min_date=None, max_date=None):
    """
    get a condition string that sets a condition on the time
    """
    # dates in unix time
    s = ""
    if at_date is not None:
        if isinstance(at_date, str):
            s += f"DATE(time, 'unixepoch') = '{sanitize(at_date)}' AND "
        elif isinstance(at_date, int|float):
            s += f"time = {int(at_date)} AND "
        else:
            print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}")
    if min_date is not None:
        if isinstance(min_date, str):
            s += f"DATE(time, 'unixepoch') >= '{sanitize(min_date)}' AND "
        elif isinstance(min_date, int|float):
            s += f"time >= {int(min_date)} AND "
        else:
            print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}")
    if max_date is not None:
        if isinstance(max_date, str):
            s += f"DATE(time, 'unixepoch') <= '{sanitize(max_date)}' AND "
        elif isinstance(max_date, int|float):
            s += f"time <= {int(max_date)} AND "
        else:
            print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}")
    if s == "":
        print(f"WARNING: get_where_date_str: no date_str generated. Returning 'time > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}")
        return "time > 0"
    return s.removesuffix(" AND ")
 def is_valid_status(status: int):
    if status >= 400: return False
    if settings["status_300_is_success"] and status >= 300: return True
    return status < 300
 #
 # GETTERS
 #
 def get_unique_visitor_ids_for_date(db: Database, date:str) -> list[int]:
    return [ visitor_id[0] for visitor_id in db(f"SELECT DISTINCT visitor_id FROM request WHERE {date}") ]
 def append_human_visitors(db: Database, unique_visitor_ids, unique_visitor_ids_human: list):
    """
    for visitor in unique_visitor_ids:
        if human -> append to unique_visitor_ids_human
    """
    for visitor_id in unique_visitor_ids:
        db.execute(f"SELECT is_human FROM visitor WHERE visitor_id = {visitor_id}")
        if db.fetchone()[0] == 1:
            unique_visitor_ids_human.append(visitor_id)
 def get_unique_request_ids_for_date(db: Database, date_constraint:str):
    return [ request_id[0] for request_id in db(f"SELECT DISTINCT request_id FROM request WHERE {date_constraint}")]
 def append_unique_request_ids_for_date_and_visitor(db: Database, date_constraint:str, visitor_id: int, unique_request_ids_human: list):
    """append all unique requests for visitor_id at date_constraint to unique_request_ids_human"""
    for request_id in db(f"SELECT DISTINCT request_id FROM request WHERE {date_constraint} AND visitor_id = {visitor_id}"):
        unique_request_ids_human.append(request_id[0])
 # get number of requests per day
 def get_request_count_for_date(db: Database, date_constraint:str) -> int:
    db.execute(f"SELECT COUNT(*) FROM request WHERE {date_constraint}")
    return db.fetchone()[0]
 def get_unique_visitor_count(db: Database) -> int:
    return sql_tablesize(db.cur, "visitor")
--- a/regina/data_visualization/visualize.py
+++ b/regina/data_visualization/visualize.py
@ -0,0 +1,365 @@
 # from sys import path
 # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
 import sqlite3 as sql
 from sys import exit
 from re import fullmatch
 import matplotlib.pyplot as plt
 from os.path import isdir
 from datetime import datetime as dt
 from numpy import empty
 # local
 from regina.database import Database
 from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
 from regina.utility.utility import pdebug, warning, missing_arg
 from regina.utility.globals import settings
 from regina.data_visualization.utility import cleanup_referer, get_where_date_str, get_unique_visitor_ids_for_date, get_unique_request_ids_for_date, append_human_visitors, append_unique_request_ids_for_date_and_visitor
 from regina.data_visualization.ranking import get_city_and_country_ranking, get_platform_browser_mobile_rankings, get_ranking, cleanup_referer_ranking, get_route_ranking
 """
 visualize information from the databse
 """
 palette = {
    "red": "#ee4035",
    "orange": "#f37736",
    "yellow": "#fdf458",
    "green": "#7bc043",
    "blue": "#0392cf",
    "purple": "#b044a0",
 }
 color_settings_filetypes = {
    palette["red"]: ["html", "php"],
    palette["green"]: ["jpg", "png", "jpeg", "gif", "svg", "webp"],
    palette["yellow"]: ["css"],
    "grey": ["txt"]
 }
 color_settings_alternate = list(palette.values())
 color_settings_browsers = {
    palette["red"]: ["Safari"],
    palette["orange"]: ["Firefox"],
    palette["yellow"]: ["Chrome"],
    "grey": ["Edge"],
    palette["green"]: ["Chromium"],
    palette["purple"]: ["Brave"]
 }
 color_settings_platforms = {
    palette["red"]: ["Mac"],
    palette["green"]: ["Android"],
    "grey": ["iPhone", "iPad"],
    palette["yellow"]: ["Linux"],
    palette["purple"]: ["BSD"],
    palette["blue"]: ["Windows"],
 }
 def len_list_list(l: list[list]):
    size = 0
    for i in range(len(l)):
        size += len(l[i])
    return size
 #
 # PLOTTING
 #
 def add_vertikal_labels_in_bar_plot(labels, max_y_val, ax, bar_plot):
    """
    Add the label of the bar in or on top of the bar, depending on the bar size
    """
    # pdebug("add_vertikal_labels_in_bar_plot:", labels)
    for idx,rect in enumerate(bar_plot):
        height = rect.get_height()
        if height > 0.6 * max_y_val:  # if the bar is large, put label in the bar
            height = 0.05 * max_y_val
        ax.text(rect.get_x() + rect.get_width()/2., height + 0.025 * max_y_val,
                labels[idx],
                ha='center', va='bottom', rotation=90)
 def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot):
    """
    add the height of the bar on the top of each bar
    """
    # pdebug("add_labels_at_top_of_bar:", xdata, ydata)
    y_offset = 0.05 * max_y_val
    for idx,rect in enumerate(bar_plot):
        ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8))
 def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None):
    """
    make a bar plot of the ranking
    """
    # pdebug(f"plot_ranking: ranking={ranking}")
    if not fig:
        fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
    # create new axis if none is given
    ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
    # fill x y data
    if len(ranking) > settings["file_ranking_plot_max_files"]:
        start_index = len(ranking) - settings["file_ranking_plot_max_files"]
    else: start_index = 0
    x_names = []
    y_counts = []
    colors = []
    for i in range(start_index, len(ranking)):
        x_names.append(ranking[i][1])
        y_counts.append(ranking[i][0])
        ft = ranking[i][1].split(".")[-1]
        color = palette["blue"]
        # if not color_settings: color = palette["blue"]
        if isinstance(color_settings, dict):
            for key, val in color_settings.items():
                if ft in val: color = key
            if not color: color = palette["blue"]
        elif isinstance(color_settings, list):
            # print(color_settings, (i - start_index) % len(color_settings))
            color = color_settings[(i - start_index) % len(color_settings)]
        colors.append(color)
    bar = ax.bar(x_names, y_counts, tick_label="", color=colors)
    if len(y_counts) > 0:
        add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar)
        if settings["plot_add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar)
    # ax.ylabel(y_counts)
    return fig
 # def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue", rotate_xlabel=0):
 #     if not fig:
 #         fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
 #     if not ax:
 #         ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
 #     else:
 #         ax = ax.twinx()
 #         ax.set_ylabel(ylabel)
 #         # ax.tick_params(axis="y", labelcolor="r")
 #     ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color)
 #     plt.xticks(rotation=rotate_xlabel)
 #     if label: ax.legend()
 #     return fig, ax
 def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major", rotate_xlabel=0, figsize=None):
    if not fig:
        fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
    if not (ax1 and ax2):
        ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1)
        ax2 = ax1.twinx()
        ax2.set_ylabel(ylabel2)
    ax1.tick_params(axis="x", rotation=90)
    plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1)
    plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2)
    # ax1.set_xticks(ax1.get_xticks())
    # ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor")
    # if label1 or label2: ax1.legend()
    if plots: plots += plot1 + plot2
    else: plots = plot1 + plot2
    plt.legend(plots, [ l.get_label() for l in plots])
    if grid == "major" or grid == "minor" or grid == "both":
        if grid == "minor" or "both":
            ax1.minorticks_on()
        ax1.grid(visible=True, which=grid, linestyle="-", color="#888")
    return fig, ax1, ax2, plots
 #
 # MAIN
 #
 def visualize(db: Database):
    """
    This assumes sanity checks have been done
    """
    pdebug("visualizing...")
    if not settings["db"]:          missing_arg("db")
    if not settings["server_name"]: missing_arg("server_name")
    img_dir = settings["img_dir"]
    pdebug("img_dir:", img_dir)
    img_filetype = settings["img_filetype"]
    if isdir(img_dir) and img_filetype:
        gen_img = True
    else:
        print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'")
        gen_img = False
    img_location = settings["img_location"]
    names = {
        # paths
        "img_route_ranking_last_x_days": f"ranking_routes_last_x_days.{img_filetype}",
        "img_referer_ranking_last_x_days": f"ranking_referers_last_x_days.{img_filetype}",
        "img_countries_last_x_days": f"ranking_countries_last_x_days.{img_filetype}",
        "img_cities_last_x_days": f"ranking_cities_last_x_days.{img_filetype}",
        "img_browser_ranking_last_x_days": f"ranking_browsers_last_x_days.{img_filetype}",
        "img_platform_ranking_last_x_days": f"ranking_platforms_last_x_days.{img_filetype}",
        "img_visitors_and_requests_last_x_days": f"visitor_request_count_daily_last_x_days.{img_filetype}",
        "img_route_ranking_total": f"ranking_routes_total.{img_filetype}",
        "img_referer_ranking_total": f"ranking_referers_total.{img_filetype}",
        "img_countries_total": f"ranking_countries_total.{img_filetype}",
        "img_cities_total": f"ranking_cities_total.{img_filetype}",
        "img_browser_ranking_total": f"ranking_browsers_total.{img_filetype}",
        "img_platform_ranking_total": f"ranking_platforms_total.{img_filetype}",
        "img_visitors_and_requests_total": f"visitor_request_count_daily_total.{img_filetype}",
        # values
        "mobile_visitor_percentage_total": 0.0,
        "mobile_visitor_percentage_last_x_days": 0.0,
        "visitor_count_last_x_days": 0,
        "visitor_count_total": 0,
        "request_count_last_x_days": 0,
        "request_count_total": 0,
        "human_visitor_percentage_last_x_days": 0.0,
        "human_visitor_percentage_total": 0.0,
        "human_request_percentage_last_x_days": 0.0,
        "human_request_percentage_total": 0.0,
        # general
        "regina_version": settings["version"],
        "server_name": settings["server_name"],
        "last_x_days": settings["last_x_days"],  # must be after all the things with last_x_days!
        "earliest_date": "1990-1-1",
        "generation_date": "1990-1-1 0:0:0",
    }
    db = Database(database_path=settings["db"])
    get_humans = settings["get_human_percentage"]
    # pdebug(f"visualize: settings {settings}")
    # DATE STRINGS
    earliest_date = db.get_earliest_date()
    names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
    names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
    # LAST_X_DAYS
    # last_x_days_min_date: latest_date - last_x_days
    secs_per_day = 86400
    last_x_days_min_date = db.get_latest_date() - settings["last_x_days"] * secs_per_day
    last_x_days_constraint = get_where_date_str(min_date=last_x_days_min_date)
    last_x_days = db.get_days_where(last_x_days_constraint)
    last_x_days_contraints = [get_where_date_str(at_date=day) for day in last_x_days]
    # ALL DATES
    all_time_constraint = get_where_date_str(min_date=0)
    # all months in yyyy-mm format
    months_all_time = db.get_months_where(all_time_constraint)
    # sqlite constrict to month string
    months_strs = []
    for year_month in months_all_time:
        year, month = year_month.split("-")
        # first day of the month
        min_date  = dt(int(year), int(month), 1).timestamp()
        month = (int(month) % 12) + 1  # + 1 month
        year = int(year)
        if month == 1: year += 1
        # first day of the next month - 1 sec
        max_date = dt(year, month, 1).timestamp() - 1
        months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date))
    for i in range(2):
        suffix = ["_total", "_last_x_days"][i]
        date_constraint = [all_time_constraint, last_x_days_constraint][i]
        date_names = [months_all_time, last_x_days][i]
        date_constraints = [months_strs, last_x_days_contraints][i]
        assert(len(date_names) == len(date_constraints))
        # FILES
        # TODO handle groups
        file_ranking = get_route_ranking(db, date_constraint)
        if gen_img:
            fig_file_ranking = plot_ranking(file_ranking, xlabel="Route Name", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot_size_broad"])
            fig_file_ranking.savefig(f"{img_dir}/{names[f'img_route_ranking{suffix}']}", bbox_inches="tight")
        # REFERER
        referer_ranking = get_ranking(db, "request", "referer", date_constraint, settings["referer_ranking_whitelist"], settings["referer_ranking_whitelist"])
        pdebug("Referer ranking", referer_ranking)
        cleanup_referer_ranking(referer_ranking)
        if gen_img:
            fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
            fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}", bbox_inches="tight")
        # GEOIP
        if settings["do_geoip_rankings"]:
            city_ranking, country_ranking = get_city_and_country_ranking(db, require_humans=settings["geoip_only_humans"])
            pdebug("Country ranking:", country_ranking)
            pdebug("City ranking:", city_ranking)
            if gen_img:
                fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
                fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_countries{suffix}']}", bbox_inches="tight")
                fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
                fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_cities{suffix}']}", bbox_inches="tight")
        # USER
        # visitor_agent_ranking = get_visitor_agent_ranking(cur, date_str)
        # for the time span
        unique_visitor_ids = get_unique_visitor_ids_for_date(db, date_constraint)
        unique_visitor_ids_human = []
        append_human_visitors(db, unique_visitor_ids, unique_visitor_ids_human)
        # for each date
        date_count = len(date_constraints)
        unique_visitor_ids_dates: list[list[int]] = []
        unique_request_ids_dates: list[list[int]] = []
        unique_visitor_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
        unique_request_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
        for i in range(date_count):
            date_constraint_ = date_constraints[i]
            unique_visitor_ids_dates.append(get_unique_visitor_ids_for_date(db, date_constraint_))
            unique_request_ids_dates.append(get_unique_request_ids_for_date(db, date_constraint_))
            if get_humans:
                # empty_list = []
                # unique_visitor_ids_human_dates.append(empty_list)
                append_human_visitors(db, unique_visitor_ids_dates[i], unique_visitor_ids_human_dates[i])
                # unique_request_ids_human_dates.append(list())
                for human in unique_visitor_ids_human_dates[i]:
                    append_unique_request_ids_for_date_and_visitor(db, date_constraint_, human, unique_request_ids_human_dates[i])
        # print("\n\tuu", unique_visitor_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_visitor_ids_human_dates, "\n\turh", unique_request_ids_human_dates)
        # pdebug("uui",   unique_visitor_ids)
        # pdebug("uuih",  unique_visitor_ids_human)
        # pdebug("uuid",  unique_visitor_ids_dates)
        # pdebug("uuidh", unique_visitor_ids_human_dates)
        # pdebug("urid",  unique_request_ids_dates)
        # pdebug("uridh", unique_visitor_ids_human_dates)
        # pdebug(f"human_visitor_precentage: len_list_list(visitor_ids)={len_list_list(unique_visitor_ids_dates)}, len_list_list(visitor_ids_human)={len_list_list(unique_visitor_ids_human_dates)}")
        if get_humans:
            try:
                names[f"human_visitor_percentage{suffix}"] = round(100 * len_list_list(unique_visitor_ids_human_dates) / len_list_list(unique_visitor_ids_dates), 2)
            except:
                names[f"human_visitor_percentage{suffix}"] = -1.0
            try:
                names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2)
            except:
                names[f"human_request_percentage{suffix}"] = -1.0
        names[f"visitor_count{suffix}"] = len_list_list(unique_visitor_ids_dates)
        names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates)
        if gen_img:
            fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="Visitor count", label1="Unique visitors", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"], rotate_xlabel=-45, figsize=settings["plot_size_broad"])
            if get_humans:
                fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique visitors (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots, rotate_xlabel=-45, figsize=settings["plot_size_broad"])
            fig_daily.savefig(f"{img_dir}/{names[f'img_visitors_and_requests{suffix}']}", bbox_inches="tight")
        # os & browser
        platform_ranking, browser_ranking, names[f"mobile_visitor_percentage{suffix}"] = get_platform_browser_mobile_rankings(db, unique_visitor_ids_human)
        if gen_img:
            fig_os_rating = plot_ranking(platform_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_platforms, figsize=settings["plot_size_narrow"])
            fig_os_rating.savefig(f"{img_dir}/{names[f'img_platform_ranking{suffix}']}", bbox_inches="tight")
            fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browser", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot_size_narrow"])
            fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}", bbox_inches="tight")
    # print("OS ranking", os_ranking)
    # print("Browser ranking", browser_ranking)
    # print("Mobile percentage", names["mobile_visitor_percentage"])
    if settings["template_html"] and settings["html_out_path"]:
        pdebug(f"visualize: writing to html: {settings['html_out_path']}")
        with open(settings["template_html"], "r") as file:
            html = file.read()
        for name, value in names.items():
            if "img" in name:
                value = f"{img_location}/{value}"
            if type(value) == float:
                value = f"{value:.2f}"
            html = html.replace(f"%{name}", str(value))
        with open(settings["html_out_path"], "w") as file:
            file.write(html)
    else:
        warning(f"Skipping html generation because either template_html or html_out_path is invalid: template_html='{settings['template_html']}', html_out_path='{settings['html_out_path']}'")
--- a/regina/db_operation/database.py
+++ b/regina/db_operation/database.py
@ -12,15 +12,14 @@ if __name__ == "__main__":  # make relative imports work as described here: http
        import sys
        from os import path
        filepath = path.realpath(path.abspath(__file__))
-        print(path.dirname(path.dirname(path.dirname(filepath))))
+        sys.path.insert(0, path.dirname(path.dirname(filepath)))
        sys.path.insert(0, path.dirname(path.dirname(path.dirname(filepath))))
 # local
-from .utility.sql_util import replace_null, sanitize, sql_select, sql_exists
+from regina.utility.sql_util import replace_null, sanitize, sql_select, sql_exists
-from .utility.utility import pdebug, get_filepath, warning, pmessage
+from regina.utility.utility import pdebug, get_filepath, warning, pmessage, is_blacklisted, is_whitelisted
-from .utility.globals import settings
+from regina.utility.globals import settings
-from .db_operation.request import Request
+from regina.data_collection.request import Request
-from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
+from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
 """
 create reginas database as shown in the uml diagram database.uxf
@ -36,13 +35,17 @@ class Database:
            pdebug(f"Database.__init__: Creating database at {database_path}")
            with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file:
                create_db = file.read()
-            self.cur.execute(create_db)
+            self.cur.executescript(create_db)
            self.conn.commit()
    def __call__(self, s):
        """execute a command and return fetchall()"""
        self.cur.execute(s)
        return self.cur.fetchall()
    def execute(self, s):
        self.cur.execute(s)
    def fetchone(self):
        return self.cur.fetchone()
    #
    # VISITOR
@ -160,9 +163,10 @@ class Database:
    def add_requests(self, requests: list[Request]):
        added_requests = 0
        # check the new visitors later
        request_blacklist = settings["request_location_regex_blacklist"]
        new_visitors = []
        for i in range(len(requests)):
            if     is_blacklisted(requests[i].request_route, settings["request_route_blacklist"]): continue
            if not is_whitelisted(requests[i].request_route, settings["request_route_whitelist"]): continue
            visitor = self.add_request(requests[i])
            if visitor:
                new_visitors.append(visitor)
@ -267,12 +271,15 @@ class Database:
        assert(type(city_id_val) == int)
        return city_id_val
    def update_geoip_tables(self, geoip_city_csv_path: str):
        """
        update the geoip data with the contents of the geoip_city_csv file
        Make sure to update the visitor.ip_range_id column for all visitors.
-        In case something changed, they might point to a different city. (won't fix)
+        In case something changed, they might point to a different city.
        TODO: update teh visitor.ip_range_id column to match (potentially) new city ip range
        """
        # indices for the csv
        FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5
@ -331,5 +338,43 @@ class Database:
            if combine_range_country_id >= 0:  # last range , append
                add_range(combine_range_low, combine_range_high, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id)
    #
    # REQUEST
    #
    # TIME/DATE
    def get_earliest_date(self) -> int:
        """return the earliest time as unixepoch"""
        date = self(f"SELECT MIN(time) FROM request")[0][0]
        if not isinstance(date, int): return 0
        else: return date
    def get_latest_date(self) -> int:
        """return the latest time as unixepoch"""
        date = self(f"SELECT MAX(time) FROM request")[0][0]
        if not isinstance(date, int): return 0
        else: return date
    def get_months_where(self, date_constraint:str) -> list[str]:
        """get a list of all dates in yyyy-mm format
        @param date_constraint parameter sqlite constraint
        """
        dates = self.get_days_where(date_constraint)
        date_dict = {}
        for date in dates:
            date_without_day = date[0:date.rfind('-')]
            date_dict[date_without_day] = 0
        return list(date_dict.keys())
    def get_days_where(self, date_constraint:str) -> list[str]:
        """get a list of all dates in yyyy-mm-dd format
        @param date_constraint parameter sqlite constraint
        """
        days = [ date[0] for date in self(f"SELECT DISTINCT DATE(time, 'unixepoch') FROM request WHERE {date_constraint}") ]  # fetchall returns tuples (date, ) 
        days.sort()
        return days
 if __name__ == '__main__':
    db = Database("test.db")
--- a/regina/db_operation/init.py
+++ b/regina/db_operation/init.py
@ -1,6 +0,0 @@
 """Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
 # __package__ = 'regina'
 import regina.utility
 from importlib import resources
 # ip2nation_db_path = resources.path("regina", "ip2nation.db")
--- a/regina/db_operation/visualize.py
+++ b/regina/db_operation/visualize.py
@ -1,666 +0,0 @@
 # from sys import path
 # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
 import sqlite3 as sql
 from sys import exit
 from re import fullmatch
 import matplotlib.pyplot as plt
 from os.path import isdir
 from datetime import datetime as dt
 from numpy import empty
 # local
 from regina.db_operation.database import Database, t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country
 from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
 from regina.utility.utility import pdebug, warning, missing_arg
 from regina.utility.globals import settings
 """
 visualize information from the databse
 """
 palette = {
    "red": "#ee4035",
    "orange": "#f37736",
    "yellow": "#fdf458",
    "green": "#7bc043",
    "blue": "#0392cf",
    "purple": "#b044a0",
 }
 color_settings_filetypes = {
    palette["red"]: ["html"],
    palette["green"]: ["jpg", "png", "jpeg", "gif", "svg", "webp"],
    palette["yellow"]: ["css"],
    "grey": ["txt"]
 }
 color_settings_alternate = list(palette.values())
 color_settings_browsers = {
    palette["red"]: ["Safari"],
    palette["orange"]: ["Firefox"],
    palette["yellow"]: ["Chrome"],
    "grey": ["Edge"],
    palette["green"]: ["Chromium"],
    palette["purple"]: ["Brave"]
 }
 color_settings_operating_systems = {
    palette["red"]: ["Mac"],
    palette["green"]: ["Android"],
    "grey": ["iPhone", "iPad"],
    palette["yellow"]: ["Linux"],
    palette["purple"]: ["BSD"],
    palette["blue"]: ["Windows"],
 }
 def len_list_list(l: list[list]):
    size = 0
    for i in range(len(l)):
        size += len(l[i])
    return size
 def valid_status(status: int):
    if status >= 400: return False
    if settings["status_300_is_success"] and status >= 300: return True
    return status < 300
 #
 # FILTERS
 #
 def get_os_browser_mobile_rankings(db: Database, visitor_ids: list[int]):
    """
    returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage
    """
    os_ranking = {}
    os_count = 0.0
    browser_ranking = {}
    browser_count = 0.0
    mobile_ranking = { True: 0.0, False: 0.0 }
    for visitor_id in visitor_ids:
        os, browser, mobile = db(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}")[0]
        mobile = bool(mobile)
        if os:
            if os in os_ranking: os_ranking[os] += 1
            else: os_ranking[os] = 1
            os_count += 1
        if browser:
            if browser in browser_ranking: browser_ranking[browser] += 1
            else: browser_ranking[browser] = 1
            browser_count += 1
        if (os or browser):
            mobile_ranking[mobile] += 1
    try:
        mobile_visitor_percentage = mobile_ranking[True] / (mobile_ranking[True] + mobile_ranking[False])
    except ZeroDivisionError:
        mobile_visitor_percentage = 0.0
    os_ranking =  [(c * 100/os_count, n) for n, c in os_ranking.items()]
    os_ranking.sort()
    browser_ranking = [(c * 100/browser_count, n) for n, c in browser_ranking.items()]
    browser_ranking.sort()
    return os_ranking, browser_ranking, mobile_visitor_percentage*100
 #
 # GETTERS
 #
 def get_where_date_str(at_date=None, min_date=None, max_date=None):
    # dates in unix time
    s = ""
    if at_date is not None:
        if isinstance(at_date, str):
            s += f"DATE(date, 'unixepoch') = '{sanitize(at_date)}' AND "
        elif isinstance(at_date, int|float):
            s += f"date = {int(at_date)} AND "
        else:
            print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}")
    if min_date is not None:
        if isinstance(min_date, str):
            s += f"DATE(date, 'unixepoch') >= '{sanitize(min_date)}' AND "
        elif isinstance(min_date, int|float):
            s += f"date >= {int(min_date)} AND "
        else:
            print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}")
    if max_date is not None:
        if isinstance(max_date, str):
            s += f"DATE(date, 'unixepoch') <= '{sanitize(max_date)}' AND "
        elif isinstance(max_date, int|float):
            s += f"date <= {int(max_date)} AND "
        else:
            print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}")
    if s == "":
        print(f"WARNING: get_where_date_str: no date_str generated. Returning 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}")
        return "date > 0"
    return s.removesuffix(" AND ")
 # get the earliest date
 def get_earliest_date(db: Database) -> int:
    """return the earliest time as unixepoch"""
    date = db(f"SELECT MIN(date) FROM {t_request}")[0][0]
    if not isinstance(date, int): return 0
    else: return date
 # get the latest date
 def get_latest_date(db: Database) -> int:
    """return the latest time as unixepoch"""
    date = db(f"SELECT MAX(date) FROM {t_request}")[0][0]
    if not isinstance(date, int): return 0
    else: return date
 # get all dates
 # the date:str parameter in all these function must be a sqlite constraint
 def get_days(db: Database, date:str) -> list[str]:
    """get a list of all dates in yyyy-mm-dd format"""
    days = [ date[0] for date in db(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")]  # fetchall returns tuples (date, ) 
    days.sort()
    return days
 def get_months(db: Database, date:str) -> list[str]:
    """get a list of all dates in yyyy-mm format"""
    dates = get_days(db, date)
    date_dict = {}
    for date in dates:
        date_without_day = date[0:date.rfind('-')]
        date_dict[date_without_day] = 0
    return list(date_dict.keys())
 def get_visitor_agent(db: Database, visitor_id: int):
    return sql_select(db.cur, t_visitor, [("visitor_id", visitor_id)])[0][2]
 def get_unique_visitor_ids_for_date(db: Database, date:str) -> list[int]:
    return [ visitor_id[0] for visitor_id in db(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}") ]
 def get_human_visitors(db: Database, unique_visitor_ids, unique_visitor_ids_human: list):
    """
    check if they have a known platform AND browser
    check if at least one request did not result in an error (http status >= 400)
    """
    for visitor_id in unique_visitor_ids:
        cur.execute(f"SELECT is_human FROM {t_visitor} WHERE visitor_id = {visitor_id}")
        # if not visitor
        if cur.fetchone()[0] == 0:
            # pdebug(f"get_human_visitors: {visitor_id}, is_human is 0")
            continue
        else:
            # pdebug(f"get_human_visitors: {visitor_id}, is_human is non-zero")
            pass
        # visitor is human
        unique_visitor_ids_human.append(visitor_id)
    # pdebug("get_human_visitors: (2)", unique_visitor_ids_human)
 def get_unique_request_ids_for_date(db: Database, date:str):
    cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}")
    return [ request_id[0] for request_id in cur.fetchall()]
 def get_unique_request_ids_for_date_and_visitor(db: Database, date:str, visitor_id: int, unique_request_ids_human: list):
    cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND visitor_id = {visitor_id}")
    # all unique requests for visitor_id
    for request_id in cur.fetchall():
        unique_request_ids_human.append(request_id[0])
 # get number of requests per day
 def get_request_count_for_date(db: Database, date:str) -> int:
    cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}")
    return cur.fetchone()[0]
 def get_unique_visitor_count(db: Database) -> int:
    return sql_tablesize(cur, t_visitor)
 #
 # RANKINGS
 #
 def get_file_ranking(db: Database, date:str) -> list[tuple[int, str]]:
    global settings
    """
    :returns [(request_count, groupname)]
    """
    ranking = []
    cur.execute(f"SELECT group_id, groupname FROM {t_filegroup}")
    for group in cur.fetchall():
        group_id = group[0]
        # filename = sql_select(cur, t_file, [("group_id", group)])
        # if len(filename) == 0: continue
        # filename = filename[0][0]
        filename = group[1]
        if settings["file_ranking_regex_whitelist"]:  # if file in whitelist
            if not fullmatch(settings["file_ranking_regex_whitelist"], filename):
                pdebug(f"get_file_ranking: file with group_id {group_id} is not in whitelist")
                continue
        if settings["file_ranking_ignore_error_files"]:  # if request to file was successful
            success = False
            cur.execute(f"SELECT status FROM {t_request} WHERE group_id = {group_id}")
            for status in cur.fetchall():
                if valid_status(status[0]):
                    pdebug(f"get_file_ranking: success code {status[0]} for file with group_id {group_id} and groupname {filename}")
                    success = True
                    break
            if not success:
                pdebug(f"get_file_ranking: file with group_id {group_id} and groupname {filename} has only requests resulting in error")
                continue
        # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
        cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group_id} AND {date}")
        ranking.append((cur.fetchone()[0], filename))
    ranking.sort()
    # print(ranking)
    return ranking
 def get_visitor_agent_ranking(db: Database, date:str) -> list[tuple[int, str]]:
    """
    :returns [(request_count, visitor_agent)]
    """
    ranking = []
    cur.execute(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}")
    for visitor_id in cur.fetchall():
        visitor_id = visitor_id[0]
        visitor_agent = sql_select(cur, t_visitor, [("visitor_id", visitor_id)])
        if len(visitor_agent) == 0: continue
        visitor_agent = visitor_agent[0][2]
        if settings["visitor_agent_ranking_regex_whitelist"]:
            if not fullmatch(settings["visitor_agent_ranking_regex_whitelist"], visitor_agent):
                continue
        # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
        cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE visitor_id = {visitor_id} AND {date}")
        ranking.append((cur.fetchone()[0], visitor_agent))
    ranking.sort()
    # print(ranking)
    return ranking
 def get_request_ranking(field_name: str, table: str, whitelist_regex: str, db: Database, date_condition:str) -> list[tuple[int, str]]:
    """
    1) get all the distinct entries for field_name after min_date_unix_time
    2) call get_name_function with the distinct entry
    3) for every entry, get the count in table after min_date_unix_time
    3) sort by count in ascending order
    :returns [(request_count, name)]
    """
    ranking = []
    cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date_condition}")
    for name in cur.fetchall():
        name = name[0]
        if whitelist_regex:
            if not fullmatch(whitelist_regex, name):
                continue
        # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
        cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date_condition}")
        ranking.append((cur.fetchone()[0], name))
    ranking.sort()
    # print(ranking)
    return ranking
 # re_uri_protocol = f"(https?)://"
 re_uri_protocol = f"(https?://)?"
 re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
 # re_uri_ipv6 = ""
 re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
 re_uri_location = r"(?:/(.*))?"
 re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})"
 # (https?://)?((?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})|(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?))((?:/(.*))?)
 def cleanup_referer(referer: str) -> str:
    """
    split the referer uri into its parts and reassemeble them depending on settings
    """
    m = fullmatch(re_uri_full, referer)
    if not m:
        warning(f"cleanup_referer: Could not match referer '{referer}'")
        return referer
    # pdebug(f"cleanup_referer: {referer} - {m.groups()}")
    protocol = m.groups()[0]
    subdomains = m.groups()[2]
    if not subdomains: subdomains = ""
    domain = m.groups()[1].replace(subdomains, "")
    location = m.groups()[3]
    referer = domain
    if settings["referer_ranking_ignore_tld"]:
        if len(domain.split(".")) == 2:  # if domain.tld
            referer = domain.split(".")[0]
    if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
    if not settings["referer_ranking_ignore_protocol"]: referer = protocol + referer
    if not settings["referer_ranking_ignore_location"]: referer += location
    # pdebug(f"cleanup_referer: cleaned up: {referer}")
    return referer
 def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
    unique_referers = dict()
    for count, referer in referer_ranking:
        referer = cleanup_referer(referer)
        if referer in unique_referers:
            unique_referers[referer] += count
        else:
            unique_referers[referer] = count
    referer_ranking.clear()
    for referer, count in unique_referers.items():
        referer_ranking.append((count, referer))
    referer_ranking.sort()
 def get_city_and_country_ranking(cur:sql.Cursor, require_humans=True, regex_city_blacklist="", regex_country_blacklist=""):
    sql_cmd = f"SELECT ci.name, c.code, c.name FROM {t_country} AS c, {t_city} as ci, {t_visitor} as u, {t_ip_range} as i WHERE u.ip_range_id = i.ip_range_id AND i.city_id = ci.city_id AND ci.country_id = c.country_id"
    if require_humans: sql_cmd += " AND u.is_human = 1"
    cur.execute(sql_cmd)
    pdebug(f"get_city_and_country_ranking: require_humans={require_humans}, regex_city_blacklist='{regex_city_blacklist}', regex_country_blacklist='{regex_country_blacklist}'")
    cities = cur.fetchall()
    cities_dict = {}
    country_dict = {}
    pdebug(f"get_city_and_country_ranking: found {len(cities)} ip_ranges")
    validate_city_cmd = lambda _ : True
    validate_country_cmd = lambda _ : True
    if len(regex_city_blacklist) > 0: validate_city_cmd = lambda city : fullmatch(regex_city_blacklist, city) is None
    if len(regex_country_blacklist) > 0 : validate_country_cmd = lambda country : fullmatch(regex_country_blacklist, country) is None
    for i in range(len(cities)):
        if cities[i][0] in cities_dict:
            cities_dict[cities[i][0]][0] += 1
        else:
            if validate_city_cmd(cities[i][0]):
                cities_dict[cities[i][0]] = [1, cities[i][1], cities[i][2]]  # count, country code
        if cities[i][2] in country_dict:
            country_dict[cities[i][2]] += 1
        else:
            if validate_country_cmd(cities[i][2]):
                country_dict[cities[i][2]] = 1  # count, country code
    city_ranking = [(v[0], f"{k} ({v[1]})") for k,v in cities_dict.items()]
    city_ranking.sort()
    country_ranking = [(v, k) for k,v in country_dict.items()]
    country_ranking.sort()
    return city_ranking, country_ranking
 #
 # PLOTTING
 #
 # add value labels
 def add_vertikal_labels_in_bar_plot(labels, max_y_val, ax, bar_plot):
    # pdebug("add_vertikal_labels_in_bar_plot:", labels)
    for idx,rect in enumerate(bar_plot):
        height = rect.get_height()
        if height > 0.6 * max_y_val:  # if the bar is large, put label in the bar
            height = 0.05 * max_y_val
        ax.text(rect.get_x() + rect.get_width()/2., height + 0.025 * max_y_val,
                labels[idx],
                ha='center', va='bottom', rotation=90)
 # add count labels
 def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot):
    # pdebug("add_labels_at_top_of_bar:", xdata, ydata)
    y_offset = 0.05 * max_y_val
    for idx,rect in enumerate(bar_plot):
        ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8))
 def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None):
    """
    make a bar plot of the most requested files
    """
    # pdebug(f"plot_ranking: ranking={ranking}")
    if not fig:
        fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
    # create new axis if none is given
    ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
    # fill x y data
    if len(ranking) > settings["file_ranking_plot_max_files"]:
        start_index = len(ranking) - settings["file_ranking_plot_max_files"]
    else: start_index = 0
    x_names = []
    y_counts = []
    colors = []
    for i in range(start_index, len(ranking)):
        x_names.append(ranking[i][1])
        y_counts.append(ranking[i][0])
        ft = ranking[i][1].split(".")[-1]
        color = palette["blue"]
        # if not color_settings: color = palette["blue"]
        if isinstance(color_settings, dict):
            for key, val in color_settings.items():
                if ft in val: color = key
            if not color: color = palette["blue"]
        elif isinstance(color_settings, list):
            # print(color_settings, (i - start_index) % len(color_settings))
            color = color_settings[(i - start_index) % len(color_settings)]
        colors.append(color)
    bar = ax.bar(x_names, y_counts, tick_label="", color=colors)
    if len(y_counts) > 0:
        add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar)
        if settings["plot_add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar)
    # ax.ylabel(y_counts)
    return fig
 # def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue", rotate_xlabel=0):
 #     if not fig:
 #         fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
 #     if not ax:
 #         ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
 #     else:
 #         ax = ax.twinx()
 #         ax.set_ylabel(ylabel)
 #         # ax.tick_params(axis="y", labelcolor="r")
 #     ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color)
 #     plt.xticks(rotation=rotate_xlabel)
 #     if label: ax.legend()
 #     return fig, ax
 def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major", rotate_xlabel=0, figsize=None):
    if not fig:
        fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
    if not (ax1 and ax2):
        ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1)
        ax2 = ax1.twinx()
        ax2.set_ylabel(ylabel2)
    ax1.tick_params(axis="x", rotation=90)
    plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1)
    plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2)
    # ax1.set_xticks(ax1.get_xticks())
    # ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor")
    # if label1 or label2: ax1.legend()
    if plots: plots += plot1 + plot2
    else: plots = plot1 + plot2
    plt.legend(plots, [ l.get_label() for l in plots])
    if grid == "major" or grid == "minor" or grid == "both":
        if grid == "minor" or "both":
            ax1.minorticks_on()
        ax1.grid(visible=True, which=grid, linestyle="-", color="#888")
    return fig, ax1, ax2, plots
 #
 # MAIN
 #
 def visualize(loaded_settings: dict):
    pdebug("visualizing...")
    global settings
    settings = loaded_settings
    if not settings["db"]: missing_arg("db")
    if not settings["server_name"]: missing_arg("server_name")
    img_dir = settings["img_dir"]
    pdebug("img_dir:", img_dir)
    img_filetype = settings["img_filetype"]
    img_location = settings["img_location"]
    names = {
        # paths
        "img_file_ranking_last_x_days": f"ranking_files_last_x_days.{img_filetype}",
        "img_referer_ranking_last_x_days": f"ranking_referers_last_x_days.{img_filetype}",
        "img_countries_last_x_days": f"ranking_countries_last_x_days.{img_filetype}",
        "img_cities_last_x_days": f"ranking_cities_last_x_days.{img_filetype}",
        "img_browser_ranking_last_x_days": f"ranking_browsers_last_x_days.{img_filetype}",
        "img_operating_system_ranking_last_x_days": f"ranking_operating_systems_last_x_days.{img_filetype}",
        "img_visitors_and_requests_last_x_days": f"visitor_request_count_daily_last_x_days.{img_filetype}",
        "img_file_ranking_total": f"ranking_files_total.{img_filetype}",
        "img_referer_ranking_total": f"ranking_referers_total.{img_filetype}",
        "img_countries_total": f"ranking_countries_total.{img_filetype}",
        "img_cities_total": f"ranking_cities_total.{img_filetype}",
        "img_browser_ranking_total": f"ranking_browsers_total.{img_filetype}",
        "img_operating_system_ranking_total": f"ranking_operating_systems_total.{img_filetype}",
        "img_visitors_and_requests_total": f"visitor_request_count_daily_total.{img_filetype}",
        # values
        "mobile_visitor_percentage_total": 0.0,
        "mobile_visitor_percentage_last_x_days": 0.0,
        "visitor_count_last_x_days": 0,
        "visitor_count_total": 0,
        "request_count_last_x_days": 0,
        "request_count_total": 0,
        "human_visitor_percentage_last_x_days": 0.0,
        "human_visitor_percentage_total": 0.0,
        "human_request_percentage_last_x_days": 0.0,
        "human_request_percentage_total": 0.0,
        # general
        "regina_version": settings["version"],
        "server_name": settings["server_name"],
        "last_x_days": settings["last_x_days"],  # must be after all the things with last_x_days!
        "earliest_date": "1990-1-1",
        "generation_date": "1990-1-1 0:0:0",
    }
    conn = sql.connect(settings["db"])
    if isdir(img_dir) and img_filetype:
        gen_img = True
    else:
        print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'")
        gen_img = False
    cur = conn.cursor()
    get_humans = settings["get_human_percentage"]
    # pdebug(f"visualize: settings {settings}")
    # DATE STRINGS
    earliest_date = get_earliest_date(cur)
    names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
    names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
    # LAST_X_DAYS
    # last_x_days_min_date: latest_date - last_x_days
    secs_per_day = 86400
    last_x_days_min_date = get_latest_date(cur) - settings["last_x_days"] * secs_per_day
    last_x_days_str = get_where_date_str(min_date=last_x_days_min_date)
    days = get_days(cur, last_x_days_str)
    days_strs = [get_where_date_str(at_date=day) for day in days]
    # ALL DATES
    all_time_str = get_where_date_str(min_date=0)
    # all months in yyyy-mm format
    months_all_time = get_months(cur, all_time_str)
    # sqlite constrict to month string
    months_strs = []
    for year_month in months_all_time:
        year, month = year_month.split("-")
        # first day of the month
        min_date  = dt(int(year), int(month), 1).timestamp()
        month = (int(month) % 12) + 1  # + 1 month
        year = int(year)
        if month == 1: year += 1
        # first day of the next month - 1 sec
        max_date = dt(year, month, 1).timestamp() - 1
        months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date))
    for i in range(2):
        suffix = ["_total", "_last_x_days"][i]
        date_str = [all_time_str, last_x_days_str][i]
        date_names = [months_all_time, days][i]
        date_strs = [months_strs, days_strs][i]
        assert(len(date_names) == len(date_strs))
        # FILES
        file_ranking = get_file_ranking(cur, date_str)
        if gen_img:
            fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot_size_broad"])
            fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}", bbox_inches="tight")
        # REFERER
        referer_ranking = get_request_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
        pdebug("Referer ranking", referer_ranking)
        cleanup_referer_ranking(referer_ranking)
        if gen_img:
            fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
            fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}", bbox_inches="tight")
        # GEOIP
        if settings["do_geoip_rankings"]:
            city_ranking, country_ranking = get_city_and_country_ranking(cur, require_humans=settings["geoip_only_humans"], regex_city_blacklist=settings["city_ranking_regex_blacklist"], regex_country_blacklist=settings["country_ranking_regex_blacklist"])
            pdebug("Country ranking:", country_ranking)
            pdebug("City ranking:", city_ranking)
            if gen_img:
                fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
                fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_countries{suffix}']}", bbox_inches="tight")
                fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
                fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_cities{suffix}']}", bbox_inches="tight")
        # USER
        # visitor_agent_ranking = get_visitor_agent_ranking(cur, date_str)
        # for the time span
        unique_visitor_ids = get_unique_visitor_ids_for_date(cur, date_str)
        unique_visitor_ids_human = []
        get_human_visitors(cur, unique_visitor_ids, unique_visitor_ids_human)
        # for each date
        date_count = len(date_strs)
        unique_visitor_ids_dates: list[list[int]] = []
        unique_request_ids_dates: list[list[int]] = []
        unique_visitor_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
        unique_request_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
        for i in range(date_count):
            date_str_ = date_strs[i]
            unique_visitor_ids_dates.append(get_unique_visitor_ids_for_date(cur, date_str_))
            unique_request_ids_dates.append(get_unique_request_ids_for_date(cur, date_str_))
            if get_humans:
                # empty_list = []
                # unique_visitor_ids_human_dates.append(empty_list)
                get_human_visitors(cur, unique_visitor_ids_dates[i], unique_visitor_ids_human_dates[i])
                # unique_request_ids_human_dates.append(list())
                for human in unique_visitor_ids_human_dates[i]:
                    get_unique_request_ids_for_date_and_visitor(cur, date_str_, human, unique_request_ids_human_dates[i])
        # print("\n\tuu", unique_visitor_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_visitor_ids_human_dates, "\n\turh", unique_request_ids_human_dates)
        # pdebug("uui",   unique_visitor_ids)
        # pdebug("uuih",  unique_visitor_ids_human)
        # pdebug("uuid",  unique_visitor_ids_dates)
        # pdebug("uuidh", unique_visitor_ids_human_dates)
        # pdebug("urid",  unique_request_ids_dates)
        # pdebug("uridh", unique_visitor_ids_human_dates)
        # pdebug(f"human_visitor_precentage: len_list_list(visitor_ids)={len_list_list(unique_visitor_ids_dates)}, len_list_list(visitor_ids_human)={len_list_list(unique_visitor_ids_human_dates)}")
        if get_humans:
            try:
                names[f"human_visitor_percentage{suffix}"] = round(100 * len_list_list(unique_visitor_ids_human_dates) / len_list_list(unique_visitor_ids_dates), 2)
            except:
                names[f"human_visitor_percentage{suffix}"] = -1.0
            try:
                names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2)
            except:
                names[f"human_request_percentage{suffix}"] = -1.0
        names[f"visitor_count{suffix}"] = len_list_list(unique_visitor_ids_dates)
        names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates)
        if gen_img:
            fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="Visitor count", label1="Unique visitors", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"], rotate_xlabel=-45, figsize=settings["plot_size_broad"])
            if get_humans:
                fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique visitors (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots, rotate_xlabel=-45, figsize=settings["plot_size_broad"])
            fig_daily.savefig(f"{img_dir}/{names[f'img_visitors_and_requests{suffix}']}", bbox_inches="tight")
        # os & browser
        os_ranking, browser_ranking, names[f"mobile_visitor_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_visitor_ids_human)
        if gen_img:
            fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems, figsize=settings["plot_size_narrow"])
            fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}", bbox_inches="tight")
            fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot_size_narrow"])
            fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}", bbox_inches="tight")
    # print("OS ranking", os_ranking)
    # print("Browser ranking", browser_ranking)
    # print("Mobile percentage", names["mobile_visitor_percentage"])
    if settings["template_html"] and settings["html_out_path"]:
        pdebug(f"visualize: writing to html: {settings['html_out_path']}")
        with open(settings["template_html"], "r") as file:
            html = file.read()
        for name, value in names.items():
            if "img" in name:
                value = f"{img_location}/{value}"
            html = html.replace(f"%{name}", str(value))
        with open(settings["html_out_path"], "w") as file:
            file.write(html)
    else:
        warning(f"Skipping html generation because either template_html or html_out_path is invalid: template_html='{settings['template_html']}', html_out_path='{settings['html_out_path']}'")
--- a/regina/default.cfg
+++ b/regina/default.cfg
@ -0,0 +1,155 @@
 # ************************************* REGINA CONFIGURATION **************************************
 #                       .__
 # _______   ____   ____ |__| ____ _____
 # \_  __ \_/ __ \ / ___\|  |/    \\__  \
 # |  | \/\  ___// /_/  >  |   |  \/ __ \_
 # |__|    \___  >___  /|__|___|  (____  /
 #             \/_____/         \/     \/
 # ************************************************************************************************* 
 [ regina ]
 # name of the server or website
 # will be available as variable for the the generated website as %server_name
 # string
 server_name = 
 # database path. if not specified, use xdg-data-home/regina/<server-name> 
 # eg: /home/my_user/regina/my_website.db
 # 
 # path or empty
 database =
 [ data-collection ]
 # path to the nginx access log to parse
 # eg: /var/log/nginx/access.log
 # path (read permissions)
 access_log =
 # FILE GROUPING
 # nginx locations and their root directory: location:directory,location:directory,...
 # eg: /:/www/my_website,/error:/www/error
 locs_and_dirs = 
 # filetypes that should be grouped (comma separated)
 # eg: png,jpg,jpeg,gif,svg,css,ico,pdf,txt
 auto_group_filetypes = 
 # group certain files
 # eg: home:index.html,home.html;images:image1.png,image2.png
 # PATHS
 [ data-visualization ]
 # template html input
 # eg: /home/my_visitor/.regina/template.html
 # path (read permissions)
 template_html = 
 # output for the generated html
 # eg: /www/analytics/statistics.html
 # path (write permissions)
 html_out_path = 
 # output directory for the generated plots
 # WARNING: you have to create the directory yourself, regina will not create it
 # eg: /www/analytics/images
 # path (directory with write permissions)
 img_out_dir = 
 # nginx location for the generated images, its root must be img_out_dir
 # eg: images
 img_location = 
 #
 # if the root for your server is /www/analytics and html_out_path is /www/analytics/analytics.html,
 # use img_dir = /www/analytics/images and img_location = /images
 [ route_groups ]
 images = 
    *.gif
    *.jpeg
    *.jpg
    *.png
    *.svg
 # HUMAN DETECTION
 # wether a request with 30x http status counts as success
 status_300_is_success = False
 # if False, unique visitor is (ip-address - visitor agent) pair, if True only ip addess
 unique_visitor_is_ip_address = False
 # wether a visitor needs to make at least 1 successful request to be a human
 human_needs_success = True
 # dont collect requests to locations fully match this
 # eg: /analytics.*
 request_location_regex_blacklist =
 [ geoip ]
 # GEOIP
 get_visitor_location = False
 # this option is relevant used when --update-geoip is used
 # list if capitalized ISO 3166-1 alpha-2 country codes for which the location needs to be resolved at city level, not country level
 # for EU, use: get_cities_for_countries = AT, BE, BG, HR, CY, CZ, DK, EE, FI, FR, DE, GZ, HU, IE, IT, LV, LT, LU, MT, NL, PL, PT, RO, SK, SI, ES, SE
 get_cities_for_countries =  
 # hash_ip_address = False
 # ***************************************** VISUALIZATION ***************************************** 
 # these changes can be changed at any point in time as they only affect the visualization of the data
 # ************************************************************************************************* 
 [ visualization ]
 # separate visitors into all and humans 
 # True/False
 get_human_percentage = True
 # GEOIP
 # generate a country and city ranking
 # True/False
 do_geoip_rankings = False
 # only use humans for geoip rankings
 # True/False
 geoip_only_humans = True
 # eg exclude unknown cities: City in .*
 # regex
 city_ranking_regex_blacklist = City in .*
 # True/False
 country_ranking_regex_blacklist =
 # ignore the protocol in referers, so https://url.com = http://url.com -> url.com
 referer_ranking_ignore_protocol = True
 # ignore the subdomains in referers, so foo.url.com = bar.url.com -> url.com
 referer_ranking_ignore_subdomain = False
 # ignore the location in referers, so url.com/foo = url.com/bar -> url.com
 referer_ranking_ignore_location = True
 # regex expression as whitelist for referer ranking, minus means empty
 # eg exclude empty referers: ^[^\-].*
 referer_ranking_regex_whitelist = ^[^\-].*
 # regex expression as whitelist for file ranking
 # eg .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) to only show these files
 # regex
 route_ranking_regex_whitelist =
 # maximum number of route (group)s on the file ranking
 # int
 route_ranking_plot_max_routes = 20
 # wether to ignore non existing files in the ranking
 # True/False
 route_ranking_ignore_error_files = True
 # int
 plot_dpi = 300
 # affects visitor/request count plot, geoip rankings, file ranking and referer ranking
 plot_size_broad = 14, 5
 # affects platform and browser ranking
 plot_size_narrow = 7, 5
 # ******************************************** REGINA ********************************************* 
 # these settings affect the behavior of regina
 # ************************************************************************************************* 
 # print lots! of debug messages to help you find problems
 debug = False
--- a/regina/generated-default.cfg
+++ b/regina/generated-default.cfg
@ -0,0 +1,166 @@
 # ************************************* REGINA CONFIGURATION **************************************
 #                           .__
 #     _______   ____   ____ |__| ____ _____
 #     \_  __ \_/ __ \ / ___\|  |/    \\__  \
 #     |  | \/\  ___// /_/  >  |   |  \/ __ \_
 #     |__|    \___  >___  /|__|___|  (____  /
 #                 \/_____/         \/     \/
 #     *************************************************************************************************
 # Common Settings
 [ regina ]
 # name (not url) of the server or website
 # will be avaiable as variable for the generated html as %server_name
 # type: string
 # server_name = my_website
 server_name = 
 # database path
 # type: file (read, write permissions)
 # database = /home/my_user/regina/my_website.db
 database = 
 # path to the nginx access log to parse
 # type: file (read permissions)
 # access_log = /var/log/nginx/access.log
 access_log = 
 # The template and generated file do actually have to be htmls, you can change it to whatever you want
 [ html-generation ]
 # type: True/False
 generate_html = True
 # template html input
 # type: file (read permissions)
 # template_html = /home/my_visitor/.regina/template.html
 template_html = 
 # output for the generated html
 # type: file (write permissions)
 # html_out_path = /www/analytics/statistics.html
 html_out_path = 
 # output directory for the generated plots
 # type: directory (write permissions)
 # img_out_dir = /www/analytics/images
 img_out_dir = 
 # nginx location for the generated images (this has to map to img_out_dir)
 # type: eg: images
 # img_location = /images
 img_location = 
 # These settings affect the data collection. If changed, they will affect how the database is being filled in the future.
 [ data-collection ]
 # whether a unique visitor is only identified by IP address
 # type: True/False
 unique_visitor_is_ip_address = 
 # whether a visitor needs at least one successful request to be a human
 # type: True/False
 human_needs_success = True
 # whether a request with 30x HTTP status counts as successful request
 # type: True/False
 status_300_is_success = True
 # delete all ip addresses after the collection is done
 # type: True/False
 delete_ip_addresses = True
 # don't collect requests to locations that match this regex
 # type: regexp, None, int or string
 # request_location_blacklist = /analytics.*
 request_location_blacklist = 
 # whether to get visitor location information
 # type: True/False
 get_visitor_location = 
 # whether to generate country and city rankings using GeoIP (requires GeoIP Database)
 # type: True/False
 do_geoip_rankings = 
 # countries for which the GeoIP needs to be resolved at city level
 # type: list of capitalized ISO 3166-1 alpha-2 country codes
 # get_cities_for_countries = AT, BE, BG, HR, CY, CZ, DK, EE, FI, FR, DE, GZ, HU, IE, IT, LV, LT, LU, MT, NL, PL, PT, RO, SK, SI, ES, SE
 get_cities_for_countries = 
 # whether to use only humans for GeoIP rankings (requires GeoIP Database)
 # type: True/False
 geoip_only_humans = True
 [ rankings ]
 # Explanation for blacklists and whitelists:
 #     If a blacklist is given: values that fully match the blacklist are excluded
 #     If a whitelist is given: values that do not fully match the whitelist are excluded
 #     Both are optional: you can provide, none or both
 # type: regexp or None
 # city_ranking_blacklist = City in .*
 city_ranking_blacklist = 
 # type: regexp or None
 city_ranking_whitelist = 
 # type: regexp or None
 country_ranking_blacklist = 
 # type: regexp or None
 country_ranking_whitelist = 
 # type: regexp or None
 # route_ranking_blacklist = .*\.((css)|(txt))
 route_ranking_blacklist = 
 # type: regexp or None
 # route_ranking_whitelist = .*\.((php)|(html)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
 route_ranking_whitelist = 
 # maximum number of entries in route ranking
 # type: int
 route_ranking_plot_max_routes = 20
 # whether to ignore non-existing routes in ranking
 # type: True/False
 route_ranking_ignore_404 = True
 # type: regexp or None
 # referer_ranking_blacklist = Example: exclude '-' (nginx sets this when there is no referer)
 referer_ranking_blacklist = -
 # type: regexp or None
 referer_ranking_whitelist = 
 # whether to ignore protocol in referer ranking (if True: https://domain.com == http://domain.com -> domain.com)
 # type: True/False
 referer_ranking_ignore_protocol = True
 # whether to ignore subdomains inreferer ranking (if True: sub.domain.com == another.sub2.domain.com -> domain.com)
 # type: True/False
 referer_ranking_ignore_subdomain = 
 # whether to ignore route in referer ranking (if True: domain.com/route1 == domain.com/route2 -> domain.com)
 # type: True/False
 referer_ranking_ignore_route = True
 [ plots ]
 # DPI for plots
 # type: int
 plot_dpi = 300
 # plot size for broad plots: width, heigh
 # type: int, int
 plot_size_broad = 14, 5
 # plot size for narrow plots: width, height
 # type: int, int
 plot_size_narrow = 7, 5
 # *************************************************************************************************
 #     https://git.quintern.xyz/MatthiasQuintern/regina
 #     *************************************************************************************************
--- a/regina/main.py
+++ b/regina/main.py
@ -5,18 +5,19 @@ from sys import argv, exit
 from os.path import isfile
 import sqlite3 as sql
-if __name__ == "__main__":
+import argparse
 if __name__ == "__main__":  # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
    if __package__ is None:
        # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
        __package__ = "regina"
        import sys
        from os import path
        filepath = path.realpath(path.abspath(__file__))
        sys.path.insert(0, path.dirname(path.dirname(filepath)))
-from .db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id
+from .data_collection.parse_log import parse_log
-from .db_operation.database import create_db, update_geoip_tables, t_visitor
+from .database import Database
-from .db_operation.visualize import visualize
+from .data_visualization import visualize
 from .utility.settings_manager import read_settings_file
 from .utility.globals import settings, version
 from .utility.utility import pmessage
@ -74,81 +75,56 @@ def error(arg):
    print("Error:", arg)
    exit(1)
 def main():
    config_file = ""
    collect = False
    visualize_ = False
    log_file = ""
    geoip_city_csv = ""
    # parse args
    i = 1
    while i in range(1, len(argv)):
        if argv[i] in ["--config", "-c"]:
            if len(argv) > i + 1: config_file = argv[i+1]
            else: missing_arg_val(argv[i])
        elif argv[i] == "--log-file":
            if len(argv) > i + 1: log_file = argv[i+1]
            else: missing_arg_val(argv[i])
        if argv[i] == "--update-geoip":
            if len(argv) > i + 1: geoip_city_csv = argv[i+1]
            else: missing_arg_val(argv[i])
        elif argv[i] in ["--help", "-h"]:
            help()
            exit(0)
        elif argv[i] == "--collect":
            collect = True
        elif argv[i] == "--visualize":
            visualize_ = True
        else:
            pass
        i += 1
    if not (collect or visualize_ or geoip_city_csv):
        missing_arg("--visualize or --collect or --update-geoip")
-    if not config_file:
+def main2():
-        missing_arg("--config")
+    parser = argparse.ArgumentParser(prog="regina")
-    if not isfile(config_file):
+    parser.add_argument("--config", "-c",   action="store",         help="path to a config file that specifies all the other parameters", metavar="config-file", required=True)
-        error(f"Not a file: '{config_file}'")
+    parser.add_argument("--update-geoip",   action="store",         help="path to IP-COUNTRY-REGION-CITY database in csv format", metavar="geoip-csv")
-    read_settings_file(config_file, settings)
+    parser.add_argument("--visualize",      action="store_true",    help="generate the visualization website")
    parser.add_argument("--collect",        action="store_true",    help="fill the database from the nginx access log")
    parser.add_argument("--log-file",       action="store",         help="use alternate logfile than what is set in the config file", metavar="log-file")
    args = parser.parse_args()
    if not (args.collect or args.visualize or args.update_geoip):
        parser.error("at least one of --visualize, --collect, or --update-geoip is required.")
    if not path.isfile(args.config):
        parser.error(f"invalid path to configuration file: '{args.config}'")
    read_settings_file(args.config, settings)
    settings["version"] = version
    if log_file: settings["access_log"] = log_file
    if args.log_file:
        settings["access_log"] = args.log_file
-    if not settings["server_name"]: missing_arg("server-name")
+    if not settings["server_name"]:
-    if not settings["access_log"]: missing_arg("log")
+        error("'server-name' is missing in the configuration file.")
    if not settings["db"]: missing_arg("db")
    if isinstance(settings["auto_group_filetypes"], str):
        settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",")
    if isinstance(settings["locs_and_dirs"], str):
        settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ]
-    if not isfile(config_file):
+    if not settings["access_log"]:
-        error(f"Not a file: '{config_file}'")
+        error("'log' is missing in the configuration file.")
    if not settings["db"]:
        error("'db' is missing in the configuration file.")
-    if not isfile(settings["db"]):
+    db = Database(settings["db"])
-        create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
+    # if not isfile(settings["db"]):
    #     create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
-    if geoip_city_csv:
+    if args.update_geoip:
-        if not isfile(geoip_city_csv):
+        if not isfile(args.update_geoip):
-            error(f"Not a file: '{geoip_city_csv}'")
+            error(f"Not a file: '{args.update_geoip}'")
-        conn = sql.connect(settings['db'], isolation_level=None)  # required vor vacuum
+        db.update_geoip_tables(args.update_geoip)
        cur = conn.cursor()
        update_geoip_tables(cur, geoip_city_csv)
        # update visitors
-        for visitor_id in range(sql_tablesize(cur, t_visitor)):
+        for (visitor_id) in db(f"SELECT visitor_id FROM visitor"):
-            update_ip_range_id(cur, visitor_id)
+            db.update_ip_range_id(visitor_id)
-        cur.close()
+    if args.collect:
        conn.commit()
        conn.close()
    if collect:
        pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
        requests = parse_log(settings["access_log"])
-        add_requests_to_db(requests, settings["db"])
+        db.add_requests(requests)
-    if visualize_:
+    if args.visualize:
        pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'")
        if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
        visualize(settings)
 if __name__ == '__main__':
-    main()
+    main2()
--- a/regina/sql/create_db.sql
+++ b/regina/sql/create_db.sql
@ -67,5 +67,5 @@ CREATE TABLE IF NOT EXISTS city(
 CREATE TABLE IF NOT EXISTS country(
    country_id  INTEGER PRIMARY KEY,
    name        TEXT UNIQUE,
-    code        TEXT UNIQUE,
+    code        TEXT UNIQUE
 ) STRICT;
--- a/regina/test.db
+++ b/regina/test.db
--- a/regina/todo.py
+++ b/regina/todo.py
@ -0,0 +1,34 @@
 def get_files_from_dir_rec(p: str, files: list[str]):
    """recursivly append all files to files"""
    pdebug("get_files_from_dir_rec:",p)
    if path.isfile(p):
        files.append(p)
    elif path.isdir(p):
        for p_ in listdir(p):
            get_files_from_dir_rec(p + "/" + p_, files)
 def create_filegroups(cursor: sql.Cursor, filegroup_str: str):
    """
    TODO: make re-usable (alter groups when config changes)
    """
    # filegroup_str: 'name1: file1, file2, file3; name2: file33'
    groups = filegroup_str.strip(";").split(";")
    pdebug("create_filegroups:", groups)
    for group in groups:
        name, vals = group.split(":")
        # create/get group
        if sql_exists(cursor, "", [("groupname", name)]):
            group_id = sql_select(cursor, "", [("groupname", name)])[0][0]
        else:
            group_id = sql_max(cursor, "", "group_id") + 1
            sql_insert(cursor, "", [(group_id, name)])
        # pdebug("create_filegroups: group_id", group_id)
        # create/edit file
        for filename in vals.split(","):
            if sql_exists(cursor, "", [("filename", filename)]):  # if exist, update
                cursor.execute(f"UPDATE file SET group_id = {group_id} WHERE filename = 'fil'")
            else:
                sql_insert(cursor, "", [[filename, group_id]])
--- a/regina/utility/globals.py
+++ b/regina/utility/globals.py
@ -2,57 +2,9 @@
 import os
-version = "1.0"
+version = "2.0"
 # default settings, these are overwriteable through a config file
 settings = {
    # GENERAL
    "server_name": "default_sever",
    # DATA COLLECTION
    "access_log": "",
    "db": "",
    "locs_and_dirs": [],
    "auto_group_filetypes": [],
    "filegroups": "",
    "request_location_regex_blacklist": "",
    "request_is_same_on_same_day": True,  # mutiple requests from same visitor to same file at same day are counted as 1
    "unique_visitor_is_ip_address": False,
    "get_visitor_location": False,
    "get_cities_for_countries": [""],  # list if country codes for which the ip address ranges need to be collected at city level, not country level
    "hash_ip_address": True,
    # VISUALIZATION
    "get_human_percentage": False,
    "human_needs_success": True,  # a human must have at least 1 successful request (status < 300)
    "status_300_is_success": False,  # 300 codes are success
    "do_geoip_rankings": False,
    "geoip_only_humans": True,
    "city_ranking_regex_blacklist": "",
    "country_ranking_regex_blacklist": "",
    # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
    "file_ranking_regex_whitelist": r".*\.(html)",
    "file_ranking_ignore_error_files": False,  # skip files that only had unsuccessful requests (status < 300)
    "referer_ranking_ignore_protocol": True,
    "referer_ranking_ignore_subdomain": False,
    "referer_ranking_ignore_location": True,
    "referer_ranking_ignore_tld": False,
    "referer_ranking_regex_whitelist": r"^[^\-].*",  # minus means empty
    "visitor_agent_ranking_regex_whitelist": r"",
    "file_ranking_plot_max_files": 15,
    # "plot_figsize": (60, 40),
    "plot_dpi": 300,
    "plot_add_count_label": True,
    "plot_size_broad": (10, 5),
    "plot_size_narrow": (6.5, 5),
    "img_dir": "",
    "img_location": "",
    "img_filetype": "svg",
    "template_html": "",
    "html_out_path": "",
    "last_x_days": 30,
    # regina
    "debug": False
 }
 # these oses and browser can be detected:
 # lower element takes precedence
--- a/regina/utility/settings_manager.py
+++ b/regina/utility/settings_manager.py
@ -1,3 +1,298 @@
 from configparser import ConfigParser
 """
 Classes and methods for managing regina configuration
 Using CFG_File and CFG_Entry, you set defaults and type restrictions for
 a dictionary like ReginaSettings object and also export the defaults as a .cfg file
 """
 def comment(s):
    return "# " + s.replace("\n", "\n# ").strip("# ")
 # for eventual later type checking
 class regexp:
    """
    represents a regular expression
    """
    pass
 class Path:
    """
    represents a path
    """
    def __init__(self, permissions="r", is_dir=False):
        self.is_dir = is_dir
        self.permissions = permissions
    def __repr__(self):
        if self.is_dir:
            s = "directory"
        else:
            s = "file"
        if self.permissions:
            s += " ("
            if "r" in self.permissions: s += "read, "
            if "w" in self.permissions: s += "write, "
            if "x" in self.permissions: s += "execute, "
            s = s[:-2] + " permissions)"
        return s
 class CFG_Entry:
    """
    key - value pair in a cfg file
    extra parameters for comments on top of the key - value pair
    """
    types = str|Path|None|type[regexp]|type[str]|type[bool]|type[int]
    def __init__(self, key, dflt=None, typ_: types|list[types]|tuple[types] =str, desc="", exam=""):   # all 4 letters -> nice indent
        """
        @param typ: type for the value:
            use list of types if multiple types are allowed
            use tuple of types for tuple of types
        """
        self.key = key
        self.default = dflt
        self.type_ = typ_
        self.descripton= desc
        self.example = exam
    def type_str(self):
        def _type_str(t):
            if type(t) == str:          return t
            if t is None:               return "None"
            if t == str:    return "string"
            if t == bool:   return "True/False"
            if t == int:    return "int"
            if t == float:  return "float"
            if t == regexp: return "regexp"
            if type(t) == Path:         return str(t)
            try:
                return t.__name__
            except AttributeError:
                return str(t)
        s = ""
        if type(self.type_) == list:
            for i in range(len(self.type_)):
                s += _type_str(self.type_[i])
                if i < len(self.type_) - 2: s += ", "
                elif i == len(self.type_) - 2: s += " or "
        elif type(self.type_) == tuple:
            for i in range(len(self.type_)):
                s += _type_str(self.type_[i])
                if i < len(self.type_) - 1: s += ", "
        else:
            s = _type_str(self.type_)
        return s
    def __repr__(self):
        s = ""
        if self.descripton: s += f"{comment(self.descripton)}\n"
        if self.type_:      s += f"{comment('type: ' + self.type_str())}\n"
        # if self.example:    s += f"{comment('eg: ' + self.example)}\n"
        if self.example:    s += comment(f"{self.key} = {self.example}\n")
        s += f"{self.key} = "
        if self.default:    s += f"{self.default}"
        s += "\n"
        return s
 class CFG_File:
    """
    represents a cfg file
    use the __repr__ method to export to a file
    """
    def __init__(self, header="", footer=""):
        self.sections = []  # (name, desc, entries)
        self.header = header
        self.footer = footer
    def add_section(self, name:str, entries: list[CFG_Entry|str], desc=""):
        self.sections.append((name, desc, entries))
    def __repr__(self):
        s = comment(self.header) + "\n"
        for name, desc, entries in self.sections:
            if desc:    s += f"\n{comment(desc)}"
            s += f"\n[ {name} ]\n"
            for entry in entries:
                s += f"{entry}\n"
        s += comment(self.footer)
        return s
 if __name__ == "__main__":
    cfg = CFG_File(header=r"""
    ************************************* REGINA CONFIGURATION **************************************
                          .__
    _______   ____   ____ |__| ____ _____
    \_  __ \_/ __ \ / ___\|  |/    \\__  \
    |  | \/\  ___// /_/  >  |   |  \/ __ \_
    |__|    \___  >___  /|__|___|  (____  /
                \/_____/         \/     \/
    ************************************************************************************************* """.strip(" \n"), footer=r"""
    *************************************************************************************************
    https://git.quintern.xyz/MatthiasQuintern/regina
    *************************************************************************************************
    """.strip(" \n"))
    cfg.add_section("regina", desc="Common Settings", entries=[
        CFG_Entry("server_name",
                desc="name (not url) of the server or website\nwill be avaiable as variable for the generated html as %server_name",
                typ_=str,
                exam="my_website"),
        CFG_Entry("database",
                desc="database path",
                typ_=Path(permissions="rw"),
                exam="/home/my_user/regina/my_website.db"),
        CFG_Entry("access_log",
                desc="path to the nginx access log to parse",
                typ_=Path(permissions="r"),
                exam="/var/log/nginx/access.log"),
        ])
    cfg.add_section("html-generation", desc="The template and generated file do actually have to be htmls, you can change it to whatever you want", entries=[
        CFG_Entry("generate_html",
                typ_=bool,
                dflt=True),
        CFG_Entry("template_html",
                desc="template html input",
                typ_=Path(permissions="r"),
                exam="/home/my_visitor/.regina/template.html"),
        CFG_Entry("html_out_path",
                desc="output for the generated html",
                typ_=Path(permissions="w"),
                exam="/www/analytics/statistics.html"),
        CFG_Entry("img_out_dir",
                desc="output directory for the generated plots",
                typ_=Path(permissions="w", is_dir=True),
                exam="/www/analytics/images"),
        CFG_Entry("img_location",
                desc="nginx location for the generated images (this has to map to img_out_dir)",
                typ_="eg: images",
                exam="/images"),
        ])
    cfg.add_section("data-collection", desc="These settings affect the data collection. If changed, they will affect how the database is being filled in the future.", entries=[
        CFG_Entry("unique_visitor_is_ip_address",
                dflt=False,
                desc="whether a unique visitor is only identified by IP address",
                typ_=bool),
        CFG_Entry("human_needs_success",
                dflt=True,
                desc="whether a visitor needs at least one successful request to be a human",
                typ_=bool),
        CFG_Entry("status_300_is_success",
                dflt=True,
                desc="whether a request with 30x HTTP status counts as successful request",
                typ_=bool),
        CFG_Entry("delete_ip_addresses",  # TODO: Implement
                dflt=True,
                desc="delete all ip addresses after the collection is done",
                typ_=bool),
        CFG_Entry("request_location_blacklist",
                desc="don't collect requests to locations that match this regex",
                typ_=[regexp, None],
                exam="/analytics.*"),
        CFG_Entry("get_visitor_location",
                dflt=False,
                desc="whether to get visitor location information",
                typ_=bool),
        CFG_Entry("do_geoip_rankings",  # TODO: is used?
                dflt=False,
                desc="whether to generate country and city rankings using GeoIP (requires GeoIP Database)",
                typ_=bool),
        CFG_Entry("get_cities_for_countries",
                desc="countries for which the GeoIP needs to be resolved at city level",
                typ_="list of capitalized ISO 3166-1 alpha-2 country codes",
                exam="AT, BE, BG, HR, CY, CZ, DK, EE, FI, FR, DE, GZ, HU, IE, IT, LV, LT, LU, MT, NL, PL, PT, RO, SK, SI, ES, SE"),
        CFG_Entry("geoip_only_humans", # TODO: is used?
                dflt=True,
                desc="whether to use only humans for GeoIP rankings (requires GeoIP Database)",
                typ_=bool),
        ])
 # cfg.add_section("data-visualization", desc="", entries=[
    cfg.add_section("rankings", desc="", entries=[
        comment("""
    Explanation for blacklists and whitelists:
    If a blacklist is given: values that fully match the blacklist are excluded
    If a whitelist is given: values that do not fully match the whitelist are excluded
    Both are optional: you can provide, none or both
        """.strip("\n")),
        CFG_Entry("city_ranking_blacklist",
                typ_=[regexp, None],
                exam="City in .*"),
        CFG_Entry("city_ranking_whitelist",
                typ_=[regexp, None]),
        CFG_Entry("country_ranking_blacklist",
                typ_=[regexp, None]),
        CFG_Entry("country_ranking_whitelist",
                typ_=[regexp, None]),
        CFG_Entry("route_ranking_blacklist",
                typ_=[regexp, None],
                exam=r".*\.((css)|(txt))"),
        CFG_Entry("route_ranking_whitelist",
                typ_=[regexp, None],
                exam=r".*\.((php)|(html)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))"),
        CFG_Entry("route_ranking_plot_max_routes",
                dflt=20,
                desc="maximum number of entries in route ranking",
                typ_=int),
        CFG_Entry("route_ranking_ignore_404",
                dflt=True,
                desc="whether to ignore non-existing routes in ranking",
                typ_=bool),
        # TODO add groups
        # Entry("route_groups",
                # desc="route groups for images",
                # typ_=[regexp, None],
                # exam="*.gif, *.jpeg, *.jpg, *.png, *.svg".replace(", ", "\n")),
        CFG_Entry("referer_ranking_blacklist",
                dflt="-",
                typ_=[regexp, None],
                exam="Example: exclude '-' (nginx sets this when there is no referer)"),
        CFG_Entry("referer_ranking_whitelist",
                typ_=[regexp, None]),
        CFG_Entry("referer_ranking_ignore_protocol",
                dflt=True,
                desc="whether to ignore protocol in referer ranking (if True: https://domain.com == http://domain.com -> domain.com)",
                typ_=bool),
        CFG_Entry("referer_ranking_ignore_subdomain",
                dflt=False,
                desc="whether to ignore subdomains inreferer ranking (if True: sub.domain.com == another.sub2.domain.com -> domain.com)",
                typ_=bool),
        CFG_Entry("referer_ranking_ignore_route",
                dflt=True,
                desc="whether to ignore route in referer ranking (if True: domain.com/route1 == domain.com/route2 -> domain.com)",
                typ_=bool),
        ])
    cfg.add_section("plots", desc="", entries=[
        CFG_Entry("plot_dpi",
                dflt=300,
                desc="DPI for plots",
                typ_=int),
        CFG_Entry("plot_size_broad",
                dflt="14, 5",
                desc="plot size for broad plots: width, heigh",
                typ_=(int, int)),
        CFG_Entry("plot_size_narrow",
                dflt="7, 5",
                desc="plot size for narrow plots: width, height",
                typ_=(int, int)),
        ])
    with open("generated-default.cfg", "w") as file:
        file.write(f"{cfg}")
 def get_bool(bool_str: str, fallback=False):
    if bool_str in ["true", "True"]: return True
@ -53,3 +348,72 @@ def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True,
                    else: continue
        else:
            settings[vals[0]] = vals[1].strip(" ")
 class ReginaSettings:
    def __init__(self, config_file):
        parser = ConfigParser()
        # with open(config_file, "r") as file
        # default settings, these are overwriteable through a config file
        self._settings = {
            # GENERAL
            "server_name": "default_sever",
            # DATA COLLECTION
            "access_log": "",
            "db": "",
            "locs_and_dirs": [],
            "auto_group_filetypes": [],
            "filegroups": "",
            "request_location_blacklist": "",
            "request_is_same_on_same_day": True,  # mutiple requests from same visitor to same file at same day are counted as 1
            "unique_visitor_is_ip_address": False,
            "get_visitor_location": False,
            "get_cities_for_countries": [""],  # list if country codes for which the ip address ranges need to be collected at city level, not country level
            "hash_ip_address": True,
            # VISUALIZATION
            "get_human_percentage": False,
            "human_needs_success": True,  # a human must have at least 1 successful request (status < 300)
            "status_300_is_success": False,  # 300 codes are success
            "do_geoip_rankings": False,
            "geoip_only_humans": True,
            "city_ranking_blacklist": "",
            "country_ranking_blacklist": "",
            # "file_ranking_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
            "file_ranking_whitelist": r".*\.(html)",
            "file_ranking_ignore_error_files": False,  # skip files that only had unsuccessful requests (status < 300)
            "referer_ranking_ignore_protocol": True,
            "referer_ranking_ignore_subdomain": False,
            "referer_ranking_ignore_location": True,
            "referer_ranking_ignore_tld": False,
            "referer_ranking_whitelist": r"^[^\-].*",  # minus means empty
            "visitor_agent_ranking_whitelist": r"",
            "file_ranking_plot_max_files": 15,
            # "plot_figsize": (60, 40),
            "plot_dpi": 300,
            "plot_add_count_label": True,
            "plot_size_broad": (10, 5),
            "plot_size_narrow": (6.5, 5),
            "img_dir": "",
            "img_location": "",
            "img_filetype": "svg",
            "template_html": "",
            "html_out_path": "",
            "last_x_days": 30,
            # regina
            "debug": False
        }
        def __getitem__(self, key):
            return self._settings[key]
        def __setitem__(self, key, value):
            """
            set key to value.
            if key already exists, TypeError is raised if value is not of the same type as the current value
            """
            if key in self._settings.keys():
                if type(value) != type(self._settings[key]):
                    raise TypeError(f"ReginaSettings: Trying to set value of '{key}' to '{value}' of type '{type(value)}', but the current type is '{type(self._settings[key])}'.")
            self._settings[key] = value
--- a/regina/utility/utility.py
+++ b/regina/utility/utility.py
@ -2,6 +2,7 @@
 # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
 from sys import exit
 from os import path
 from re import fullmatch
 from regina.utility.globals import settings
@ -9,6 +10,29 @@ from regina.utility.globals import settings
 Various utitity
 """
 def is_whitelisted(val: str, whitelist: str|list[str]|None):
    """
    Check if val is in a regex whitelist
    whitelist: regexp, list of regexp or None
    if whitelist is None, always return True
    """
    if not whitelist: return True
    if type(whitelist) == str:
        return fullmatch(whitelist, val)
    if type(whitelist) == list:
        for w in whitelist:
            if not fullmatch(w, val): return False
    return True
 def is_blacklisted(val: str, blacklist: str|list[str]|None):
    """
    Check if val is in a regex blacklist
    blacklist: regexp, list of regexp or None
    if blacklist is None, always return False
    """
    return not is_whitelisted(val, blacklist)
 def pdebug(*args, **keys):
    if settings["debug"]: print(*args, **keys)
--- a/test/test.db
+++ b/test/test.db