diff --git a/database.uxf b/database.uxf
index d30cf51..5e99d88 100644
--- a/database.uxf
+++ b/database.uxf
@@ -7,7 +7,7 @@
364
273
299
- 208
+ 234
User
--
@@ -19,6 +19,7 @@
- platform: TEXT
- browser: TEXT
- mobile: INTEGER
+- is_human: INTEGER
style=autoresize
diff --git a/default.conf b/default.conf
index e2002f1..2be8d2a 100644
--- a/default.conf
+++ b/default.conf
@@ -1,24 +1,48 @@
-# nginx analytics config for quintern.xy
+# default configuration for regina
# GENERAL
-server-name = default-sever
+server_name = default_sever
+# path to the database
+db = /home/my_user/analytics/my_website.db
+
# DATA COLLECTION
-db = /home/my-user/analytics/my-website.db
-access-log = /home/my-user/analytics/access.log
-locs-and-dirs = /:/www/my-website,/error:/www/error
-auto-group-filetypes = png,jpg,jpeg,gif,svg,css
+# these changes will only apply to newly collected data/creation of new database
+# path to the nginx access log to parse.
+access_log = /home/my_user/analytics/access.log
+# nginx locations and their root directory: location:directory,location:directory,...
+locs_and_dirs = /:/www/my_website,/error:/www/error
+# filetypes that should be grouped (comma separated)
+auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt
+# wether a request with 30x http status counts as success
+status_300_is_success = False
+# wether a user needs to make at least 1 successful request to be a human
+humans_need_success = True
+# filegroups, eg group index.html and home.html
+filegroups = home:index.html,home.html;images:image1.png,image2.png
+# filegroups =
# VISUALIZATION
-get-human-percentage = True
-# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
-file_ranking_regex_whitelist = .*\.(html)
-# minus means empty
+# separate users into all and humans
+get_human_percentage = True
+# regex expression as whitelist for file ranking
+# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
+file_ranking_regex_whitelist =
+# regex expression as whitelist for referer ranking, minus means empty
+# eg: exclude empty referers
referer_ranking_regex_whitelist = ^[^\-].*
+# regex expression as whitelist for user agent ranking
user_agent_ranking_regex_whitelist =
-file_ranking_plot_max_files = 15
+# maximum number of file(group)s on the file ranking
+file_ranking_plot_max_files = 20
+# wether to ignore non existing files in the ranking
+file_ranking_ignore_error_files = True
+
# "plot_figsize" = (60 40),
plot_dpi = 300
+# output directory for the generated plots
img_dir = /www/analytics/images
-img_dir = /analytics/images
-template_html = /home/my-user/analytics/template.html
+# nginx location for the generated images, its root must be img_dir
+img_location = images
+# template html input
+template_html = /home/my_user/analytics/template.html
+# output for the generated html
html_out_path = /www/analytics/statistics.html
-# filegroups = start:/index.html,/about.html,/img_on_index.png;music:/music.html,song.mp3
diff --git a/regina/__init__.py b/regina/__init__.py
new file mode 100644
index 0000000..9d718fe
--- /dev/null
+++ b/regina/__init__.py
@@ -0,0 +1,5 @@
+"""Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
+# __package__ = 'regina'
+
+from db_operation import database, visualize, collect
+print("running __init__.py")
diff --git a/regina/db_operation/__init__.py b/regina/db_operation/__init__.py
new file mode 100644
index 0000000..a694da7
--- /dev/null
+++ b/regina/db_operation/__init__.py
@@ -0,0 +1,2 @@
+"""Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
+# __package__ = 'regina'
diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py
new file mode 100644
index 0000000..5e16885
--- /dev/null
+++ b/regina/db_operation/collect.py
@@ -0,0 +1,163 @@
+import sqlite3 as sql
+from re import match
+from time import mktime
+from datetime import datetime as dt
+from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
+from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
+from utility.utility import pdebug, warning
+from utility.globals import user_agent_operating_systems, user_agent_browsers, settings
+
+"""
+collect information from the access log and put it into the database
+"""
+months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
+
+
+
+class Request:
+ def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""):
+ self.ip_address = sanitize(ip_address)
+ self.time_local = 0
+ #[20/Nov/2022:00:47:36 +0100]
+ m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
+ if m:
+ g = m.groups()
+ try:
+ datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
+ self.time_local = int(mktime(datetime_.timetuple()))
+ except Exception as e:
+ warning(f"Request:__init__: {e}")
+ else:
+ warning(f"Request:__init__: Could not match time: '{time_local}'")
+
+ self.request_type = sanitize(request_type)
+ self.request_file = sanitize(request_file)
+ self.request_protocol = sanitize(request_protocol)
+ self.status = sanitize(status)
+ self.bytes_sent = sanitize(bytes_sent)
+ self.referer = sanitize(referer)
+ self.user_agent = sanitize(user_agent)
+
+ def __repr__(self):
+ return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.user_agent} - {self.status}"
+
+re_remote_addr = r"[0-9a-fA-F.:]+"
+re_remote_user = ".*"
+re_time_local = r"\[.+\]"
+re_request = r'"[^"]+"'
+re_status = r'\d+'
+re_body_bytes_sent = r'\d+'
+re_http_referer = r'"([^"]*)"'
+re_http_user_agent = r'"([^"]*)"'
+re_log_format: str = f'({re_remote_addr}) - ({re_remote_user}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_user_agent}'
+def parse_log(logfile:str) -> list[Request]:
+ """
+ create Request objects from each line in the logfile
+ """
+ requests = []
+ with open(logfile, "r") as file:
+ lines = file.readlines()
+ for line in lines:
+ m = match(re_log_format, line)
+ if m is None:
+ warning(f"parse_log: Unmatched line: '{line}'")
+ continue
+ # print(m.groups())
+ g = m.groups()
+ request_ = m.groups()[3].split(" ")
+ if len(request_) != 3:
+ warning(f"parse_log: len('{m.groups()[3]}'.split(' ')) is {len(request_)} and not 3")
+ continue
+ requests.append(Request(ip_address=g[0], time_local=g[2],
+ request_type=request_[0], request_file=request_[1], request_protocol=request_[2],
+ status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
+ return requests
+
+def get_user_id(request: Request, cursor: sql.Cursor) -> int:
+ """
+ get the user_id. Adds the user if not already existing
+ """
+ # if user exists
+ if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]):
+ user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
+ else: # new user
+ # new user_id is number of elements
+ user_id: int = sql_tablesize(cursor, t_user)
+ pdebug("new user:", user_id, request.ip_address)
+ platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent)
+ is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id))
+ cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');")
+ return user_id
+
+def is_user_human(cur: sql.Cursor, user_id: int):
+ global settings
+ """
+ check if they have a known platform AND browser
+ check if at least one request did not result in an error (http status >= 400)
+ """
+ max_success_status = 400
+ if settings["status_300_is_success"]: max_success_status = 300
+ # check if has browser
+ cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
+ # if no browser and platform
+ if cur.fetchone()[0] == 0: return False
+ # if human needs successful request
+ if settings["human_needs_success"]:
+ # check if at least request was successful (status < 400)
+ cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE user_id = {user_id} AND status < {max_success_status})")
+ if cur.fetchone()[0] == 1:
+ # pdebug(f"is_user_human: User {user_id} is human")
+ pass
+ else:
+ # pdebug(f"is_user_human: User {user_id} only had unsuccessful requests")
+ return False
+ # user is human
+ return True
+
+
+# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)"
+# 1: platform, 2: version, 3: details
+def get_os_browser_pairs_from_agent(user_agent):
+ # for groups in findall(re_user_agent, user_agent):
+ operating_system = ""
+ browser = ""
+ mobile = "Mobi" in user_agent
+ for os in user_agent_operating_systems:
+ if os in user_agent:
+ operating_system = os
+ break
+ for br in user_agent_browsers:
+ if br in user_agent:
+ browser = br
+ break
+ # if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'")
+ return operating_system, browser, mobile
+
+
+def add_requests_to_db(requests: list[Request], db_name: str):
+ conn = sql.connect(db_name)
+ cursor = conn.cursor()
+ # check the new users later
+ max_user_id = sql_tablesize(cursor, t_user)
+ for i in range(len(requests)):
+ request = requests[i]
+ # pdebug("add_requests_to_db:", i, "request:", request)
+ user_id = get_user_id(request, cursor)
+ conn.commit()
+ group_id: int = get_filegroup(request.request_file, cursor)
+ # check if request is unique
+ group_id_name = database_tables[t_filegroup].key.name
+ user_id_name = database_tables[t_user].key.name
+ if sql_exists(cursor, t_request, [(group_id_name, group_id), (user_id_name, user_id), ("date", request.time_local)]):
+ # pdebug("request exists:", request)
+ pass
+ else:
+ # pdebug("new request:", request)
+ request_id = sql_tablesize(cursor, t_request)
+ sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]])
+ for user_id in range(max_user_id, sql_tablesize(cursor, t_user)):
+ is_human = is_user_human(cursor, user_id)
+ if is_human:
+ cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}")
+ cursor.close()
+ conn.commit()
diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py
new file mode 100644
index 0000000..3cdd830
--- /dev/null
+++ b/regina/db_operation/database.py
@@ -0,0 +1,157 @@
+# from sys import path
+# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
+import sqlite3 as sql
+from os import path, listdir
+# local
+from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
+from utility.utility import pdebug
+
+
+"""
+create reginas database as shown in the uml diagram database.uxf
+"""
+
+class Entry:
+ """
+ represents an sql entry
+ type_ is INTEGER, TEXT, REAL...
+ """
+ def __init__(self, name, type_) -> None:
+ self.name = name
+ self.type_ = type_
+ def __repr__(self):
+ return f"[{self.name}] {self.type_}"
+
+class Table:
+ def __init__(self, name, key: Entry, entries: list[Entry]=[], constaints: list[str]=[]):
+ self.name = name
+ self.key = key
+ self.entries = entries
+ self.constaints = constaints
+ def create_sql_str(self):
+ return f"CREATE TABLE IF NOT EXISTS {self.name}\n({self})\n"
+ def __repr__(self):
+ s = f"{self.key} PRIMARY KEY"
+ for entry in self.entries:
+ s += f", {entry}"
+ for c in self.constaints:
+ s += f", {c}"
+ return s
+t_request = "request"
+t_file = "file"
+t_filegroup = "filegroup"
+t_user = "user"
+
+user_id = Entry("user_id", "INTEGER")
+request_id = Entry("request_id", "INTEGER")
+filegroup_id = Entry("group_id", "INTEGER")
+ip_address_entry = Entry("ip_address", "TEXT")
+filename_entry = Entry("filename", "TEXT")
+database_tables = {
+ t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER")], [f"UNIQUE({user_id.name})"]),
+ t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]),
+ t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]),
+ t_request: Table(t_request, request_id, [
+ user_id, filegroup_id, Entry("date", "INTEGER"), Entry("referer", "TEXT"), Entry("status", "INTEGER")
+ ], ["UNIQUE(request_id)"]),
+}
+
+
+
+def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
+ """
+ get the filegroup
+ returns the group where
+ 1) filename is the groupname
+ 2) the filetype of filename is the groupname
+ 3) new group with filename as gorupname
+ """
+ # pdebug(f"get_filegroup: {filename}")
+ if sql_exists(cursor, t_file, [("filename", filename)]):
+ return sql_select(cursor, t_file, [("filename", filename)])[0][1]
+ else:
+ suffix = filename.split('.')[-1]
+ cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
+ # cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
+ group_id_candidates = cursor.fetchall()
+ pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
+ if group_id_candidates:
+ return group_id_candidates[0][0]
+ else: # add new group file filename
+ group_id = sql_tablesize(cursor, t_filegroup)
+ # pdebug("new file(group):", group_id, filename)
+ # add group
+ sql_insert(cursor, t_filegroup, [[group_id, filename]])
+ # add file
+ sql_insert(cursor, t_file, [[filename, group_id]])
+ return group_id
+
+def create_filegroups(cursor: sql.Cursor, filegroup_str: str):
+ # filegroup_str: 'name1: file1, file2, file3; name2: file33'
+ groups = filegroup_str.strip(";").split(";")
+ pdebug("create_filegroups:", groups)
+ for group in groups:
+ name, vals = group.split(":")
+ # create/get group
+ if sql_exists(cursor, t_filegroup, [("groupname", name)]):
+ group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0]
+ else:
+ group_id = sql_tablesize(cursor, t_filegroup)
+ sql_insert(cursor, t_filegroup, [(group_id, name)])
+ # pdebug("create_filegroups: group_id", group_id)
+ # create/edit file
+ for filename in vals.split(","):
+ if sql_exists(cursor, t_file, [("filename", filename)]): # if exist, update
+ cursor.execute(f"UPDATE {t_file} SET group_id = {group_id} WHERE filename = '{filename}'")
+ else:
+ sql_insert(cursor, t_file, [[filename, group_id]])
+
+def get_files_from_dir_rec(p: str, files: list[str]):
+ """recursivly append all files to files"""
+ pdebug("get_files_from_dir_rec:",p)
+ if path.isfile(p):
+ files.append(p)
+ elif path.isdir(p):
+ for p_ in listdir(p):
+ get_files_from_dir_rec(p + "/" + p_, files)
+
+def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_filetypes:list[str]) -> str:
+ """
+ :param list of nginx locations and the corresponding directories
+ :param auto_filetype_groups list of filetypes for auto grouping
+ """
+ files: list[str] = []
+ start_i = 0
+ for location, dir_ in location_and_dirs:
+ get_files_from_dir_rec(dir_, files)
+ # replace dir_ with location, eg /www/website with /
+ for i in range(start_i, len(files)):
+ files[i] = files[i].replace(dir_, location).replace("//", "/")
+ filegroups = ""
+ # create groups for each filetype
+ for ft in auto_group_filetypes:
+ filegroups += f"{ft}:"
+ for file in files:
+ if file.endswith(f".{ft}"):
+ filegroups += f"{file},"
+ filegroups = filegroups.strip(",") + ";"
+ pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups)
+ return filegroups
+
+def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]):
+ """
+ create the name with database_tables
+ """
+ print(f"creating database: '{name}'")
+ conn = sql.connect(f"{name}")
+ cursor = conn.cursor()
+ for table in database_tables.values():
+ cursor.execute(table.create_sql_str())
+ filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes)
+ create_filegroups(cursor, filegroup_str)
+ conn.commit()
+ conn.close()
+
+
+if __name__ == '__main__':
+ create_db("test.db")
diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py
new file mode 100644
index 0000000..262486a
--- /dev/null
+++ b/regina/db_operation/visualize.py
@@ -0,0 +1,591 @@
+# from sys import path
+# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
+import sqlite3 as sql
+from sys import exit
+from re import fullmatch
+import matplotlib.pyplot as plt
+from os.path import isdir
+from datetime import datetime as dt
+
+from numpy import empty
+# local
+from db_operation.database import t_request, t_user, t_file, t_filegroup
+from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
+from utility.utility import pdebug, warning, missing_arg
+from utility.globals import settings
+
+
+"""
+visualize information from the databse
+TODO:
+- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
+- ignore 404
+"""
+
+palette = {
+ "red": "#ee4035",
+ "orange": "#f37736",
+ "yellow": "#fdf458",
+ "green": "#7bc043",
+ "blue": "#0392cf",
+ "purple": "#b044a0",
+}
+color_settings_filetypes = {
+ palette["red"]: ["html"],
+ palette["green"]: ["jpg", "png", "jpeg", "gif", "svg", "webp"],
+ palette["yellow"]: ["css"],
+ "grey": ["txt"]
+}
+color_settings_alternate = list(palette.values())
+
+color_settings_browsers = {
+ palette["red"]: ["Safari"],
+ palette["orange"]: ["Firefox"],
+ palette["yellow"]: ["Chrome"],
+ "grey": ["Edge"],
+ palette["green"]: ["Chromium"],
+ palette["purple"]: ["Brave"]
+}
+color_settings_operating_systems = {
+ palette["red"]: ["Mac"],
+ palette["green"]: ["Android"],
+ "grey": ["iPhone", "iPad"],
+ palette["yellow"]: ["Linux"],
+ palette["purple"]: ["BSD"],
+ palette["blue"]: ["Windows"],
+}
+
+
+def len_list_list(l: list[list]):
+ size = 0
+ for i in range(len(l)):
+ size += len(l[i])
+ return size
+
+def valid_status(status: int):
+ if status >= 400: return False
+ if settings["status_300_is_success"] and status >= 300: return True
+ return status < 300
+
+#
+# FILTERS
+#
+def get_os_browser_mobile_rankings(cur: sql.Cursor, user_ids: list[int]):
+ """
+ returns [(count, operating_system)], [(count, browser)], mobile_user_percentage
+ """
+ os_ranking = {}
+ os_count = 0.0
+ browser_ranking = {}
+ browser_count = 0.0
+ mobile_ranking = { True: 0.0, False: 0.0 }
+ for user_id in user_ids:
+ cur.execute(f"SELECT platform,browser,mobile FROM {t_user} WHERE user_id = {user_id}")
+ os, browser, mobile = cur.fetchone()
+ mobile = bool(mobile)
+ if os:
+ if os in os_ranking: os_ranking[os] += 1
+ else: os_ranking[os] = 1
+ os_count += 1
+ if browser:
+ if browser in browser_ranking: browser_ranking[browser] += 1
+ else: browser_ranking[browser] = 1
+ browser_count += 1
+ if (os or browser):
+ mobile_ranking[mobile] += 1
+ try:
+ mobile_user_percentage = mobile_ranking[True] / (mobile_ranking[True] + mobile_ranking[False])
+ except ZeroDivisionError:
+ mobile_user_percentage = 0.0
+
+ os_ranking = [(c * 100/os_count, n) for n, c in os_ranking.items()]
+ os_ranking.sort()
+ browser_ranking = [(c * 100/browser_count, n) for n, c in browser_ranking.items()]
+ browser_ranking.sort()
+ return os_ranking, browser_ranking, mobile_user_percentage*100
+
+#
+# GETTERS
+#
+def get_where_date_str(at_date=None, min_date=None, max_date=None):
+ # dates in unix time
+ s = ""
+ if at_date is not None:
+ if isinstance(at_date, str):
+ s += f"DATE(date, 'unixepoch') = '{sanitize(at_date)}' AND "
+ elif isinstance(at_date, int|float):
+ s += f"date = {int(at_date)} AND "
+ else:
+ print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}")
+ if min_date is not None:
+ if isinstance(min_date, str):
+ s += f"DATE(date, 'unixepoch') >= '{sanitize(min_date)}' AND "
+ elif isinstance(min_date, int|float):
+ s += f"date >= {int(min_date)} AND "
+ else:
+ print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}")
+ if max_date is not None:
+ if isinstance(max_date, str):
+ s += f"DATE(date, 'unixepoch') <= '{sanitize(max_date)}' AND "
+ elif isinstance(max_date, int|float):
+ s += f"date <= {int(max_date)} AND "
+ else:
+ print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}")
+ if s == "":
+ print(f"WARNING: get_where_date_str: no date_str generated. Returing 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}")
+ return "date > 0"
+ return s.removesuffix(" AND ")
+
+
+# get the earliest date
+def get_earliest_date(cur: sql.Cursor) -> int:
+ """return the earliest time as unixepoch"""
+ cur.execute(f"SELECT MIN(date) FROM {t_request}")
+ return cur.fetchone()[0]
+# get the latest date
+def get_latest_date(cur: sql.Cursor) -> int:
+ """return the latest time as unixepoch"""
+ cur.execute(f"SELECT MAX(date) FROM {t_request}")
+ return cur.fetchone()[0]
+# get all dates
+# the date:str parameter in all these function must be a sqlite constraint
+def get_days(cur: sql.Cursor, date:str) -> list[str]:
+ """get a list of all dates in yyyy-mm-dd format"""
+ cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")
+ days = [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, )
+ days.sort()
+ return days
+
+def get_months(cur: sql.Cursor, date:str) -> list[str]:
+ """get a list of all dates in yyyy-mm format"""
+ cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")
+ dates = get_days(cur, date)
+ date_dict = {}
+ for date in dates:
+ date_without_day = date[0:date.rfind('-')]
+ date_dict[date_without_day] = 0
+ return list(date_dict.keys())
+
+
+def get_user_agent(cur: sql.Cursor, user_id: int):
+ return sql_select(cur, t_user, [("user_id", user_id)])[0][2]
+
+def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]:
+ cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}")
+ return [ user_id[0] for user_id in cur.fetchall() ]
+
+def get_human_users(cur: sql.Cursor, unique_user_ids, unique_user_ids_human: list):
+ """
+ check if they have a known platform AND browser
+ check if at least one request did not result in an error (http status >= 400)
+ """
+ for user_id in unique_user_ids:
+ cur.execute(f"SELECT is_human FROM {t_user} WHERE user_id = {user_id}")
+ # if not user
+ if cur.fetchone()[0] == 0:
+ # pdebug(f"get_human_users: {user_id}, is_human is 0")
+ continue
+ else:
+ # pdebug(f"get_human_users: {user_id}, is_human is non-zero")
+ pass
+
+ # user is human
+ unique_user_ids_human.append(user_id)
+ # pdebug("get_human_users: (2)", unique_user_ids_human)
+
+def get_unique_request_ids_for_date(cur: sql.Cursor, date:str):
+ cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}")
+ return [ request_id[0] for request_id in cur.fetchall()]
+
+def get_unique_request_ids_for_date_and_user(cur: sql.Cursor, date:str, user_id: int, unique_request_ids_human: list):
+ cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND user_id = {user_id}")
+ # all unique requests for user_id
+ for request_id in cur.fetchall():
+ unique_request_ids_human.append(request_id[0])
+
+# get number of requests per day
+def get_request_count_for_date(cur: sql.Cursor, date:str) -> int:
+ cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}")
+ return cur.fetchone()[0]
+
+def get_unique_user_count(cur: sql.Cursor) -> int:
+ return sql_tablesize(cur, t_user)
+
+
+
+#
+# RANKINGS
+#
+def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
+ global settings
+ """
+ :returns [(request_count, groupname)]
+ """
+ ranking = []
+ cur.execute(f"SELECT group_id, groupname FROM {t_filegroup}")
+ for group in cur.fetchall():
+ group_id = group[0]
+ # filename = sql_select(cur, t_file, [("group_id", group)])
+ # if len(filename) == 0: continue
+ # filename = filename[0][0]
+ filename = group[1]
+ if settings["file_ranking_regex_whitelist"]: # if file in whitelist
+ if not fullmatch(settings["file_ranking_regex_whitelist"], filename):
+ pdebug(f"get_file_ranking: file with group_id {group_id} is not in whitelist")
+ continue
+ if settings["file_ranking_ignore_error_files"]: # if request to file was successful
+ success = False
+ cur.execute(f"SELECT status FROM {t_request} WHERE group_id = {group_id}")
+ for status in cur.fetchall():
+ if valid_status(status[0]):
+ pdebug(f"get_file_ranking: success code {status[0]} for file with group_id {group_id} and groupname {filename}")
+ success = True
+ break
+ if not success:
+ pdebug(f"get_file_ranking: file with group_id {group_id} and groupname {filename} has only requests resulting in error")
+ continue
+
+
+ # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
+ cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group_id} AND {date}")
+ ranking.append((cur.fetchone()[0], filename))
+ ranking.sort()
+ # print(ranking)
+ return ranking
+
+def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
+ """
+ :returns [(request_count, user_agent)]
+ """
+ ranking = []
+ cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}")
+ for user_id in cur.fetchall():
+ user_id = user_id[0]
+ user_agent = sql_select(cur, t_user, [("user_id", user_id)])
+ if len(user_agent) == 0: continue
+ user_agent = user_agent[0][2]
+ if settings["user_agent_ranking_regex_whitelist"]:
+ if not fullmatch(settings["user_agent_ranking_regex_whitelist"], user_agent):
+ continue
+ # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
+ cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND {date}")
+ ranking.append((cur.fetchone()[0], user_agent))
+ ranking.sort()
+ # print(ranking)
+ return ranking
+
+def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
+ """
+ 1) get all the distinct entries for field_name after min_date_unix_time
+ 2) call get_name_function with the distinct entry
+ 3) for every entry, get the count in table after min_date_unix_time
+ 3) sort by count in ascending order
+ :returns [(request_count, name)]
+ """
+ ranking = []
+ cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}")
+ for name in cur.fetchall():
+ name = name[0]
+ if whitelist_regex:
+ if not fullmatch(whitelist_regex, name):
+ continue
+ # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
+ cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}")
+ ranking.append((cur.fetchone()[0], name))
+ ranking.sort()
+ # print(ranking)
+ return ranking
+
+
+#
+# PLOTTING
+#
+# add value labels
+def add_vertikal_labels_in_bar_plot(labels, max_y_val, ax, bar_plot):
+ # pdebug("add_vertikal_labels_in_bar_plot:", labels)
+ for idx,rect in enumerate(bar_plot):
+ height = rect.get_height()
+ if height > 0.6 * max_y_val: # if the bar is large, put label in the bar
+ height = 0.05 * max_y_val
+ ax.text(rect.get_x() + rect.get_width()/2., height + 0.025 * max_y_val,
+ labels[idx],
+ ha='center', va='bottom', rotation=90)
+# add count labels
+def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot):
+ # pdebug("add_labels_at_top_of_bar:", xdata, ydata)
+ y_offset = 0.05 * max_y_val
+ for idx,rect in enumerate(bar_plot):
+ ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8))
+
+def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[]):
+ """
+ make a bar plot of the most requested files
+ """
+ # pdebug(f"plot_ranking: ranking={ranking}")
+ if not fig:
+ fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
+ # create new axis if none is given
+ ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
+ # fill x y data
+ if len(ranking) > settings["file_ranking_plot_max_files"]:
+ start_index = len(ranking) - settings["file_ranking_plot_max_files"]
+ else: start_index = 0
+ x_names = []
+ y_counts = []
+ colors = []
+ for i in range(start_index, len(ranking)):
+ x_names.append(ranking[i][1])
+ y_counts.append(ranking[i][0])
+ ft = ranking[i][1].split(".")[-1]
+ color = palette["blue"]
+ # if not color_settings: color = palette["blue"]
+ if isinstance(color_settings, dict):
+ for key, val in color_settings.items():
+ if ft in val: color = key
+ if not color: color = palette["blue"]
+ elif isinstance(color_settings, list):
+ # print(color_settings, (i - start_index) % len(color_settings))
+ color = color_settings[(i - start_index) % len(color_settings)]
+ colors.append(color)
+ bar = ax.bar(x_names, y_counts, tick_label="", color=colors)
+
+ if len(y_counts) > 0:
+ add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar)
+ if settings["plot_add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar)
+ # ax.ylabel(y_counts)
+ return fig
+
+
+def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue"):
+ if not fig:
+ fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
+ if not ax:
+ ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
+ else:
+ ax = ax.twinx()
+ ax.set_ylabel(ylabel)
+ # ax.tick_params(axis="y", labelcolor="r")
+ ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color)
+ if label: ax.legend()
+ # if xlim:
+ # if xlim[0] != xlim[1]:
+ # ax.set_xlim(*xlim)
+
+ # if ylim:
+ # if ylim[0] != ylim[1]:
+ # ax.set_ylim(*ylim)
+ return fig, ax
+
+def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major"):
+ if not fig:
+ fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
+ if not (ax1 and ax2):
+ ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1)
+ ax2 = ax1.twinx()
+ ax2.set_ylabel(ylabel2)
+ # ax.tick_params(axis="y", labelcolor="r")
+ plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1)
+ plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2)
+ # if label1 or label2: ax1.legend()
+ if plots: plots += plot1 + plot2
+ else: plots = plot1 + plot2
+ plt.legend(plots, [ l.get_label() for l in plots])
+
+ if grid == "major" or grid == "minor" or grid == "both":
+ if grid == "minor" or "both":
+ ax1.minorticks_on()
+ ax1.grid(visible=True, which=grid, linestyle="-", color="#888")
+
+ # if xlim:
+ # if xlim[0] != xlim[1]:
+ # ax.set_xlim(*xlim)
+
+ # if ylim:
+ # if ylim[0] != ylim[1]:
+ # ax.set_ylim(*ylim)
+ return fig, ax1, ax2, plots
+
+
+#
+# MAIN
+#
+
+def visualize(loaded_settings: dict):
+ pdebug("visualizing...")
+ global settings
+ settings = loaded_settings
+ if not settings["db"]: missing_arg("db")
+ if not settings["server_name"]: missing_arg("server_name")
+
+ img_dir = settings["img_dir"]
+ img_filetype = settings["img_filetype"]
+ img_location = settings["img_location"]
+ names = {
+ # paths
+ "img_file_ranking_last_x_days": f"ranking_all_time_files_last_x_days.{img_filetype}",
+ "img_referer_ranking_last_x_days": f"ranking_all_time_referers_last_x_days.{img_filetype}",
+ "img_browser_ranking_last_x_days": f"ranking_all_time_browsers_last_x_days.{img_filetype}",
+ "img_operating_system_ranking_last_x_days": f"ranking_all_time_operating_systems_last_x_days.{img_filetype}",
+ "img_users_and_requests_last_x_days": f"user_request_count_daily_last_x_days.{img_filetype}",
+
+ "img_file_ranking_total": f"ranking_all_time_files_total.{img_filetype}",
+ "img_referer_ranking_total": f"ranking_all_time_referers_total.{img_filetype}",
+ "img_browser_ranking_total": f"ranking_all_time_browsers_total.{img_filetype}",
+ "img_operating_system_ranking_total": f"ranking_all_time_operating_systems_total.{img_filetype}",
+ "img_users_and_requests_total": f"user_request_count_daily_total.{img_filetype}",
+ # values
+ "mobile_user_percentage_total": 0.0,
+ "mobile_user_percentage_last_x_days": 0.0,
+ "user_count_last_x_days": 0,
+ "user_count_total": 0,
+ "request_count_last_x_days": 0,
+ "request_count_total": 0,
+ "human_user_percentage_last_x_days": 0.0,
+ "human_user_percentage_total": 0.0,
+ "human_request_percentage_last_x_days": 0.0,
+ "human_request_percentage_total": 0.0,
+ # general
+ "regina_version": settings["version"],
+ "server_name": settings["server_name"],
+ "last_x_days": settings["last_x_days"], # must be after all the things with last_x_days!
+ "earliest_date": "1990-1-1",
+ "generation_date": "1990-1-1 0:0:0",
+ }
+
+ conn = sql.connect(settings["db"])
+ if isdir(img_dir) and img_filetype:
+ gen_img = True
+ else:
+ print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'")
+ gen_img = False
+ cur = conn.cursor()
+
+ get_humans = settings["get_human_percentage"]
+ # pdebug(f"visualize: settings {settings}")
+ # DATE STRINGS
+ names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d")
+ names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
+ # LAST_X_DAYS
+ # last_x_days_min_date: latest_date - last_x_days
+ secs_per_day = 86400
+ last_x_days_min_date = get_latest_date(cur) - settings["last_x_days"] * secs_per_day
+ last_x_days_str = get_where_date_str(min_date=last_x_days_min_date)
+ days = get_days(cur, last_x_days_str)
+ days_strs = [get_where_date_str(at_date=day) for day in days]
+
+
+ # ALL DATES
+ all_time_str = get_where_date_str(min_date=0)
+ # all months in yyyy-mm format
+ months_all_time = get_months(cur, all_time_str)
+ # sqlite constrict to month string
+ months_strs = []
+ for year_month in months_all_time:
+ year, month = year_month.split("-")
+ # first day of the month
+ min_date = dt(int(year), int(month), 1).timestamp()
+ month = (int(month) % 12) + 1 # + 1 month
+ year = int(year)
+ if month == 1: year += 1
+ # first day of the next month - 1 sec
+ max_date = dt(year, month, 1).timestamp() - 1
+ months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date))
+
+ for i in range(2):
+ suffix = ["_total", "_last_x_days"][i]
+ date_str = [all_time_str, last_x_days_str][i]
+ date_names = [months_all_time, days][i]
+ date_strs = [months_strs, days_strs][i]
+ assert(len(date_names) == len(date_strs))
+
+ # FILES
+ file_ranking = get_file_ranking(cur, date_str)
+ if gen_img:
+ fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes)
+ fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}")
+
+ # REFERER
+ referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
+ if gen_img:
+ fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
+ fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")
+
+ # USER
+ # user_agent_ranking = get_user_agent_ranking(cur, date_str)
+ # for the time span
+ unique_user_ids = get_unique_user_ids_for_date(cur, date_str)
+ unique_user_ids_human = []
+ get_human_users(cur, unique_user_ids, unique_user_ids_human)
+ # for each date
+ date_count = len(date_strs)
+ unique_user_ids_dates: list[list[int]] = []
+ unique_request_ids_dates: list[list[int]] = []
+ unique_user_ids_human_dates: list[list[int]] = [[] for i in range(date_count)]
+ unique_request_ids_human_dates: list[list[int]] = [[] for i in range(date_count)]
+ for i in range(date_count):
+ date_str_ = date_strs[i]
+ unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_))
+ unique_request_ids_dates.append(get_unique_request_ids_for_date(cur, date_str_))
+ if get_humans:
+ # empty_list = []
+ # unique_user_ids_human_dates.append(empty_list)
+ get_human_users(cur, unique_user_ids_dates[i], unique_user_ids_human_dates[i])
+ # unique_request_ids_human_dates.append(list())
+ for human in unique_user_ids_human_dates[i]:
+ get_unique_request_ids_for_date_and_user(cur, date_str_, human, unique_request_ids_human_dates[i])
+ # print("\n\tuu", unique_user_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_user_ids_human_dates, "\n\turh", unique_request_ids_human_dates)
+ # pdebug("uui", unique_user_ids)
+ # pdebug("uuih", unique_user_ids_human)
+ # pdebug("uuid", unique_user_ids_dates)
+ # pdebug("uuidh", unique_user_ids_human_dates)
+ # pdebug("urid", unique_request_ids_dates)
+ # pdebug("uridh", unique_user_ids_human_dates)
+ # pdebug(f"human_user_precentage: len_list_list(user_ids)={len_list_list(unique_user_ids_dates)}, len_list_list(user_ids_human)={len_list_list(unique_user_ids_human_dates)}")
+ if get_humans:
+ try:
+ names[f"human_user_percentage{suffix}"] = round(100 * len_list_list(unique_user_ids_human_dates) / len_list_list(unique_user_ids_dates), 2)
+ except:
+ names[f"human_user_percentage{suffix}"] = -1.0
+ try:
+ names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2)
+ except:
+ names[f"human_request_percentage{suffix}"] = -1.0
+ names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates)
+ names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates)
+ if gen_img:
+ fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"])
+ if get_humans:
+ fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots)
+ fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}")
+
+ # os & browser
+ os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human)
+ if gen_img:
+ fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems)
+ fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}")
+ fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers)
+ fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}")
+
+ # print("File Ranking", file_ranking)
+ # print("referer Ranking", referer_ranking)
+ # print("user agent ranking", user_agent_ranking)
+ # print("Unique Users:", get_unique_user_count(cur))
+ # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue")
+ # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange")
+ # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}")
+ # print("OS ranking", os_ranking)
+ # print("Browser ranking", browser_ranking)
+ # print("Mobile percentage", names["mobile_user_percentage"])
+ if settings["template_html"] and settings["html_out_path"]:
+ pdebug(f"visualize: writing to html: {settings['html_out_path']}")
+
+ with open(settings["template_html"], "r") as file:
+ html = file.read()
+ for name, value in names.items():
+ if "img" in name:
+ value = f"{img_location}/{value}"
+ html = html.replace(f"%{name}", str(value))
+ with open(settings["html_out_path"], "w") as file:
+ file.write(html)
+ else:
+ warning(f"Skipping html generation because either template_html or html_out_path is invalid: template_html='{settings['template_html']}', html_out_path='{settings['html_out_path']}'")
diff --git a/regina/main.py b/regina/main.py
new file mode 100644
index 0000000..810e5cd
--- /dev/null
+++ b/regina/main.py
@@ -0,0 +1,95 @@
+# from sys import path
+# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
+# __package__="."
+from sys import argv, exit
+from os.path import isfile
+from db_operation.visualize import visualize
+from utility.settings_manager import read_settings_file
+from db_operation.collect import parse_log, add_requests_to_db
+from db_operation.database import create_db
+from utility.globals import settings, version
+
+"""
+start regina, launch either collect or visualize
+"""
+
+
+def help():
+ helpstring = """Command line options:
+ --server-name string
+ --log path to the access.log
+ --db name of the database
+ --settings["filegroups"] string describing settings["filegroups"], eg 'name1: file1, file2; name2: file3, file4, file5;'
+ --auto-group-filetypes comma separated list of filetypes, eg 'css,png,gif'
+ --locs-and_dirs comma separated list of nginx_location:directory pairs, eg '/:/www/website'
+ --config-file path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line
+ """
+ print(helpstring)
+
+def missing_arg_val(arg):
+ print("Missing argument for", arg)
+ exit(1)
+
+def missing_arg(arg):
+ print("Missing ", arg)
+ exit(1)
+
+def error(arg):
+ print("Error:", arg)
+ exit(1)
+
+def main():
+ config_file = ""
+ collect = False
+ visualize_ = False
+ log_file = ""
+ # parse args
+ i = 1
+ while i in range(1, len(argv)):
+ if argv[i] == "--config":
+ if len(argv) > i + 1: config_file = argv[i+1]
+ else: missing_arg_val(argv[i])
+ if argv[i] == "--log-file":
+ if len(argv) > i + 1: log_file = argv[i+1]
+ else: missing_arg_val(argv[i])
+ elif argv[i] == "--help":
+ help()
+ exit(0)
+ elif argv[i] == "--collect":
+ collect = True
+ elif argv[i] == "--visualize":
+ visualize_ = True
+ else:
+ pass
+ i += 1
+ if not collect and not visualize_:
+ missing_arg("--visualize or --collect")
+
+ if not config_file:
+ missing_arg("--config_file")
+ if not isfile(config_file):
+ error(f"Not a file: '{config_file}'")
+ read_settings_file(config_file, settings)
+ settings["version"] = version
+ if log_file: settings["access-log"] = log_file
+
+ print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'")
+
+ if not settings["server_name"]: missing_arg("server-name")
+ if not settings["access_log"]: missing_arg("log")
+ if not settings["db"]: missing_arg("db")
+ if isinstance(settings["auto_group_filetypes"], str):
+ settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",")
+ if isinstance(settings["locs_and_dirs"], str):
+ settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ]
+ if collect:
+ if not isfile(settings["db"]):
+ create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
+ requests = parse_log(settings["access_log"])
+ add_requests_to_db(requests, settings["db"])
+ if visualize_:
+ if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
+ visualize(settings)
+
+if __name__ == '__main__':
+ main()
diff --git a/regina/utility/__init__.py b/regina/utility/__init__.py
new file mode 100644
index 0000000..f36e6b3
--- /dev/null
+++ b/regina/utility/__init__.py
@@ -0,0 +1 @@
+"""Utility for regina"""
diff --git a/regina/utility/globals.py b/regina/utility/globals.py
new file mode 100644
index 0000000..14a2165
--- /dev/null
+++ b/regina/utility/globals.py
@@ -0,0 +1,40 @@
+"""global variables for regina"""
+
+version = "1.0"
+
+# default settings, these are overwriteable through a config file
+settings = {
+ # GENERAL
+ "server_name": "",
+ # DATA COLLECTION
+ "access_log": "",
+ "db": "",
+ "locs_and_dirs": [],
+ "auto_group_filetypes": [],
+ "filegroups": "",
+
+ # VISUALIZATION
+ "get_human_percentage": False,
+ "human_needs_success": True, # a human must have at least 1 successful request (status < 300)
+ "status_300_is_success": False, # 300 codes are success
+ # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
+ "file_ranking_regex_whitelist": r".*\.(html)",
+ "file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
+ "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
+ "user_agent_ranking_regex_whitelist": r"",
+ "file_ranking_plot_max_files": 15,
+ # "plot_figsize": (60, 40),
+ "plot_dpi": 300,
+ "plot_add_count_label": True,
+ "img_dir": "",
+ "img_location": "",
+ "img_filetype": "svg",
+ "template_html": "",
+ "html_out_path": "",
+ "last_x_days": 30,
+}
+
+# these oses and browser can be detected:
+# lower element takes precedence
+user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"]
+user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"]
diff --git a/regina/utility/settings_manager.py b/regina/utility/settings_manager.py
new file mode 100644
index 0000000..cb821d6
--- /dev/null
+++ b/regina/utility/settings_manager.py
@@ -0,0 +1,33 @@
+
+def get_bool(bool_str: str, fallback=False):
+ if bool_str in ["true", "True"]: return True
+ elif bool_str in ["false", "False"]: return False
+ return fallback
+
+def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, allow_new_keys=False, convert_to_type=True):
+ lines = []
+ with open(filepath, "r") as file:
+ lines = file.readlines()
+
+ for i in range(len(lines)):
+ line = lines[i].strip("\n ")
+ if line.startswith("#"): continue
+ vals = line.split("=")
+ if not len(vals) == 2:
+ if ignore_invalid_lines: continue
+ else: raise KeyError(f"Invalid line: '{line}'")
+ vals[0] = vals[0].strip(" ")
+ if not allow_new_keys and vals[0] not in settings.keys():
+ if ignore_invalid_lines: continue
+ else: raise KeyError(f"Invalid key: '{vals[0]}'")
+ if convert_to_type and not isinstance(settings[vals[0]], str|list|None):
+ if isinstance(settings[vals[0]], bool):
+ settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]])
+ continue
+ try:
+ settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" "))
+ except Exception as e:
+ if not ignore_invalid_lines: raise e
+ else: continue
+ else:
+ settings[vals[0]] = vals[1].strip(" ")
diff --git a/regina/utility/sql_util.py b/regina/utility/sql_util.py
new file mode 100644
index 0000000..2e3f9a8
--- /dev/null
+++ b/regina/utility/sql_util.py
@@ -0,0 +1,40 @@
+import sqlite3 as sql
+"""Various utilities"""
+def sanitize(s):
+ if type(s) != str: return s
+ return s\
+ .replace("''", "'").replace("'", r"''").strip(" ")
+ # .replace('"', r'\"')\
+
+def sql_get_constaint_str(constraints: list[tuple[str, str|int]], logic="AND") -> str:
+ c_str = ""
+ for name, val in constraints:
+ c_str += f"{name} = '{sanitize(val)}' {logic} "
+ return c_str.strip(logic + " ")
+
+def sql_get_value_str(values: list[list]) -> str:
+ c_str = ""
+ for params in values:
+ c_str += "("
+ for p in params: c_str += f"'{sanitize(p)}', "
+ c_str = c_str.strip(", ") + "), "
+ return c_str.strip(", ")
+
+def sql_exists(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND") -> bool:
+ cur.execute(f"SELECT EXISTS (SELECT 1 FROM {table} WHERE {sql_get_constaint_str(constraints, logic)})")
+ return cur.fetchone()[0] == 1
+
+def sql_select(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND"):
+ cur.execute(f"SELECT * FROM {table} WHERE {sql_get_constaint_str(constraints, logic)}")
+ return cur.fetchall()
+
+def sql_insert(cur: sql.Cursor, table: str, values: list[list]):
+ cur.execute(f"INSERT INTO {table} VALUES {sql_get_value_str(values)}")
+
+def sql_tablesize(cur: sql.Cursor, table: str) -> int:
+ cur.execute(f"SELECT Count(*) FROM {table}")
+ return cur.fetchone()[0]
+
+def sql_get_count_where(cur: sql.Cursor, table, constraints) -> int:
+ cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {sql_get_constaint_str(constraints)}")
+ return cur.fetchone()[0]
diff --git a/regina/utility/utility.py b/regina/utility/utility.py
new file mode 100644
index 0000000..42a4299
--- /dev/null
+++ b/regina/utility/utility.py
@@ -0,0 +1,27 @@
+# from sys import path
+# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
+from sys import exit
+
+"""
+Various utitity
+"""
+
+DEBUG = False
+def pdebug(*args):
+ if DEBUG: print(*args)
+
+def warning(*w):
+ print("Warning:", *w)
+
+def error(*arg):
+ print("Error:", *arg)
+ exit(1)
+
+def missing_arg_val(arg):
+ print("Missing argument for", arg)
+ exit(1)
+
+def missing_arg(arg):
+ print("Missing ", arg)
+ exit(1)
+