diff --git a/regina/collect.py b/regina/collect.py deleted file mode 100644 index 26706db..0000000 --- a/regina/collect.py +++ /dev/null @@ -1,138 +0,0 @@ -import sqlite3 as sql -from re import match -from time import mktime -from datetime import datetime as dt -from .database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup -from .sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize - -""" -collect information from the access log and put it into the database -""" - -DEBUG = True -def pdebug(*args): - if DEBUG: print(*args) - -def warning(w): - print(w) - - -months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] - -# these oses and browser can be detected: -# lower element takes precedence -user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"] -user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"] - - -class Request: - def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""): - self.ip_address = sanitize(ip_address) - self.time_local = 0 - #[20/Nov/2022:00:47:36 +0100] - m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) - if m: - g = m.groups() - try: - datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) - self.time_local = int(mktime(datetime_.timetuple())) - except Exception as e: - warning(f"Request:__init__: {e}") - else: - warning(f"Request:__init__: Could not match time: '{time_local}'") - - self.request_type = sanitize(request_type) - self.request_file = sanitize(request_file) - self.request_protocol = sanitize(request_protocol) - self.status = sanitize(status) - self.bytes_sent = sanitize(bytes_sent) - self.referer = sanitize(referer) - self.user_agent = sanitize(user_agent) - - def __repr__(self): - return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.user_agent} - {self.status}" - -re_remote_addr = r"[0-9a-fA-F.:]+" -re_remote_user = ".*" -re_time_local = r"\[.+\]" -re_request = r'"[^"]+"' -re_status = r'\d+' -re_body_bytes_sent = r'\d+' -re_http_referer = r'"([^"]*)"' -re_http_user_agent = r'"([^"]*)"' -re_log_format: str = f'({re_remote_addr}) - ({re_remote_user}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_user_agent}' -def parse_log(logfile:str) -> list[Request]: - """ - create Request objects from each line in the logfile - """ - requests = [] - with open(logfile, "r") as file: - lines = file.readlines() - for line in lines: - m = match(re_log_format, line) - if m is None: - warning(f"parse_log: Unmatched line: '{line}'") - continue - # print(m.groups()) - g = m.groups() - request_ = m.groups()[3].split(" ") - if len(request_) != 3: - warning(f"parse_log: len('{m.groups()[3]}'.split(' ')) is {len(request_)} and not 3") - continue - requests.append(Request(ip_address=g[0], time_local=g[2], - request_type=request_[0], request_file=request_[1], request_protocol=request_[2], - status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7])) - return requests - -def get_user_id(request: Request, cursor: sql.Cursor) -> int: - """ - get the user_id. Adds the user if not already existing - """ - # if user exists - if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]): - user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0] - else: # new user - # new user_id is number of elements - user_id: int = sql_tablesize(cursor, t_user) - pdebug("new user:", user_id, request.ip_address) - platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent) - cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}');") - return user_id - -# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)" -# 1: platform, 2: version, 3: details -def get_os_browser_pairs_from_agent(user_agent): - # for groups in findall(re_user_agent, user_agent): - operating_system = "" - browser = "" - mobile = "Mobi" in user_agent - for os in user_agent_operating_systems: - if os in user_agent: - operating_system = os - break - for br in user_agent_browsers: - if br in user_agent: - browser = br - break - # if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'") - return operating_system, browser, mobile - - -def add_requests_to_db(requests: list[Request], db_name: str): - conn = sql.connect(db_name) - cursor = conn.cursor() - for i in range(len(requests)): - request = requests[i] - pdebug("add_requests_to_db:", i, "request:", request) - user_id = get_user_id(request, cursor) - conn.commit() - group_id: int = get_filegroup(request.request_file, cursor) - # check if request is unique - group_id_name = database_tables[t_filegroup].key.name - user_id_name = database_tables[t_user].key.name - if sql_exists(cursor, t_request, [(group_id_name, group_id), (user_id_name, user_id), ("date", request.time_local)]): - pdebug("request exists:", request) - else: - pdebug("new request:", request) - request_id = sql_tablesize(cursor, t_request) - sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]]) diff --git a/regina/database.py b/regina/database.py deleted file mode 100644 index 43d1481..0000000 --- a/regina/database.py +++ /dev/null @@ -1,144 +0,0 @@ -import sqlite3 as sql -from os import path, listdir -from .sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize - - -""" -create reginas database as shown in the uml diagram database.uxf -""" - -DEBUG = True -def pdebug(*args): - if DEBUG: print(*args) - -class Entry: - """ - represents an sql entry - type_ is INTEGER, TEXT, REAL... - """ - def __init__(self, name, type_) -> None: - self.name = name - self.type_ = type_ - def __repr__(self): - return f"[{self.name}] {self.type_}" - -class Table: - def __init__(self, name, key: Entry, entries: list[Entry]=[], constaints: list[str]=[]): - self.name = name - self.key = key - self.entries = entries - self.constaints = constaints - def create_sql_str(self): - return f"CREATE TABLE IF NOT EXISTS {self.name}\n({self})\n" - def __repr__(self): - s = f"{self.key} PRIMARY KEY" - for entry in self.entries: - s += f", {entry}" - for c in self.constaints: - s += f", {c}" - return s -t_request = "request" -t_file = "file" -t_filegroup = "filegroup" -t_user = "user" - -user_id = Entry("user_id", "INTEGER") -request_id = Entry("request_id", "INTEGER") -filegroup_id = Entry("group_id", "INTEGER") -ip_address_entry = Entry("ip_address", "TEXT") -filename_entry = Entry("filename", "TEXT") -database_tables = { - t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER")], [f"UNIQUE({user_id.name})"]), - t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), - t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), - t_request: Table(t_request, request_id, [ - user_id, filegroup_id, Entry("date", "INTEGER"), Entry("referer", "TEXT"), Entry("status", "INTEGER") - ], ["UNIQUE(request_id)"]), -} - - - -def get_filegroup(filename: str, cursor: sql.Cursor) -> int: - """ - get the user_id. Adds the user if not already existing - """ - if sql_exists(cursor, t_file, [("filename", filename)]): - return sql_select(cursor, t_file, [("filename", filename)])[0][1] - else: - group_id = sql_tablesize(cursor, t_filegroup) - # pdebug("new file(group):", group_id, filename) - # add group - sql_insert(cursor, t_filegroup, [[group_id, filename]]) - # add file - sql_insert(cursor, t_file, [[filename, group_id]]) - return group_id - -def create_filegroups(cursor: sql.Cursor, filegroup_str: str): - # filegroup_str: 'name1: file1, file2, file3; name2: file33' - groups = filegroup_str.strip(";").split(";") - pdebug("create_filegroups", groups) - for group in groups: - name, vals = group.split(":") - # create/get group - if sql_exists(cursor, t_filegroup, [("groupname", name)]): - group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0] - else: - group_id = sql_tablesize(cursor, t_filegroup) - sql_insert(cursor, t_filegroup, [(group_id, name)]) - # pdebug("create_filegroups: group_id", group_id) - # create/edit file - for filename in vals.split(","): - if sql_exists(cursor, t_file, [("filename", filename)]): # if exist, update - cursor.execute(f"UPDATE {t_file} SET group_id = {group_id} WHERE filename = '{filename}'") - else: - sql_insert(cursor, t_file, [[filename, group_id]]) - -def get_files_from_dir_rec(p: str, files: list[str]): - """recursivly append all files to files""" - pdebug("get_files_from_dir_rec:",p) - if path.isfile(p): - files.append(p) - elif path.isdir(p): - for p_ in listdir(p): - get_files_from_dir_rec(p + "/" + p_, files) - -def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_filetypes:list[str]) -> str: - """ - :param list of nginx locations and the corresponding directories - :param auto_filetype_groups list of filetypes for auto grouping - """ - files: list[str] = [] - start_i = 0 - for location, dir_ in location_and_dirs: - get_files_from_dir_rec(dir_, files) - # replace dir_ with location, eg /www/website with / - for i in range(start_i, len(files)): - files[i] = files[i].replace(dir_, location).replace("//", "/") - filegroups = "" - # create groups for each filetype - for ft in auto_group_filetypes: - filegroups += f"{ft}:" - for file in files: - if file.endswith(f".{ft}"): - filegroups += f"{file}," - filegroups = filegroups.strip(",") + ";" - pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups) - return filegroups - -def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]): - """ - create the name with database_tables - """ - print(f"creating database: '{name}'") - conn = sql.connect(f"{name}") - cursor = conn.cursor() - for table in database_tables.values(): - cursor.execute(table.create_sql_str()) - filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes) - create_filegroups(cursor, filegroup_str) - conn.commit() - conn.close() - - -if __name__ == '__main__': - create_db("test.db") diff --git a/regina/regina.py b/regina/regina.py deleted file mode 100644 index 8179357..0000000 --- a/regina/regina.py +++ /dev/null @@ -1,121 +0,0 @@ -from sys import argv, exit -from os.path import isfile -from .visualize import visualize -from .settings_manager import read_settings_file -from .collect import parse_log, add_requests_to_db -from .database import create_db - -""" -start regina, launch either collect or visualize -""" - -version = "1.0" - -# default settings, these are overwriteable through a config file -settings = { - # GENERAL - "server-name": "", - # DATA COLLECTION - "access-log": "", - "db": "", - "locs-and-dirs": [], - "auto-group-filetypes": [], - "filegroups": "", - - # VISUALIZATION - "get-human-percentage": False, - # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))", - "file_ranking_regex_whitelist": r".*\.(html)", - "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty - "user_agent_ranking_regex_whitelist": r"", - "file_ranking_plot_max_files": 15, - # "plot_figsize": (60, 40), - "plot_dpi": 300, - "img_dir": "", - "img_location": "", - "img_filetype": "svg", - "template_html": "", - "html_out_path": "", - "last_x_days": 30, -} - - -def help(): - helpstring = """Command line options: - --server-name string - --log path to the access.log - --db name of the database - --settings["filegroups"] string describing settings["filegroups"], eg 'name1: file1, file2; name2: file3, file4, file5;' - --auto-group-filetypes comma separated list of filetypes, eg 'css,png,gif' - --locs-and_dirs comma separated list of nginx_location:directory pairs, eg '/:/www/website' - --config-file path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line - """ - print(helpstring) - -def missing_arg_val(arg): - print("Missing argument for", arg) - exit(1) - -def missing_arg(arg): - print("Missing ", arg) - exit(1) - -def error(arg): - print("Error:", arg) - exit(1) - -def main(): - config_file = "" - collect = False - visualize_ = False - log_file = "" - # parse args - i = 1 - while i in range(1, len(argv)): - if argv[i] == "--config": - if len(argv) > i + 1: config_file = argv[i+1] - else: missing_arg_val(argv[i]) - if argv[i] == "--log-file": - if len(argv) > i + 1: log_file = argv[i+1] - else: missing_arg_val(argv[i]) - elif argv[i] == "--help": - help() - exit(0) - elif argv[i] == "--collect": - collect = True - elif argv[i] == "--visualize": - visualize_ = True - else: - pass - i += 1 - if not collect and not visualize_: - missing_arg("--visualize or --collect") - - if not config_file: - missing_arg("--config_file") - if not isfile(config_file): - error(f"Not a file: '{config_file}'") - read_settings_file(config_file, settings) - settings["version"] = version - if log_file: settings["access-log"] = log_file - - print(f"regina version {version} with server-name '{settings['server-name']}' and database '{settings['db']}'") - - if not settings["server-name"]: missing_arg("server-name") - if not settings["access-log"]: missing_arg("log") - if not settings["db"]: missing_arg("db") - if isinstance(settings["auto-group-filetypes"], str): - settings["auto-group-filetypes"] = settings["auto-group-filetypes"].split(",") - if isinstance(settings["locs-and-dirs"], str): - settings["locs-and-dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs-and-dirs"].split(",") ] - if collect: - if not isfile(settings["db"]): - create_db(settings["db"], settings["filegroups"], settings["locs-and-dirs"], settings["auto-group-filetypes"]) - requests = parse_log(settings["access-log"]) - add_requests_to_db(requests, settings["db"]) - if visualize_: - if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") - visualize(settings) - -if __name__ == '__main__': - main() diff --git a/regina/settings_manager.py b/regina/settings_manager.py deleted file mode 100644 index cb821d6..0000000 --- a/regina/settings_manager.py +++ /dev/null @@ -1,33 +0,0 @@ - -def get_bool(bool_str: str, fallback=False): - if bool_str in ["true", "True"]: return True - elif bool_str in ["false", "False"]: return False - return fallback - -def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, allow_new_keys=False, convert_to_type=True): - lines = [] - with open(filepath, "r") as file: - lines = file.readlines() - - for i in range(len(lines)): - line = lines[i].strip("\n ") - if line.startswith("#"): continue - vals = line.split("=") - if not len(vals) == 2: - if ignore_invalid_lines: continue - else: raise KeyError(f"Invalid line: '{line}'") - vals[0] = vals[0].strip(" ") - if not allow_new_keys and vals[0] not in settings.keys(): - if ignore_invalid_lines: continue - else: raise KeyError(f"Invalid key: '{vals[0]}'") - if convert_to_type and not isinstance(settings[vals[0]], str|list|None): - if isinstance(settings[vals[0]], bool): - settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]]) - continue - try: - settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" ")) - except Exception as e: - if not ignore_invalid_lines: raise e - else: continue - else: - settings[vals[0]] = vals[1].strip(" ") diff --git a/regina/sql_util.py b/regina/sql_util.py deleted file mode 100644 index 2e3f9a8..0000000 --- a/regina/sql_util.py +++ /dev/null @@ -1,40 +0,0 @@ -import sqlite3 as sql -"""Various utilities""" -def sanitize(s): - if type(s) != str: return s - return s\ - .replace("''", "'").replace("'", r"''").strip(" ") - # .replace('"', r'\"')\ - -def sql_get_constaint_str(constraints: list[tuple[str, str|int]], logic="AND") -> str: - c_str = "" - for name, val in constraints: - c_str += f"{name} = '{sanitize(val)}' {logic} " - return c_str.strip(logic + " ") - -def sql_get_value_str(values: list[list]) -> str: - c_str = "" - for params in values: - c_str += "(" - for p in params: c_str += f"'{sanitize(p)}', " - c_str = c_str.strip(", ") + "), " - return c_str.strip(", ") - -def sql_exists(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND") -> bool: - cur.execute(f"SELECT EXISTS (SELECT 1 FROM {table} WHERE {sql_get_constaint_str(constraints, logic)})") - return cur.fetchone()[0] == 1 - -def sql_select(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND"): - cur.execute(f"SELECT * FROM {table} WHERE {sql_get_constaint_str(constraints, logic)}") - return cur.fetchall() - -def sql_insert(cur: sql.Cursor, table: str, values: list[list]): - cur.execute(f"INSERT INTO {table} VALUES {sql_get_value_str(values)}") - -def sql_tablesize(cur: sql.Cursor, table: str) -> int: - cur.execute(f"SELECT Count(*) FROM {table}") - return cur.fetchone()[0] - -def sql_get_count_where(cur: sql.Cursor, table, constraints) -> int: - cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {sql_get_constaint_str(constraints)}") - return cur.fetchone()[0] diff --git a/regina/visualize.py b/regina/visualize.py deleted file mode 100644 index 08586b3..0000000 --- a/regina/visualize.py +++ /dev/null @@ -1,523 +0,0 @@ -import sqlite3 as sql -from sys import exit -from re import fullmatch -import matplotlib.pyplot as plt -from os.path import isdir -from datetime import datetime as dt -from .database import t_request, t_user, t_file, t_filegroup -from .sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where -""" -visualize information from the databse -TODO: -- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com -""" - -settings = {} - -palette = { - "red": "#ee4035", - "orange": "#f37736", - "yellow": "#fdf458", - "green": "#7bc043", - "blue": "#0392cf", - "purple": "#b044a0", -} -color_settings_filetypes = { - palette["red"]: ["html"], - palette["green"]: ["jpg", "png", "jpeg", "gif", "svg", "webp"], - palette["yellow"]: ["css"], - "grey": ["txt"] -} -color_settings_alternate = list(palette.values()) - -color_settings_browsers = { - palette["red"]: ["Safari"], - palette["orange"]: ["Firefox"], - palette["yellow"]: ["Chrome"], - "grey": ["Edge"], - palette["green"]: ["Chromium"], - palette["purple"]: ["Brave"] -} -color_settings_operating_systems = { - palette["red"]: ["Macintosh"], - palette["green"]: ["Android"], - "grey": ["iPhone", "iPad"], - palette["yellow"]: ["Linux"], - palette["purple"]: ["BSD"], - palette["blue"]: ["Windows"], -} - - -def len_list_list(l: list[list]): - size = 0 - for i in range(len(l)): - size += len(l[i]) - return size - - -# -# FILTERS -# -def get_os_browser_mobile_rankings(cur: sql.Cursor, user_ids: list[int]): - """ - returns [(count, operating_system)], [(count, browser)], mobile_user_percentage - """ - os_ranking = {} - os_count = 0.0 - browser_ranking = {} - browser_count = 0.0 - mobile_ranking = { True: 0.0, False: 0.0 } - for user_id in user_ids: - cur.execute(f"SELECT platform,browser,mobile FROM {t_user} WHERE user_id = {user_id}") - os, browser, mobile = cur.fetchone() - mobile = bool(mobile) - if os: - if os in os_ranking: os_ranking[os] += 1 - else: os_ranking[os] = 1 - os_count += 1 - if browser: - if browser in browser_ranking: browser_ranking[browser] += 1 - else: browser_ranking[browser] = 1 - browser_count += 1 - if (os or browser): - mobile_ranking[mobile] += 1 - try: - mobile_user_percentage = mobile_ranking[True] / (mobile_ranking[True] + mobile_ranking[False]) - except ZeroDivisionError: - mobile_user_percentage = 0.0 - - os_ranking = [(c * 100/os_count, n) for n, c in os_ranking.items()] - os_ranking.sort() - browser_ranking = [(c * 100/browser_count, n) for n, c in browser_ranking.items()] - browser_ranking.sort() - return os_ranking, browser_ranking, mobile_user_percentage*100 - -# -# GETTERS -# -def get_where_date_str(at_date=None, min_date=None, max_date=None): - # dates in unix time - s = "" - if at_date is not None: - if isinstance(at_date, str): - s += f"DATE(date, 'unixepoch') = '{sanitize(at_date)}' AND " - elif isinstance(at_date, int|float): - s += f"date = {int(at_date)} AND " - else: - print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}") - if min_date is not None: - if isinstance(min_date, str): - s += f"DATE(date, 'unixepoch') >= '{sanitize(min_date)}' AND " - elif isinstance(min_date, int|float): - s += f"date >= {int(min_date)} AND " - else: - print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}") - if max_date is not None: - if isinstance(max_date, str): - s += f"DATE(date, 'unixepoch') <= '{sanitize(max_date)}' AND " - elif isinstance(max_date, int|float): - s += f"date <= {int(max_date)} AND " - else: - print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}") - if s == "": - print(f"WARNING: get_where_date_str: no date_str generated. Returing 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") - return "date > 0" - return s.removesuffix(" AND ") - - -# get the earliest date -def get_earliest_date(cur: sql.Cursor) -> int: - """return the earliest time as unixepoch""" - cur.execute(f"SELECT MIN(date) FROM {t_request}") - return cur.fetchone()[0] -# get the latest date -def get_latest_date(cur: sql.Cursor) -> int: - """return the latest time as unixepoch""" - cur.execute(f"SELECT MAX(date) FROM {t_request}") - return cur.fetchone()[0] -# get all dates -# the date:str parameter in all these function must be a sqlite constraint -def get_days(cur: sql.Cursor, date:str) -> list[str]: - """get a list of all dates in yyyy-mm-dd format""" - cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") - return [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, ) - -def get_months(cur: sql.Cursor, date:str) -> list[str]: - """get a list of all dates in yyyy-mm format""" - cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") - dates = get_days(cur, date) - date_dict = {} - for date in dates: - date_without_day = date[0:date.rfind('-')] - date_dict[date_without_day] = 0 - return list(date_dict.keys()) - - -def get_user_agent(cur: sql.Cursor, user_id: int): - return sql_select(cur, t_user, [("user_id", user_id)])[0][2] - -def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: - cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}") - return [ user_id[0] for user_id in cur.fetchall() ] - -def get_human_users(cur: sql.Cursor, unique_user_ids): - human_user_ids = [] - for user_id in unique_user_ids: - cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)") - if cur.fetchone()[0] == 1: - human_user_ids.append(user_id) - return human_user_ids - -def get_unique_request_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: - cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}") - return [ request_id[0] for request_id in cur.fetchall() ] - -def get_unique_request_ids_for_date_and_user(cur: sql.Cursor, date:str, user_id: int) -> list[int]: - cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND user_id = {user_id}") - return [ request_id[0] for request_id in cur.fetchall() ] - -# get number of requests per day -def get_request_count_for_date(cur: sql.Cursor, date:str) -> int: - cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}") - return cur.fetchone()[0] - -def get_unique_user_count(cur: sql.Cursor) -> int: - return sql_tablesize(cur, t_user) - - - -# -# RANKINGS -# -def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: - global settings - """ - :returns [(request_count, filename)] - """ - ranking = [] - cur.execute(f"SELECT DISTINCT group_id FROM {t_filegroup}") - for group in cur.fetchall(): - group = group[0] - filename = sql_select(cur, t_file, [("group_id", group)]) - if len(filename) == 0: continue - filename = filename[0][0] - if settings["file_ranking_regex_whitelist"]: - if not fullmatch(settings["file_ranking_regex_whitelist"], filename): - continue - # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group} AND {date}") - ranking.append((cur.fetchone()[0], filename)) - ranking.sort() - # print(ranking) - return ranking - -def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: - """ - :returns [(request_count, user_agent)] - """ - ranking = [] - cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}") - for user_id in cur.fetchall(): - user_id = user_id[0] - user_agent = sql_select(cur, t_user, [("user_id", user_id)]) - if len(user_agent) == 0: continue - user_agent = user_agent[0][2] - if settings["user_agent_ranking_regex_whitelist"]: - if not fullmatch(settings["user_agent_ranking_regex_whitelist"], user_agent): - continue - # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND {date}") - ranking.append((cur.fetchone()[0], user_agent)) - ranking.sort() - # print(ranking) - return ranking - -def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]: - """ - 1) get all the distinct entries for field_name after min_date_unix_time - 2) call get_name_function with the distinct entry - 3) for every entry, get the count in table after min_date_unix_time - 3) sort by count in ascending order - :returns [(request_count, name)] - """ - ranking = [] - cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}") - for name in cur.fetchall(): - name = name[0] - if whitelist_regex: - if not fullmatch(whitelist_regex, name): - continue - # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}") - ranking.append((cur.fetchone()[0], name)) - ranking.sort() - # print(ranking) - return ranking - - -# -# PLOTTING -# -def add_vertikal_labels_in_bar_plot(labels, max_y_val, ax, bar_plot): - for idx,rect in enumerate(bar_plot): - height = rect.get_height() - if height > 0.8 * max_y_val: - height = 0.05 * max_y_val - ax.text(rect.get_x() + rect.get_width()/2., height + 0.025 * max_y_val, - labels[idx], - ha='center', va='bottom', rotation=90) - -def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[]): - """ - make a bar plot of the most requested files - """ - if not fig: - fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) - # create new axis if none is given - ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) - # fill x y data - if len(ranking) > settings["file_ranking_plot_max_files"]: - start_index = len(ranking) - settings["file_ranking_plot_max_files"] - else: start_index = 0 - x_names = [] - y_counts = [] - colors = [] - for i in range(start_index, len(ranking)): - x_names.append(ranking[i][1]) - y_counts.append(ranking[i][0]) - ft = ranking[i][1].split(".")[-1] - color = "blue" - if not color_settings: color = "blue" - elif isinstance(color_settings, dict): - for key, val in color_settings.items(): - if ft in val: color = key - if not color: color = "blue" - elif isinstance(color_settings, list): - # print(color_settings, (i - start_index) % len(color_settings)) - color = color_settings[(i - start_index) % len(color_settings)] - colors.append(color) - bar = ax.bar(x_names, y_counts, tick_label="", color=colors) - add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar) - # ax.ylabel(y_counts) - return fig - - -def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue"): - if not fig: - fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) - if not ax: - ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) - else: - ax = ax.twinx() - ax.set_ylabel(ylabel) - # ax.tick_params(axis="y", labelcolor="r") - ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color) - if label: ax.legend() - # if xlim: - # if xlim[0] != xlim[1]: - # ax.set_xlim(*xlim) - - # if ylim: - # if ylim[0] != ylim[1]: - # ax.set_ylim(*ylim) - return fig, ax - -def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major"): - if not fig: - fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) - if not (ax1 and ax2): - ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1) - ax2 = ax1.twinx() - ax2.set_ylabel(ylabel2) - # ax.tick_params(axis="y", labelcolor="r") - plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1) - plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2) - # if label1 or label2: ax1.legend() - if plots: plots += plot1 + plot2 - else: plots = plot1 + plot2 - plt.legend(plots, [ l.get_label() for l in plots]) - - if grid == "major" or grid == "minor" or grid == "both": - if grid == "minor" or "both": - ax1.minorticks_on() - ax1.grid(visible=True, which=grid, linestyle="-", color="#888") - - # if xlim: - # if xlim[0] != xlim[1]: - # ax.set_xlim(*xlim) - - # if ylim: - # if ylim[0] != ylim[1]: - # ax.set_ylim(*ylim) - return fig, ax1, ax2, plots - - -# -# MAIN -# -def missing_arg_val(arg): - print("Missing argument for", arg) - exit(1) - -def missing_arg(arg): - print("Missing ", arg) - exit(1) - -def visualize(loaded_settings: dict): - global settings - settings = loaded_settings - if not settings["db"]: missing_arg("db") - if not settings["server-name"]: missing_arg("server-name") - - img_dir = settings["img_dir"] - img_filetype = settings["img_filetype"] - img_location = settings["img_location"] - names = { - # paths - "img_file_ranking_last_x_days": f"ranking_all_time_files_last_x_days.{img_filetype}", - "img_referer_ranking_last_x_days": f"ranking_all_time_referers_last_x_days.{img_filetype}", - "img_browser_ranking_last_x_days": f"ranking_all_time_browsers_last_x_days.{img_filetype}", - "img_operating_system_ranking_last_x_days": f"ranking_all_time_operating_systems_last_x_days.{img_filetype}", - "img_users_and_requests_last_x_days": f"user_request_count_daily_last_x_days.{img_filetype}", - - "img_file_ranking_total": f"ranking_all_time_files_total.{img_filetype}", - "img_referer_ranking_total": f"ranking_all_time_referers_total.{img_filetype}", - "img_browser_ranking_total": f"ranking_all_time_browsers_total.{img_filetype}", - "img_operating_system_ranking_total": f"ranking_all_time_operating_systems_total.{img_filetype}", - "img_users_and_requests_total": f"user_request_count_daily_total.{img_filetype}", - # values - "mobile_user_percentage_total": 0.0, - "mobile_user_percentage_last_x_days": 0.0, - "user_count_last_x_days": 0, - "user_count_total": 0, - "request_count_last_x_days": 0, - "request_count_total": 0, - "human_user_percentage_last_x_days": 0, - "human_user_percentage_total": 0, - "human_request_percentage_last_x_days": 0, - "human_request_percentage_total": 0, - # general - "regina_version": settings["version"], - "server-name": settings["server-name"], - "last_x_days": settings["last_x_days"], # must be after all the things with last_x_days! - "earliest_date": "1990-1-1", - "generation_date": "1990-1-1 0:0:0", - } - - conn = sql.connect(settings["db"]) - if isdir(img_dir) and img_filetype: - gen_img = True - else: - print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'") - gen_img = False - cur = conn.cursor() - - get_humans = settings["get-human-percentage"] - # DATE STRINGS - names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d") - names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") - # LAST_X_DAYS - # last_x_days_min_date: latest_date - last_x_days - secs_per_day = 86400 - last_x_days_min_date = get_latest_date(cur) - settings["last_x_days"] * secs_per_day - last_x_days_str = get_where_date_str(min_date=last_x_days_min_date) - days = get_days(cur, last_x_days_str) - days_strs = [get_where_date_str(at_date=day) for day in days] - - - # ALL DATES - all_time_str = get_where_date_str(min_date=0) - # all months in yyyy-mm format - months_all_time = get_months(cur, all_time_str) - # sqlite constrict to month string - months_strs = [] - for year_month in months_all_time: - year, month = year_month.split("-") - # first day of the month - min_date = dt(int(year), int(month), 1).timestamp() - month = (int(month) % 12) + 1 # + 1 month - year = int(year) - if month == 1: year += 1 - # first day of the next month - 1 sec - max_date = dt(year, month, 1).timestamp() - 1 - months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date)) - - for i in range(2): - suffix = ["_total", "_last_x_days"][i] - date_str = [all_time_str, last_x_days_str][i] - date_names = [months_all_time, days][i] - date_strs = [months_strs, days_strs][i] - assert(len(date_names) == len(date_strs)) - - # FILES - file_ranking = get_file_ranking(cur, date_str) - if gen_img: - fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes) - fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}") - - # REFERER - referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) - if gen_img: - fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) - fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}") - - # USER - # user_agent_ranking = get_user_agent_ranking(cur, date_str) - # for the time span - unique_user_ids = get_unique_user_ids_for_date(cur, date_str) - unique_user_ids_human = get_human_users(cur, unique_user_ids) - # for each date - unique_user_ids_dates: list[list[int]] = [] - unique_request_ids_dates: list[list[int]] = [] - unique_user_ids_human_dates: list[list[int]] = [] - unique_request_ids_human_dates: list[list[int]] = [] - for i in range(len(date_strs)): - date_str_ = date_strs[i] - unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_)) - unique_request_ids_dates.append(get_unique_request_ids_for_date(cur, date_str_)) - if get_humans: - unique_user_ids_human_dates.append(get_human_users(cur, unique_user_ids_dates[-1])) - unique_request_ids_human_dates.append([]) - for human in unique_user_ids_human_dates[-1]: - unique_request_ids_human_dates[-1] += get_unique_request_ids_for_date_and_user(cur, date_str_, human) - # print("\n\tuu", unique_user_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_user_ids_human_dates, "\n\turh", unique_request_ids_human_dates) - if get_humans: - try: - names[f"human_user_percentage{suffix}"] = round(100 * len_list_list(unique_user_ids_human_dates) / len_list_list(unique_user_ids_dates), 2) - names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2) - except: pass - names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates) - names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates) - if gen_img: - fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"]) - if get_humans: - fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots) - fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}") - - # os & browser - os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human) - if gen_img: - fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems) - fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}") - fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers) - fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}") - - # print("File Ranking", file_ranking) - # print("referer Ranking", referer_ranking) - # print("user agent ranking", user_agent_ranking) - # print("Unique Users:", get_unique_user_count(cur)) - # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue") - # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange") - # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}") - # print("OS ranking", os_ranking) - # print("Browser ranking", browser_ranking) - # print("Mobile percentage", names["mobile_user_percentage"]) - if settings["template_html"] and settings["html_out_path"]: - with open(settings["template_html"], "r") as file: - html = file.read() - for name, value in names.items(): - if "img" in name: - value = f"{img_location}/{value}" - html = html.replace(f"%{name}", str(value)) - with open(settings["html_out_path"], "w") as file: - file.write(html)