From 90f1c6f8a7129c01bb1c0b26ea67c3a721a12e8e Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Sun, 27 Nov 2022 22:27:57 +0100 Subject: [PATCH] lots of fixes --- database.uxf | 3 +- default.conf | 52 ++- regina/__init__.py | 5 + regina/db_operation/__init__.py | 2 + regina/db_operation/collect.py | 163 ++++++++ regina/db_operation/database.py | 157 ++++++++ regina/db_operation/visualize.py | 591 +++++++++++++++++++++++++++++ regina/main.py | 95 +++++ regina/utility/__init__.py | 1 + regina/utility/globals.py | 40 ++ regina/utility/settings_manager.py | 33 ++ regina/utility/sql_util.py | 40 ++ regina/utility/utility.py | 27 ++ 13 files changed, 1194 insertions(+), 15 deletions(-) create mode 100644 regina/__init__.py create mode 100644 regina/db_operation/__init__.py create mode 100644 regina/db_operation/collect.py create mode 100644 regina/db_operation/database.py create mode 100644 regina/db_operation/visualize.py create mode 100644 regina/main.py create mode 100644 regina/utility/__init__.py create mode 100644 regina/utility/globals.py create mode 100644 regina/utility/settings_manager.py create mode 100644 regina/utility/sql_util.py create mode 100644 regina/utility/utility.py diff --git a/database.uxf b/database.uxf index d30cf51..5e99d88 100644 --- a/database.uxf +++ b/database.uxf @@ -7,7 +7,7 @@ 364 273 299 - 208 + 234 User -- @@ -19,6 +19,7 @@ - platform: TEXT - browser: TEXT - mobile: INTEGER +- is_human: INTEGER style=autoresize diff --git a/default.conf b/default.conf index e2002f1..2be8d2a 100644 --- a/default.conf +++ b/default.conf @@ -1,24 +1,48 @@ -# nginx analytics config for quintern.xy +# default configuration for regina # GENERAL -server-name = default-sever +server_name = default_sever +# path to the database +db = /home/my_user/analytics/my_website.db + # DATA COLLECTION -db = /home/my-user/analytics/my-website.db -access-log = /home/my-user/analytics/access.log -locs-and-dirs = /:/www/my-website,/error:/www/error -auto-group-filetypes = png,jpg,jpeg,gif,svg,css +# these changes will only apply to newly collected data/creation of new database +# path to the nginx access log to parse. +access_log = /home/my_user/analytics/access.log +# nginx locations and their root directory: location:directory,location:directory,... +locs_and_dirs = /:/www/my_website,/error:/www/error +# filetypes that should be grouped (comma separated) +auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt +# wether a request with 30x http status counts as success +status_300_is_success = False +# wether a user needs to make at least 1 successful request to be a human +humans_need_success = True +# filegroups, eg group index.html and home.html +filegroups = home:index.html,home.html;images:image1.png,image2.png +# filegroups = # VISUALIZATION -get-human-percentage = True -# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))", -file_ranking_regex_whitelist = .*\.(html) -# minus means empty +# separate users into all and humans +get_human_percentage = True +# regex expression as whitelist for file ranking +# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) +file_ranking_regex_whitelist = +# regex expression as whitelist for referer ranking, minus means empty +# eg: exclude empty referers referer_ranking_regex_whitelist = ^[^\-].* +# regex expression as whitelist for user agent ranking user_agent_ranking_regex_whitelist = -file_ranking_plot_max_files = 15 +# maximum number of file(group)s on the file ranking +file_ranking_plot_max_files = 20 +# wether to ignore non existing files in the ranking +file_ranking_ignore_error_files = True + # "plot_figsize" = (60 40), plot_dpi = 300 +# output directory for the generated plots img_dir = /www/analytics/images -img_dir = /analytics/images -template_html = /home/my-user/analytics/template.html +# nginx location for the generated images, its root must be img_dir +img_location = images +# template html input +template_html = /home/my_user/analytics/template.html +# output for the generated html html_out_path = /www/analytics/statistics.html -# filegroups = start:/index.html,/about.html,/img_on_index.png;music:/music.html,song.mp3 diff --git a/regina/__init__.py b/regina/__init__.py new file mode 100644 index 0000000..9d718fe --- /dev/null +++ b/regina/__init__.py @@ -0,0 +1,5 @@ +"""Gather analytics from nginx access logs and visualize them through generated images and a generated html""" +# __package__ = 'regina' + +from db_operation import database, visualize, collect +print("running __init__.py") diff --git a/regina/db_operation/__init__.py b/regina/db_operation/__init__.py new file mode 100644 index 0000000..a694da7 --- /dev/null +++ b/regina/db_operation/__init__.py @@ -0,0 +1,2 @@ +"""Gather analytics from nginx access logs and visualize them through generated images and a generated html""" +# __package__ = 'regina' diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py new file mode 100644 index 0000000..5e16885 --- /dev/null +++ b/regina/db_operation/collect.py @@ -0,0 +1,163 @@ +import sqlite3 as sql +from re import match +from time import mktime +from datetime import datetime as dt +from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup +from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from utility.utility import pdebug, warning +from utility.globals import user_agent_operating_systems, user_agent_browsers, settings + +""" +collect information from the access log and put it into the database +""" +months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] + + + +class Request: + def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""): + self.ip_address = sanitize(ip_address) + self.time_local = 0 + #[20/Nov/2022:00:47:36 +0100] + m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) + if m: + g = m.groups() + try: + datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) + self.time_local = int(mktime(datetime_.timetuple())) + except Exception as e: + warning(f"Request:__init__: {e}") + else: + warning(f"Request:__init__: Could not match time: '{time_local}'") + + self.request_type = sanitize(request_type) + self.request_file = sanitize(request_file) + self.request_protocol = sanitize(request_protocol) + self.status = sanitize(status) + self.bytes_sent = sanitize(bytes_sent) + self.referer = sanitize(referer) + self.user_agent = sanitize(user_agent) + + def __repr__(self): + return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.user_agent} - {self.status}" + +re_remote_addr = r"[0-9a-fA-F.:]+" +re_remote_user = ".*" +re_time_local = r"\[.+\]" +re_request = r'"[^"]+"' +re_status = r'\d+' +re_body_bytes_sent = r'\d+' +re_http_referer = r'"([^"]*)"' +re_http_user_agent = r'"([^"]*)"' +re_log_format: str = f'({re_remote_addr}) - ({re_remote_user}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_user_agent}' +def parse_log(logfile:str) -> list[Request]: + """ + create Request objects from each line in the logfile + """ + requests = [] + with open(logfile, "r") as file: + lines = file.readlines() + for line in lines: + m = match(re_log_format, line) + if m is None: + warning(f"parse_log: Unmatched line: '{line}'") + continue + # print(m.groups()) + g = m.groups() + request_ = m.groups()[3].split(" ") + if len(request_) != 3: + warning(f"parse_log: len('{m.groups()[3]}'.split(' ')) is {len(request_)} and not 3") + continue + requests.append(Request(ip_address=g[0], time_local=g[2], + request_type=request_[0], request_file=request_[1], request_protocol=request_[2], + status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7])) + return requests + +def get_user_id(request: Request, cursor: sql.Cursor) -> int: + """ + get the user_id. Adds the user if not already existing + """ + # if user exists + if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]): + user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0] + else: # new user + # new user_id is number of elements + user_id: int = sql_tablesize(cursor, t_user) + pdebug("new user:", user_id, request.ip_address) + platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent) + is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id)) + cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');") + return user_id + +def is_user_human(cur: sql.Cursor, user_id: int): + global settings + """ + check if they have a known platform AND browser + check if at least one request did not result in an error (http status >= 400) + """ + max_success_status = 400 + if settings["status_300_is_success"]: max_success_status = 300 + # check if has browser + cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)") + # if no browser and platform + if cur.fetchone()[0] == 0: return False + # if human needs successful request + if settings["human_needs_success"]: + # check if at least request was successful (status < 400) + cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE user_id = {user_id} AND status < {max_success_status})") + if cur.fetchone()[0] == 1: + # pdebug(f"is_user_human: User {user_id} is human") + pass + else: + # pdebug(f"is_user_human: User {user_id} only had unsuccessful requests") + return False + # user is human + return True + + +# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)" +# 1: platform, 2: version, 3: details +def get_os_browser_pairs_from_agent(user_agent): + # for groups in findall(re_user_agent, user_agent): + operating_system = "" + browser = "" + mobile = "Mobi" in user_agent + for os in user_agent_operating_systems: + if os in user_agent: + operating_system = os + break + for br in user_agent_browsers: + if br in user_agent: + browser = br + break + # if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'") + return operating_system, browser, mobile + + +def add_requests_to_db(requests: list[Request], db_name: str): + conn = sql.connect(db_name) + cursor = conn.cursor() + # check the new users later + max_user_id = sql_tablesize(cursor, t_user) + for i in range(len(requests)): + request = requests[i] + # pdebug("add_requests_to_db:", i, "request:", request) + user_id = get_user_id(request, cursor) + conn.commit() + group_id: int = get_filegroup(request.request_file, cursor) + # check if request is unique + group_id_name = database_tables[t_filegroup].key.name + user_id_name = database_tables[t_user].key.name + if sql_exists(cursor, t_request, [(group_id_name, group_id), (user_id_name, user_id), ("date", request.time_local)]): + # pdebug("request exists:", request) + pass + else: + # pdebug("new request:", request) + request_id = sql_tablesize(cursor, t_request) + sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]]) + for user_id in range(max_user_id, sql_tablesize(cursor, t_user)): + is_human = is_user_human(cursor, user_id) + if is_human: + cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}") + cursor.close() + conn.commit() diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py new file mode 100644 index 0000000..3cdd830 --- /dev/null +++ b/regina/db_operation/database.py @@ -0,0 +1,157 @@ +# from sys import path +# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") +import sqlite3 as sql +from os import path, listdir +# local +from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from utility.utility import pdebug + + +""" +create reginas database as shown in the uml diagram database.uxf +""" + +class Entry: + """ + represents an sql entry + type_ is INTEGER, TEXT, REAL... + """ + def __init__(self, name, type_) -> None: + self.name = name + self.type_ = type_ + def __repr__(self): + return f"[{self.name}] {self.type_}" + +class Table: + def __init__(self, name, key: Entry, entries: list[Entry]=[], constaints: list[str]=[]): + self.name = name + self.key = key + self.entries = entries + self.constaints = constaints + def create_sql_str(self): + return f"CREATE TABLE IF NOT EXISTS {self.name}\n({self})\n" + def __repr__(self): + s = f"{self.key} PRIMARY KEY" + for entry in self.entries: + s += f", {entry}" + for c in self.constaints: + s += f", {c}" + return s +t_request = "request" +t_file = "file" +t_filegroup = "filegroup" +t_user = "user" + +user_id = Entry("user_id", "INTEGER") +request_id = Entry("request_id", "INTEGER") +filegroup_id = Entry("group_id", "INTEGER") +ip_address_entry = Entry("ip_address", "TEXT") +filename_entry = Entry("filename", "TEXT") +database_tables = { + t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER")], [f"UNIQUE({user_id.name})"]), + t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), + t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), + t_request: Table(t_request, request_id, [ + user_id, filegroup_id, Entry("date", "INTEGER"), Entry("referer", "TEXT"), Entry("status", "INTEGER") + ], ["UNIQUE(request_id)"]), +} + + + +def get_filegroup(filename: str, cursor: sql.Cursor) -> int: + """ + get the filegroup + returns the group where + 1) filename is the groupname + 2) the filetype of filename is the groupname + 3) new group with filename as gorupname + """ + # pdebug(f"get_filegroup: {filename}") + if sql_exists(cursor, t_file, [("filename", filename)]): + return sql_select(cursor, t_file, [("filename", filename)])[0][1] + else: + suffix = filename.split('.')[-1] + cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") + # cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") + group_id_candidates = cursor.fetchall() + pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") + if group_id_candidates: + return group_id_candidates[0][0] + else: # add new group file filename + group_id = sql_tablesize(cursor, t_filegroup) + # pdebug("new file(group):", group_id, filename) + # add group + sql_insert(cursor, t_filegroup, [[group_id, filename]]) + # add file + sql_insert(cursor, t_file, [[filename, group_id]]) + return group_id + +def create_filegroups(cursor: sql.Cursor, filegroup_str: str): + # filegroup_str: 'name1: file1, file2, file3; name2: file33' + groups = filegroup_str.strip(";").split(";") + pdebug("create_filegroups:", groups) + for group in groups: + name, vals = group.split(":") + # create/get group + if sql_exists(cursor, t_filegroup, [("groupname", name)]): + group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0] + else: + group_id = sql_tablesize(cursor, t_filegroup) + sql_insert(cursor, t_filegroup, [(group_id, name)]) + # pdebug("create_filegroups: group_id", group_id) + # create/edit file + for filename in vals.split(","): + if sql_exists(cursor, t_file, [("filename", filename)]): # if exist, update + cursor.execute(f"UPDATE {t_file} SET group_id = {group_id} WHERE filename = '{filename}'") + else: + sql_insert(cursor, t_file, [[filename, group_id]]) + +def get_files_from_dir_rec(p: str, files: list[str]): + """recursivly append all files to files""" + pdebug("get_files_from_dir_rec:",p) + if path.isfile(p): + files.append(p) + elif path.isdir(p): + for p_ in listdir(p): + get_files_from_dir_rec(p + "/" + p_, files) + +def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_filetypes:list[str]) -> str: + """ + :param list of nginx locations and the corresponding directories + :param auto_filetype_groups list of filetypes for auto grouping + """ + files: list[str] = [] + start_i = 0 + for location, dir_ in location_and_dirs: + get_files_from_dir_rec(dir_, files) + # replace dir_ with location, eg /www/website with / + for i in range(start_i, len(files)): + files[i] = files[i].replace(dir_, location).replace("//", "/") + filegroups = "" + # create groups for each filetype + for ft in auto_group_filetypes: + filegroups += f"{ft}:" + for file in files: + if file.endswith(f".{ft}"): + filegroups += f"{file}," + filegroups = filegroups.strip(",") + ";" + pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups) + return filegroups + +def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]): + """ + create the name with database_tables + """ + print(f"creating database: '{name}'") + conn = sql.connect(f"{name}") + cursor = conn.cursor() + for table in database_tables.values(): + cursor.execute(table.create_sql_str()) + filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes) + create_filegroups(cursor, filegroup_str) + conn.commit() + conn.close() + + +if __name__ == '__main__': + create_db("test.db") diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py new file mode 100644 index 0000000..262486a --- /dev/null +++ b/regina/db_operation/visualize.py @@ -0,0 +1,591 @@ +# from sys import path +# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") +import sqlite3 as sql +from sys import exit +from re import fullmatch +import matplotlib.pyplot as plt +from os.path import isdir +from datetime import datetime as dt + +from numpy import empty +# local +from db_operation.database import t_request, t_user, t_file, t_filegroup +from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where +from utility.utility import pdebug, warning, missing_arg +from utility.globals import settings + + +""" +visualize information from the databse +TODO: +- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com +- ignore 404 +""" + +palette = { + "red": "#ee4035", + "orange": "#f37736", + "yellow": "#fdf458", + "green": "#7bc043", + "blue": "#0392cf", + "purple": "#b044a0", +} +color_settings_filetypes = { + palette["red"]: ["html"], + palette["green"]: ["jpg", "png", "jpeg", "gif", "svg", "webp"], + palette["yellow"]: ["css"], + "grey": ["txt"] +} +color_settings_alternate = list(palette.values()) + +color_settings_browsers = { + palette["red"]: ["Safari"], + palette["orange"]: ["Firefox"], + palette["yellow"]: ["Chrome"], + "grey": ["Edge"], + palette["green"]: ["Chromium"], + palette["purple"]: ["Brave"] +} +color_settings_operating_systems = { + palette["red"]: ["Mac"], + palette["green"]: ["Android"], + "grey": ["iPhone", "iPad"], + palette["yellow"]: ["Linux"], + palette["purple"]: ["BSD"], + palette["blue"]: ["Windows"], +} + + +def len_list_list(l: list[list]): + size = 0 + for i in range(len(l)): + size += len(l[i]) + return size + +def valid_status(status: int): + if status >= 400: return False + if settings["status_300_is_success"] and status >= 300: return True + return status < 300 + +# +# FILTERS +# +def get_os_browser_mobile_rankings(cur: sql.Cursor, user_ids: list[int]): + """ + returns [(count, operating_system)], [(count, browser)], mobile_user_percentage + """ + os_ranking = {} + os_count = 0.0 + browser_ranking = {} + browser_count = 0.0 + mobile_ranking = { True: 0.0, False: 0.0 } + for user_id in user_ids: + cur.execute(f"SELECT platform,browser,mobile FROM {t_user} WHERE user_id = {user_id}") + os, browser, mobile = cur.fetchone() + mobile = bool(mobile) + if os: + if os in os_ranking: os_ranking[os] += 1 + else: os_ranking[os] = 1 + os_count += 1 + if browser: + if browser in browser_ranking: browser_ranking[browser] += 1 + else: browser_ranking[browser] = 1 + browser_count += 1 + if (os or browser): + mobile_ranking[mobile] += 1 + try: + mobile_user_percentage = mobile_ranking[True] / (mobile_ranking[True] + mobile_ranking[False]) + except ZeroDivisionError: + mobile_user_percentage = 0.0 + + os_ranking = [(c * 100/os_count, n) for n, c in os_ranking.items()] + os_ranking.sort() + browser_ranking = [(c * 100/browser_count, n) for n, c in browser_ranking.items()] + browser_ranking.sort() + return os_ranking, browser_ranking, mobile_user_percentage*100 + +# +# GETTERS +# +def get_where_date_str(at_date=None, min_date=None, max_date=None): + # dates in unix time + s = "" + if at_date is not None: + if isinstance(at_date, str): + s += f"DATE(date, 'unixepoch') = '{sanitize(at_date)}' AND " + elif isinstance(at_date, int|float): + s += f"date = {int(at_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}") + if min_date is not None: + if isinstance(min_date, str): + s += f"DATE(date, 'unixepoch') >= '{sanitize(min_date)}' AND " + elif isinstance(min_date, int|float): + s += f"date >= {int(min_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}") + if max_date is not None: + if isinstance(max_date, str): + s += f"DATE(date, 'unixepoch') <= '{sanitize(max_date)}' AND " + elif isinstance(max_date, int|float): + s += f"date <= {int(max_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}") + if s == "": + print(f"WARNING: get_where_date_str: no date_str generated. Returing 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") + return "date > 0" + return s.removesuffix(" AND ") + + +# get the earliest date +def get_earliest_date(cur: sql.Cursor) -> int: + """return the earliest time as unixepoch""" + cur.execute(f"SELECT MIN(date) FROM {t_request}") + return cur.fetchone()[0] +# get the latest date +def get_latest_date(cur: sql.Cursor) -> int: + """return the latest time as unixepoch""" + cur.execute(f"SELECT MAX(date) FROM {t_request}") + return cur.fetchone()[0] +# get all dates +# the date:str parameter in all these function must be a sqlite constraint +def get_days(cur: sql.Cursor, date:str) -> list[str]: + """get a list of all dates in yyyy-mm-dd format""" + cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") + days = [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, ) + days.sort() + return days + +def get_months(cur: sql.Cursor, date:str) -> list[str]: + """get a list of all dates in yyyy-mm format""" + cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") + dates = get_days(cur, date) + date_dict = {} + for date in dates: + date_without_day = date[0:date.rfind('-')] + date_dict[date_without_day] = 0 + return list(date_dict.keys()) + + +def get_user_agent(cur: sql.Cursor, user_id: int): + return sql_select(cur, t_user, [("user_id", user_id)])[0][2] + +def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: + cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}") + return [ user_id[0] for user_id in cur.fetchall() ] + +def get_human_users(cur: sql.Cursor, unique_user_ids, unique_user_ids_human: list): + """ + check if they have a known platform AND browser + check if at least one request did not result in an error (http status >= 400) + """ + for user_id in unique_user_ids: + cur.execute(f"SELECT is_human FROM {t_user} WHERE user_id = {user_id}") + # if not user + if cur.fetchone()[0] == 0: + # pdebug(f"get_human_users: {user_id}, is_human is 0") + continue + else: + # pdebug(f"get_human_users: {user_id}, is_human is non-zero") + pass + + # user is human + unique_user_ids_human.append(user_id) + # pdebug("get_human_users: (2)", unique_user_ids_human) + +def get_unique_request_ids_for_date(cur: sql.Cursor, date:str): + cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}") + return [ request_id[0] for request_id in cur.fetchall()] + +def get_unique_request_ids_for_date_and_user(cur: sql.Cursor, date:str, user_id: int, unique_request_ids_human: list): + cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND user_id = {user_id}") + # all unique requests for user_id + for request_id in cur.fetchall(): + unique_request_ids_human.append(request_id[0]) + +# get number of requests per day +def get_request_count_for_date(cur: sql.Cursor, date:str) -> int: + cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}") + return cur.fetchone()[0] + +def get_unique_user_count(cur: sql.Cursor) -> int: + return sql_tablesize(cur, t_user) + + + +# +# RANKINGS +# +def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: + global settings + """ + :returns [(request_count, groupname)] + """ + ranking = [] + cur.execute(f"SELECT group_id, groupname FROM {t_filegroup}") + for group in cur.fetchall(): + group_id = group[0] + # filename = sql_select(cur, t_file, [("group_id", group)]) + # if len(filename) == 0: continue + # filename = filename[0][0] + filename = group[1] + if settings["file_ranking_regex_whitelist"]: # if file in whitelist + if not fullmatch(settings["file_ranking_regex_whitelist"], filename): + pdebug(f"get_file_ranking: file with group_id {group_id} is not in whitelist") + continue + if settings["file_ranking_ignore_error_files"]: # if request to file was successful + success = False + cur.execute(f"SELECT status FROM {t_request} WHERE group_id = {group_id}") + for status in cur.fetchall(): + if valid_status(status[0]): + pdebug(f"get_file_ranking: success code {status[0]} for file with group_id {group_id} and groupname {filename}") + success = True + break + if not success: + pdebug(f"get_file_ranking: file with group_id {group_id} and groupname {filename} has only requests resulting in error") + continue + + + # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) + cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group_id} AND {date}") + ranking.append((cur.fetchone()[0], filename)) + ranking.sort() + # print(ranking) + return ranking + +def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: + """ + :returns [(request_count, user_agent)] + """ + ranking = [] + cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}") + for user_id in cur.fetchall(): + user_id = user_id[0] + user_agent = sql_select(cur, t_user, [("user_id", user_id)]) + if len(user_agent) == 0: continue + user_agent = user_agent[0][2] + if settings["user_agent_ranking_regex_whitelist"]: + if not fullmatch(settings["user_agent_ranking_regex_whitelist"], user_agent): + continue + # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) + cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND {date}") + ranking.append((cur.fetchone()[0], user_agent)) + ranking.sort() + # print(ranking) + return ranking + +def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]: + """ + 1) get all the distinct entries for field_name after min_date_unix_time + 2) call get_name_function with the distinct entry + 3) for every entry, get the count in table after min_date_unix_time + 3) sort by count in ascending order + :returns [(request_count, name)] + """ + ranking = [] + cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}") + for name in cur.fetchall(): + name = name[0] + if whitelist_regex: + if not fullmatch(whitelist_regex, name): + continue + # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) + cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}") + ranking.append((cur.fetchone()[0], name)) + ranking.sort() + # print(ranking) + return ranking + + +# +# PLOTTING +# +# add value labels +def add_vertikal_labels_in_bar_plot(labels, max_y_val, ax, bar_plot): + # pdebug("add_vertikal_labels_in_bar_plot:", labels) + for idx,rect in enumerate(bar_plot): + height = rect.get_height() + if height > 0.6 * max_y_val: # if the bar is large, put label in the bar + height = 0.05 * max_y_val + ax.text(rect.get_x() + rect.get_width()/2., height + 0.025 * max_y_val, + labels[idx], + ha='center', va='bottom', rotation=90) +# add count labels +def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot): + # pdebug("add_labels_at_top_of_bar:", xdata, ydata) + y_offset = 0.05 * max_y_val + for idx,rect in enumerate(bar_plot): + ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8)) + +def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[]): + """ + make a bar plot of the most requested files + """ + # pdebug(f"plot_ranking: ranking={ranking}") + if not fig: + fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + # create new axis if none is given + ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) + # fill x y data + if len(ranking) > settings["file_ranking_plot_max_files"]: + start_index = len(ranking) - settings["file_ranking_plot_max_files"] + else: start_index = 0 + x_names = [] + y_counts = [] + colors = [] + for i in range(start_index, len(ranking)): + x_names.append(ranking[i][1]) + y_counts.append(ranking[i][0]) + ft = ranking[i][1].split(".")[-1] + color = palette["blue"] + # if not color_settings: color = palette["blue"] + if isinstance(color_settings, dict): + for key, val in color_settings.items(): + if ft in val: color = key + if not color: color = palette["blue"] + elif isinstance(color_settings, list): + # print(color_settings, (i - start_index) % len(color_settings)) + color = color_settings[(i - start_index) % len(color_settings)] + colors.append(color) + bar = ax.bar(x_names, y_counts, tick_label="", color=colors) + + if len(y_counts) > 0: + add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar) + if settings["plot_add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar) + # ax.ylabel(y_counts) + return fig + + +def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue"): + if not fig: + fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + if not ax: + ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) + else: + ax = ax.twinx() + ax.set_ylabel(ylabel) + # ax.tick_params(axis="y", labelcolor="r") + ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color) + if label: ax.legend() + # if xlim: + # if xlim[0] != xlim[1]: + # ax.set_xlim(*xlim) + + # if ylim: + # if ylim[0] != ylim[1]: + # ax.set_ylim(*ylim) + return fig, ax + +def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major"): + if not fig: + fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + if not (ax1 and ax2): + ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1) + ax2 = ax1.twinx() + ax2.set_ylabel(ylabel2) + # ax.tick_params(axis="y", labelcolor="r") + plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1) + plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2) + # if label1 or label2: ax1.legend() + if plots: plots += plot1 + plot2 + else: plots = plot1 + plot2 + plt.legend(plots, [ l.get_label() for l in plots]) + + if grid == "major" or grid == "minor" or grid == "both": + if grid == "minor" or "both": + ax1.minorticks_on() + ax1.grid(visible=True, which=grid, linestyle="-", color="#888") + + # if xlim: + # if xlim[0] != xlim[1]: + # ax.set_xlim(*xlim) + + # if ylim: + # if ylim[0] != ylim[1]: + # ax.set_ylim(*ylim) + return fig, ax1, ax2, plots + + +# +# MAIN +# + +def visualize(loaded_settings: dict): + pdebug("visualizing...") + global settings + settings = loaded_settings + if not settings["db"]: missing_arg("db") + if not settings["server_name"]: missing_arg("server_name") + + img_dir = settings["img_dir"] + img_filetype = settings["img_filetype"] + img_location = settings["img_location"] + names = { + # paths + "img_file_ranking_last_x_days": f"ranking_all_time_files_last_x_days.{img_filetype}", + "img_referer_ranking_last_x_days": f"ranking_all_time_referers_last_x_days.{img_filetype}", + "img_browser_ranking_last_x_days": f"ranking_all_time_browsers_last_x_days.{img_filetype}", + "img_operating_system_ranking_last_x_days": f"ranking_all_time_operating_systems_last_x_days.{img_filetype}", + "img_users_and_requests_last_x_days": f"user_request_count_daily_last_x_days.{img_filetype}", + + "img_file_ranking_total": f"ranking_all_time_files_total.{img_filetype}", + "img_referer_ranking_total": f"ranking_all_time_referers_total.{img_filetype}", + "img_browser_ranking_total": f"ranking_all_time_browsers_total.{img_filetype}", + "img_operating_system_ranking_total": f"ranking_all_time_operating_systems_total.{img_filetype}", + "img_users_and_requests_total": f"user_request_count_daily_total.{img_filetype}", + # values + "mobile_user_percentage_total": 0.0, + "mobile_user_percentage_last_x_days": 0.0, + "user_count_last_x_days": 0, + "user_count_total": 0, + "request_count_last_x_days": 0, + "request_count_total": 0, + "human_user_percentage_last_x_days": 0.0, + "human_user_percentage_total": 0.0, + "human_request_percentage_last_x_days": 0.0, + "human_request_percentage_total": 0.0, + # general + "regina_version": settings["version"], + "server_name": settings["server_name"], + "last_x_days": settings["last_x_days"], # must be after all the things with last_x_days! + "earliest_date": "1990-1-1", + "generation_date": "1990-1-1 0:0:0", + } + + conn = sql.connect(settings["db"]) + if isdir(img_dir) and img_filetype: + gen_img = True + else: + print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'") + gen_img = False + cur = conn.cursor() + + get_humans = settings["get_human_percentage"] + # pdebug(f"visualize: settings {settings}") + # DATE STRINGS + names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d") + names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") + # LAST_X_DAYS + # last_x_days_min_date: latest_date - last_x_days + secs_per_day = 86400 + last_x_days_min_date = get_latest_date(cur) - settings["last_x_days"] * secs_per_day + last_x_days_str = get_where_date_str(min_date=last_x_days_min_date) + days = get_days(cur, last_x_days_str) + days_strs = [get_where_date_str(at_date=day) for day in days] + + + # ALL DATES + all_time_str = get_where_date_str(min_date=0) + # all months in yyyy-mm format + months_all_time = get_months(cur, all_time_str) + # sqlite constrict to month string + months_strs = [] + for year_month in months_all_time: + year, month = year_month.split("-") + # first day of the month + min_date = dt(int(year), int(month), 1).timestamp() + month = (int(month) % 12) + 1 # + 1 month + year = int(year) + if month == 1: year += 1 + # first day of the next month - 1 sec + max_date = dt(year, month, 1).timestamp() - 1 + months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date)) + + for i in range(2): + suffix = ["_total", "_last_x_days"][i] + date_str = [all_time_str, last_x_days_str][i] + date_names = [months_all_time, days][i] + date_strs = [months_strs, days_strs][i] + assert(len(date_names) == len(date_strs)) + + # FILES + file_ranking = get_file_ranking(cur, date_str) + if gen_img: + fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes) + fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}") + + # REFERER + referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) + if gen_img: + fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) + fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}") + + # USER + # user_agent_ranking = get_user_agent_ranking(cur, date_str) + # for the time span + unique_user_ids = get_unique_user_ids_for_date(cur, date_str) + unique_user_ids_human = [] + get_human_users(cur, unique_user_ids, unique_user_ids_human) + # for each date + date_count = len(date_strs) + unique_user_ids_dates: list[list[int]] = [] + unique_request_ids_dates: list[list[int]] = [] + unique_user_ids_human_dates: list[list[int]] = [[] for i in range(date_count)] + unique_request_ids_human_dates: list[list[int]] = [[] for i in range(date_count)] + for i in range(date_count): + date_str_ = date_strs[i] + unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_)) + unique_request_ids_dates.append(get_unique_request_ids_for_date(cur, date_str_)) + if get_humans: + # empty_list = [] + # unique_user_ids_human_dates.append(empty_list) + get_human_users(cur, unique_user_ids_dates[i], unique_user_ids_human_dates[i]) + # unique_request_ids_human_dates.append(list()) + for human in unique_user_ids_human_dates[i]: + get_unique_request_ids_for_date_and_user(cur, date_str_, human, unique_request_ids_human_dates[i]) + # print("\n\tuu", unique_user_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_user_ids_human_dates, "\n\turh", unique_request_ids_human_dates) + # pdebug("uui", unique_user_ids) + # pdebug("uuih", unique_user_ids_human) + # pdebug("uuid", unique_user_ids_dates) + # pdebug("uuidh", unique_user_ids_human_dates) + # pdebug("urid", unique_request_ids_dates) + # pdebug("uridh", unique_user_ids_human_dates) + # pdebug(f"human_user_precentage: len_list_list(user_ids)={len_list_list(unique_user_ids_dates)}, len_list_list(user_ids_human)={len_list_list(unique_user_ids_human_dates)}") + if get_humans: + try: + names[f"human_user_percentage{suffix}"] = round(100 * len_list_list(unique_user_ids_human_dates) / len_list_list(unique_user_ids_dates), 2) + except: + names[f"human_user_percentage{suffix}"] = -1.0 + try: + names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2) + except: + names[f"human_request_percentage{suffix}"] = -1.0 + names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates) + names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates) + if gen_img: + fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"]) + if get_humans: + fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots) + fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}") + + # os & browser + os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human) + if gen_img: + fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems) + fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}") + fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers) + fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}") + + # print("File Ranking", file_ranking) + # print("referer Ranking", referer_ranking) + # print("user agent ranking", user_agent_ranking) + # print("Unique Users:", get_unique_user_count(cur)) + # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue") + # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange") + # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}") + # print("OS ranking", os_ranking) + # print("Browser ranking", browser_ranking) + # print("Mobile percentage", names["mobile_user_percentage"]) + if settings["template_html"] and settings["html_out_path"]: + pdebug(f"visualize: writing to html: {settings['html_out_path']}") + + with open(settings["template_html"], "r") as file: + html = file.read() + for name, value in names.items(): + if "img" in name: + value = f"{img_location}/{value}" + html = html.replace(f"%{name}", str(value)) + with open(settings["html_out_path"], "w") as file: + file.write(html) + else: + warning(f"Skipping html generation because either template_html or html_out_path is invalid: template_html='{settings['template_html']}', html_out_path='{settings['html_out_path']}'") diff --git a/regina/main.py b/regina/main.py new file mode 100644 index 0000000..810e5cd --- /dev/null +++ b/regina/main.py @@ -0,0 +1,95 @@ +# from sys import path +# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") +# __package__="." +from sys import argv, exit +from os.path import isfile +from db_operation.visualize import visualize +from utility.settings_manager import read_settings_file +from db_operation.collect import parse_log, add_requests_to_db +from db_operation.database import create_db +from utility.globals import settings, version + +""" +start regina, launch either collect or visualize +""" + + +def help(): + helpstring = """Command line options: + --server-name string + --log path to the access.log + --db name of the database + --settings["filegroups"] string describing settings["filegroups"], eg 'name1: file1, file2; name2: file3, file4, file5;' + --auto-group-filetypes comma separated list of filetypes, eg 'css,png,gif' + --locs-and_dirs comma separated list of nginx_location:directory pairs, eg '/:/www/website' + --config-file path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line + """ + print(helpstring) + +def missing_arg_val(arg): + print("Missing argument for", arg) + exit(1) + +def missing_arg(arg): + print("Missing ", arg) + exit(1) + +def error(arg): + print("Error:", arg) + exit(1) + +def main(): + config_file = "" + collect = False + visualize_ = False + log_file = "" + # parse args + i = 1 + while i in range(1, len(argv)): + if argv[i] == "--config": + if len(argv) > i + 1: config_file = argv[i+1] + else: missing_arg_val(argv[i]) + if argv[i] == "--log-file": + if len(argv) > i + 1: log_file = argv[i+1] + else: missing_arg_val(argv[i]) + elif argv[i] == "--help": + help() + exit(0) + elif argv[i] == "--collect": + collect = True + elif argv[i] == "--visualize": + visualize_ = True + else: + pass + i += 1 + if not collect and not visualize_: + missing_arg("--visualize or --collect") + + if not config_file: + missing_arg("--config_file") + if not isfile(config_file): + error(f"Not a file: '{config_file}'") + read_settings_file(config_file, settings) + settings["version"] = version + if log_file: settings["access-log"] = log_file + + print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'") + + if not settings["server_name"]: missing_arg("server-name") + if not settings["access_log"]: missing_arg("log") + if not settings["db"]: missing_arg("db") + if isinstance(settings["auto_group_filetypes"], str): + settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",") + if isinstance(settings["locs_and_dirs"], str): + settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ] + if collect: + if not isfile(settings["db"]): + create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"]) + requests = parse_log(settings["access_log"]) + add_requests_to_db(requests, settings["db"]) + if visualize_: + if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") + visualize(settings) + +if __name__ == '__main__': + main() diff --git a/regina/utility/__init__.py b/regina/utility/__init__.py new file mode 100644 index 0000000..f36e6b3 --- /dev/null +++ b/regina/utility/__init__.py @@ -0,0 +1 @@ +"""Utility for regina""" diff --git a/regina/utility/globals.py b/regina/utility/globals.py new file mode 100644 index 0000000..14a2165 --- /dev/null +++ b/regina/utility/globals.py @@ -0,0 +1,40 @@ +"""global variables for regina""" + +version = "1.0" + +# default settings, these are overwriteable through a config file +settings = { + # GENERAL + "server_name": "", + # DATA COLLECTION + "access_log": "", + "db": "", + "locs_and_dirs": [], + "auto_group_filetypes": [], + "filegroups": "", + + # VISUALIZATION + "get_human_percentage": False, + "human_needs_success": True, # a human must have at least 1 successful request (status < 300) + "status_300_is_success": False, # 300 codes are success + # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))", + "file_ranking_regex_whitelist": r".*\.(html)", + "file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300) + "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty + "user_agent_ranking_regex_whitelist": r"", + "file_ranking_plot_max_files": 15, + # "plot_figsize": (60, 40), + "plot_dpi": 300, + "plot_add_count_label": True, + "img_dir": "", + "img_location": "", + "img_filetype": "svg", + "template_html": "", + "html_out_path": "", + "last_x_days": 30, +} + +# these oses and browser can be detected: +# lower element takes precedence +user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"] +user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"] diff --git a/regina/utility/settings_manager.py b/regina/utility/settings_manager.py new file mode 100644 index 0000000..cb821d6 --- /dev/null +++ b/regina/utility/settings_manager.py @@ -0,0 +1,33 @@ + +def get_bool(bool_str: str, fallback=False): + if bool_str in ["true", "True"]: return True + elif bool_str in ["false", "False"]: return False + return fallback + +def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, allow_new_keys=False, convert_to_type=True): + lines = [] + with open(filepath, "r") as file: + lines = file.readlines() + + for i in range(len(lines)): + line = lines[i].strip("\n ") + if line.startswith("#"): continue + vals = line.split("=") + if not len(vals) == 2: + if ignore_invalid_lines: continue + else: raise KeyError(f"Invalid line: '{line}'") + vals[0] = vals[0].strip(" ") + if not allow_new_keys and vals[0] not in settings.keys(): + if ignore_invalid_lines: continue + else: raise KeyError(f"Invalid key: '{vals[0]}'") + if convert_to_type and not isinstance(settings[vals[0]], str|list|None): + if isinstance(settings[vals[0]], bool): + settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]]) + continue + try: + settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" ")) + except Exception as e: + if not ignore_invalid_lines: raise e + else: continue + else: + settings[vals[0]] = vals[1].strip(" ") diff --git a/regina/utility/sql_util.py b/regina/utility/sql_util.py new file mode 100644 index 0000000..2e3f9a8 --- /dev/null +++ b/regina/utility/sql_util.py @@ -0,0 +1,40 @@ +import sqlite3 as sql +"""Various utilities""" +def sanitize(s): + if type(s) != str: return s + return s\ + .replace("''", "'").replace("'", r"''").strip(" ") + # .replace('"', r'\"')\ + +def sql_get_constaint_str(constraints: list[tuple[str, str|int]], logic="AND") -> str: + c_str = "" + for name, val in constraints: + c_str += f"{name} = '{sanitize(val)}' {logic} " + return c_str.strip(logic + " ") + +def sql_get_value_str(values: list[list]) -> str: + c_str = "" + for params in values: + c_str += "(" + for p in params: c_str += f"'{sanitize(p)}', " + c_str = c_str.strip(", ") + "), " + return c_str.strip(", ") + +def sql_exists(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND") -> bool: + cur.execute(f"SELECT EXISTS (SELECT 1 FROM {table} WHERE {sql_get_constaint_str(constraints, logic)})") + return cur.fetchone()[0] == 1 + +def sql_select(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND"): + cur.execute(f"SELECT * FROM {table} WHERE {sql_get_constaint_str(constraints, logic)}") + return cur.fetchall() + +def sql_insert(cur: sql.Cursor, table: str, values: list[list]): + cur.execute(f"INSERT INTO {table} VALUES {sql_get_value_str(values)}") + +def sql_tablesize(cur: sql.Cursor, table: str) -> int: + cur.execute(f"SELECT Count(*) FROM {table}") + return cur.fetchone()[0] + +def sql_get_count_where(cur: sql.Cursor, table, constraints) -> int: + cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {sql_get_constaint_str(constraints)}") + return cur.fetchone()[0] diff --git a/regina/utility/utility.py b/regina/utility/utility.py new file mode 100644 index 0000000..42a4299 --- /dev/null +++ b/regina/utility/utility.py @@ -0,0 +1,27 @@ +# from sys import path +# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") +from sys import exit + +""" +Various utitity +""" + +DEBUG = False +def pdebug(*args): + if DEBUG: print(*args) + +def warning(*w): + print("Warning:", *w) + +def error(*arg): + print("Error:", *arg) + exit(1) + +def missing_arg_val(arg): + print("Missing argument for", arg) + exit(1) + +def missing_arg(arg): + print("Missing ", arg) + exit(1) +