From 5064df1f0ecb8cd62a830e4ab7204ee36c4d7bf1 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Wed, 23 Nov 2022 20:02:19 +0100 Subject: [PATCH] wip --- server_datenbank.uxf => database.uxf | 41 ++-- default.conf | 2 +- regina/collect.py | 36 ++- regina/database.py | 9 +- regina/{main.py => regina.py} | 12 +- regina/settings_manager.py | 4 +- regina/sql_util.py | 3 +- regina/template.html | 39 ---- regina/visualize.py | 332 ++++++++++++++++----------- template.html | 69 ++++++ 10 files changed, 343 insertions(+), 204 deletions(-) rename server_datenbank.uxf => database.uxf (88%) rename regina/{main.py => regina.py} (92%) delete mode 100644 regina/template.html create mode 100644 template.html diff --git a/server_datenbank.uxf b/database.uxf similarity index 88% rename from server_datenbank.uxf rename to database.uxf index 6895493..20b6024 100644 --- a/server_datenbank.uxf +++ b/database.uxf @@ -4,10 +4,10 @@ UMLClass - 38 - 323 + 133 + 247 342 - 190 + 266 User -- @@ -15,14 +15,17 @@ -- - ip address - user agent string +- platform +- browser +- mobile style=autoresize UMLClass - 1064 - 323 + 1159 + 247 247 152 @@ -37,8 +40,8 @@ style=autoresize Relation - 874 - 323 + 969 + 247 228 95 @@ -51,8 +54,8 @@ m2=1 UMLClass - 608 - 304 + 703 + 228 285 285 @@ -72,8 +75,8 @@ style=autoresize Relation - 361 - 323 + 456 + 247 285 95 @@ -86,8 +89,8 @@ m2=n UMLClass - 1064 - 722 + 1159 + 646 190 152 @@ -103,8 +106,8 @@ style=autoresize Relation - 1121 - 456 + 1216 + 380 76 304 @@ -117,8 +120,8 @@ m2=1 UMLNote - 779 - 95 + 874 + 19 570 133 @@ -131,8 +134,8 @@ style=autoresize Relation - 1083 - 209 + 1178 + 133 57 152 diff --git a/default.conf b/default.conf index 080b903..e118eb7 100644 --- a/default.conf +++ b/default.conf @@ -20,4 +20,4 @@ plot_dpi = 300 img_dir = /www/analytics/images template_html = /home/my-user/analytics/template.html html_out_path = /www/analytics/statistics.html -# filegroups = +# filegroups = start:/index.html,/about.html,/img_on_index.png;music:/music.html,song.mp3 diff --git a/regina/collect.py b/regina/collect.py index 6b35441..774e4aa 100644 --- a/regina/collect.py +++ b/regina/collect.py @@ -1,9 +1,13 @@ import sqlite3 as sql from re import match from time import mktime -from datetime import datetime as dt from database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup from sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from datetime import datetime as dt + +""" +collect information from the access log and put it into the database +""" DEBUG = True def pdebug(*args): @@ -15,6 +19,12 @@ def warning(w): months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] +# these oses and browser can be detected: +# lower element takes precedence +user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"] +user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"] + + class Request: def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""): self.ip_address = sanitize(ip_address) @@ -39,9 +49,6 @@ class Request: self.referer = sanitize(referer) self.user_agent = sanitize(user_agent) - def insert_user_sql_str(self, user_id, user_table="user"): - return f"INSERT INTO {user_table} (user_id, ip_address, user_agent) VALUES ({user_id}, '{self.ip_address}', '{self.user_agent}');" - def __repr__(self): return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.user_agent} - {self.status}" @@ -88,9 +95,28 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int: # new user_id is number of elements user_id: int = sql_tablesize(cursor, t_user) pdebug("new user:", user_id, request.ip_address) - cursor.execute(request.insert_user_sql_str(user_id)) + platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent) + cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}');") return user_id +# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)" +# 1: platform, 2: version, 3: details +def get_os_browser_pairs_from_agent(user_agent): + # for groups in findall(re_user_agent, user_agent): + operating_system = "" + browser = "" + mobile = "Mobi" in user_agent + for os in user_agent_operating_systems: + if os in user_agent: + operating_system = os + break + for br in user_agent_browsers: + if br in user_agent: + browser = br + break + # if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'") + return operating_system, browser, mobile + def add_requests_to_db(requests: list[Request], db_name: str): conn = sql.connect(db_name) diff --git a/regina/database.py b/regina/database.py index 08509e0..d93e779 100644 --- a/regina/database.py +++ b/regina/database.py @@ -2,6 +2,11 @@ import sqlite3 as sql from sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from os import path, listdir + +""" +create reginas database as shown in the uml diagram database.uxf +""" + DEBUG = True def pdebug(*args): if DEBUG: print(*args) @@ -43,7 +48,7 @@ filegroup_id = Entry("group_id", "INTEGER") ip_address_entry = Entry("ip_address", "TEXT") filename_entry = Entry("filename", "TEXT") database_tables = { - t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT")], [f"UNIQUE({user_id.name})"]), + t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER")], [f"UNIQUE({user_id.name})"]), t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), t_request: Table(t_request, request_id, [ @@ -124,12 +129,12 @@ def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[] """ create the name with database_tables """ + print(f"creating database: '{name}'") conn = sql.connect(f"{name}") cursor = conn.cursor() for table in database_tables.values(): cursor.execute(table.create_sql_str()) filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes) - print(filegroup_str) create_filegroups(cursor, filegroup_str) conn.commit() conn.close() diff --git a/regina/main.py b/regina/regina.py similarity index 92% rename from regina/main.py rename to regina/regina.py index 554b281..b690f06 100644 --- a/regina/main.py +++ b/regina/regina.py @@ -6,6 +6,10 @@ from os.path import isfile, isdir from visualize import visualize from settings_manager import read_settings_file +""" +start regina, launch either collect or visualize +""" + version = "1.0" # default settings, these are overwriteable through a config file @@ -75,7 +79,6 @@ if __name__ == '__main__': exit(0) elif argv[i] == "--collect": collect = True - exit(0) elif argv[i] == "--visualize": visualize_ = True else: @@ -90,19 +93,20 @@ if __name__ == '__main__': error(f"Not a file: '{config_file}'") read_settings_file(config_file, settings) settings["version"] = version + print(f"regina version {version} with server-name '{settings['server-name']}' and database '{settings['db']}'") if not settings["server-name"]: missing_arg("server-name") if not settings["access-log"]: missing_arg("log") if not settings["db"]: missing_arg("db") - if type(settings["auto-group-filetypes"]) == str: + if isinstance(settings["auto-group-filetypes"], str): settings["auto-group-filetypes"] = settings["auto-group-filetypes"].split(",") - if type(settings["locs-and-dirs"]) == str: + if isinstance(settings["locs-and-dirs"], str): settings["locs-and-dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs-and-dirs"].split(",") ] if collect: if not isfile(settings["db"]): create_db(settings["db"], settings["filegroups"], settings["locs-and-dirs"], settings["auto-group-filetypes"]) requests = parse_log(settings["access-log"]) add_requests_to_db(requests, settings["db"]) - if visualize: + if visualize_: if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") visualize(settings) diff --git a/regina/settings_manager.py b/regina/settings_manager.py index 993b76a..cb821d6 100644 --- a/regina/settings_manager.py +++ b/regina/settings_manager.py @@ -20,8 +20,8 @@ def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, if not allow_new_keys and vals[0] not in settings.keys(): if ignore_invalid_lines: continue else: raise KeyError(f"Invalid key: '{vals[0]}'") - if convert_to_type and type(settings[vals[0]]) not in [str, None]: - if type(settings[vals[0]]) == bool: + if convert_to_type and not isinstance(settings[vals[0]], str|list|None): + if isinstance(settings[vals[0]], bool): settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]]) continue try: diff --git a/regina/sql_util.py b/regina/sql_util.py index 34b0dc5..2e3f9a8 100644 --- a/regina/sql_util.py +++ b/regina/sql_util.py @@ -1,6 +1,5 @@ import sqlite3 as sql -"""Various sql utilities""" - +"""Various utilities""" def sanitize(s): if type(s) != str: return s return s\ diff --git a/regina/template.html b/regina/template.html deleted file mode 100644 index 60f2a30..0000000 --- a/regina/template.html +++ /dev/null @@ -1,39 +0,0 @@ - - - - - - - - Analytics for %server_name - - - -

Analytics for %server-name

-
-
-

Last %last_x_days days

- Daily Statistics -
    -
  • Unique user count: %total_user_count_x_days, from which %human_user_percentage_x_days% are human
  • -
  • Unique request count: %total_request_count_x_days, from which %human_request_percentage_x_days% came from human users
  • -
-
-
-
-
-

All times

- Operating system ranking - Browser ranking -
    -
  • Mobile user percentage: %mobile_user_percentage
  • -
  • Total user count: %total_user_count, from which %human_user_percentage% are human
  • -
  • Total request count: %total_request_count, from which %human_request_percentage% came from human users
  • -
- File ranking - Referer ranking -
-
-

These analytics were generated by regina %regina_version

- - diff --git a/regina/visualize.py b/regina/visualize.py index cb1b751..ca795c6 100644 --- a/regina/visualize.py +++ b/regina/visualize.py @@ -6,7 +6,9 @@ from re import fullmatch, findall import matplotlib.pyplot as plt import matplotlib as mpl from os.path import isdir +from datetime import datetime as dt """ +visualize information from the databse TODO: - bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com """ @@ -29,10 +31,6 @@ color_settings_filetypes = { } color_settings_alternate = list(palette.values()) -# these oses and browser can be detected: -# lower element takes precedence -user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"] -user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"] color_settings_browsers = { palette["red"]: ["Safari"], palette["orange"]: ["Firefox"], @@ -61,25 +59,7 @@ def len_list_list(l: list[list]): # # FILTERS # -# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)" -# 1: platform, 2: version, 3: details -def get_os_browser_pairs_from_agent(user_agent): - # for groups in findall(re_user_agent, user_agent): - operating_system = "" - browser = "" - mobile = "Mobi" in user_agent - for os in user_agent_operating_systems: - if os in user_agent: - operating_system = os - break - for br in user_agent_browsers: - if br in user_agent: - browser = br - break - # if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'") - return operating_system, browser, mobile - -def get_os_browser_mobile_rankings(user_agent_ranking): +def get_os_browser_mobile_rankings(cur: sql.Cursor, user_ids: list[int]): """ returns [(count, operating_system)], [(count, browser)], mobile_user_percentage """ @@ -88,8 +68,10 @@ def get_os_browser_mobile_rankings(user_agent_ranking): browser_ranking = {} browser_count = 0.0 mobile_ranking = { True: 0.0, False: 0.0 } - for count, agent in user_agent_ranking: - os, browser, mobile = get_os_browser_pairs_from_agent(agent) + for user_id in user_ids: + cur.execute(f"SELECT platform,browser,mobile FROM {t_user} WHERE user_id = {user_id}") + os, browser, mobile = cur.fetchone() + mobile = bool(mobile) if os: if os in os_ranking: os_ranking[os] += 1 else: os_ranking[os] = 1 @@ -114,40 +96,91 @@ def get_os_browser_mobile_rankings(user_agent_ranking): # # GETTERS # +def get_where_date_str(at_date=None, min_date=None, max_date=None): + # dates in unix time + s = "" + if at_date is not None: + if isinstance(at_date, str): + s += f"DATE(date, 'unixepoch') = '{sanitize(at_date)}' AND " + elif isinstance(at_date, int|float): + s += f"date = {int(at_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}") + if min_date is not None: + if isinstance(min_date, str): + s += f"DATE(date, 'unixepoch') >= '{sanitize(min_date)}' AND " + elif isinstance(min_date, int|float): + s += f"date >= {int(min_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}") + if max_date is not None: + if isinstance(max_date, str): + s += f"DATE(date, 'unixepoch') <= '{sanitize(max_date)}' AND " + elif isinstance(max_date, int|float): + s += f"date <= {int(max_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}") + if s == "": + print(f"WARNING: get_where_date_str: no date_str generated. Returing 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") + return "date > 0" + return s.removesuffix(" AND ") + + +# get the earliest date +def get_earliest_date(cur: sql.Cursor) -> int: + """return the earliest time as unixepoch""" + cur.execute(f"SELECT MIN(date) FROM {t_request}") + return cur.fetchone()[0] +# get the latest date +def get_latest_date(cur: sql.Cursor) -> int: + """return the latest time as unixepoch""" + cur.execute(f"SELECT MAX(date) FROM {t_request}") + return cur.fetchone()[0] # get all dates -def get_dates(cur: sql.Cursor) -> list[str]: - cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request}") +# the date:str parameter in all these function must be a sqlite constraint +def get_days(cur: sql.Cursor, date:str) -> list[str]: + """get a list of all dates in yyyy-mm-dd format""" + cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") return [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, ) -def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: - cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}'") - return [ user_id[0] for user_id in cur.fetchall() ] +def get_months(cur: sql.Cursor, date:str) -> list[str]: + """get a list of all dates in yyyy-mm format""" + cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") + dates = get_days(cur, date) + date_dict = {} + for date in dates: + date_without_day = date[0:date.rfind('-')] + date_dict[date_without_day] = 0 + return list(date_dict.keys()) + def get_user_agent(cur: sql.Cursor, user_id: int): return sql_select(cur, t_user, [("user_id", user_id)])[0][2] -def get_unique_user_ids_for_date_human(cur: sql.Cursor, date: str): - cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}'") +def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: + cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}") + return [ user_id[0] for user_id in cur.fetchall() ] + +def get_human_users(cur: sql.Cursor, unique_user_ids): human_user_ids = [] - for user_id in cur.fetchall(): - user_agent = get_user_agent(cur, user_id[0]) - os, browser, mobile = get_os_browser_pairs_from_agent(user_agent) - # print("get_unique_user_ids_for_date", user_id[0], os, browser, user_agent) - if os and browser: - human_user_ids.append(user_id[0]) + for user_id in unique_user_ids: + cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)") + if cur.fetchone()[0] == 1: + human_user_ids.append(user_id) return human_user_ids def get_unique_request_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: - cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}'") + cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}") return [ request_id[0] for request_id in cur.fetchall() ] def get_unique_request_ids_for_date_and_user(cur: sql.Cursor, date:str, user_id: int) -> list[int]: - cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}' AND user_id = {user_id}") + cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND user_id = {user_id}") return [ request_id[0] for request_id in cur.fetchall() ] # get number of requests per day def get_request_count_for_date(cur: sql.Cursor, date:str) -> int: - return sql_get_count_where(cur, t_request, [("DATE(date, 'unixepoch')", date)]) + cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}") + return cur.fetchone()[0] def get_unique_user_count(cur: sql.Cursor) -> int: return sql_tablesize(cur, t_user) @@ -157,7 +190,7 @@ def get_unique_user_count(cur: sql.Cursor) -> int: # # RANKINGS # -def get_file_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, str]]: +def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: global settings """ :returns [(request_count, filename)] @@ -173,18 +206,18 @@ def get_file_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, if not fullmatch(settings["file_ranking_regex_whitelist"], filename): continue # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group} AND date >= {min_date_unix_time}") + cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group} AND {date}") ranking.append((cur.fetchone()[0], filename)) ranking.sort() # print(ranking) return ranking -def get_user_agent_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, str]]: +def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: """ :returns [(request_count, user_agent)] """ ranking = [] - cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE date >= {min_date_unix_time}") + cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}") for user_id in cur.fetchall(): user_id = user_id[0] user_agent = sql_select(cur, t_user, [("user_id", user_id)]) @@ -194,13 +227,13 @@ def get_user_agent_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tupl if not fullmatch(settings["user_agent_ranking_regex_whitelist"], user_agent): continue # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND date >= {min_date_unix_time}") + cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND {date}") ranking.append((cur.fetchone()[0], user_agent)) ranking.sort() # print(ranking) return ranking -def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, str]]: +def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]: """ 1) get all the distinct entries for field_name after min_date_unix_time 2) call get_name_function with the distinct entry @@ -209,14 +242,14 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs :returns [(request_count, name)] """ ranking = [] - cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE date >= {min_date_unix_time}") + cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}") for name in cur.fetchall(): name = name[0] if whitelist_regex: if not fullmatch(whitelist_regex, name): continue # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND date >= {min_date_unix_time}") + cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}") ranking.append((cur.fetchone()[0], name)) ranking.sort() # print(ranking) @@ -256,11 +289,11 @@ def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", ft = ranking[i][1].split(".")[-1] color = "blue" if not color_settings: color = "blue" - elif type(color_settings) == dict: + elif isinstance(color_settings, dict): for key, val in color_settings.items(): if ft in val: color = key if not color: color = "blue" - elif type(color_settings) == list: + elif isinstance(color_settings, list): # print(color_settings, (i - start_index) % len(color_settings)) color = color_settings[(i - start_index) % len(color_settings)] colors.append(color) @@ -340,27 +373,35 @@ def visualize(loaded_settings: dict): img_dir = settings["img_dir"] img_filetype = settings["img_filetype"] names = { - # general - "regina_version": settings["version"] # paths - "img_file_ranking": f"{img_dir}/ranking_all_time_files.{img_filetype}", - "img_referer_ranking": f"{img_dir}/ranking_all_time_referers.{img_filetype}", - "img_browser_ranking": f"{img_dir}/ranking_all_time_browsers.{img_filetype}", - "img_operating_system_ranking": f"{img_dir}/ranking_all_time_operating_systems.{img_filetype}", - "img_daily": f"{img_dir}/user_request_count_daily.{img_filetype}", + "img_file_ranking_last_x_days": f"{img_dir}/ranking_all_time_files_last_x_days.{img_filetype}", + "img_referer_ranking_last_x_days": f"{img_dir}/ranking_all_time_referers_last_x_days.{img_filetype}", + "img_browser_ranking_last_x_days": f"{img_dir}/ranking_all_time_browsers_last_x_days.{img_filetype}", + "img_operating_system_ranking_last_x_days": f"{img_dir}/ranking_all_time_operating_systems_last_x_days.{img_filetype}", + "img_users_and_requests_last_x_days": f"{img_dir}/user_request_count_daily_last_x_days.{img_filetype}", + + "img_file_ranking_total": f"{img_dir}/ranking_all_time_files_total.{img_filetype}", + "img_referer_ranking_total": f"{img_dir}/ranking_all_time_referers_total.{img_filetype}", + "img_browser_ranking_total": f"{img_dir}/ranking_all_time_browsers_total.{img_filetype}", + "img_operating_system_ranking_total": f"{img_dir}/ranking_all_time_operating_systems_total.{img_filetype}", + "img_users_and_requests_total": f"{img_dir}/user_request_count_daily_total.{img_filetype}", # values - "mobile_user_percentage": 0.0, - "server-name": settings["server-name"], - "last_x_days": settings["last_x_days"], - # order matters! - "total_user_count_x_days": 0, - "total_request_count_x_days": 0, - "total_user_count": 0, - "total_request_count": 0, + "mobile_user_percentage_total": 0.0, + "mobile_user_percentage_last_x_days": 0.0, + "user_count_x_days": 0, + "user_count_total": 0, + "request_count_x_days": 0, + "request_count_total": 0, "human_user_percentage_x_days": 0, "human_request_percentage_x_days": 0, - "human_user_percentage": 0, - "human_request_percentage": 0, + "human_user_percentage_total": 0, + "human_request_percentage_total": 0, + # general + "regina_version": settings["version"], + "server-name": settings["server-name"], + "last_x_days": settings["last_x_days"], # must be after all the things with last_x_days! + "earliest_date": "1990-1-1", + "generation_date": "1990-1-1 0:0:0", } conn = sql.connect(settings["db"]) @@ -372,72 +413,105 @@ def visualize(loaded_settings: dict): cur = conn.cursor() get_humans = settings["get-human-percentage"] - print("\t>>>>>>", get_humans) + # DATE STRINGS + names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d") + names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") + # LAST_X_DAYS + # last_x_days_min_date: latest_date - last_x_days + secs_per_day = 86400 + last_x_days_min_date = get_latest_date(cur) - settings["last_x_days"] * secs_per_day + last_x_days_str = get_where_date_str(min_date=last_x_days_min_date) + days = get_days(cur, last_x_days_str) + days_strs = [get_where_date_str(at_date=day) for day in days] - # files - file_ranking = get_file_ranking(cur) - if gen_img: - fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes) - fig_file_ranking.savefig(names["img_file_ranking"]) - # referer - referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur) - print("Referer ranking", referer_ranking) - if gen_img: - fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) - fig_referer_ranking.savefig(names["img_referer_ranking"]) + # ALL DATES + all_time_str = get_where_date_str(min_date=0) + # all months in yyyy-mm format + months_all_time = get_months(cur, all_time_str) + # sqlite constrict to month string + months_strs = [] + for year_month in months_all_time: + year, month = year_month.split("-") + # first day of the month + min_date = dt(int(year), int(month), 1).timestamp() + month = (int(month) % 12) + 1 # + 1 month + year = int(year) + if month == 1: year += 1 + # first day of the next month - 1 sec + max_date = dt(year, month, 1).timestamp() - 1 + months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date)) - # dates - dates = get_dates(cur) - # user - user_agent_ranking = get_user_agent_ranking(cur) - unique_user_ids_for_dates = [] - unique_request_ids_for_dates = [] - unique_user_ids_for_dates_human = [] - unique_request_ids_for_dates_human = [] - for date in dates: - unique_user_ids_for_dates.append(get_unique_user_ids_for_date(cur, date)) - unique_request_ids_for_dates.append(get_unique_request_ids_for_date(cur, date)) + for i in range(2): + suffix = ["_total", "_last_x_days"][i] + date_str = [all_time_str, last_x_days_str][i] + date_names = [months_all_time, days][i] + date_strs = [months_strs, days_strs][i] + assert(len(date_names) == len(date_strs)) + + # FILES + file_ranking = get_file_ranking(cur, date_str) + if gen_img: + fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes) + fig_file_ranking.savefig(names[f'img_file_ranking{suffix}']) + + # REFERER + referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) + if gen_img: + fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) + fig_referer_ranking.savefig(names[f'img_referer_ranking{suffix}']) + + # USER + # user_agent_ranking = get_user_agent_ranking(cur, date_str) + # for the time span + unique_user_ids = get_unique_user_ids_for_date(cur, date_str) + unique_user_ids_human = get_human_users(cur, unique_user_ids) + # for each date + unique_user_ids_dates: list[list[int]] = [] + unique_request_ids_dates: list[list[int]] = [] + unique_user_ids_human_dates: list[list[int]] = [] + unique_request_ids_human_dates: list[list[int]] = [] + for i in range(len(date_strs)): + date_str_ = date_strs[i] + unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_)) + unique_request_ids_dates.append(get_unique_request_ids_for_date(cur, date_str_)) + if get_humans: + unique_user_ids_human_dates.append(get_human_users(cur, unique_user_ids_dates[-1])) + unique_request_ids_human_dates.append([]) + for human in unique_user_ids_human_dates[-1]: + unique_request_ids_human_dates[-1] += get_unique_request_ids_for_date_and_user(cur, date_str_, human) + # print("\n\tuu", unique_user_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_user_ids_human_dates, "\n\turh", unique_request_ids_human_dates) if get_humans: - unique_user_ids_for_dates_human.append(get_unique_user_ids_for_date_human(cur, date)) - unique_request_ids_for_dates_human.append([]) - for human in unique_user_ids_for_dates_human[-1]: - unique_request_ids_for_dates_human[-1] += get_unique_request_ids_for_date_and_user(cur, date, human) - if get_humans: - try: - names["human_user_percentage_x_days"] = round(100 * len_list_list(unique_user_ids_for_dates_human) / len_list_list(unique_user_ids_for_dates), 2) - names["human_request_percentage_x_days"] = round(100 * len_list_list(unique_request_ids_for_dates_human) / len_list_list(unique_request_ids_for_dates), 2) - except: pass - print(">>>", len_list_list(unique_request_ids_for_dates), len_list_list(unique_request_ids_for_dates_human)) - names["total_user_count"] = sql_tablesize(cur, t_user) - names["total_request_count"] = sql_tablesize(cur, t_request) - names["total_user_count_x_days"] = len_list_list(unique_user_ids_for_dates) - names["total_request_count_x_days"] = len_list_list(unique_request_ids_for_dates) + try: + names[f"human_user_percentage{suffix}"] = round(100 * len_list_list(unique_user_ids_human_dates) / len_list_list(unique_user_ids_dates), 2) + names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2) + except: pass + names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates) + names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates) + if gen_img: + fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"]) + if get_humans: + fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", ylabel2="Einzigartige Anfragen", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots) + fig_daily.savefig(names[f"img_users_and_requests{suffix}"]) - # os & browser - os_ranking, browser_ranking, names["mobile_user_percentage"] = get_os_browser_mobile_rankings(user_agent_ranking) - if gen_img: - fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems) - fig_os_rating.savefig(names["img_operating_system_ranking"]) - fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers) - fig_browser_rating.savefig(names["img_browser_ranking"]) + # os & browser + os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human) + if gen_img: + fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems) + fig_os_rating.savefig(names[f"img_operating_system_ranking{suffix}"]) + fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers) + fig_browser_rating.savefig(names[f"img_browser_ranking{suffix}"]) - # print("File Ranking", file_ranking) - # print("referer Ranking", referer_ranking) - # print("user agent ranking", user_agent_ranking) - # print("Unique Users:", get_unique_user_count(cur)) - # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue") - # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange") - # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}") - if gen_img: - fig_daily, ax1, ax2, plots = plot2y(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], [len(request_ids) for request_ids in unique_request_ids_for_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"]) - if get_humans: - fig_daily, ax1, ax2, plots = plot2y(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates_human], [len(request_ids) for request_ids in unique_request_ids_for_dates_human], label1="Unique users (human)", ylabel2="Einzigartige Anfragen", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots) - fig_daily.savefig(names["img_daily"]) - print("OS ranking", os_ranking) - print("Browser ranking", browser_ranking) - print("Mobile percentage", names["mobile_user_percentage"]) - print(dates, "\n\tuu", unique_user_ids_for_dates, "\n\tur",unique_request_ids_for_dates, "\n\tuuh", unique_user_ids_for_dates_human, "\n\turh", unique_request_ids_for_dates_human) + # print("File Ranking", file_ranking) + # print("referer Ranking", referer_ranking) + # print("user agent ranking", user_agent_ranking) + # print("Unique Users:", get_unique_user_count(cur)) + # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue") + # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange") + # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}") + # print("OS ranking", os_ranking) + # print("Browser ranking", browser_ranking) + # print("Mobile percentage", names["mobile_user_percentage"]) if settings["template_html"] and settings["html_out_path"]: with open(settings["template_html"], "r") as file: html = file.read() @@ -445,5 +519,3 @@ def visualize(loaded_settings: dict): html = html.replace(f"%{name}", str(value)) with open(settings["html_out_path"], "w") as file: file.write(html) - - diff --git a/template.html b/template.html new file mode 100644 index 0000000..d7f0e55 --- /dev/null +++ b/template.html @@ -0,0 +1,69 @@ + + + + + + + + Analytics for %server_name + + + +

Analytics for %server-name

+
+
+

Last %last_x_days days

+
+

User and request count (per month)

+ Daily Statistics +
    +
  • user count: %user_count_last_x_days, from which %human_user_percentage_last_x_days% are human
  • +
  • request count: %request_count_last_x_days, from which %human_request_percentage_last_x_days% came from human users
  • +
+
+ +

File access

+ File ranking +
+ +

Platforms and browsers

+ Operating system ranking + Browser ranking +

Mobile users: %mobile_user_percentage_last_x_days%

+
+ +

Referrers

+ Referer ranking +
+
+
+
+
+

Total (since %earliest_date)

+
+

User and request count (per month)

+ Monthly Statistics +
    +
  • Total user count: %user_count_total, from which %human_user_percentage_total% are human
  • +
  • Total request count: %request_count_total, from which %human_request_percentage_total% came from human users
  • +
+
+ +

File access

+ File ranking +
+ +

Platforms and browsers

+ Operating system ranking + Browser ranking +

Mobile users: %mobile_user_percentage_total%

+
+ +

Referrers

+ Referer ranking +
+
+
+

These analytics were generated by regina %regina_version at %generation_date

+ +