This commit is contained in:
matthias@arch 2022-11-23 20:02:19 +01:00
parent 56c867a6b3
commit 5064df1f0e
10 changed files with 343 additions and 204 deletions

View File

@ -4,10 +4,10 @@
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>38</x> <x>133</x>
<y>323</y> <y>247</y>
<w>342</w> <w>342</w>
<h>190</h> <h>266</h>
</coordinates> </coordinates>
<panel_attributes>User <panel_attributes>User
-- --
@ -15,14 +15,17 @@
-- --
- ip address - ip address
- user agent string - user agent string
- platform
- browser
- mobile
style=autoresize</panel_attributes> style=autoresize</panel_attributes>
<additional_attributes/> <additional_attributes/>
</element> </element>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>1064</x> <x>1159</x>
<y>323</y> <y>247</y>
<w>247</w> <w>247</w>
<h>152</h> <h>152</h>
</coordinates> </coordinates>
@ -37,8 +40,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>874</x> <x>969</x>
<y>323</y> <y>247</y>
<w>228</w> <w>228</w>
<h>95</h> <h>95</h>
</coordinates> </coordinates>
@ -51,8 +54,8 @@ m2=1
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>608</x> <x>703</x>
<y>304</y> <y>228</y>
<w>285</w> <w>285</w>
<h>285</h> <h>285</h>
</coordinates> </coordinates>
@ -72,8 +75,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>361</x> <x>456</x>
<y>323</y> <y>247</y>
<w>285</w> <w>285</w>
<h>95</h> <h>95</h>
</coordinates> </coordinates>
@ -86,8 +89,8 @@ m2=n
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>1064</x> <x>1159</x>
<y>722</y> <y>646</y>
<w>190</w> <w>190</w>
<h>152</h> <h>152</h>
</coordinates> </coordinates>
@ -103,8 +106,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>1121</x> <x>1216</x>
<y>456</y> <y>380</y>
<w>76</w> <w>76</w>
<h>304</h> <h>304</h>
</coordinates> </coordinates>
@ -117,8 +120,8 @@ m2=1
<element> <element>
<id>UMLNote</id> <id>UMLNote</id>
<coordinates> <coordinates>
<x>779</x> <x>874</x>
<y>95</y> <y>19</y>
<w>570</w> <w>570</w>
<h>133</h> <h>133</h>
</coordinates> </coordinates>
@ -131,8 +134,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>1083</x> <x>1178</x>
<y>209</y> <y>133</y>
<w>57</w> <w>57</w>
<h>152</h> <h>152</h>
</coordinates> </coordinates>

View File

@ -20,4 +20,4 @@ plot_dpi = 300
img_dir = /www/analytics/images img_dir = /www/analytics/images
template_html = /home/my-user/analytics/template.html template_html = /home/my-user/analytics/template.html
html_out_path = /www/analytics/statistics.html html_out_path = /www/analytics/statistics.html
# filegroups = # filegroups = start:/index.html,/about.html,/img_on_index.png;music:/music.html,song.mp3

View File

@ -1,9 +1,13 @@
import sqlite3 as sql import sqlite3 as sql
from re import match from re import match
from time import mktime from time import mktime
from datetime import datetime as dt
from database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup from database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from datetime import datetime as dt
"""
collect information from the access log and put it into the database
"""
DEBUG = True DEBUG = True
def pdebug(*args): def pdebug(*args):
@ -15,6 +19,12 @@ def warning(w):
months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
# these oses and browser can be detected:
# lower element takes precedence
user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"]
user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"]
class Request: class Request:
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""): def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""):
self.ip_address = sanitize(ip_address) self.ip_address = sanitize(ip_address)
@ -39,9 +49,6 @@ class Request:
self.referer = sanitize(referer) self.referer = sanitize(referer)
self.user_agent = sanitize(user_agent) self.user_agent = sanitize(user_agent)
def insert_user_sql_str(self, user_id, user_table="user"):
return f"INSERT INTO {user_table} (user_id, ip_address, user_agent) VALUES ({user_id}, '{self.ip_address}', '{self.user_agent}');"
def __repr__(self): def __repr__(self):
return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.user_agent} - {self.status}" return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.user_agent} - {self.status}"
@ -88,9 +95,28 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int:
# new user_id is number of elements # new user_id is number of elements
user_id: int = sql_tablesize(cursor, t_user) user_id: int = sql_tablesize(cursor, t_user)
pdebug("new user:", user_id, request.ip_address) pdebug("new user:", user_id, request.ip_address)
cursor.execute(request.insert_user_sql_str(user_id)) platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent)
cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}');")
return user_id return user_id
# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)"
# 1: platform, 2: version, 3: details
def get_os_browser_pairs_from_agent(user_agent):
# for groups in findall(re_user_agent, user_agent):
operating_system = ""
browser = ""
mobile = "Mobi" in user_agent
for os in user_agent_operating_systems:
if os in user_agent:
operating_system = os
break
for br in user_agent_browsers:
if br in user_agent:
browser = br
break
# if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'")
return operating_system, browser, mobile
def add_requests_to_db(requests: list[Request], db_name: str): def add_requests_to_db(requests: list[Request], db_name: str):
conn = sql.connect(db_name) conn = sql.connect(db_name)

View File

@ -2,6 +2,11 @@ import sqlite3 as sql
from sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from os import path, listdir from os import path, listdir
"""
create reginas database as shown in the uml diagram database.uxf
"""
DEBUG = True DEBUG = True
def pdebug(*args): def pdebug(*args):
if DEBUG: print(*args) if DEBUG: print(*args)
@ -43,7 +48,7 @@ filegroup_id = Entry("group_id", "INTEGER")
ip_address_entry = Entry("ip_address", "TEXT") ip_address_entry = Entry("ip_address", "TEXT")
filename_entry = Entry("filename", "TEXT") filename_entry = Entry("filename", "TEXT")
database_tables = { database_tables = {
t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT")], [f"UNIQUE({user_id.name})"]), t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER")], [f"UNIQUE({user_id.name})"]),
t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]),
t_request: Table(t_request, request_id, [ t_request: Table(t_request, request_id, [
@ -124,12 +129,12 @@ def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[]
""" """
create the name with database_tables create the name with database_tables
""" """
print(f"creating database: '{name}'")
conn = sql.connect(f"{name}") conn = sql.connect(f"{name}")
cursor = conn.cursor() cursor = conn.cursor()
for table in database_tables.values(): for table in database_tables.values():
cursor.execute(table.create_sql_str()) cursor.execute(table.create_sql_str())
filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes) filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes)
print(filegroup_str)
create_filegroups(cursor, filegroup_str) create_filegroups(cursor, filegroup_str)
conn.commit() conn.commit()
conn.close() conn.close()

View File

@ -6,6 +6,10 @@ from os.path import isfile, isdir
from visualize import visualize from visualize import visualize
from settings_manager import read_settings_file from settings_manager import read_settings_file
"""
start regina, launch either collect or visualize
"""
version = "1.0" version = "1.0"
# default settings, these are overwriteable through a config file # default settings, these are overwriteable through a config file
@ -75,7 +79,6 @@ if __name__ == '__main__':
exit(0) exit(0)
elif argv[i] == "--collect": elif argv[i] == "--collect":
collect = True collect = True
exit(0)
elif argv[i] == "--visualize": elif argv[i] == "--visualize":
visualize_ = True visualize_ = True
else: else:
@ -90,19 +93,20 @@ if __name__ == '__main__':
error(f"Not a file: '{config_file}'") error(f"Not a file: '{config_file}'")
read_settings_file(config_file, settings) read_settings_file(config_file, settings)
settings["version"] = version settings["version"] = version
print(f"regina version {version} with server-name '{settings['server-name']}' and database '{settings['db']}'")
if not settings["server-name"]: missing_arg("server-name") if not settings["server-name"]: missing_arg("server-name")
if not settings["access-log"]: missing_arg("log") if not settings["access-log"]: missing_arg("log")
if not settings["db"]: missing_arg("db") if not settings["db"]: missing_arg("db")
if type(settings["auto-group-filetypes"]) == str: if isinstance(settings["auto-group-filetypes"], str):
settings["auto-group-filetypes"] = settings["auto-group-filetypes"].split(",") settings["auto-group-filetypes"] = settings["auto-group-filetypes"].split(",")
if type(settings["locs-and-dirs"]) == str: if isinstance(settings["locs-and-dirs"], str):
settings["locs-and-dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs-and-dirs"].split(",") ] settings["locs-and-dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs-and-dirs"].split(",") ]
if collect: if collect:
if not isfile(settings["db"]): if not isfile(settings["db"]):
create_db(settings["db"], settings["filegroups"], settings["locs-and-dirs"], settings["auto-group-filetypes"]) create_db(settings["db"], settings["filegroups"], settings["locs-and-dirs"], settings["auto-group-filetypes"])
requests = parse_log(settings["access-log"]) requests = parse_log(settings["access-log"])
add_requests_to_db(requests, settings["db"]) add_requests_to_db(requests, settings["db"])
if visualize: if visualize_:
if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
visualize(settings) visualize(settings)

View File

@ -20,8 +20,8 @@ def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True,
if not allow_new_keys and vals[0] not in settings.keys(): if not allow_new_keys and vals[0] not in settings.keys():
if ignore_invalid_lines: continue if ignore_invalid_lines: continue
else: raise KeyError(f"Invalid key: '{vals[0]}'") else: raise KeyError(f"Invalid key: '{vals[0]}'")
if convert_to_type and type(settings[vals[0]]) not in [str, None]: if convert_to_type and not isinstance(settings[vals[0]], str|list|None):
if type(settings[vals[0]]) == bool: if isinstance(settings[vals[0]], bool):
settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]]) settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]])
continue continue
try: try:

View File

@ -1,6 +1,5 @@
import sqlite3 as sql import sqlite3 as sql
"""Various sql utilities""" """Various utilities"""
def sanitize(s): def sanitize(s):
if type(s) != str: return s if type(s) != str: return s
return s\ return s\

View File

@ -1,39 +0,0 @@
<html>
<head>
<meta http-equiv="content-type" content="text/html">
<meta charset="utf-8">
<meta name="description" content="Regina - Nginx Analytics">
<meta name="keywords" content="">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Analytics for %server_name</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<h1>Analytics for %server-name</h1>
<div class=box>
<center>
<h2>Last %last_x_days days</h2>
<img src="%img_daily" alt="Daily Statistics", title="User and request count for the last %last_x_days days">
<ul>
<li>Unique user count: <b>%total_user_count_x_days</b>, from which <b>%human_user_percentage_x_days%</b> are human</li>
<li>Unique request count: <b>%total_request_count_x_days</b>, from which <b>%human_request_percentage_x_days%</b> came from human users </li>
</ul>
</center>
</div>
<div class=box>
<center>
<h2>All times</h2>
<img src="%img_operating_system_ranking" alt="Operating system ranking", title="Operating system ranking">
<img src="%img_browser_ranking" alt="Browser ranking", title="Browser ranking">
<ul>
<li>Mobile user percentage: %mobile_user_percentage</li>
<li>Total user count: <b>%total_user_count</b>, from which <b>%human_user_percentage%</b> are human</li>
<li>Total request count: <b>%total_request_count</b>, from which <b>%human_request_percentage%</b> came from human users </li>
</ul>
<img src="%img_file_ranking" alt="File ranking", title="File ranking">
<img src="%img_referer_ranking" alt="Referer ranking", title="Referer ranking">
</center>
</div>
<p>These analytics were generated by <a href="https://git.quintern.xyz/MatthiasQuintern/regina">regina %regina_version</a></p>
</body>
</html>

View File

@ -6,7 +6,9 @@ from re import fullmatch, findall
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib as mpl import matplotlib as mpl
from os.path import isdir from os.path import isdir
from datetime import datetime as dt
""" """
visualize information from the databse
TODO: TODO:
- bei referrers &auml;hnliche zusammenlegen, z.b. www.google.de und https://google.com - bei referrers &auml;hnliche zusammenlegen, z.b. www.google.de und https://google.com
""" """
@ -29,10 +31,6 @@ color_settings_filetypes = {
} }
color_settings_alternate = list(palette.values()) color_settings_alternate = list(palette.values())
# these oses and browser can be detected:
# lower element takes precedence
user_agent_operating_systems = ["Windows", "Android", "Linux", "iPhone", "iPad", "Mac", "BSD"]
user_agent_browsers = ["Firefox", "DuckDuckGo", "SeaMonkey", "Vivaldi", "Yandex", "Brave", "SamsungBrowser", "Lynx", "Epiphany", "Chromium", "Chrome", "Safari", "Opera", "Edge"]
color_settings_browsers = { color_settings_browsers = {
palette["red"]: ["Safari"], palette["red"]: ["Safari"],
palette["orange"]: ["Firefox"], palette["orange"]: ["Firefox"],
@ -61,25 +59,7 @@ def len_list_list(l: list[list]):
# #
# FILTERS # FILTERS
# #
# re_user_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)" def get_os_browser_mobile_rankings(cur: sql.Cursor, user_ids: list[int]):
# 1: platform, 2: version, 3: details
def get_os_browser_pairs_from_agent(user_agent):
# for groups in findall(re_user_agent, user_agent):
operating_system = ""
browser = ""
mobile = "Mobi" in user_agent
for os in user_agent_operating_systems:
if os in user_agent:
operating_system = os
break
for br in user_agent_browsers:
if br in user_agent:
browser = br
break
# if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{user_agent}', found os: '{operating_system}' and browser: '{browser}'")
return operating_system, browser, mobile
def get_os_browser_mobile_rankings(user_agent_ranking):
""" """
returns [(count, operating_system)], [(count, browser)], mobile_user_percentage returns [(count, operating_system)], [(count, browser)], mobile_user_percentage
""" """
@ -88,8 +68,10 @@ def get_os_browser_mobile_rankings(user_agent_ranking):
browser_ranking = {} browser_ranking = {}
browser_count = 0.0 browser_count = 0.0
mobile_ranking = { True: 0.0, False: 0.0 } mobile_ranking = { True: 0.0, False: 0.0 }
for count, agent in user_agent_ranking: for user_id in user_ids:
os, browser, mobile = get_os_browser_pairs_from_agent(agent) cur.execute(f"SELECT platform,browser,mobile FROM {t_user} WHERE user_id = {user_id}")
os, browser, mobile = cur.fetchone()
mobile = bool(mobile)
if os: if os:
if os in os_ranking: os_ranking[os] += 1 if os in os_ranking: os_ranking[os] += 1
else: os_ranking[os] = 1 else: os_ranking[os] = 1
@ -114,40 +96,91 @@ def get_os_browser_mobile_rankings(user_agent_ranking):
# #
# GETTERS # GETTERS
# #
def get_where_date_str(at_date=None, min_date=None, max_date=None):
# dates in unix time
s = ""
if at_date is not None:
if isinstance(at_date, str):
s += f"DATE(date, 'unixepoch') = '{sanitize(at_date)}' AND "
elif isinstance(at_date, int|float):
s += f"date = {int(at_date)} AND "
else:
print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}")
if min_date is not None:
if isinstance(min_date, str):
s += f"DATE(date, 'unixepoch') >= '{sanitize(min_date)}' AND "
elif isinstance(min_date, int|float):
s += f"date >= {int(min_date)} AND "
else:
print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}")
if max_date is not None:
if isinstance(max_date, str):
s += f"DATE(date, 'unixepoch') <= '{sanitize(max_date)}' AND "
elif isinstance(max_date, int|float):
s += f"date <= {int(max_date)} AND "
else:
print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}")
if s == "":
print(f"WARNING: get_where_date_str: no date_str generated. Returing 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}")
return "date > 0"
return s.removesuffix(" AND ")
# get the earliest date
def get_earliest_date(cur: sql.Cursor) -> int:
"""return the earliest time as unixepoch"""
cur.execute(f"SELECT MIN(date) FROM {t_request}")
return cur.fetchone()[0]
# get the latest date
def get_latest_date(cur: sql.Cursor) -> int:
"""return the latest time as unixepoch"""
cur.execute(f"SELECT MAX(date) FROM {t_request}")
return cur.fetchone()[0]
# get all dates # get all dates
def get_dates(cur: sql.Cursor) -> list[str]: # the date:str parameter in all these function must be a sqlite constraint
cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request}") def get_days(cur: sql.Cursor, date:str) -> list[str]:
"""get a list of all dates in yyyy-mm-dd format"""
cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")
return [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, ) return [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, )
def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: def get_months(cur: sql.Cursor, date:str) -> list[str]:
cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}'") """get a list of all dates in yyyy-mm format"""
return [ user_id[0] for user_id in cur.fetchall() ] cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")
dates = get_days(cur, date)
date_dict = {}
for date in dates:
date_without_day = date[0:date.rfind('-')]
date_dict[date_without_day] = 0
return list(date_dict.keys())
def get_user_agent(cur: sql.Cursor, user_id: int): def get_user_agent(cur: sql.Cursor, user_id: int):
return sql_select(cur, t_user, [("user_id", user_id)])[0][2] return sql_select(cur, t_user, [("user_id", user_id)])[0][2]
def get_unique_user_ids_for_date_human(cur: sql.Cursor, date: str): def get_unique_user_ids_for_date(cur: sql.Cursor, date:str) -> list[int]:
cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}'") cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}")
return [ user_id[0] for user_id in cur.fetchall() ]
def get_human_users(cur: sql.Cursor, unique_user_ids):
human_user_ids = [] human_user_ids = []
for user_id in cur.fetchall(): for user_id in unique_user_ids:
user_agent = get_user_agent(cur, user_id[0]) cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
os, browser, mobile = get_os_browser_pairs_from_agent(user_agent) if cur.fetchone()[0] == 1:
# print("get_unique_user_ids_for_date", user_id[0], os, browser, user_agent) human_user_ids.append(user_id)
if os and browser:
human_user_ids.append(user_id[0])
return human_user_ids return human_user_ids
def get_unique_request_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: def get_unique_request_ids_for_date(cur: sql.Cursor, date:str) -> list[int]:
cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}'") cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}")
return [ request_id[0] for request_id in cur.fetchall() ] return [ request_id[0] for request_id in cur.fetchall() ]
def get_unique_request_ids_for_date_and_user(cur: sql.Cursor, date:str, user_id: int) -> list[int]: def get_unique_request_ids_for_date_and_user(cur: sql.Cursor, date:str, user_id: int) -> list[int]:
cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE DATE(date, 'unixepoch') = '{sanitize(date)}' AND user_id = {user_id}") cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND user_id = {user_id}")
return [ request_id[0] for request_id in cur.fetchall() ] return [ request_id[0] for request_id in cur.fetchall() ]
# get number of requests per day # get number of requests per day
def get_request_count_for_date(cur: sql.Cursor, date:str) -> int: def get_request_count_for_date(cur: sql.Cursor, date:str) -> int:
return sql_get_count_where(cur, t_request, [("DATE(date, 'unixepoch')", date)]) cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}")
return cur.fetchone()[0]
def get_unique_user_count(cur: sql.Cursor) -> int: def get_unique_user_count(cur: sql.Cursor) -> int:
return sql_tablesize(cur, t_user) return sql_tablesize(cur, t_user)
@ -157,7 +190,7 @@ def get_unique_user_count(cur: sql.Cursor) -> int:
# #
# RANKINGS # RANKINGS
# #
def get_file_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, str]]: def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
global settings global settings
""" """
:returns [(request_count, filename)] :returns [(request_count, filename)]
@ -173,18 +206,18 @@ def get_file_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int,
if not fullmatch(settings["file_ranking_regex_whitelist"], filename): if not fullmatch(settings["file_ranking_regex_whitelist"], filename):
continue continue
# ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group} AND date >= {min_date_unix_time}") cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE group_id = {group} AND {date}")
ranking.append((cur.fetchone()[0], filename)) ranking.append((cur.fetchone()[0], filename))
ranking.sort() ranking.sort()
# print(ranking) # print(ranking)
return ranking return ranking
def get_user_agent_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, str]]: def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
""" """
:returns [(request_count, user_agent)] :returns [(request_count, user_agent)]
""" """
ranking = [] ranking = []
cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE date >= {min_date_unix_time}") cur.execute(f"SELECT DISTINCT user_id FROM {t_request} WHERE {date}")
for user_id in cur.fetchall(): for user_id in cur.fetchall():
user_id = user_id[0] user_id = user_id[0]
user_agent = sql_select(cur, t_user, [("user_id", user_id)]) user_agent = sql_select(cur, t_user, [("user_id", user_id)])
@ -194,13 +227,13 @@ def get_user_agent_ranking(cur: sql.Cursor, min_date_unix_time = 0) -> list[tupl
if not fullmatch(settings["user_agent_ranking_regex_whitelist"], user_agent): if not fullmatch(settings["user_agent_ranking_regex_whitelist"], user_agent):
continue continue
# ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND date >= {min_date_unix_time}") cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE user_id = {user_id} AND {date}")
ranking.append((cur.fetchone()[0], user_agent)) ranking.append((cur.fetchone()[0], user_agent))
ranking.sort() ranking.sort()
# print(ranking) # print(ranking)
return ranking return ranking
def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, min_date_unix_time = 0) -> list[tuple[int, str]]: def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
""" """
1) get all the distinct entries for field_name after min_date_unix_time 1) get all the distinct entries for field_name after min_date_unix_time
2) call get_name_function with the distinct entry 2) call get_name_function with the distinct entry
@ -209,14 +242,14 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs
:returns [(request_count, name)] :returns [(request_count, name)]
""" """
ranking = [] ranking = []
cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE date >= {min_date_unix_time}") cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}")
for name in cur.fetchall(): for name in cur.fetchall():
name = name[0] name = name[0]
if whitelist_regex: if whitelist_regex:
if not fullmatch(whitelist_regex, name): if not fullmatch(whitelist_regex, name):
continue continue
# ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND date >= {min_date_unix_time}") cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}")
ranking.append((cur.fetchone()[0], name)) ranking.append((cur.fetchone()[0], name))
ranking.sort() ranking.sort()
# print(ranking) # print(ranking)
@ -256,11 +289,11 @@ def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="",
ft = ranking[i][1].split(".")[-1] ft = ranking[i][1].split(".")[-1]
color = "blue" color = "blue"
if not color_settings: color = "blue" if not color_settings: color = "blue"
elif type(color_settings) == dict: elif isinstance(color_settings, dict):
for key, val in color_settings.items(): for key, val in color_settings.items():
if ft in val: color = key if ft in val: color = key
if not color: color = "blue" if not color: color = "blue"
elif type(color_settings) == list: elif isinstance(color_settings, list):
# print(color_settings, (i - start_index) % len(color_settings)) # print(color_settings, (i - start_index) % len(color_settings))
color = color_settings[(i - start_index) % len(color_settings)] color = color_settings[(i - start_index) % len(color_settings)]
colors.append(color) colors.append(color)
@ -340,27 +373,35 @@ def visualize(loaded_settings: dict):
img_dir = settings["img_dir"] img_dir = settings["img_dir"]
img_filetype = settings["img_filetype"] img_filetype = settings["img_filetype"]
names = { names = {
# general
"regina_version": settings["version"]
# paths # paths
"img_file_ranking": f"{img_dir}/ranking_all_time_files.{img_filetype}", "img_file_ranking_last_x_days": f"{img_dir}/ranking_all_time_files_last_x_days.{img_filetype}",
"img_referer_ranking": f"{img_dir}/ranking_all_time_referers.{img_filetype}", "img_referer_ranking_last_x_days": f"{img_dir}/ranking_all_time_referers_last_x_days.{img_filetype}",
"img_browser_ranking": f"{img_dir}/ranking_all_time_browsers.{img_filetype}", "img_browser_ranking_last_x_days": f"{img_dir}/ranking_all_time_browsers_last_x_days.{img_filetype}",
"img_operating_system_ranking": f"{img_dir}/ranking_all_time_operating_systems.{img_filetype}", "img_operating_system_ranking_last_x_days": f"{img_dir}/ranking_all_time_operating_systems_last_x_days.{img_filetype}",
"img_daily": f"{img_dir}/user_request_count_daily.{img_filetype}", "img_users_and_requests_last_x_days": f"{img_dir}/user_request_count_daily_last_x_days.{img_filetype}",
"img_file_ranking_total": f"{img_dir}/ranking_all_time_files_total.{img_filetype}",
"img_referer_ranking_total": f"{img_dir}/ranking_all_time_referers_total.{img_filetype}",
"img_browser_ranking_total": f"{img_dir}/ranking_all_time_browsers_total.{img_filetype}",
"img_operating_system_ranking_total": f"{img_dir}/ranking_all_time_operating_systems_total.{img_filetype}",
"img_users_and_requests_total": f"{img_dir}/user_request_count_daily_total.{img_filetype}",
# values # values
"mobile_user_percentage": 0.0, "mobile_user_percentage_total": 0.0,
"server-name": settings["server-name"], "mobile_user_percentage_last_x_days": 0.0,
"last_x_days": settings["last_x_days"], "user_count_x_days": 0,
# order matters! "user_count_total": 0,
"total_user_count_x_days": 0, "request_count_x_days": 0,
"total_request_count_x_days": 0, "request_count_total": 0,
"total_user_count": 0,
"total_request_count": 0,
"human_user_percentage_x_days": 0, "human_user_percentage_x_days": 0,
"human_request_percentage_x_days": 0, "human_request_percentage_x_days": 0,
"human_user_percentage": 0, "human_user_percentage_total": 0,
"human_request_percentage": 0, "human_request_percentage_total": 0,
# general
"regina_version": settings["version"],
"server-name": settings["server-name"],
"last_x_days": settings["last_x_days"], # must be after all the things with last_x_days!
"earliest_date": "1990-1-1",
"generation_date": "1990-1-1 0:0:0",
} }
conn = sql.connect(settings["db"]) conn = sql.connect(settings["db"])
@ -372,55 +413,94 @@ def visualize(loaded_settings: dict):
cur = conn.cursor() cur = conn.cursor()
get_humans = settings["get-human-percentage"] get_humans = settings["get-human-percentage"]
print("\t>>>>>>", get_humans) # DATE STRINGS
names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d")
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
# LAST_X_DAYS
# last_x_days_min_date: latest_date - last_x_days
secs_per_day = 86400
last_x_days_min_date = get_latest_date(cur) - settings["last_x_days"] * secs_per_day
last_x_days_str = get_where_date_str(min_date=last_x_days_min_date)
days = get_days(cur, last_x_days_str)
days_strs = [get_where_date_str(at_date=day) for day in days]
# files
file_ranking = get_file_ranking(cur) # ALL DATES
all_time_str = get_where_date_str(min_date=0)
# all months in yyyy-mm format
months_all_time = get_months(cur, all_time_str)
# sqlite constrict to month string
months_strs = []
for year_month in months_all_time:
year, month = year_month.split("-")
# first day of the month
min_date = dt(int(year), int(month), 1).timestamp()
month = (int(month) % 12) + 1 # + 1 month
year = int(year)
if month == 1: year += 1
# first day of the next month - 1 sec
max_date = dt(year, month, 1).timestamp() - 1
months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date))
for i in range(2):
suffix = ["_total", "_last_x_days"][i]
date_str = [all_time_str, last_x_days_str][i]
date_names = [months_all_time, days][i]
date_strs = [months_strs, days_strs][i]
assert(len(date_names) == len(date_strs))
# FILES
file_ranking = get_file_ranking(cur, date_str)
if gen_img: if gen_img:
fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes) fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes)
fig_file_ranking.savefig(names["img_file_ranking"]) fig_file_ranking.savefig(names[f'img_file_ranking{suffix}'])
# referer # REFERER
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur) referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
print("Referer ranking", referer_ranking)
if gen_img: if gen_img:
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
fig_referer_ranking.savefig(names["img_referer_ranking"]) fig_referer_ranking.savefig(names[f'img_referer_ranking{suffix}'])
# dates # USER
dates = get_dates(cur) # user_agent_ranking = get_user_agent_ranking(cur, date_str)
# user # for the time span
user_agent_ranking = get_user_agent_ranking(cur) unique_user_ids = get_unique_user_ids_for_date(cur, date_str)
unique_user_ids_for_dates = [] unique_user_ids_human = get_human_users(cur, unique_user_ids)
unique_request_ids_for_dates = [] # for each date
unique_user_ids_for_dates_human = [] unique_user_ids_dates: list[list[int]] = []
unique_request_ids_for_dates_human = [] unique_request_ids_dates: list[list[int]] = []
for date in dates: unique_user_ids_human_dates: list[list[int]] = []
unique_user_ids_for_dates.append(get_unique_user_ids_for_date(cur, date)) unique_request_ids_human_dates: list[list[int]] = []
unique_request_ids_for_dates.append(get_unique_request_ids_for_date(cur, date)) for i in range(len(date_strs)):
date_str_ = date_strs[i]
unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_))
unique_request_ids_dates.append(get_unique_request_ids_for_date(cur, date_str_))
if get_humans: if get_humans:
unique_user_ids_for_dates_human.append(get_unique_user_ids_for_date_human(cur, date)) unique_user_ids_human_dates.append(get_human_users(cur, unique_user_ids_dates[-1]))
unique_request_ids_for_dates_human.append([]) unique_request_ids_human_dates.append([])
for human in unique_user_ids_for_dates_human[-1]: for human in unique_user_ids_human_dates[-1]:
unique_request_ids_for_dates_human[-1] += get_unique_request_ids_for_date_and_user(cur, date, human) unique_request_ids_human_dates[-1] += get_unique_request_ids_for_date_and_user(cur, date_str_, human)
# print("\n\tuu", unique_user_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_user_ids_human_dates, "\n\turh", unique_request_ids_human_dates)
if get_humans: if get_humans:
try: try:
names["human_user_percentage_x_days"] = round(100 * len_list_list(unique_user_ids_for_dates_human) / len_list_list(unique_user_ids_for_dates), 2) names[f"human_user_percentage{suffix}"] = round(100 * len_list_list(unique_user_ids_human_dates) / len_list_list(unique_user_ids_dates), 2)
names["human_request_percentage_x_days"] = round(100 * len_list_list(unique_request_ids_for_dates_human) / len_list_list(unique_request_ids_for_dates), 2) names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2)
except: pass except: pass
print(">>>", len_list_list(unique_request_ids_for_dates), len_list_list(unique_request_ids_for_dates_human)) names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates)
names["total_user_count"] = sql_tablesize(cur, t_user) names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates)
names["total_request_count"] = sql_tablesize(cur, t_request) if gen_img:
names["total_user_count_x_days"] = len_list_list(unique_user_ids_for_dates) fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"])
names["total_request_count_x_days"] = len_list_list(unique_request_ids_for_dates) if get_humans:
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", ylabel2="Einzigartige Anfragen", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots)
fig_daily.savefig(names[f"img_users_and_requests{suffix}"])
# os & browser # os & browser
os_ranking, browser_ranking, names["mobile_user_percentage"] = get_os_browser_mobile_rankings(user_agent_ranking) os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human)
if gen_img: if gen_img:
fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems) fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems)
fig_os_rating.savefig(names["img_operating_system_ranking"]) fig_os_rating.savefig(names[f"img_operating_system_ranking{suffix}"])
fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers) fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers)
fig_browser_rating.savefig(names["img_browser_ranking"]) fig_browser_rating.savefig(names[f"img_browser_ranking{suffix}"])
# print("File Ranking", file_ranking) # print("File Ranking", file_ranking)
# print("referer Ranking", referer_ranking) # print("referer Ranking", referer_ranking)
@ -429,15 +509,9 @@ def visualize(loaded_settings: dict):
# fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue") # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue")
# fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange") # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange")
# fig_daily.savefig(f"{img_dir}/daily.{img_filetype}") # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}")
if gen_img: # print("OS ranking", os_ranking)
fig_daily, ax1, ax2, plots = plot2y(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], [len(request_ids) for request_ids in unique_request_ids_for_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"]) # print("Browser ranking", browser_ranking)
if get_humans: # print("Mobile percentage", names["mobile_user_percentage"])
fig_daily, ax1, ax2, plots = plot2y(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates_human], [len(request_ids) for request_ids in unique_request_ids_for_dates_human], label1="Unique users (human)", ylabel2="Einzigartige Anfragen", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots)
fig_daily.savefig(names["img_daily"])
print("OS ranking", os_ranking)
print("Browser ranking", browser_ranking)
print("Mobile percentage", names["mobile_user_percentage"])
print(dates, "\n\tuu", unique_user_ids_for_dates, "\n\tur",unique_request_ids_for_dates, "\n\tuuh", unique_user_ids_for_dates_human, "\n\turh", unique_request_ids_for_dates_human)
if settings["template_html"] and settings["html_out_path"]: if settings["template_html"] and settings["html_out_path"]:
with open(settings["template_html"], "r") as file: with open(settings["template_html"], "r") as file:
html = file.read() html = file.read()
@ -445,5 +519,3 @@ def visualize(loaded_settings: dict):
html = html.replace(f"%{name}", str(value)) html = html.replace(f"%{name}", str(value))
with open(settings["html_out_path"], "w") as file: with open(settings["html_out_path"], "w") as file:
file.write(html) file.write(html)

69
template.html Normal file
View File

@ -0,0 +1,69 @@
<html>
<head>
<meta http-equiv="content-type" content="text/html">
<meta charset="utf-8">
<meta name="description" content="Regina - Nginx Analytics">
<meta name="keywords" content="">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Analytics for %server_name</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<h1>Analytics for %server-name</h1>
<div class=box>
<center>
<h2>Last %last_x_days days</h2>
<hr>
<h3>User and request count (per month)</h3>
<img src="%img_users_and_requests_last_x_days" alt="Daily Statistics", title="User and request count for the last %last_x_days days">
<ul>
<li>user count: <b>%user_count_last_x_days</b>, from which <b>%human_user_percentage_last_x_days%</b> are human</li>
<li>request count: <b>%request_count_last_x_days</b>, from which <b>%human_request_percentage_last_x_days%</b> came from human users </li>
</ul>
<hr>
<h3>File access</h3>
<img src="%img_file_ranking_last_x_days" alt="File ranking", title="File ranking">
<hr>
<h3>Platforms and browsers</h3>
<img src="%img_operating_system_ranking_last_x_days" alt="Operating system ranking", title="Operating system ranking">
<img src="%img_browser_ranking_last_x_days" alt="Browser ranking", title="Browser ranking">
<h4>Mobile users: %mobile_user_percentage_last_x_days%</h4>
<hr>
<h3>Referrers</h3>
<img src="%img_referer_ranking_last_x_days" alt="Referer ranking", title="Referer ranking">
<hr>
</center>
</div>
<div class=box>
<center>
<h2>Total (since %earliest_date)</h2>
<hr>
<h3>User and request count (per month)</h3>
<img src="%img_users_and_requests_total" alt="Monthly Statistics", title="User and request count">
<ul>
<li>Total user count: <b>%user_count_total</b>, from which <b>%human_user_percentage_total%</b> are human</li>
<li>Total request count: <b>%request_count_total</b>, from which <b>%human_request_percentage_total%</b> came from human users </li>
</ul>
<hr>
<h3>File access</h3>
<img src="%img_file_ranking_total" alt="File ranking", title="File ranking">
<hr>
<h3>Platforms and browsers</h3>
<img src="%img_operating_system_ranking_total" alt="Operating system ranking", title="Operating system ranking">
<img src="%img_browser_ranking_total" alt="Browser ranking", title="Browser ranking">
<h4>Mobile users: %mobile_user_percentage_total%</h4>
<hr>
<h3>Referrers</h3>
<img src="%img_referer_ranking_total" alt="Referer ranking", title="Referer ranking">
<hr>
</center>
</div>
<p>These analytics were generated by <a href="https://git.quintern.xyz/MatthiasQuintern/regina">regina %regina_version</a> at %generation_date</p>
</body>
</html>