From 3007f1ff8db5d29edb5988798b93f6dfe46b2320 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Mon, 28 Nov 2022 01:01:06 +0100 Subject: [PATCH] changed import, added referer cleanup --- default.conf | 31 +++++++++++---- regina/db_operation/collect.py | 26 ++++++++++--- regina/db_operation/database.py | 4 +- regina/db_operation/visualize.py | 67 +++++++++++++++++++++++++++----- regina/main.py | 23 +++++++---- regina/utility/globals.py | 5 +++ regina/utility/utility.py | 2 +- 7 files changed, 125 insertions(+), 33 deletions(-) diff --git a/default.conf b/default.conf index 2be8d2a..d351099 100644 --- a/default.conf +++ b/default.conf @@ -8,29 +8,45 @@ db = /home/my_user/analytics/my_website.db # these changes will only apply to newly collected data/creation of new database # path to the nginx access log to parse. access_log = /home/my_user/analytics/access.log + # nginx locations and their root directory: location:directory,location:directory,... locs_and_dirs = /:/www/my_website,/error:/www/error # filetypes that should be grouped (comma separated) auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt -# wether a request with 30x http status counts as success -status_300_is_success = False -# wether a user needs to make at least 1 successful request to be a human -humans_need_success = True -# filegroups, eg group index.html and home.html +# group certain files filegroups = home:index.html,home.html;images:image1.png,image2.png # filegroups = +# wether a request with 30x http status counts as success +status_300_is_success = False +# if False, unique user is (ip-address - user agent) pair, if True only ip addess +unique_user_is_ip_address = False +# wether a user needs to make at least 1 successful request to be a human +humans_need_success = True + +# dont collect requests to locations matched by this +request_location_regex_blacklist = /analytics.* + # VISUALIZATION # separate users into all and humans get_human_percentage = True # regex expression as whitelist for file ranking -# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) -file_ranking_regex_whitelist = + +# ignore the protocol in referers, so https://url.com = http://url.com -> url.com +referer_ranking_ignore_protocol = True +# ignore the subdomains in referers, so foo.url.com = bar.url.com -> url.com +referer_ranking_ignore_subdomain = False +# ignore the location in referers, so url.com/foo = url.com/bar -> url.com +referer_ranking_ignore_location = True # regex expression as whitelist for referer ranking, minus means empty # eg: exclude empty referers referer_ranking_regex_whitelist = ^[^\-].* + # regex expression as whitelist for user agent ranking user_agent_ranking_regex_whitelist = + +# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) +file_ranking_regex_whitelist = # maximum number of file(group)s on the file ranking file_ranking_plot_max_files = 20 # wether to ignore non existing files in the ranking @@ -38,6 +54,7 @@ file_ranking_ignore_error_files = True # "plot_figsize" = (60 40), plot_dpi = 300 + # output directory for the generated plots img_dir = /www/analytics/images # nginx location for the generated images, its root must be img_dir diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py index 5e16885..3b50e39 100644 --- a/regina/db_operation/collect.py +++ b/regina/db_operation/collect.py @@ -2,10 +2,10 @@ import sqlite3 as sql from re import match from time import mktime from datetime import datetime as dt -from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup -from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize -from utility.utility import pdebug, warning -from utility.globals import user_agent_operating_systems, user_agent_browsers, settings +from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup +from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from regina.utility.utility import pdebug, warning +from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings """ collect information from the access log and put it into the database @@ -73,13 +73,23 @@ def parse_log(logfile:str) -> list[Request]: status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7])) return requests + +def user_exists(cursor, request) -> bool: + if settings["unique_user_is_ip_address"]: + return sql_exists(cursor, t_user, [("ip_address", request.ip_address)]) + else: + return sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]) + def get_user_id(request: Request, cursor: sql.Cursor) -> int: """ get the user_id. Adds the user if not already existing """ # if user exists - if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]): - user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0] + if user_exists(cursor, request): + if settings["unique_user_is_ip_address"]: + user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address)])[0][0] + else: + user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0] else: # new user # new user_id is number of elements user_id: int = sql_tablesize(cursor, t_user) @@ -139,8 +149,12 @@ def add_requests_to_db(requests: list[Request], db_name: str): cursor = conn.cursor() # check the new users later max_user_id = sql_tablesize(cursor, t_user) + request_blacklist = settings["request_location_regex_blacklist"] for i in range(len(requests)): request = requests[i] + # skip requests to blacklisted locations + if request_blacklist: + if match(request_blacklist, request.request_file): continue # pdebug("add_requests_to_db:", i, "request:", request) user_id = get_user_id(request, cursor) conn.commit() diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py index 0819a8f..8e86374 100644 --- a/regina/db_operation/database.py +++ b/regina/db_operation/database.py @@ -3,8 +3,8 @@ import sqlite3 as sql from os import path, listdir # local -from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize -from utility.utility import pdebug +from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from regina.utility.utility import pdebug """ create reginas database as shown in the uml diagram database.uxf diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py index 262486a..fd4be19 100644 --- a/regina/db_operation/visualize.py +++ b/regina/db_operation/visualize.py @@ -9,17 +9,14 @@ from datetime import datetime as dt from numpy import empty # local -from db_operation.database import t_request, t_user, t_file, t_filegroup -from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where -from utility.utility import pdebug, warning, missing_arg -from utility.globals import settings +from regina.db_operation.database import t_request, t_user, t_file, t_filegroup +from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where +from regina.utility.utility import pdebug, warning, missing_arg +from regina.utility.globals import settings """ visualize information from the databse -TODO: -- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com -- ignore 404 """ palette = { @@ -141,12 +138,18 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None): def get_earliest_date(cur: sql.Cursor) -> int: """return the earliest time as unixepoch""" cur.execute(f"SELECT MIN(date) FROM {t_request}") - return cur.fetchone()[0] + date = cur.fetchone()[0] + if not isinstance(date, int): return 0 + else: return date + # get the latest date def get_latest_date(cur: sql.Cursor) -> int: """return the latest time as unixepoch""" cur.execute(f"SELECT MAX(date) FROM {t_request}") - return cur.fetchone()[0] + date = cur.fetchone()[0] + if not isinstance(date, int): return 0 + else: return date + # get all dates # the date:str parameter in all these function must be a sqlite constraint def get_days(cur: sql.Cursor, date:str) -> list[str]: @@ -296,6 +299,48 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs # print(ranking) return ranking +re_uri_protocol = f"(https?)://" +re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)" +# re_uri_ipv6 = "" +re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})" +re_uri_location = r"(?:/(.*))?" +re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})" + +def cleanup_referer(referer: str) -> str: + """ + split the referer uri into its parts and reassemeble them depending on settings + """ + m = fullmatch(re_uri_full, referer) + if not m: + pdebug(f"cleanup_referer: Could not match referer '{referer}'") + return referer + # pdebug(f"cleanup_referer: {referer} - {m.groups()}") + protocol = m.groups()[0] + subdomains = m.groups()[2] + if not subdomains: subdomains = "" + domain = m.groups()[1].replace(subdomains, "") + location = m.groups()[3] + + referer = domain + if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer + if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer + if not settings["referer_ranking_ignore_location"]: referer += location + # pdebug(f"cleanup_referer: cleaned up: {referer}") + return referer + +def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]): + unique_referers = dict() + for count, referer in referer_ranking: + referer = cleanup_referer(referer) + if referer in unique_referers: + unique_referers[referer] += count + else: + unique_referers[referer] = count + referer_ranking.clear() + for referer, count in unique_referers.items(): + referer_ranking.append((count, referer)) + referer_ranking.sort() + # # PLOTTING @@ -463,7 +508,8 @@ def visualize(loaded_settings: dict): get_humans = settings["get_human_percentage"] # pdebug(f"visualize: settings {settings}") # DATE STRINGS - names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d") + earliest_date = get_earliest_date(cur) + names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d") names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") # LAST_X_DAYS # last_x_days_min_date: latest_date - last_x_days @@ -506,6 +552,7 @@ def visualize(loaded_settings: dict): # REFERER referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) + cleanup_referer_ranking(referer_ranking) if gen_img: fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}") diff --git a/regina/main.py b/regina/main.py index fc0d744..8e457cf 100644 --- a/regina/main.py +++ b/regina/main.py @@ -3,14 +3,23 @@ # __package__="." from sys import argv, exit from os.path import isfile -from db_operation.collect import parse_log, add_requests_to_db -from db_operation.database import create_db -from db_operation.visualize import visualize -from utility.settings_manager import read_settings_file -from utility.globals import settings, version +from regina.db_operation.collect import parse_log, add_requests_to_db +from regina.db_operation.database import create_db +from regina.db_operation.visualize import visualize +from regina.utility.settings_manager import read_settings_file +from regina.utility.globals import settings, version """ start regina, launch either collect or visualize +TODO: +- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com +- optionen: + - unique user = ip address + - max requests/time + +- wenn datenbankgröße zum problem wird: + - referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id + - selbes für platforms und browsers """ @@ -71,9 +80,9 @@ def main(): error(f"Not a file: '{config_file}'") read_settings_file(config_file, settings) settings["version"] = version - if log_file: settings["access-log"] = log_file + if log_file: settings["access_log"] = log_file - print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'") + print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'") if not settings["server_name"]: missing_arg("server-name") if not settings["access_log"]: missing_arg("log") diff --git a/regina/utility/globals.py b/regina/utility/globals.py index 14a2165..7355bd9 100644 --- a/regina/utility/globals.py +++ b/regina/utility/globals.py @@ -12,6 +12,8 @@ settings = { "locs_and_dirs": [], "auto_group_filetypes": [], "filegroups": "", + "request_location_regex_blacklist": "", + "unique_user_is_ip_address": False, # VISUALIZATION "get_human_percentage": False, @@ -20,6 +22,9 @@ settings = { # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))", "file_ranking_regex_whitelist": r".*\.(html)", "file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300) + "referer_ranking_ignore_protocol": True, + "referer_ranking_ignore_subdomain": False, + "referer_ranking_ignore_location": True, "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty "user_agent_ranking_regex_whitelist": r"", "file_ranking_plot_max_files": 15, diff --git a/regina/utility/utility.py b/regina/utility/utility.py index 42a4299..00a31e8 100644 --- a/regina/utility/utility.py +++ b/regina/utility/utility.py @@ -6,7 +6,7 @@ from sys import exit Various utitity """ -DEBUG = False +DEBUG = True def pdebug(*args): if DEBUG: print(*args)