changed import, added referer cleanup

This commit is contained in:
matthias@arch 2022-11-28 01:01:06 +01:00
parent 691d45d311
commit 3007f1ff8d
7 changed files with 125 additions and 33 deletions

View File

@ -8,29 +8,45 @@ db = /home/my_user/analytics/my_website.db
# these changes will only apply to newly collected data/creation of new database
# path to the nginx access log to parse.
access_log = /home/my_user/analytics/access.log
# nginx locations and their root directory: location:directory,location:directory,...
locs_and_dirs = /:/www/my_website,/error:/www/error
# filetypes that should be grouped (comma separated)
auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt
# wether a request with 30x http status counts as success
status_300_is_success = False
# wether a user needs to make at least 1 successful request to be a human
humans_need_success = True
# filegroups, eg group index.html and home.html
# group certain files
filegroups = home:index.html,home.html;images:image1.png,image2.png
# filegroups =
# wether a request with 30x http status counts as success
status_300_is_success = False
# if False, unique user is (ip-address - user agent) pair, if True only ip addess
unique_user_is_ip_address = False
# wether a user needs to make at least 1 successful request to be a human
humans_need_success = True
# dont collect requests to locations matched by this
request_location_regex_blacklist = /analytics.*
# VISUALIZATION
# separate users into all and humans
get_human_percentage = True
# regex expression as whitelist for file ranking
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
file_ranking_regex_whitelist =
# ignore the protocol in referers, so https://url.com = http://url.com -> url.com
referer_ranking_ignore_protocol = True
# ignore the subdomains in referers, so foo.url.com = bar.url.com -> url.com
referer_ranking_ignore_subdomain = False
# ignore the location in referers, so url.com/foo = url.com/bar -> url.com
referer_ranking_ignore_location = True
# regex expression as whitelist for referer ranking, minus means empty
# eg: exclude empty referers
referer_ranking_regex_whitelist = ^[^\-].*
# regex expression as whitelist for user agent ranking
user_agent_ranking_regex_whitelist =
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
file_ranking_regex_whitelist =
# maximum number of file(group)s on the file ranking
file_ranking_plot_max_files = 20
# wether to ignore non existing files in the ranking
@ -38,6 +54,7 @@ file_ranking_ignore_error_files = True
# "plot_figsize" = (60 40),
plot_dpi = 300
# output directory for the generated plots
img_dir = /www/analytics/images
# nginx location for the generated images, its root must be img_dir

View File

@ -2,10 +2,10 @@ import sqlite3 as sql
from re import match
from time import mktime
from datetime import datetime as dt
from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from utility.utility import pdebug, warning
from utility.globals import user_agent_operating_systems, user_agent_browsers, settings
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug, warning
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
"""
collect information from the access log and put it into the database
@ -73,13 +73,23 @@ def parse_log(logfile:str) -> list[Request]:
status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
return requests
def user_exists(cursor, request) -> bool:
if settings["unique_user_is_ip_address"]:
return sql_exists(cursor, t_user, [("ip_address", request.ip_address)])
else:
return sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])
def get_user_id(request: Request, cursor: sql.Cursor) -> int:
"""
get the user_id. Adds the user if not already existing
"""
# if user exists
if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]):
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
if user_exists(cursor, request):
if settings["unique_user_is_ip_address"]:
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address)])[0][0]
else:
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
else: # new user
# new user_id is number of elements
user_id: int = sql_tablesize(cursor, t_user)
@ -139,8 +149,12 @@ def add_requests_to_db(requests: list[Request], db_name: str):
cursor = conn.cursor()
# check the new users later
max_user_id = sql_tablesize(cursor, t_user)
request_blacklist = settings["request_location_regex_blacklist"]
for i in range(len(requests)):
request = requests[i]
# skip requests to blacklisted locations
if request_blacklist:
if match(request_blacklist, request.request_file): continue
# pdebug("add_requests_to_db:", i, "request:", request)
user_id = get_user_id(request, cursor)
conn.commit()

View File

@ -3,8 +3,8 @@
import sqlite3 as sql
from os import path, listdir
# local
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from utility.utility import pdebug
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug
"""
create reginas database as shown in the uml diagram database.uxf

View File

@ -9,17 +9,14 @@ from datetime import datetime as dt
from numpy import empty
# local
from db_operation.database import t_request, t_user, t_file, t_filegroup
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
from utility.utility import pdebug, warning, missing_arg
from utility.globals import settings
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
from regina.utility.utility import pdebug, warning, missing_arg
from regina.utility.globals import settings
"""
visualize information from the databse
TODO:
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
- ignore 404
"""
palette = {
@ -141,12 +138,18 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None):
def get_earliest_date(cur: sql.Cursor) -> int:
"""return the earliest time as unixepoch"""
cur.execute(f"SELECT MIN(date) FROM {t_request}")
return cur.fetchone()[0]
date = cur.fetchone()[0]
if not isinstance(date, int): return 0
else: return date
# get the latest date
def get_latest_date(cur: sql.Cursor) -> int:
"""return the latest time as unixepoch"""
cur.execute(f"SELECT MAX(date) FROM {t_request}")
return cur.fetchone()[0]
date = cur.fetchone()[0]
if not isinstance(date, int): return 0
else: return date
# get all dates
# the date:str parameter in all these function must be a sqlite constraint
def get_days(cur: sql.Cursor, date:str) -> list[str]:
@ -296,6 +299,48 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs
# print(ranking)
return ranking
re_uri_protocol = f"(https?)://"
re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
# re_uri_ipv6 = ""
re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
re_uri_location = r"(?:/(.*))?"
re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})"
def cleanup_referer(referer: str) -> str:
"""
split the referer uri into its parts and reassemeble them depending on settings
"""
m = fullmatch(re_uri_full, referer)
if not m:
pdebug(f"cleanup_referer: Could not match referer '{referer}'")
return referer
# pdebug(f"cleanup_referer: {referer} - {m.groups()}")
protocol = m.groups()[0]
subdomains = m.groups()[2]
if not subdomains: subdomains = ""
domain = m.groups()[1].replace(subdomains, "")
location = m.groups()[3]
referer = domain
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
if not settings["referer_ranking_ignore_location"]: referer += location
# pdebug(f"cleanup_referer: cleaned up: {referer}")
return referer
def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
unique_referers = dict()
for count, referer in referer_ranking:
referer = cleanup_referer(referer)
if referer in unique_referers:
unique_referers[referer] += count
else:
unique_referers[referer] = count
referer_ranking.clear()
for referer, count in unique_referers.items():
referer_ranking.append((count, referer))
referer_ranking.sort()
#
# PLOTTING
@ -463,7 +508,8 @@ def visualize(loaded_settings: dict):
get_humans = settings["get_human_percentage"]
# pdebug(f"visualize: settings {settings}")
# DATE STRINGS
names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d")
earliest_date = get_earliest_date(cur)
names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
# LAST_X_DAYS
# last_x_days_min_date: latest_date - last_x_days
@ -506,6 +552,7 @@ def visualize(loaded_settings: dict):
# REFERER
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
cleanup_referer_ranking(referer_ranking)
if gen_img:
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")

View File

@ -3,14 +3,23 @@
# __package__="."
from sys import argv, exit
from os.path import isfile
from db_operation.collect import parse_log, add_requests_to_db
from db_operation.database import create_db
from db_operation.visualize import visualize
from utility.settings_manager import read_settings_file
from utility.globals import settings, version
from regina.db_operation.collect import parse_log, add_requests_to_db
from regina.db_operation.database import create_db
from regina.db_operation.visualize import visualize
from regina.utility.settings_manager import read_settings_file
from regina.utility.globals import settings, version
"""
start regina, launch either collect or visualize
TODO:
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
- optionen:
- unique user = ip address
- max requests/time
- wenn datenbankgröße zum problem wird:
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
- selbes für platforms und browsers
"""
@ -71,9 +80,9 @@ def main():
error(f"Not a file: '{config_file}'")
read_settings_file(config_file, settings)
settings["version"] = version
if log_file: settings["access-log"] = log_file
if log_file: settings["access_log"] = log_file
print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'")
print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not settings["server_name"]: missing_arg("server-name")
if not settings["access_log"]: missing_arg("log")

View File

@ -12,6 +12,8 @@ settings = {
"locs_and_dirs": [],
"auto_group_filetypes": [],
"filegroups": "",
"request_location_regex_blacklist": "",
"unique_user_is_ip_address": False,
# VISUALIZATION
"get_human_percentage": False,
@ -20,6 +22,9 @@ settings = {
# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
"file_ranking_regex_whitelist": r".*\.(html)",
"file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
"referer_ranking_ignore_protocol": True,
"referer_ranking_ignore_subdomain": False,
"referer_ranking_ignore_location": True,
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
"user_agent_ranking_regex_whitelist": r"",
"file_ranking_plot_max_files": 15,

View File

@ -6,7 +6,7 @@ from sys import exit
Various utitity
"""
DEBUG = False
DEBUG = True
def pdebug(*args):
if DEBUG: print(*args)