changed import, added referer cleanup

This commit is contained in:
matthias@arch 2022-11-28 01:01:06 +01:00
parent 691d45d311
commit 3007f1ff8d
7 changed files with 125 additions and 33 deletions

View File

@ -8,29 +8,45 @@ db = /home/my_user/analytics/my_website.db
# these changes will only apply to newly collected data/creation of new database # these changes will only apply to newly collected data/creation of new database
# path to the nginx access log to parse. # path to the nginx access log to parse.
access_log = /home/my_user/analytics/access.log access_log = /home/my_user/analytics/access.log
# nginx locations and their root directory: location:directory,location:directory,... # nginx locations and their root directory: location:directory,location:directory,...
locs_and_dirs = /:/www/my_website,/error:/www/error locs_and_dirs = /:/www/my_website,/error:/www/error
# filetypes that should be grouped (comma separated) # filetypes that should be grouped (comma separated)
auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt
# wether a request with 30x http status counts as success # group certain files
status_300_is_success = False
# wether a user needs to make at least 1 successful request to be a human
humans_need_success = True
# filegroups, eg group index.html and home.html
filegroups = home:index.html,home.html;images:image1.png,image2.png filegroups = home:index.html,home.html;images:image1.png,image2.png
# filegroups = # filegroups =
# wether a request with 30x http status counts as success
status_300_is_success = False
# if False, unique user is (ip-address - user agent) pair, if True only ip addess
unique_user_is_ip_address = False
# wether a user needs to make at least 1 successful request to be a human
humans_need_success = True
# dont collect requests to locations matched by this
request_location_regex_blacklist = /analytics.*
# VISUALIZATION # VISUALIZATION
# separate users into all and humans # separate users into all and humans
get_human_percentage = True get_human_percentage = True
# regex expression as whitelist for file ranking # regex expression as whitelist for file ranking
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
file_ranking_regex_whitelist = # ignore the protocol in referers, so https://url.com = http://url.com -> url.com
referer_ranking_ignore_protocol = True
# ignore the subdomains in referers, so foo.url.com = bar.url.com -> url.com
referer_ranking_ignore_subdomain = False
# ignore the location in referers, so url.com/foo = url.com/bar -> url.com
referer_ranking_ignore_location = True
# regex expression as whitelist for referer ranking, minus means empty # regex expression as whitelist for referer ranking, minus means empty
# eg: exclude empty referers # eg: exclude empty referers
referer_ranking_regex_whitelist = ^[^\-].* referer_ranking_regex_whitelist = ^[^\-].*
# regex expression as whitelist for user agent ranking # regex expression as whitelist for user agent ranking
user_agent_ranking_regex_whitelist = user_agent_ranking_regex_whitelist =
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
file_ranking_regex_whitelist =
# maximum number of file(group)s on the file ranking # maximum number of file(group)s on the file ranking
file_ranking_plot_max_files = 20 file_ranking_plot_max_files = 20
# wether to ignore non existing files in the ranking # wether to ignore non existing files in the ranking
@ -38,6 +54,7 @@ file_ranking_ignore_error_files = True
# "plot_figsize" = (60 40), # "plot_figsize" = (60 40),
plot_dpi = 300 plot_dpi = 300
# output directory for the generated plots # output directory for the generated plots
img_dir = /www/analytics/images img_dir = /www/analytics/images
# nginx location for the generated images, its root must be img_dir # nginx location for the generated images, its root must be img_dir

View File

@ -2,10 +2,10 @@ import sqlite3 as sql
from re import match from re import match
from time import mktime from time import mktime
from datetime import datetime as dt from datetime import datetime as dt
from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from utility.utility import pdebug, warning from regina.utility.utility import pdebug, warning
from utility.globals import user_agent_operating_systems, user_agent_browsers, settings from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
""" """
collect information from the access log and put it into the database collect information from the access log and put it into the database
@ -73,12 +73,22 @@ def parse_log(logfile:str) -> list[Request]:
status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7])) status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
return requests return requests
def user_exists(cursor, request) -> bool:
if settings["unique_user_is_ip_address"]:
return sql_exists(cursor, t_user, [("ip_address", request.ip_address)])
else:
return sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])
def get_user_id(request: Request, cursor: sql.Cursor) -> int: def get_user_id(request: Request, cursor: sql.Cursor) -> int:
""" """
get the user_id. Adds the user if not already existing get the user_id. Adds the user if not already existing
""" """
# if user exists # if user exists
if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]): if user_exists(cursor, request):
if settings["unique_user_is_ip_address"]:
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address)])[0][0]
else:
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0] user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
else: # new user else: # new user
# new user_id is number of elements # new user_id is number of elements
@ -139,8 +149,12 @@ def add_requests_to_db(requests: list[Request], db_name: str):
cursor = conn.cursor() cursor = conn.cursor()
# check the new users later # check the new users later
max_user_id = sql_tablesize(cursor, t_user) max_user_id = sql_tablesize(cursor, t_user)
request_blacklist = settings["request_location_regex_blacklist"]
for i in range(len(requests)): for i in range(len(requests)):
request = requests[i] request = requests[i]
# skip requests to blacklisted locations
if request_blacklist:
if match(request_blacklist, request.request_file): continue
# pdebug("add_requests_to_db:", i, "request:", request) # pdebug("add_requests_to_db:", i, "request:", request)
user_id = get_user_id(request, cursor) user_id = get_user_id(request, cursor)
conn.commit() conn.commit()

View File

@ -3,8 +3,8 @@
import sqlite3 as sql import sqlite3 as sql
from os import path, listdir from os import path, listdir
# local # local
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from utility.utility import pdebug from regina.utility.utility import pdebug
""" """
create reginas database as shown in the uml diagram database.uxf create reginas database as shown in the uml diagram database.uxf

View File

@ -9,17 +9,14 @@ from datetime import datetime as dt
from numpy import empty from numpy import empty
# local # local
from db_operation.database import t_request, t_user, t_file, t_filegroup from regina.db_operation.database import t_request, t_user, t_file, t_filegroup
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
from utility.utility import pdebug, warning, missing_arg from regina.utility.utility import pdebug, warning, missing_arg
from utility.globals import settings from regina.utility.globals import settings
""" """
visualize information from the databse visualize information from the databse
TODO:
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
- ignore 404
""" """
palette = { palette = {
@ -141,12 +138,18 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None):
def get_earliest_date(cur: sql.Cursor) -> int: def get_earliest_date(cur: sql.Cursor) -> int:
"""return the earliest time as unixepoch""" """return the earliest time as unixepoch"""
cur.execute(f"SELECT MIN(date) FROM {t_request}") cur.execute(f"SELECT MIN(date) FROM {t_request}")
return cur.fetchone()[0] date = cur.fetchone()[0]
if not isinstance(date, int): return 0
else: return date
# get the latest date # get the latest date
def get_latest_date(cur: sql.Cursor) -> int: def get_latest_date(cur: sql.Cursor) -> int:
"""return the latest time as unixepoch""" """return the latest time as unixepoch"""
cur.execute(f"SELECT MAX(date) FROM {t_request}") cur.execute(f"SELECT MAX(date) FROM {t_request}")
return cur.fetchone()[0] date = cur.fetchone()[0]
if not isinstance(date, int): return 0
else: return date
# get all dates # get all dates
# the date:str parameter in all these function must be a sqlite constraint # the date:str parameter in all these function must be a sqlite constraint
def get_days(cur: sql.Cursor, date:str) -> list[str]: def get_days(cur: sql.Cursor, date:str) -> list[str]:
@ -296,6 +299,48 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs
# print(ranking) # print(ranking)
return ranking return ranking
re_uri_protocol = f"(https?)://"
re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
# re_uri_ipv6 = ""
re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
re_uri_location = r"(?:/(.*))?"
re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})"
def cleanup_referer(referer: str) -> str:
"""
split the referer uri into its parts and reassemeble them depending on settings
"""
m = fullmatch(re_uri_full, referer)
if not m:
pdebug(f"cleanup_referer: Could not match referer '{referer}'")
return referer
# pdebug(f"cleanup_referer: {referer} - {m.groups()}")
protocol = m.groups()[0]
subdomains = m.groups()[2]
if not subdomains: subdomains = ""
domain = m.groups()[1].replace(subdomains, "")
location = m.groups()[3]
referer = domain
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
if not settings["referer_ranking_ignore_location"]: referer += location
# pdebug(f"cleanup_referer: cleaned up: {referer}")
return referer
def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
unique_referers = dict()
for count, referer in referer_ranking:
referer = cleanup_referer(referer)
if referer in unique_referers:
unique_referers[referer] += count
else:
unique_referers[referer] = count
referer_ranking.clear()
for referer, count in unique_referers.items():
referer_ranking.append((count, referer))
referer_ranking.sort()
# #
# PLOTTING # PLOTTING
@ -463,7 +508,8 @@ def visualize(loaded_settings: dict):
get_humans = settings["get_human_percentage"] get_humans = settings["get_human_percentage"]
# pdebug(f"visualize: settings {settings}") # pdebug(f"visualize: settings {settings}")
# DATE STRINGS # DATE STRINGS
names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d") earliest_date = get_earliest_date(cur)
names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
# LAST_X_DAYS # LAST_X_DAYS
# last_x_days_min_date: latest_date - last_x_days # last_x_days_min_date: latest_date - last_x_days
@ -506,6 +552,7 @@ def visualize(loaded_settings: dict):
# REFERER # REFERER
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
cleanup_referer_ranking(referer_ranking)
if gen_img: if gen_img:
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}") fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")

View File

@ -3,14 +3,23 @@
# __package__="." # __package__="."
from sys import argv, exit from sys import argv, exit
from os.path import isfile from os.path import isfile
from db_operation.collect import parse_log, add_requests_to_db from regina.db_operation.collect import parse_log, add_requests_to_db
from db_operation.database import create_db from regina.db_operation.database import create_db
from db_operation.visualize import visualize from regina.db_operation.visualize import visualize
from utility.settings_manager import read_settings_file from regina.utility.settings_manager import read_settings_file
from utility.globals import settings, version from regina.utility.globals import settings, version
""" """
start regina, launch either collect or visualize start regina, launch either collect or visualize
TODO:
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
- optionen:
- unique user = ip address
- max requests/time
- wenn datenbankgröße zum problem wird:
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
- selbes für platforms und browsers
""" """
@ -71,9 +80,9 @@ def main():
error(f"Not a file: '{config_file}'") error(f"Not a file: '{config_file}'")
read_settings_file(config_file, settings) read_settings_file(config_file, settings)
settings["version"] = version settings["version"] = version
if log_file: settings["access-log"] = log_file if log_file: settings["access_log"] = log_file
print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'") print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not settings["server_name"]: missing_arg("server-name") if not settings["server_name"]: missing_arg("server-name")
if not settings["access_log"]: missing_arg("log") if not settings["access_log"]: missing_arg("log")

View File

@ -12,6 +12,8 @@ settings = {
"locs_and_dirs": [], "locs_and_dirs": [],
"auto_group_filetypes": [], "auto_group_filetypes": [],
"filegroups": "", "filegroups": "",
"request_location_regex_blacklist": "",
"unique_user_is_ip_address": False,
# VISUALIZATION # VISUALIZATION
"get_human_percentage": False, "get_human_percentage": False,
@ -20,6 +22,9 @@ settings = {
# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))", # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
"file_ranking_regex_whitelist": r".*\.(html)", "file_ranking_regex_whitelist": r".*\.(html)",
"file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300) "file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
"referer_ranking_ignore_protocol": True,
"referer_ranking_ignore_subdomain": False,
"referer_ranking_ignore_location": True,
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
"user_agent_ranking_regex_whitelist": r"", "user_agent_ranking_regex_whitelist": r"",
"file_ranking_plot_max_files": 15, "file_ranking_plot_max_files": 15,

View File

@ -6,7 +6,7 @@ from sys import exit
Various utitity Various utitity
""" """
DEBUG = False DEBUG = True
def pdebug(*args): def pdebug(*args):
if DEBUG: print(*args) if DEBUG: print(*args)