changed import, added referer cleanup
This commit is contained in:
parent
691d45d311
commit
3007f1ff8d
31
default.conf
31
default.conf
@ -8,29 +8,45 @@ db = /home/my_user/analytics/my_website.db
|
||||
# these changes will only apply to newly collected data/creation of new database
|
||||
# path to the nginx access log to parse.
|
||||
access_log = /home/my_user/analytics/access.log
|
||||
|
||||
# nginx locations and their root directory: location:directory,location:directory,...
|
||||
locs_and_dirs = /:/www/my_website,/error:/www/error
|
||||
# filetypes that should be grouped (comma separated)
|
||||
auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt
|
||||
# wether a request with 30x http status counts as success
|
||||
status_300_is_success = False
|
||||
# wether a user needs to make at least 1 successful request to be a human
|
||||
humans_need_success = True
|
||||
# filegroups, eg group index.html and home.html
|
||||
# group certain files
|
||||
filegroups = home:index.html,home.html;images:image1.png,image2.png
|
||||
# filegroups =
|
||||
|
||||
# wether a request with 30x http status counts as success
|
||||
status_300_is_success = False
|
||||
# if False, unique user is (ip-address - user agent) pair, if True only ip addess
|
||||
unique_user_is_ip_address = False
|
||||
# wether a user needs to make at least 1 successful request to be a human
|
||||
humans_need_success = True
|
||||
|
||||
# dont collect requests to locations matched by this
|
||||
request_location_regex_blacklist = /analytics.*
|
||||
|
||||
# VISUALIZATION
|
||||
# separate users into all and humans
|
||||
get_human_percentage = True
|
||||
# regex expression as whitelist for file ranking
|
||||
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
|
||||
file_ranking_regex_whitelist =
|
||||
|
||||
# ignore the protocol in referers, so https://url.com = http://url.com -> url.com
|
||||
referer_ranking_ignore_protocol = True
|
||||
# ignore the subdomains in referers, so foo.url.com = bar.url.com -> url.com
|
||||
referer_ranking_ignore_subdomain = False
|
||||
# ignore the location in referers, so url.com/foo = url.com/bar -> url.com
|
||||
referer_ranking_ignore_location = True
|
||||
# regex expression as whitelist for referer ranking, minus means empty
|
||||
# eg: exclude empty referers
|
||||
referer_ranking_regex_whitelist = ^[^\-].*
|
||||
|
||||
# regex expression as whitelist for user agent ranking
|
||||
user_agent_ranking_regex_whitelist =
|
||||
|
||||
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
|
||||
file_ranking_regex_whitelist =
|
||||
# maximum number of file(group)s on the file ranking
|
||||
file_ranking_plot_max_files = 20
|
||||
# wether to ignore non existing files in the ranking
|
||||
@ -38,6 +54,7 @@ file_ranking_ignore_error_files = True
|
||||
|
||||
# "plot_figsize" = (60 40),
|
||||
plot_dpi = 300
|
||||
|
||||
# output directory for the generated plots
|
||||
img_dir = /www/analytics/images
|
||||
# nginx location for the generated images, its root must be img_dir
|
||||
|
@ -2,10 +2,10 @@ import sqlite3 as sql
|
||||
from re import match
|
||||
from time import mktime
|
||||
from datetime import datetime as dt
|
||||
from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
|
||||
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||
from utility.utility import pdebug, warning
|
||||
from utility.globals import user_agent_operating_systems, user_agent_browsers, settings
|
||||
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||
from regina.utility.utility import pdebug, warning
|
||||
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
|
||||
|
||||
"""
|
||||
collect information from the access log and put it into the database
|
||||
@ -73,13 +73,23 @@ def parse_log(logfile:str) -> list[Request]:
|
||||
status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
|
||||
return requests
|
||||
|
||||
|
||||
def user_exists(cursor, request) -> bool:
|
||||
if settings["unique_user_is_ip_address"]:
|
||||
return sql_exists(cursor, t_user, [("ip_address", request.ip_address)])
|
||||
else:
|
||||
return sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])
|
||||
|
||||
def get_user_id(request: Request, cursor: sql.Cursor) -> int:
|
||||
"""
|
||||
get the user_id. Adds the user if not already existing
|
||||
"""
|
||||
# if user exists
|
||||
if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]):
|
||||
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
|
||||
if user_exists(cursor, request):
|
||||
if settings["unique_user_is_ip_address"]:
|
||||
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address)])[0][0]
|
||||
else:
|
||||
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
|
||||
else: # new user
|
||||
# new user_id is number of elements
|
||||
user_id: int = sql_tablesize(cursor, t_user)
|
||||
@ -139,8 +149,12 @@ def add_requests_to_db(requests: list[Request], db_name: str):
|
||||
cursor = conn.cursor()
|
||||
# check the new users later
|
||||
max_user_id = sql_tablesize(cursor, t_user)
|
||||
request_blacklist = settings["request_location_regex_blacklist"]
|
||||
for i in range(len(requests)):
|
||||
request = requests[i]
|
||||
# skip requests to blacklisted locations
|
||||
if request_blacklist:
|
||||
if match(request_blacklist, request.request_file): continue
|
||||
# pdebug("add_requests_to_db:", i, "request:", request)
|
||||
user_id = get_user_id(request, cursor)
|
||||
conn.commit()
|
||||
|
@ -3,8 +3,8 @@
|
||||
import sqlite3 as sql
|
||||
from os import path, listdir
|
||||
# local
|
||||
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||
from utility.utility import pdebug
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||
from regina.utility.utility import pdebug
|
||||
|
||||
"""
|
||||
create reginas database as shown in the uml diagram database.uxf
|
||||
|
@ -9,17 +9,14 @@ from datetime import datetime as dt
|
||||
|
||||
from numpy import empty
|
||||
# local
|
||||
from db_operation.database import t_request, t_user, t_file, t_filegroup
|
||||
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
|
||||
from utility.utility import pdebug, warning, missing_arg
|
||||
from utility.globals import settings
|
||||
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
|
||||
from regina.utility.utility import pdebug, warning, missing_arg
|
||||
from regina.utility.globals import settings
|
||||
|
||||
|
||||
"""
|
||||
visualize information from the databse
|
||||
TODO:
|
||||
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
|
||||
- ignore 404
|
||||
"""
|
||||
|
||||
palette = {
|
||||
@ -141,12 +138,18 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None):
|
||||
def get_earliest_date(cur: sql.Cursor) -> int:
|
||||
"""return the earliest time as unixepoch"""
|
||||
cur.execute(f"SELECT MIN(date) FROM {t_request}")
|
||||
return cur.fetchone()[0]
|
||||
date = cur.fetchone()[0]
|
||||
if not isinstance(date, int): return 0
|
||||
else: return date
|
||||
|
||||
# get the latest date
|
||||
def get_latest_date(cur: sql.Cursor) -> int:
|
||||
"""return the latest time as unixepoch"""
|
||||
cur.execute(f"SELECT MAX(date) FROM {t_request}")
|
||||
return cur.fetchone()[0]
|
||||
date = cur.fetchone()[0]
|
||||
if not isinstance(date, int): return 0
|
||||
else: return date
|
||||
|
||||
# get all dates
|
||||
# the date:str parameter in all these function must be a sqlite constraint
|
||||
def get_days(cur: sql.Cursor, date:str) -> list[str]:
|
||||
@ -296,6 +299,48 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs
|
||||
# print(ranking)
|
||||
return ranking
|
||||
|
||||
re_uri_protocol = f"(https?)://"
|
||||
re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
|
||||
# re_uri_ipv6 = ""
|
||||
re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
|
||||
re_uri_location = r"(?:/(.*))?"
|
||||
re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})"
|
||||
|
||||
def cleanup_referer(referer: str) -> str:
|
||||
"""
|
||||
split the referer uri into its parts and reassemeble them depending on settings
|
||||
"""
|
||||
m = fullmatch(re_uri_full, referer)
|
||||
if not m:
|
||||
pdebug(f"cleanup_referer: Could not match referer '{referer}'")
|
||||
return referer
|
||||
# pdebug(f"cleanup_referer: {referer} - {m.groups()}")
|
||||
protocol = m.groups()[0]
|
||||
subdomains = m.groups()[2]
|
||||
if not subdomains: subdomains = ""
|
||||
domain = m.groups()[1].replace(subdomains, "")
|
||||
location = m.groups()[3]
|
||||
|
||||
referer = domain
|
||||
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
|
||||
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
|
||||
if not settings["referer_ranking_ignore_location"]: referer += location
|
||||
# pdebug(f"cleanup_referer: cleaned up: {referer}")
|
||||
return referer
|
||||
|
||||
def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
|
||||
unique_referers = dict()
|
||||
for count, referer in referer_ranking:
|
||||
referer = cleanup_referer(referer)
|
||||
if referer in unique_referers:
|
||||
unique_referers[referer] += count
|
||||
else:
|
||||
unique_referers[referer] = count
|
||||
referer_ranking.clear()
|
||||
for referer, count in unique_referers.items():
|
||||
referer_ranking.append((count, referer))
|
||||
referer_ranking.sort()
|
||||
|
||||
|
||||
#
|
||||
# PLOTTING
|
||||
@ -463,7 +508,8 @@ def visualize(loaded_settings: dict):
|
||||
get_humans = settings["get_human_percentage"]
|
||||
# pdebug(f"visualize: settings {settings}")
|
||||
# DATE STRINGS
|
||||
names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d")
|
||||
earliest_date = get_earliest_date(cur)
|
||||
names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
|
||||
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
# LAST_X_DAYS
|
||||
# last_x_days_min_date: latest_date - last_x_days
|
||||
@ -506,6 +552,7 @@ def visualize(loaded_settings: dict):
|
||||
|
||||
# REFERER
|
||||
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
|
||||
cleanup_referer_ranking(referer_ranking)
|
||||
if gen_img:
|
||||
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
|
||||
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")
|
||||
|
@ -3,14 +3,23 @@
|
||||
# __package__="."
|
||||
from sys import argv, exit
|
||||
from os.path import isfile
|
||||
from db_operation.collect import parse_log, add_requests_to_db
|
||||
from db_operation.database import create_db
|
||||
from db_operation.visualize import visualize
|
||||
from utility.settings_manager import read_settings_file
|
||||
from utility.globals import settings, version
|
||||
from regina.db_operation.collect import parse_log, add_requests_to_db
|
||||
from regina.db_operation.database import create_db
|
||||
from regina.db_operation.visualize import visualize
|
||||
from regina.utility.settings_manager import read_settings_file
|
||||
from regina.utility.globals import settings, version
|
||||
|
||||
"""
|
||||
start regina, launch either collect or visualize
|
||||
TODO:
|
||||
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
|
||||
- optionen:
|
||||
- unique user = ip address
|
||||
- max requests/time
|
||||
|
||||
- wenn datenbankgröße zum problem wird:
|
||||
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
|
||||
- selbes für platforms und browsers
|
||||
"""
|
||||
|
||||
|
||||
@ -71,9 +80,9 @@ def main():
|
||||
error(f"Not a file: '{config_file}'")
|
||||
read_settings_file(config_file, settings)
|
||||
settings["version"] = version
|
||||
if log_file: settings["access-log"] = log_file
|
||||
if log_file: settings["access_log"] = log_file
|
||||
|
||||
print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'")
|
||||
print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
|
||||
|
||||
if not settings["server_name"]: missing_arg("server-name")
|
||||
if not settings["access_log"]: missing_arg("log")
|
||||
|
@ -12,6 +12,8 @@ settings = {
|
||||
"locs_and_dirs": [],
|
||||
"auto_group_filetypes": [],
|
||||
"filegroups": "",
|
||||
"request_location_regex_blacklist": "",
|
||||
"unique_user_is_ip_address": False,
|
||||
|
||||
# VISUALIZATION
|
||||
"get_human_percentage": False,
|
||||
@ -20,6 +22,9 @@ settings = {
|
||||
# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
|
||||
"file_ranking_regex_whitelist": r".*\.(html)",
|
||||
"file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
|
||||
"referer_ranking_ignore_protocol": True,
|
||||
"referer_ranking_ignore_subdomain": False,
|
||||
"referer_ranking_ignore_location": True,
|
||||
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
|
||||
"user_agent_ranking_regex_whitelist": r"",
|
||||
"file_ranking_plot_max_files": 15,
|
||||
|
@ -6,7 +6,7 @@ from sys import exit
|
||||
Various utitity
|
||||
"""
|
||||
|
||||
DEBUG = False
|
||||
DEBUG = True
|
||||
def pdebug(*args):
|
||||
if DEBUG: print(*args)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user