changed import, added referer cleanup
This commit is contained in:
parent
691d45d311
commit
3007f1ff8d
31
default.conf
31
default.conf
@ -8,29 +8,45 @@ db = /home/my_user/analytics/my_website.db
|
|||||||
# these changes will only apply to newly collected data/creation of new database
|
# these changes will only apply to newly collected data/creation of new database
|
||||||
# path to the nginx access log to parse.
|
# path to the nginx access log to parse.
|
||||||
access_log = /home/my_user/analytics/access.log
|
access_log = /home/my_user/analytics/access.log
|
||||||
|
|
||||||
# nginx locations and their root directory: location:directory,location:directory,...
|
# nginx locations and their root directory: location:directory,location:directory,...
|
||||||
locs_and_dirs = /:/www/my_website,/error:/www/error
|
locs_and_dirs = /:/www/my_website,/error:/www/error
|
||||||
# filetypes that should be grouped (comma separated)
|
# filetypes that should be grouped (comma separated)
|
||||||
auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt
|
auto_group_filetypes = png,jpg,jpeg,gif,svg,css,ico,pdf,txt
|
||||||
# wether a request with 30x http status counts as success
|
# group certain files
|
||||||
status_300_is_success = False
|
|
||||||
# wether a user needs to make at least 1 successful request to be a human
|
|
||||||
humans_need_success = True
|
|
||||||
# filegroups, eg group index.html and home.html
|
|
||||||
filegroups = home:index.html,home.html;images:image1.png,image2.png
|
filegroups = home:index.html,home.html;images:image1.png,image2.png
|
||||||
# filegroups =
|
# filegroups =
|
||||||
|
|
||||||
|
# wether a request with 30x http status counts as success
|
||||||
|
status_300_is_success = False
|
||||||
|
# if False, unique user is (ip-address - user agent) pair, if True only ip addess
|
||||||
|
unique_user_is_ip_address = False
|
||||||
|
# wether a user needs to make at least 1 successful request to be a human
|
||||||
|
humans_need_success = True
|
||||||
|
|
||||||
|
# dont collect requests to locations matched by this
|
||||||
|
request_location_regex_blacklist = /analytics.*
|
||||||
|
|
||||||
# VISUALIZATION
|
# VISUALIZATION
|
||||||
# separate users into all and humans
|
# separate users into all and humans
|
||||||
get_human_percentage = True
|
get_human_percentage = True
|
||||||
# regex expression as whitelist for file ranking
|
# regex expression as whitelist for file ranking
|
||||||
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
|
|
||||||
file_ranking_regex_whitelist =
|
# ignore the protocol in referers, so https://url.com = http://url.com -> url.com
|
||||||
|
referer_ranking_ignore_protocol = True
|
||||||
|
# ignore the subdomains in referers, so foo.url.com = bar.url.com -> url.com
|
||||||
|
referer_ranking_ignore_subdomain = False
|
||||||
|
# ignore the location in referers, so url.com/foo = url.com/bar -> url.com
|
||||||
|
referer_ranking_ignore_location = True
|
||||||
# regex expression as whitelist for referer ranking, minus means empty
|
# regex expression as whitelist for referer ranking, minus means empty
|
||||||
# eg: exclude empty referers
|
# eg: exclude empty referers
|
||||||
referer_ranking_regex_whitelist = ^[^\-].*
|
referer_ranking_regex_whitelist = ^[^\-].*
|
||||||
|
|
||||||
# regex expression as whitelist for user agent ranking
|
# regex expression as whitelist for user agent ranking
|
||||||
user_agent_ranking_regex_whitelist =
|
user_agent_ranking_regex_whitelist =
|
||||||
|
|
||||||
|
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
|
||||||
|
file_ranking_regex_whitelist =
|
||||||
# maximum number of file(group)s on the file ranking
|
# maximum number of file(group)s on the file ranking
|
||||||
file_ranking_plot_max_files = 20
|
file_ranking_plot_max_files = 20
|
||||||
# wether to ignore non existing files in the ranking
|
# wether to ignore non existing files in the ranking
|
||||||
@ -38,6 +54,7 @@ file_ranking_ignore_error_files = True
|
|||||||
|
|
||||||
# "plot_figsize" = (60 40),
|
# "plot_figsize" = (60 40),
|
||||||
plot_dpi = 300
|
plot_dpi = 300
|
||||||
|
|
||||||
# output directory for the generated plots
|
# output directory for the generated plots
|
||||||
img_dir = /www/analytics/images
|
img_dir = /www/analytics/images
|
||||||
# nginx location for the generated images, its root must be img_dir
|
# nginx location for the generated images, its root must be img_dir
|
||||||
|
@ -2,10 +2,10 @@ import sqlite3 as sql
|
|||||||
from re import match
|
from re import match
|
||||||
from time import mktime
|
from time import mktime
|
||||||
from datetime import datetime as dt
|
from datetime import datetime as dt
|
||||||
from db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
|
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
|
||||||
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||||
from utility.utility import pdebug, warning
|
from regina.utility.utility import pdebug, warning
|
||||||
from utility.globals import user_agent_operating_systems, user_agent_browsers, settings
|
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
|
||||||
|
|
||||||
"""
|
"""
|
||||||
collect information from the access log and put it into the database
|
collect information from the access log and put it into the database
|
||||||
@ -73,12 +73,22 @@ def parse_log(logfile:str) -> list[Request]:
|
|||||||
status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
|
status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
|
def user_exists(cursor, request) -> bool:
|
||||||
|
if settings["unique_user_is_ip_address"]:
|
||||||
|
return sql_exists(cursor, t_user, [("ip_address", request.ip_address)])
|
||||||
|
else:
|
||||||
|
return sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])
|
||||||
|
|
||||||
def get_user_id(request: Request, cursor: sql.Cursor) -> int:
|
def get_user_id(request: Request, cursor: sql.Cursor) -> int:
|
||||||
"""
|
"""
|
||||||
get the user_id. Adds the user if not already existing
|
get the user_id. Adds the user if not already existing
|
||||||
"""
|
"""
|
||||||
# if user exists
|
# if user exists
|
||||||
if sql_exists(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)]):
|
if user_exists(cursor, request):
|
||||||
|
if settings["unique_user_is_ip_address"]:
|
||||||
|
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address)])[0][0]
|
||||||
|
else:
|
||||||
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
|
user_id = sql_select(cursor, t_user, [("ip_address", request.ip_address), ("user_agent", request.user_agent)])[0][0]
|
||||||
else: # new user
|
else: # new user
|
||||||
# new user_id is number of elements
|
# new user_id is number of elements
|
||||||
@ -139,8 +149,12 @@ def add_requests_to_db(requests: list[Request], db_name: str):
|
|||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
# check the new users later
|
# check the new users later
|
||||||
max_user_id = sql_tablesize(cursor, t_user)
|
max_user_id = sql_tablesize(cursor, t_user)
|
||||||
|
request_blacklist = settings["request_location_regex_blacklist"]
|
||||||
for i in range(len(requests)):
|
for i in range(len(requests)):
|
||||||
request = requests[i]
|
request = requests[i]
|
||||||
|
# skip requests to blacklisted locations
|
||||||
|
if request_blacklist:
|
||||||
|
if match(request_blacklist, request.request_file): continue
|
||||||
# pdebug("add_requests_to_db:", i, "request:", request)
|
# pdebug("add_requests_to_db:", i, "request:", request)
|
||||||
user_id = get_user_id(request, cursor)
|
user_id = get_user_id(request, cursor)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
@ -3,8 +3,8 @@
|
|||||||
import sqlite3 as sql
|
import sqlite3 as sql
|
||||||
from os import path, listdir
|
from os import path, listdir
|
||||||
# local
|
# local
|
||||||
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||||
from utility.utility import pdebug
|
from regina.utility.utility import pdebug
|
||||||
|
|
||||||
"""
|
"""
|
||||||
create reginas database as shown in the uml diagram database.uxf
|
create reginas database as shown in the uml diagram database.uxf
|
||||||
|
@ -9,17 +9,14 @@ from datetime import datetime as dt
|
|||||||
|
|
||||||
from numpy import empty
|
from numpy import empty
|
||||||
# local
|
# local
|
||||||
from db_operation.database import t_request, t_user, t_file, t_filegroup
|
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup
|
||||||
from utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
|
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
|
||||||
from utility.utility import pdebug, warning, missing_arg
|
from regina.utility.utility import pdebug, warning, missing_arg
|
||||||
from utility.globals import settings
|
from regina.utility.globals import settings
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
visualize information from the databse
|
visualize information from the databse
|
||||||
TODO:
|
|
||||||
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
|
|
||||||
- ignore 404
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
palette = {
|
palette = {
|
||||||
@ -141,12 +138,18 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None):
|
|||||||
def get_earliest_date(cur: sql.Cursor) -> int:
|
def get_earliest_date(cur: sql.Cursor) -> int:
|
||||||
"""return the earliest time as unixepoch"""
|
"""return the earliest time as unixepoch"""
|
||||||
cur.execute(f"SELECT MIN(date) FROM {t_request}")
|
cur.execute(f"SELECT MIN(date) FROM {t_request}")
|
||||||
return cur.fetchone()[0]
|
date = cur.fetchone()[0]
|
||||||
|
if not isinstance(date, int): return 0
|
||||||
|
else: return date
|
||||||
|
|
||||||
# get the latest date
|
# get the latest date
|
||||||
def get_latest_date(cur: sql.Cursor) -> int:
|
def get_latest_date(cur: sql.Cursor) -> int:
|
||||||
"""return the latest time as unixepoch"""
|
"""return the latest time as unixepoch"""
|
||||||
cur.execute(f"SELECT MAX(date) FROM {t_request}")
|
cur.execute(f"SELECT MAX(date) FROM {t_request}")
|
||||||
return cur.fetchone()[0]
|
date = cur.fetchone()[0]
|
||||||
|
if not isinstance(date, int): return 0
|
||||||
|
else: return date
|
||||||
|
|
||||||
# get all dates
|
# get all dates
|
||||||
# the date:str parameter in all these function must be a sqlite constraint
|
# the date:str parameter in all these function must be a sqlite constraint
|
||||||
def get_days(cur: sql.Cursor, date:str) -> list[str]:
|
def get_days(cur: sql.Cursor, date:str) -> list[str]:
|
||||||
@ -296,6 +299,48 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs
|
|||||||
# print(ranking)
|
# print(ranking)
|
||||||
return ranking
|
return ranking
|
||||||
|
|
||||||
|
re_uri_protocol = f"(https?)://"
|
||||||
|
re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
|
||||||
|
# re_uri_ipv6 = ""
|
||||||
|
re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
|
||||||
|
re_uri_location = r"(?:/(.*))?"
|
||||||
|
re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})"
|
||||||
|
|
||||||
|
def cleanup_referer(referer: str) -> str:
|
||||||
|
"""
|
||||||
|
split the referer uri into its parts and reassemeble them depending on settings
|
||||||
|
"""
|
||||||
|
m = fullmatch(re_uri_full, referer)
|
||||||
|
if not m:
|
||||||
|
pdebug(f"cleanup_referer: Could not match referer '{referer}'")
|
||||||
|
return referer
|
||||||
|
# pdebug(f"cleanup_referer: {referer} - {m.groups()}")
|
||||||
|
protocol = m.groups()[0]
|
||||||
|
subdomains = m.groups()[2]
|
||||||
|
if not subdomains: subdomains = ""
|
||||||
|
domain = m.groups()[1].replace(subdomains, "")
|
||||||
|
location = m.groups()[3]
|
||||||
|
|
||||||
|
referer = domain
|
||||||
|
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
|
||||||
|
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
|
||||||
|
if not settings["referer_ranking_ignore_location"]: referer += location
|
||||||
|
# pdebug(f"cleanup_referer: cleaned up: {referer}")
|
||||||
|
return referer
|
||||||
|
|
||||||
|
def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
|
||||||
|
unique_referers = dict()
|
||||||
|
for count, referer in referer_ranking:
|
||||||
|
referer = cleanup_referer(referer)
|
||||||
|
if referer in unique_referers:
|
||||||
|
unique_referers[referer] += count
|
||||||
|
else:
|
||||||
|
unique_referers[referer] = count
|
||||||
|
referer_ranking.clear()
|
||||||
|
for referer, count in unique_referers.items():
|
||||||
|
referer_ranking.append((count, referer))
|
||||||
|
referer_ranking.sort()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# PLOTTING
|
# PLOTTING
|
||||||
@ -463,7 +508,8 @@ def visualize(loaded_settings: dict):
|
|||||||
get_humans = settings["get_human_percentage"]
|
get_humans = settings["get_human_percentage"]
|
||||||
# pdebug(f"visualize: settings {settings}")
|
# pdebug(f"visualize: settings {settings}")
|
||||||
# DATE STRINGS
|
# DATE STRINGS
|
||||||
names["earliest_date"] = dt.fromtimestamp(get_earliest_date(cur)).strftime("%Y-%m-%d")
|
earliest_date = get_earliest_date(cur)
|
||||||
|
names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
|
||||||
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
|
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
# LAST_X_DAYS
|
# LAST_X_DAYS
|
||||||
# last_x_days_min_date: latest_date - last_x_days
|
# last_x_days_min_date: latest_date - last_x_days
|
||||||
@ -506,6 +552,7 @@ def visualize(loaded_settings: dict):
|
|||||||
|
|
||||||
# REFERER
|
# REFERER
|
||||||
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
|
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
|
||||||
|
cleanup_referer_ranking(referer_ranking)
|
||||||
if gen_img:
|
if gen_img:
|
||||||
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
|
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
|
||||||
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")
|
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")
|
||||||
|
@ -3,14 +3,23 @@
|
|||||||
# __package__="."
|
# __package__="."
|
||||||
from sys import argv, exit
|
from sys import argv, exit
|
||||||
from os.path import isfile
|
from os.path import isfile
|
||||||
from db_operation.collect import parse_log, add_requests_to_db
|
from regina.db_operation.collect import parse_log, add_requests_to_db
|
||||||
from db_operation.database import create_db
|
from regina.db_operation.database import create_db
|
||||||
from db_operation.visualize import visualize
|
from regina.db_operation.visualize import visualize
|
||||||
from utility.settings_manager import read_settings_file
|
from regina.utility.settings_manager import read_settings_file
|
||||||
from utility.globals import settings, version
|
from regina.utility.globals import settings, version
|
||||||
|
|
||||||
"""
|
"""
|
||||||
start regina, launch either collect or visualize
|
start regina, launch either collect or visualize
|
||||||
|
TODO:
|
||||||
|
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
|
||||||
|
- optionen:
|
||||||
|
- unique user = ip address
|
||||||
|
- max requests/time
|
||||||
|
|
||||||
|
- wenn datenbankgröße zum problem wird:
|
||||||
|
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
|
||||||
|
- selbes für platforms und browsers
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -71,9 +80,9 @@ def main():
|
|||||||
error(f"Not a file: '{config_file}'")
|
error(f"Not a file: '{config_file}'")
|
||||||
read_settings_file(config_file, settings)
|
read_settings_file(config_file, settings)
|
||||||
settings["version"] = version
|
settings["version"] = version
|
||||||
if log_file: settings["access-log"] = log_file
|
if log_file: settings["access_log"] = log_file
|
||||||
|
|
||||||
print(f"regina version {version} with server-name '{settings['server_name']}' and database '{settings['db']}'")
|
print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
|
||||||
|
|
||||||
if not settings["server_name"]: missing_arg("server-name")
|
if not settings["server_name"]: missing_arg("server-name")
|
||||||
if not settings["access_log"]: missing_arg("log")
|
if not settings["access_log"]: missing_arg("log")
|
||||||
|
@ -12,6 +12,8 @@ settings = {
|
|||||||
"locs_and_dirs": [],
|
"locs_and_dirs": [],
|
||||||
"auto_group_filetypes": [],
|
"auto_group_filetypes": [],
|
||||||
"filegroups": "",
|
"filegroups": "",
|
||||||
|
"request_location_regex_blacklist": "",
|
||||||
|
"unique_user_is_ip_address": False,
|
||||||
|
|
||||||
# VISUALIZATION
|
# VISUALIZATION
|
||||||
"get_human_percentage": False,
|
"get_human_percentage": False,
|
||||||
@ -20,6 +22,9 @@ settings = {
|
|||||||
# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
|
# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
|
||||||
"file_ranking_regex_whitelist": r".*\.(html)",
|
"file_ranking_regex_whitelist": r".*\.(html)",
|
||||||
"file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
|
"file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
|
||||||
|
"referer_ranking_ignore_protocol": True,
|
||||||
|
"referer_ranking_ignore_subdomain": False,
|
||||||
|
"referer_ranking_ignore_location": True,
|
||||||
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
|
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
|
||||||
"user_agent_ranking_regex_whitelist": r"",
|
"user_agent_ranking_regex_whitelist": r"",
|
||||||
"file_ranking_plot_max_files": 15,
|
"file_ranking_plot_max_files": 15,
|
||||||
|
@ -6,7 +6,7 @@ from sys import exit
|
|||||||
Various utitity
|
Various utitity
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = True
|
||||||
def pdebug(*args):
|
def pdebug(*args):
|
||||||
if DEBUG: print(*args)
|
if DEBUG: print(*args)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user