From 7be6e67aaff59c0c439dfa5e6e296a1b8fffa14b Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Mon, 28 Nov 2022 23:29:32 +0100 Subject: [PATCH] multiple fixes --- .gitignore | 1 + database.uxf | 36 +++++++++++----------- default.conf | 4 +++ regina/db_operation/__init__.py | 3 ++ regina/db_operation/collect.py | 53 ++++++++++++++++++++++++++------ regina/db_operation/database.py | 13 ++++++-- regina/db_operation/visualize.py | 5 ++- regina/main.py | 26 +++++++++++++--- regina/utility/globals.py | 2 ++ regina/utility/utility.py | 13 +++++--- setup.py | 2 ++ 11 files changed, 118 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index d336186..e81f3bd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ testing/ __pycache__ build/ regina.egg-info/ +regina/test/ diff --git a/database.uxf b/database.uxf index 5e99d88..281314c 100644 --- a/database.uxf +++ b/database.uxf @@ -4,8 +4,8 @@ UMLClass - 364 - 273 + 247 + 312 299 234 @@ -26,8 +26,8 @@ style=autoresize UMLClass - 1092 - 273 + 975 + 312 234 130 @@ -43,8 +43,8 @@ style=autoresize Relation - 988 - 273 + 871 + 312 130 65 @@ -57,8 +57,8 @@ m2=1 UMLClass - 754 - 260 + 637 + 299 247 221 @@ -79,8 +79,8 @@ style=autoresize Relation - 650 - 273 + 533 + 312 130 65 @@ -93,8 +93,8 @@ m2=n UMLClass - 1092 - 546 + 975 + 585 234 130 @@ -111,8 +111,8 @@ style=autoresize Relation - 1131 - 390 + 1014 + 429 52 182 @@ -125,8 +125,8 @@ m2=1 UMLNote - 897 - 117 + 780 + 156 390 91 @@ -139,8 +139,8 @@ style=autoresize Relation - 1105 - 195 + 988 + 234 39 104 diff --git a/default.conf b/default.conf index d351099..275a7e6 100644 --- a/default.conf +++ b/default.conf @@ -27,6 +27,10 @@ humans_need_success = True # dont collect requests to locations matched by this request_location_regex_blacklist = /analytics.* +# get nation +user_get_country = True + + # VISUALIZATION # separate users into all and humans get_human_percentage = True diff --git a/regina/db_operation/__init__.py b/regina/db_operation/__init__.py index 48f4e1a..0185ded 100644 --- a/regina/db_operation/__init__.py +++ b/regina/db_operation/__init__.py @@ -1,3 +1,6 @@ """Gather analytics from nginx access logs and visualize them through generated images and a generated html""" # __package__ = 'regina' import regina.utility + +from importlib import resources +# ip2nation_db_path = resources.path("regina", "ip2nation.db") diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py index 3b50e39..961d8e4 100644 --- a/regina/db_operation/collect.py +++ b/regina/db_operation/collect.py @@ -4,13 +4,13 @@ from time import mktime from datetime import datetime as dt from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize -from regina.utility.utility import pdebug, warning +from regina.utility.utility import pdebug, warning, pmessage from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings """ collect information from the access log and put it into the database """ -months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] @@ -23,13 +23,16 @@ class Request: if m: g = m.groups() try: - datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) - self.time_local = int(mktime(datetime_.timetuple())) + if g[1] in months: + datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) + # pdebug(f"Request __init__: datetime {datetime_}, from {g}") + self.time_local = int(mktime(datetime_.timetuple())) + else: + warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}") except Exception as e: warning(f"Request:__init__: {e}") else: warning(f"Request:__init__: Could not match time: '{time_local}'") - self.request_type = sanitize(request_type) self.request_file = sanitize(request_file) self.request_protocol = sanitize(request_protocol) @@ -93,7 +96,7 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int: else: # new user # new user_id is number of elements user_id: int = sql_tablesize(cursor, t_user) - pdebug("new user:", user_id, request.ip_address) + # pdebug("new user:", user_id, request.ip_address) platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent) is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id)) cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');") @@ -107,10 +110,21 @@ def is_user_human(cur: sql.Cursor, user_id: int): """ max_success_status = 400 if settings["status_300_is_success"]: max_success_status = 300 + cur.execute(f"SELECT browser, platform FROM {t_user} WHERE user_id = {user_id}") + browsers_and_platforms = cur.fetchall() + if len(browsers_and_platforms) != 1: + pdebug(f"is_user_human: {user_id} - could not find user or found too many") + return False + if not browsers_and_platforms[0][0] in user_agent_browsers: + return False + if not browsers_and_platforms[0][1] in user_agent_operating_systems: + return False # check if has browser - cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)") + # cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)") # if no browser and platform - if cur.fetchone()[0] == 0: return False + # exists = cur.fetchone() + # if exists is None or exists[0] == 0: + # return False # if human needs successful request if settings["human_needs_success"]: # check if at least request was successful (status < 400) @@ -144,9 +158,21 @@ def get_os_browser_pairs_from_agent(user_agent): return operating_system, browser, mobile +# def set_countries(cur: sql.Cursor, user_ids: list[int]): +# if settings["user_get_country"]: +# ipconn = sql.connect(ip2nation_db_path) +# ipcur = ipconn.cursor() +# for user_id in user_ids: +# ip_address = sql_select(cur, t_user, [("user_id", user_id)]) +# cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}") +# ip_address = cur.fetchall()[0][0] +# ipcur.execute("SELECT iso_code_3 FROM ip2nationCountries WHERE ip") + + def add_requests_to_db(requests: list[Request], db_name: str): conn = sql.connect(db_name) cursor = conn.cursor() + added_requests = 0 # check the new users later max_user_id = sql_tablesize(cursor, t_user) request_blacklist = settings["request_location_regex_blacklist"] @@ -154,7 +180,9 @@ def add_requests_to_db(requests: list[Request], db_name: str): request = requests[i] # skip requests to blacklisted locations if request_blacklist: - if match(request_blacklist, request.request_file): continue + if match(request_blacklist, request.request_file): + # pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'") + continue # pdebug("add_requests_to_db:", i, "request:", request) user_id = get_user_id(request, cursor) conn.commit() @@ -169,9 +197,14 @@ def add_requests_to_db(requests: list[Request], db_name: str): # pdebug("new request:", request) request_id = sql_tablesize(cursor, t_request) sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]]) - for user_id in range(max_user_id, sql_tablesize(cursor, t_user)): + added_requests += 1 + user_count = sql_tablesize(cursor, t_user) + for user_id in range(max_user_id, user_count): is_human = is_user_human(cursor, user_id) + cursor.execute(f"SELECT * FROM {t_user} WHERE user_id = {user_id}") + # pdebug(f"add_rq_to_db: {user_id} is_human? {is_human}, {cursor.fetchall()}") if is_human: cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}") cursor.close() conn.commit() + pmessage(f"Collection Summary: Added {user_count - max_user_id} new users and {added_requests} new requests.") diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py index 8e86374..c3d6c16 100644 --- a/regina/db_operation/database.py +++ b/regina/db_operation/database.py @@ -47,7 +47,16 @@ filegroup_id = Entry("group_id", "INTEGER") ip_address_entry = Entry("ip_address", "TEXT") filename_entry = Entry("filename", "TEXT") database_tables = { - t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER")], [f"UNIQUE({user_id.name})"]), + t_user: Table(t_user, user_id, [ + Entry("ip_address", "TEXT"), + Entry("user_agent", "TEXT"), + Entry("platform", "TEXT"), + Entry("browser", "TEXT"), + Entry("mobile", "INTEGER"), + Entry("is_human", "INTEGER"), + # Entry("country_iso_code_3", "TEXT") + ], + [f"UNIQUE({user_id.name})"]), t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), t_request: Table(t_request, request_id, [ @@ -73,7 +82,7 @@ def get_filegroup(filename: str, cursor: sql.Cursor) -> int: cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") # cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") group_id_candidates = cursor.fetchall() - pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") + # pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") if group_id_candidates: return group_id_candidates[0][0] else: # add new group file filename diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py index fd4be19..856306b 100644 --- a/regina/db_operation/visualize.py +++ b/regina/db_operation/visualize.py @@ -321,7 +321,10 @@ def cleanup_referer(referer: str) -> str: domain = m.groups()[1].replace(subdomains, "") location = m.groups()[3] - referer = domain + assert(len(domain.split(".")) == 2) + referer = domain.split(".")[0] + if not settings["referer_ranking_ignore_tld"]: referer += "." + domain.split(".")[1] + if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer if not settings["referer_ranking_ignore_location"]: referer += location diff --git a/regina/main.py b/regina/main.py index 8e457cf..f48faed 100644 --- a/regina/main.py +++ b/regina/main.py @@ -8,18 +8,31 @@ from regina.db_operation.database import create_db from regina.db_operation.visualize import visualize from regina.utility.settings_manager import read_settings_file from regina.utility.globals import settings, version +from regina.utility.utility import pmessage """ start regina, launch either collect or visualize TODO: -- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com - optionen: - unique user = ip address - max requests/time - +- fix datum im user and request count plot +- fix datum monat is 1 zu wenig +- checken warum last x days und total counts abweichen +- länder aus ip addresse +- "manuelle" datenbank beabeitung in cli: + - user + alle seine requests löschen +- user agents: + - android vor linux suchen, oder linux durch X11 ersetzen + - alles was bot drin hat als bot betrachten - wenn datenbankgröße zum problem wird: - referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id - selbes für platforms und browsers +- test: + - human detection + - referer cleanup +- schöne log nachrichten für die cron mail +- testing! """ @@ -82,7 +95,6 @@ def main(): settings["version"] = version if log_file: settings["access_log"] = log_file - print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'") if not settings["server_name"]: missing_arg("server-name") if not settings["access_log"]: missing_arg("log") @@ -91,14 +103,20 @@ def main(): settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",") if isinstance(settings["locs_and_dirs"], str): settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ] + + if collect: + pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'") if not isfile(settings["db"]): create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"]) requests = parse_log(settings["access_log"]) add_requests_to_db(requests, settings["db"]) - if visualize_: + elif visualize_: + pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'") if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") visualize(settings) + else: + error("Either --collect --visualize has to be provided") if __name__ == '__main__': main() diff --git a/regina/utility/globals.py b/regina/utility/globals.py index 7355bd9..0e4d61a 100644 --- a/regina/utility/globals.py +++ b/regina/utility/globals.py @@ -14,6 +14,7 @@ settings = { "filegroups": "", "request_location_regex_blacklist": "", "unique_user_is_ip_address": False, + "user_get_country": True, # VISUALIZATION "get_human_percentage": False, @@ -25,6 +26,7 @@ settings = { "referer_ranking_ignore_protocol": True, "referer_ranking_ignore_subdomain": False, "referer_ranking_ignore_location": True, + "referer_ranking_ignore_tld": False, "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty "user_agent_ranking_regex_whitelist": r"", "file_ranking_plot_max_files": 15, diff --git a/regina/utility/utility.py b/regina/utility/utility.py index 00a31e8..bd81d04 100644 --- a/regina/utility/utility.py +++ b/regina/utility/utility.py @@ -6,12 +6,15 @@ from sys import exit Various utitity """ -DEBUG = True -def pdebug(*args): - if DEBUG: print(*args) +DEBUG = False +def pdebug(*args, **keys): + if DEBUG: print(*args, **keys) -def warning(*w): - print("Warning:", *w) +def warning(*w, **k): + print("Warning:", *w, **k) + +def pmessage(*args, **keys): + print(*args, **keys) def error(*arg): print("Error:", *arg) diff --git a/setup.py b/setup.py index 6cd0f63..7cd99b4 100755 --- a/setup.py +++ b/setup.py @@ -24,6 +24,8 @@ setup( "Topic :: Utilities", ], + # data_files=[("ip2nation", ["ip2nation.sql", "ip2nation.db"])], + # scripts=["bin/nicole"], entry_points={ "console_scripts": [ "regina=regina.main:main" ],