diff --git a/database.uxf b/database.uxf index cf19979..4794468 100644 --- a/database.uxf +++ b/database.uxf @@ -1,13 +1,13 @@ - - 13 + + 10 UMLClass - 299 - 221 - 299 - 247 + 70 + 220 + 250 + 190 visitor -- @@ -27,10 +27,10 @@ style=autoresize UMLClass - 1040 - 221 - 234 - 130 + 640 + 220 + 180 + 100 filegroup -- @@ -44,10 +44,10 @@ style=autoresize Relation - 936 - 221 - 130 - 65 + 560 + 220 + 100 + 50 lt=- m1=n @@ -58,10 +58,10 @@ m2=1 UMLClass - 702 - 208 - 247 - 221 + 380 + 210 + 190 + 170 request -- @@ -80,29 +80,29 @@ style=autoresize Relation - 585 - 221 - 143 - 65 + 310 + 220 + 90 + 50 lt=- m1=1 m2=n - 10.0;20.0;90.0;20.0 + 10.0;20.0;70.0;20.0 UMLClass - 1040 - 455 - 234 - 130 + 640 + 400 + 180 + 100 file -- <<PK>> -- name: TEXT +- filename: TEXT -- - group_id: INTEGER -- @@ -112,10 +112,10 @@ style=autoresize Relation - 1079 - 338 - 52 - 143 + 670 + 310 + 40 + 110 lt=- m1=n @@ -126,10 +126,10 @@ m2=1 UMLNote - 845 - 65 - 390 - 91 + 490 + 100 + 300 + 70 One group contains multiple files. Lets you group the images from a @@ -140,10 +140,10 @@ style=autoresize Relation - 1053 - 143 - 39 - 104 + 650 + 160 + 30 + 80 lt=<- 10.0;60.0;10.0;10.0 @@ -151,10 +151,10 @@ style=autoresize UMLClass - 676 - 611 - 247 - 169 + 360 + 520 + 190 + 130 city -- @@ -170,10 +170,10 @@ style=autoresize UMLClass - 1014 - 611 - 156 - 143 + 620 + 520 + 120 + 110 country -- @@ -188,10 +188,10 @@ style=autoresize Relation - 910 - 637 - 130 - 65 + 540 + 540 + 100 + 50 lt=- m1=1 @@ -202,10 +202,10 @@ m2=n Relation - 572 - 637 - 130 - 65 + 280 + 540 + 100 + 50 lt=- m1=1 @@ -216,10 +216,10 @@ m2=n UMLClass - 364 - 611 - 221 - 169 + 120 + 520 + 170 + 130 ip_range -- @@ -235,10 +235,10 @@ style=autoresize Relation - 429 - 455 - 52 - 182 + 170 + 400 + 40 + 140 lt=- m1=1 diff --git a/default.conf b/default.conf index 270971d..4eed783 100644 --- a/default.conf +++ b/default.conf @@ -58,6 +58,8 @@ get_visitor_location = False # eg for EU: AT, BE, BG, HR, CY, CZ, DK, EE, FI, FR, DE, GZ, HU, IE, IT, LV, LT, LU, MT, NL, PL, PT, RO, SK, SI, ES, SE get_cities_for_countries = +hash_ip_address = True + # ***************************************** VISUALIZATION ***************************************** # these changes can be changed at any point in time as they only affect the visualization of the data diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py index 5df7583..7cc9fdc 100644 --- a/regina/db_operation/collect.py +++ b/regina/db_operation/collect.py @@ -4,7 +4,7 @@ from ipaddress import IPv4Address, ip_address from time import mktime from datetime import datetime as dt from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id -from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max from regina.utility.utility import pdebug, warning, pmessage from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings @@ -79,31 +79,39 @@ def parse_log(logfile:str) -> list[Request]: def visitor_exists(cursor, request) -> bool: - if settings["unique_visitor_is_ip_address"]: - return sql_exists(cursor, t_visitor, [("ip_address", request.ip_address)]) + if settings["hash_ip_address"]: + ip_address = hash(request.ip_address) else: - return sql_exists(cursor, t_visitor, [("ip_address", request.ip_address), ("visitor_agent", request.visitor_agent)]) + ip_address = request.ip_address + if settings["unique_visitor_is_ip_address"]: + return sql_exists(cursor, t_visitor, [("ip_address", ip_address)]) + else: + return sql_exists(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)]) def get_visitor_id(request: Request, cursor: sql.Cursor) -> int: """ get the visitor_id. Adds the visitor if not already existing """ - # if visitor exists + if settings["hash_ip_address"]: + ip_address = hash(request.ip_address) + else: + ip_address = request.ip_address + if visitor_exists(cursor, request): if settings["unique_visitor_is_ip_address"]: - visitor_id = sql_select(cursor, t_visitor, [("ip_address", request.ip_address)])[0][0] + visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address)])[0][0] else: - visitor_id = sql_select(cursor, t_visitor, [("ip_address", request.ip_address), ("visitor_agent", request.visitor_agent)])[0][0] + visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0] else: # new visitor # new visitor_id is number of elements - visitor_id: int = sql_tablesize(cursor, t_visitor) + visitor_id = sql_max(cursor, t_visitor, "visitor_id") + 1 # pdebug("new visitor:", visitor_id, request.ip_address) platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent) ip_range_id_val = 0 if settings["get_visitor_location"]: ip_range_id_val = get_ip_range_id(cursor, request.ip_address) is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(cursor, visitor_id)) - cursor.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{request.ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") + cursor.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") return visitor_id def is_visitor_human(cur: sql.Cursor, visitor_id: int): @@ -204,7 +212,7 @@ def add_requests_to_db(requests: list[Request], db_name: str): cursor = conn.cursor() added_requests = 0 # check the new visitors later - max_visitor_id = sql_tablesize(cursor, t_visitor) + max_visitor_id = sql_max(cursor, t_visitor, "visitor_id") request_blacklist = settings["request_location_regex_blacklist"] for i in range(len(requests)): request = requests[i] @@ -223,11 +231,12 @@ def add_requests_to_db(requests: list[Request], db_name: str): pass else: # pdebug("new request:", request) - request_id = sql_tablesize(cursor, t_request) + request_id = sql_max(cursor, t_request, "request_id") + 1 sql_insert(cursor, t_request, [[request_id, visitor_id, group_id, request.time_local, request.referer, request.status]]) added_requests += 1 visitor_count = sql_tablesize(cursor, t_visitor) for visitor_id in range(max_visitor_id, visitor_count): + if not sql_exists(cursor, t_visitor, [(str(visitor_id), "visitor_id")]): continue is_human = is_visitor_human(cursor, visitor_id) cursor.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}") # pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {cursor.fetchall()}") diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py index 49a61b9..b157c11 100644 --- a/regina/db_operation/database.py +++ b/regina/db_operation/database.py @@ -3,7 +3,7 @@ import sqlite3 as sql from csv import reader from os import path, listdir # local -from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize +from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max from regina.utility.utility import pdebug from regina.utility.globals import settings @@ -48,7 +48,7 @@ t_ip_range = "ip_range" visitor_id = Entry("visitor_id", "INTEGER") request_id = Entry("request_id", "INTEGER") filegroup_id = Entry("group_id", "INTEGER") -ip_address_entry = Entry("ip_address", "TEXT") +ip_address_entry = Entry("ip_address", "INTEGER") filename_entry = Entry("filename", "TEXT") city_id = Entry("city_id", "INTEGER") country_id = Entry("country_id", "INTEGER") @@ -120,7 +120,8 @@ def get_filegroup(filename: str, cursor: sql.Cursor) -> int: if group_id_candidates: return group_id_candidates[0][0] else: # add new group file filename - group_id = sql_tablesize(cursor, t_filegroup) + group_id = sql_max(cursor, t_filegroup, "group_id") + 1 + # pdebug("new file(group):", group_id, filename) # add group sql_insert(cursor, t_filegroup, [[group_id, filename]]) @@ -138,7 +139,7 @@ def create_filegroups(cursor: sql.Cursor, filegroup_str: str): if sql_exists(cursor, t_filegroup, [("groupname", name)]): group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0] else: - group_id = sql_tablesize(cursor, t_filegroup) + group_id = sql_max(cursor, t_filegroup, "group_id") + 1 sql_insert(cursor, t_filegroup, [(group_id, name)]) # pdebug("create_filegroups: group_id", group_id) # create/edit file diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py index 17a77ce..6191a61 100644 --- a/regina/db_operation/visualize.py +++ b/regina/db_operation/visualize.py @@ -128,7 +128,7 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None): else: print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}") if s == "": - print(f"WARNING: get_where_date_str: no date_str generated. Returing 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") + print(f"WARNING: get_where_date_str: no date_str generated. Returning 'date > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") return "date > 0" return s.removesuffix(" AND ") @@ -353,7 +353,6 @@ def get_city_and_country_ranking(cur:sql.Cursor, require_humans=True, regex_city cities = cur.fetchall() cities_dict = {} country_dict = {} - # TODO: find out why regex_blacklist does not work pdebug(f"get_city_and_country_ranking: found {len(cities)} ip_ranges") validate_city_cmd = lambda _ : True diff --git a/regina/main.py b/regina/main.py index e3a8545..8bdd62f 100644 --- a/regina/main.py +++ b/regina/main.py @@ -48,6 +48,7 @@ def help(): --update-geoip path to IP-COUNTRY-REGION-CITY database in csv format --visualize generate the visualization website --collect fill the database from the nginx access log + --log-file use alternate logfile """ print(helpstring) diff --git a/regina/utility/globals.py b/regina/utility/globals.py index 5f6805b..7073e5d 100644 --- a/regina/utility/globals.py +++ b/regina/utility/globals.py @@ -17,6 +17,7 @@ settings = { "unique_visitor_is_ip_address": False, "get_visitor_location": False, "get_cities_for_countries": [""], # list if country codes for which the ip address ranges need to be collected at city level, not country level + "hash_ip_address": True, # VISUALIZATION "get_human_percentage": False, diff --git a/regina/utility/sql_util.py b/regina/utility/sql_util.py index 2e3f9a8..de06c9b 100644 --- a/regina/utility/sql_util.py +++ b/regina/utility/sql_util.py @@ -2,8 +2,7 @@ import sqlite3 as sql """Various utilities""" def sanitize(s): if type(s) != str: return s - return s\ - .replace("''", "'").replace("'", r"''").strip(" ") + return s.replace("'", r"''").strip(" ") # .replace('"', r'\"')\ def sql_get_constaint_str(constraints: list[tuple[str, str|int]], logic="AND") -> str: @@ -35,6 +34,12 @@ def sql_tablesize(cur: sql.Cursor, table: str) -> int: cur.execute(f"SELECT Count(*) FROM {table}") return cur.fetchone()[0] +def sql_max(cur: sql.Cursor, table: str, column: str) -> int: + cur.execute(f"SELECT MAX({column}) FROM {table}") + val = cur.fetchone()[0] + if not type(val) == int: val = 0 + return val + def sql_get_count_where(cur: sql.Cursor, table, constraints) -> int: cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {sql_get_constaint_str(constraints)}") return cur.fetchone()[0]