diff --git a/database.uxf b/database.uxf index 4794468..e4e262c 100644 --- a/database.uxf +++ b/database.uxf @@ -1,13 +1,13 @@ - 10 + 8 UMLClass - 70 - 220 - 250 - 190 + 96 + 248 + 160 + 144 visitor -- @@ -15,53 +15,35 @@ - visitor_id: INTEGER -- - ip_address: INTEGER -- visitor agent string: TEXT -- platform: TEXT -- browser: TEXT +- platform_id: INTEGER +- browser_id: INTEGER - mobile: INTEGER - is_human: INTEGER - range_id: INTEGER -style=autoresize - - - - UMLClass - - 640 - 220 - 180 - 100 - - filegroup --- -<<PK>> -- group_id: INTEGER --- -- group_name: TEXT style=autoresize Relation - 560 - 220 - 100 - 50 + 216 + 168 + 32 + 96 lt=- m1=n m2=1 - 10.0;20.0;80.0;20.0 + 10.0;100.0;10.0;10.0 UMLClass - 380 - 210 - 190 - 170 + 352 + 240 + 152 + 136 request -- @@ -69,10 +51,10 @@ m2=1 - request_id: INTEGER -- - visitor_id: INTEGER -- group_id: INTEGER +- route_id: INTEGER +- referer_id: INTEGER -- -- date: TEXT -- referer: TEXT +- time: INTEGER - status: INTEGER style=autoresize @@ -80,81 +62,42 @@ style=autoresize Relation - 310 - 220 - 90 - 50 + 248 + 248 + 120 + 40 lt=- m1=1 m2=n - 10.0;20.0;70.0;20.0 + 10.0;20.0;130.0;20.0 UMLClass - 640 - 400 - 180 - 100 - - file --- -<<PK>> -- filename: TEXT --- -- group_id: INTEGER --- -style=autoresize - - - - Relation - - 670 - 310 - 40 - 110 - - lt=- -m1=n -m2=1 - - 10.0;90.0;10.0;10.0 - - - UMLNote - - 490 - 100 - 300 - 70 - - One group contains multiple files. -Lets you group the images from a -html with the html itself -style=autoresize - - - - Relation - - 650 - 160 - 30 + 16 + 96 + 160 80 - lt=<- - 10.0;60.0;10.0;10.0 + platform +-- +<<PK>> +- platform_id: INTEGER +-- +- name: TEXT UNIQUE +-- +style=autoresize + UMLClass - 360 - 520 - 190 - 130 + 328 + 488 + 152 + 104 city -- @@ -170,15 +113,15 @@ style=autoresize UMLClass - 620 - 520 - 120 - 110 + 536 + 488 + 152 + 88 country -- <<PK>> -- country_id +- country_id: INTEGER -- - name: TEXT - code: TEXT @@ -188,10 +131,10 @@ style=autoresize Relation - 540 - 540 - 100 - 50 + 472 + 504 + 80 + 40 lt=- m1=1 @@ -202,10 +145,10 @@ m2=n Relation - 280 - 540 - 100 - 50 + 264 + 504 + 80 + 40 lt=- m1=1 @@ -216,10 +159,10 @@ m2=n UMLClass - 120 - 520 - 170 - 130 + 136 + 488 + 136 + 104 ip_range -- @@ -235,15 +178,111 @@ style=autoresize Relation - 170 - 400 - 40 - 140 + 176 + 384 + 32 + 120 lt=- m1=1 m2=n - 10.0;120.0;10.0;10.0 + 10.0;130.0;10.0;10.0 + + + UMLClass + + 576 + 264 + 144 + 80 + + route +-- +<<PK>> +- route_id: INTEGER +-- +- name: TEXT UNIQUE +-- +style=autoresize + + + + UMLClass + + 208 + 96 + 152 + 80 + + browser +-- +<<PK>> +- browser_id: INTEGER +-- +- name: TEXT UNIQUE +-- +style=autoresize + + + + Relation + + 144 + 168 + 32 + 96 + + lt=- +m1=n +m2=1 + + 10.0;100.0;10.0;10.0 + + + UMLClass + + 392 + 96 + 152 + 80 + + referer +-- +<<PK>> +- referer_id: INTEGER +-- +- name: TEXT UNIQUE +-- +style=autoresize + + + + Relation + + 400 + 168 + 32 + 88 + + lt=- +m1=n +m2=1 + + 10.0;90.0;10.0;10.0 + + + Relation + + 496 + 288 + 96 + 40 + + lt=- +m1=n +m2=1 + + 10.0;20.0;100.0;20.0 diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py index e3b7948..a4dacab 100644 --- a/regina/db_operation/database.py +++ b/regina/db_operation/database.py @@ -5,8 +5,18 @@ from os import path, listdir import pkg_resources import re from datetime import datetime as dt + +if __name__ == "__main__": # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change + if __package__ is None: + __package__ = "regina" + import sys + from os import path + filepath = path.realpath(path.abspath(__file__)) + print(path.dirname(path.dirname(path.dirname(filepath)))) + sys.path.insert(0, path.dirname(path.dirname(path.dirname(filepath)))) + # local -from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max +from .utility.sql_util import replace_null, sanitize, sql_select, sql_exists from .utility.utility import pdebug, get_filepath, warning, pmessage from .utility.globals import settings from .db_operation.request import Request @@ -16,97 +26,6 @@ from .utility.globals import visitor_agent_operating_systems, visitor_agent_brow create reginas database as shown in the uml diagram database.uxf """ -class Entry: - """ - represents an sql entry - type_ is INTEGER, TEXT, REAL... - """ - def __init__(self, name, type_) -> None: - self.name = name - self.type_ = type_ - def __repr__(self): - return f"[{self.name}] {self.type_}" - -class Table: - def __init__(self, name, key: Entry, entries: list[Entry]=[], constaints: list[str]=[]): - self.name = name - self.key = key - self.entries = entries - self.constaints = constaints - def create_sql_str(self): - return f"CREATE TABLE IF NOT EXISTS {self.name}\n({self})\n" - def __repr__(self): - s = f"{self.key} PRIMARY KEY" - for entry in self.entries: - s += f", {entry}" - for c in self.constaints: - s += f", {c}" - return s - - -t_request = "request" -t_file = "file" -t_filegroup = "filegroup" -t_visitor = "visitor" -t_city = "city" -t_country = "country" -t_ip_range = "ip_range" - -visitor_id = Entry("visitor_id", "INTEGER") -request_id = Entry("request_id", "INTEGER") -filegroup_id = Entry("group_id", "INTEGER") -ip_address_entry = Entry("ip_address", "INTEGER") -filename_entry = Entry("filename", "TEXT") -city_id = Entry("city_id", "INTEGER") -country_id = Entry("country_id", "INTEGER") -ip_range_id = Entry("ip_range_id", "INTEGER") - -database_tables = { - t_visitor: Table(t_visitor, visitor_id, [ - Entry("ip_address", "INTEGER"), - Entry("visitor_agent", "TEXT"), - Entry("platform", "TEXT"), - Entry("browser", "TEXT"), - Entry("mobile", "INTEGER"), - Entry("is_human", "INTEGER"), - ip_range_id, - ], - [f"UNIQUE({visitor_id.name})"]), - t_file: Table(t_file, filename_entry, - [filegroup_id], - [f"UNIQUE({filename_entry.name})"]), - t_filegroup: Table(t_filegroup, filegroup_id, - [Entry("groupname", "TEXT")], - [f"UNIQUE({filegroup_id.name})"]), - t_request: Table(t_request, request_id, [ - visitor_id, - filegroup_id, - Entry("date", "INTEGER"), - Entry("referer", "TEXT"), - Entry("status", "INTEGER") - ], - ["UNIQUE(request_id)"]), - t_ip_range: Table(t_ip_range, ip_range_id, [ - Entry("lower", "INTEGER"), - Entry("upper", "INTEGER"), - city_id, - ], - [f"UNIQUE({ip_range_id.name})"]), - t_city: Table(t_city, city_id, [ - country_id, - Entry("name", "TEXT"), - Entry("region", "TEXT"), - ], - [f"UNIQUE({city_id.name})"]), - t_country: Table(t_country, country_id, [ - Entry("name", "TEXT"), - Entry("code", "TEXT"), - ], - [f"UNIQUE({country_id.name})"]), -} - - - class Database: def __init__(self, database_path): self.conn = sql.connect(database_path) @@ -118,6 +37,7 @@ class Database: with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file: create_db = file.read() self.cur.execute(create_db) + self.conn.commit() def __call__(self, s): """execute a command and return fetchall()""" @@ -127,42 +47,27 @@ class Database: # # VISITOR # - def visitor_exists(self, request) -> bool: - if settings["hash_ip_address"]: - ip_address = hash(request.ip_address) - else: - ip_address = request.ip_address - if settings["unique_visitor_is_ip_address"]: - return sql_exists(self.cur, t_visitor, [("ip_address", ip_address)]) - else: - return sql_exists(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)]) - def is_visitor_human(self, visitor_id: int): """ check if they have a known platform AND browser - check if at least one request did not result in an error (http status >= 400) + if settings "human_needs_success": check if at least one request did not result in an error (http status >= 400) """ max_success_status = 400 if settings["status_300_is_success"]: max_success_status = 300 - self.cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}") + self.cur.execute(f"SELECT browser_id, platform_id FROM visitor WHERE visitor_id = {visitor_id}") browsers_and_platforms = self.cur.fetchall() if len(browsers_and_platforms) != 1: pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many") return False - if not browsers_and_platforms[0][0] in visitor_agent_browsers: + browser = self.get_name("browser", browsers_and_platforms[0][0]) + if not browser in visitor_agent_browsers: return False - if not browsers_and_platforms[0][1] in visitor_agent_operating_systems: + platform = self.get_name("platform", browsers_and_platforms[0][1]) + if not platform in visitor_agent_operating_systems: return False - # check if has browser - # self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)") - # if no browser and platform - # exists = self.cur.fetchone() - # if exists is None or exists[0] == 0: - # return False - # if human needs successful request if settings["human_needs_success"]: # check if at least request was successful (status < 400) - self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})") + self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM request WHERE visitor_id = {visitor_id} AND status < {max_success_status})") if self.cur.fetchone()[0] == 1: # pdebug(f"is_visitor_human: Visitor {visitor_id} is human") pass @@ -171,67 +76,85 @@ class Database: return False return True - def get_visitor_id(self, request: Request) -> int: + def get_visitor_id(self, request: Request, insert=True) -> int | None: """ get the visitor_id. Adds the visitor if not already existing """ + """ + get the visitor_id: + If settings unique_visitor_is_ip_address: Check if visitor with ip address exists + Else: check if visitor with ip_address, browser and platform exists + + If visitor does not exist and insert: insert, return id + Else: return None + """ if settings["hash_ip_address"]: ip_address = hash(request.ip_address) else: ip_address = request.ip_address - if self.visitor_exists(request): - if settings["unique_visitor_is_ip_address"]: - visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address)])[0][0] - else: - visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0] - else: # new visitor - # new visitor_id is number of elements - visitor_id = sql_max(self.cur, t_visitor, "visitor_id") + 1 - # pdebug("new visitor:", visitor_id, request.ip_address) - platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent) - ip_range_id_val = 0 + # if insert == True, ids will be int + browser_id: int | None = self.get_id("browser", request.get_browser(), insert=insert) + platform_id: int | None = self.get_id("platform", request.get_platform(), insert=insert) + constraints = [("ip_address", ip_address)] + if not settings["unique_visitor_is_ip_address"]: + if browser_id: constraints.append(("browser_id", browser_id)) + if platform_id: constraints.append(("platform_id", platform_id)) + require_update_is_human = False + if not sql_exists(self.cur, "visitor", constraints): + require_update_is_human = True + if not insert: + return None + is_mobile = int(request.get_mobile()) + ip_range_id = 0 if settings["get_visitor_location"]: - ip_range_id_val = get_ip_range_id(self.cur, request.ip_address) - is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(self.cur, visitor_id)) - self.cur.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") + ip_range_id = self.get_ip_range_id(request.ip_address) + is_human = 0 # is_visitor_human cannot be called until visitor is in db + self.cur.execute(f"INSERT INTO visitor (ip_address, ip_range_id, platform_id, browser_id, is_mobile, is_human, ip_range_id) VALUES ('{ip_address}', '{ip_range_id}', '{platform_id}', '{browser_id}', '{is_mobile}', '{is_human}');") + visitor_id = sql_select(self.cur, "visitor", constraints)[0][0] + # TODO: if requests are not added yet, visitor might not be recognized since it does not have a successful requets yet + if require_update_is_human: + is_human = self.is_visitor_human(visitor_id) + if is_human: + self.cur.execute(f"UPDATE visitor SET is_human = 1 WHERE visitor_id = {visitor_id}") return visitor_id # # REQUEST # - def request_exists(self, request: Request, visitor_id: int, group_id: int): - # get all requests from same visitor to same location - # TODO this looks wrong - self.cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'") + def request_exists(self, request: Request, visitor_id: int, route_id: int): + """ + Check if a request from same visitor was made to same location in the same day, if setting "request_is_same_on_same_day" is True + If not, always returns False + """ + if not settings["request_is_same_on_same_day"]: return False + # get all requests from same visitor to same route + self.cur.execute(f"SELECT request_id, time FROM request WHERE visitor_id = '{visitor_id}' AND = route_id = '{route_id}'") + # check if on same day date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d") for request_id, date1 in self.cur.fetchall(): - if settings["request_is_same_on_same_day"]: - date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d") - if date0 == date1: - pdebug(f"request_exists: Request is on same day as request {request_id}") - return True + date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d") + if date0 == date1: + pdebug(f"request_exists: Request is on same day as request {request_id}") + return True return False def add_request(self, request: Request) -> (int | None): """returns visitor_id if new request was added, else None""" - # skip requests to blacklisted locations - if request_blacklist: - if re.fullmatch(request_blacklist, request.request_file): - # pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'") - return None - # pdebug("add_requests_to_db:", i, "request:", request) visitor_id = self.get_visitor_id(request) self.conn.commit() - group_id: int = self.get_filegroup(request.request_file) + # browser_id = self.get_id("browser", request.get_browser()) + # platform_id = self.get_id("platform", request.get_platform()) + referer_id = self.get_id("referer", request.referer) + route_id = self.get_id("route", request.route) # check if request is unique - if self.request_exists(request, visitor_id, group_id): + if self.request_exists(request, visitor_id, route_id): # pdebug("request exists:", request) return None else: # pdebug("new request:", request) - sql_insert(t_request, [[None, visitor_id, group_id, request.time_local, request.referer, request.status]]) + self.cur.execute(f"INSERT INTO request (visitor_id, route_id, referer_id, time, status) VALUES ({visitor_id}, {route_id}, {referer_id}, {request.time_local}, {request.status})") return visitor_id def add_requests(self, requests: list[Request]): @@ -246,53 +169,50 @@ class Database: # update the is_human column for all new visitors for visitor_id in new_visitors: - if not sql_exists(self.cur, t_visitor, [(str(visitor_id), "visitor_id")]): continue - is_human = self.is_visitor_human(visitor_id) - self.cur.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}") + # TODO this does not look right + if not sql_exists(self.cur, "visitor", [("visitor_id", visitor_id)]): continue # pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}") - if is_human: - self.cur.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}") self.conn.commit() pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.") - # - # FILE(GROUP) - # - def get_filegroup(self, filename: str) -> int: - """ - get the filegroup - returns the group where - 1) filename is the groupname - 2) the filetype of filename is the groupname - 3) new group with filename as gorupname - """ - # pdebug(f"get_filegroup: {filename}") - if sql_exists(self.cur, t_file, [("filename", filename)]): - return sql_select(self.cur, t_file, [("filename", filename)])[0][1] - else: - suffix = filename.split('.')[-1] - self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") - # self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") - group_id_candidates = self.cur.fetchall() - # pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") - if group_id_candidates: - return group_id_candidates[0][0] - else: # add new group file filename - group_id = sql_max(self.cur, t_filegroup, "group_id") + 1 - # pdebug("new file(group):", group_id, filename) - # add group - sql_insert(self.cur, t_filegroup, [[group_id, filename]]) - # add file - sql_insert(self.cur, t_file, [[filename, group_id]]) - return group_id + def get_id(self, table: str, name: str, insert=True) -> int | None: + """ + get the id of name in table + if name is not in table: + if insert: add and return id + else: return None + supported tables: platform, browser, referer, route, city + """ + supported_tables = ["platform", "browser", "referer", "route", "city"] + if not table in supported_tables: raise ValueError(f"table '{table}' is not supported ({supported_tables})") + name = sanitize(replace_null(name)) + # if non existent, add name + if not sql_exists(self.cur, table, [("name", name)]): + if not insert: return None + self.cur.execute(f"INSERT INTO {table} (name) VALUES ('{name}')") + return self(f"SELECT {table}_id FROM {table} WHERE name = '{name}'")[0][0] + + def get_name(self, table: str, id_: int) -> (str | None): + """ + get the name of id in table + if id is not in table, returns None + supported tables: platform, browser, referer, route, city + """ + supported_tables = ["platform", "browser", "referer", "route", "city"] + if not table in supported_tables: raise ValueError(f"table '{table}' is not supported ({supported_tables})") + ret = self(f"SELECT name FROM {table} WHERE {table}_id = '{id_}'") + if len(ret) == 0: return None + # TODO check if this returns tuple or value + return ret[0] + + # # GEOIP # - def get_ip_range_id(self, ip_address: int): - self.cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper") - results = self.cur.fetchall() + def get_ip_range_id(self, ip_address: int) -> int: + results = self(f"SELECT ip_range_id FROM ip_range WHERE '{ip_address}' BETWEEN low AND high") ip_range_id_val = 0 if len(results) == 0: pass @@ -302,174 +222,114 @@ class Database: ip_range_id_val = results[0][0] return ip_range_id_val + def update_ip_range_id(self, visitor_id: int): - self.cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}") - results = self.cur.fetchall() - if len(results) == 0: + """ + update the ip_range_id column of visitor with visitor_id + """ + results = self(f"SELECT ip_address FROM visitor WHERE visitor_id = {visitor_id}") + if len(results) == 0: # sanity checks warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}") return elif len(results) > 1: warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}") return ip_address = results[0][0] - self.cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(self.cur, ip_address)}' WHERE visitor_id = '{visitor_id}'") + self.cur.execute(f"UPDATE visitor SET ip_range_id = '{self.get_ip_range_id(ip_address)}' WHERE visitor_id = '{visitor_id}'") -def create_filegroups(cursor: sql.Cursor, filegroup_str: str): - """ - TODO: make re-usable (alter groups when config changes) - """ - # filegroup_str: 'name1: file1, file2, file3; name2: file33' - groups = filegroup_str.strip(";").split(";") - pdebug("create_filegroups:", groups) - for group in groups: - name, vals = group.split(":") - # create/get group - if sql_exists(cursor, t_filegroup, [("groupname", name)]): - group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0] + + + def get_country_id(self, name, code) -> int: + """ + get the id of country of name + if not present, insert and return id + """ + if not sql_exists(self.cur, "country", [("name", name)]): + self.cur.execute(f"INSERT INTO country (name, code) VALUES ('{name}', '{code}')") + countries = self(f"SELECT country_id FROM country WHERE name = '{name}'") + if len(countries) > 0: + country_id_val = countries[0][0] else: - group_id = sql_max(cursor, t_filegroup, "group_id") + 1 - sql_insert(cursor, t_filegroup, [(group_id, name)]) - # pdebug("create_filegroups: group_id", group_id) - # create/edit file - for filename in vals.split(","): - if sql_exists(cursor, t_file, [("filename", filename)]): # if exist, update - cursor.execute(f"UPDATE {t_file} SET group_id = {group_id} WHERE filename = '{filename}'") - else: - sql_insert(cursor, t_file, [[filename, group_id]]) + warning(f"get_country_id: Could not get country_id for name='{name}'.") + return 0 + assert(type(country_id_val) == int) + return country_id_val -def get_files_from_dir_rec(p: str, files: list[str]): - """recursivly append all files to files""" - pdebug("get_files_from_dir_rec:",p) - if path.isfile(p): - files.append(p) - elif path.isdir(p): - for p_ in listdir(p): - get_files_from_dir_rec(p + "/" + p_, files) + def get_city_id(self, name, region, country_id) -> int: + if not sql_exists(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)]): + self.cur.execute(f"INSERT INTO city (name, region, country_id) VALUES ('{name}', '{region}', '{country_id}')") + cities = sql_select(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)]) + if len(cities) > 0: + city_id_val = cities[0][0] + else: + warning(f"get_city_id: Could not get city_id for name='{name}', region='{region}' and country_id='{country_id}'.") + return 0 + assert(type(city_id_val) == int) + return city_id_val -def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_filetypes:list[str]) -> str: - """ - :param list of nginx locations and the corresponding directories - :param auto_filetype_groups list of filetypes for auto grouping - """ - files: list[str] = [] - start_i = 0 - if len(location_and_dirs) > 0 and len(location_and_dirs[0]) == 2: - for location, dir_ in location_and_dirs: - get_files_from_dir_rec(dir_, files) - # replace dir_ with location, eg /www/website with / - for i in range(start_i, len(files)): - files[i] = files[i].replace(dir_, location).replace("//", "/") - filegroups = "" - # create groups for each filetype - for ft in auto_group_filetypes: - filegroups += f"{ft}:" - for file in files: - if file.endswith(f".{ft}"): - filegroups += f"{file}," - filegroups = filegroups.strip(",") + ";" - pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups) - return filegroups + def update_geoip_tables(self, geoip_city_csv_path: str): + """ + update the geoip data with the contents of the geoip_city_csv file -def get_country_id(cur:sql.Cursor, name, code, country_tablesize): - # countries = sql_select(cur, t_country, [("name", name)]) - cur.execute(f"SELECT {country_id.name} FROM {t_country} WHERE name = '{name}'") - countries = cur.fetchall() - if len(countries) > 0: - country_id_val = countries[0][0] - else: # insert new country - country_id_val = country_tablesize - # pdebug(f"update_geoip_tables: Adding country #{country_id_val}, name={name}") - cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES ({country_id_val}, '{name}', '{code}')") - country_tablesize += 1 - return country_id_val, country_tablesize + Make sure to update the visitor.ip_range_id column for all visitors. + In case something changed, they might point to a different city. (won't fix) + """ + # indices for the csv + FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5 + with open(geoip_city_csv_path, 'r') as file: + csv = reader(file, delimiter=',', quotechar='"') + # execute only if file could be opened + # delete all previous data + self.cur.execute(f"DELETE FROM ip_range") + self.cur.execute(f"DELETE FROM city") + self.cur.execute(f"DELETE FROM country") + self.cur.execute(f"VACUUM") -def get_city_id(cur: sql.Cursor, name, region, country_id, city_tablesize): - # cities = sql_select(cur, t_city, [("name", name)]) - cur.execute(f"SELECT {city_id.name} FROM {t_city} WHERE name = '{name}'") - cities = cur.fetchall() - if len(cities) > 0: - city_id_val = cities[0][0] - else: # insert new city - city_id_val = city_tablesize - # pdebug(f"update_geoip_tables: Adding city #{city_id_val}, name={row[CITY]}, country={country_id_val}") - cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region, country_id) VALUES ({city_id_val}, '{name}', '{region}', '{country_id}')") - city_tablesize += 1 - return city_id_val, city_tablesize + # guarantees that unkown city/country will have id 0 + self.cur.execute(f"INSERT INTO country (country_id, name, code) VALUES (0, 'Unknown', 'XX') ") + self.cur.execute(f"INSERT INTO city (city_id, name, region) VALUES (0, 'Unknown', 'Unkown') ") + print(f"Recreating the geoip database from {geoip_city_csv_path}. This might take a long time...") -def update_geoip_tables(cur: sql.Cursor, geoip_city_csv: str): - FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5 - ip_range_id_val = 0 - with open(geoip_city_csv, 'r') as file: - # delete all previous data - cur.execute(f"DELETE FROM {t_ip_range}") - cur.execute(f"VACUUM") - csv = reader(file, delimiter=',', quotechar='"') + # for combining city ranges into a 'City in ' range + # country_id for the range that was last added (for combining multiple csv rows in one ip_range) + RANGE_DONE = -1 + combine_range_country_id = RANGE_DONE + combine_range_country_name = "" + combine_range_low = RANGE_DONE + combine_range_high = RANGE_DONE + def add_range(low, high, city_name, region, country_id): + city_id = self.get_city_id(city_name, region, country_id) + pdebug(f"update_ip_range_id: Adding range for city={city_name}, country_id={country_id}, low={low}, high={high}") + self.cur.execute(f"INSERT INTO ip_range (low, high, city_id) VALUES ({low}, {high}, {city_id})") + for row in csv: + # these might contain problematic characters (') + row[CITY] = sanitize(row[CITY]) + row[COUNTRY] = sanitize(row[COUNTRY]) + row[REGION] = sanitize(row[REGION]) - # guarantees that unkown city/country will have id 0 - if not sql_exists(cur, t_country, [("name", "Unknown")]): - cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES (0, 'Unknown', 'XX') ") - if not sql_exists(cur, t_city, [("name", "Unknown")]): - cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region) VALUES (0, 'Unknown', 'Unkown') ") - country_tablesize = sql_tablesize(cur, t_country) - city_tablesize = sql_tablesize(cur, t_city) - print(f"Recreating the geoip database from {geoip_city_csv}. This might take a long time...") - combine_range_country_id = 0 - combine_range_lower = -1 - combine_range_upper = -1 - combine_range_country_name = "" - for row in csv: - # these might contain problematic characters (') - row[CITY] = sanitize(row[CITY]) - row[COUNTRY] = sanitize(row[COUNTRY]) - row[REGION] = sanitize(row[REGION]) - - # make sure country exists - country_id_val, country_tablesize = get_country_id(cur, row[COUNTRY], row[CODE], country_tablesize) - if row[CODE] in settings["get_cities_for_countries"]: - # make sure city exists - city_id_val, city_tablesize = get_city_id(cur, row[CITY], row[REGION], country_id_val, city_tablesize) - pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding range for city={row[CITY]}, country={row[COUNTRY]}, lower={row[FROM]}, upper={row[TO]}") - cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {row[FROM]}, {row[TO]}, {city_id_val})") - ip_range_id_val += 1 - else: - if combine_range_country_id >= 0: - if combine_range_country_id == country_id_val: combine_range_upper = row[TO] - else: # new range for country, append - # get id for dummy city - pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}") - city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize) - cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})") - ip_range_id_val += 1 - combine_range_country_id = -1 - if combine_range_country_id < 0 : # combine with later ranges - combine_range_country_id = country_id_val - combine_range_lower = row[FROM] - combine_range_upper = row[TO] - combine_range_country_name = row[COUNTRY] - if combine_range_country_id >= 0: # last range , append - # get id for dummy city - pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}") - city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize) - cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})") - ip_range_id_val += 1 - - -def create_db(db_name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]): - """ - create the name with database_tables - """ - print(f"creating database: '{db_name}'") - conn = sql.connect(f"{db_name}") - cursor = conn.cursor() - for table in database_tables.values(): - cursor.execute(table.create_sql_str()) - filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes) - create_filegroups(cursor, filegroup_str) - cursor.close() - conn.commit() - conn.close() - + # make sure country exists + country_id = self.get_country_id(row[COUNTRY], row[CODE]) + # only add cities for countries the user is interested in + if row[CODE] in settings["get_cities_for_countries"]: + add_range(row[FROM], row[TO], row[CITY], row[REGION], country_id) + else: + # if continuing + if combine_range_country_id != RANGE_DONE: + # if continuing previous range, extend the upper range limit + if combine_range_country_id == country_id: + combine_range_high = row[TO] + else: # new range for country, append + add_range(combine_range_low, combine_range_high, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id) + combine_range_country_id = RANGE_DONE + # not elif, this has to be executed if previous else was executed + if combine_range_country_id == RANGE_DONE : # currently in new range, combine with later ranges + combine_range_country_id = country_id + combine_range_country_name = row[COUNTRY] + combine_range_low = row[FROM] + combine_range_high = row[TO] + if combine_range_country_id >= 0: # last range , append + add_range(combine_range_low, combine_range_high, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id) if __name__ == '__main__': - create_db("test.db") + db = Database("test.db") diff --git a/regina/db_operation/request.py b/regina/db_operation/request.py index 9586ba0..261e0bc 100644 --- a/regina/db_operation/request.py +++ b/regina/db_operation/request.py @@ -13,7 +13,7 @@ class Request: def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""): self.ip_address = int(IPv4Address(sanitize(ip_address))) self.time_local = 0 - #[20/Nov/2022:00:47:36 +0100] + # turn [20/Nov/2022:00:47:36 +0100] to unix time m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) if m: g = m.groups() @@ -29,7 +29,7 @@ class Request: else: warning(f"Request:__init__: Could not match time: '{time_local}'") self.request_type = sanitize(request_type) - self.request_file = sanitize(request_file) + self.request_route = sanitize(request_file) self.request_protocol = sanitize(request_protocol) self.status = sanitize(status) self.bytes_sent = sanitize(bytes_sent) @@ -37,9 +37,9 @@ class Request: self.visitor_agent = sanitize(visitor_agent) def __repr__(self): - return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}" + return f"{self.ip_address} - {self.time_local} - {self.request_route} - {self.visitor_agent} - {self.status}" - def get_os(self): + def get_platform(self): # for groups in findall(re_visitor_agent, visitor_agent): operating_system = "" for os in visitor_agent_operating_systems: