From 0e0ece77ea16b757ac821bd73c03cdb79d71ebd7 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Mon, 15 May 2023 22:01:30 +0200 Subject: [PATCH] Changed database structure added new tables to reduce the size of the database --- database.svg | 280 +++++++++++++++++++++++++++++++++++++ database.uxf | 191 +++++++++++++------------ regina/database.py | 245 +++++++++++++++++++------------- regina/sql/create_db.sql | 8 +- regina/utility/sql_util.py | 47 ++++++- 5 files changed, 571 insertions(+), 200 deletions(-) create mode 100644 database.svg diff --git a/database.svg b/database.svg new file mode 100644 index 0000000..714d817 --- /dev/null +++ b/database.svg @@ -0,0 +1,280 @@ + + +referer«PK»- referer_id: INTEGER- name: TEXT UNIQUEbrowser«PK»- browser_id: INTEGER- name: TEXT UNIQUEroute«PK»- route_id: INTEGER- name: TEXT UNIQUEip_range«PK»- ip_range_id- low: INTEGER UNIQUE- high: INTEGER UNIQUE- city_id: INTEGERcountry«PK»- country_id: INTEGER- name: TEXT UNIQUE- code: TEXT UNIQUEcity«PK»- city_id: INTEGER- country_id: INTEGER- name: TEXT- region: TEXTplatform«PK»- platform_id: INTEGER- name: TEXT UNIQUErequest«PK»- request_id: INTEGER- visitor_id: INTEGER- route_id: INTEGER- referer_id: INTEGER- time: INTEGER- status: INTEGERvisitor«PK»- visitor_id: INTEGER- ip_address: INTEGER- platform_id: INTEGER- browser_id: INTEGER- is_mobile: INTEGER- is_human: INTEGER- ip_range_id: INTEGERn1n1n11n1n1n1nn1 diff --git a/database.uxf b/database.uxf index e4e262c..215a626 100644 --- a/database.uxf +++ b/database.uxf @@ -1,13 +1,13 @@ - 8 + 9 UMLClass - 96 - 248 - 160 - 144 + 441 + 306 + 180 + 162 visitor -- @@ -17,19 +17,20 @@ - ip_address: INTEGER - platform_id: INTEGER - browser_id: INTEGER -- mobile: INTEGER +- is_mobile: INTEGER - is_human: INTEGER -- range_id: INTEGER -style=autoresize +- ip_range_id: INTEGER +style=autoresize +bg=MAGENTA Relation - 216 - 168 - 32 - 96 + 558 + 216 + 36 + 108 lt=- m1=n @@ -40,10 +41,10 @@ m2=1 UMLClass - 352 - 240 - 152 - 136 + 702 + 306 + 171 + 153 request -- @@ -56,30 +57,31 @@ m2=1 -- - time: INTEGER - status: INTEGER -style=autoresize +style=autoresize +bg=CYAN Relation - 248 - 248 - 120 - 40 + 612 + 333 + 108 + 45 lt=- m1=1 m2=n - 10.0;20.0;130.0;20.0 + 10.0;20.0;100.0;20.0 UMLClass - 16 - 96 - 160 - 80 + 333 + 135 + 180 + 90 platform -- @@ -88,16 +90,17 @@ m2=n -- - name: TEXT UNIQUE -- -style=autoresize +style=autoresize +bg=MAGENTA UMLClass - 328 - 488 - 152 - 104 + 702 + 549 + 171 + 117 city -- @@ -107,95 +110,98 @@ style=autoresize - country_id: INTEGER - name: TEXT - region: TEXT -style=autoresize +style=autoresize +bg=ORANGE UMLClass - 536 - 488 - 152 - 88 + 945 + 549 + 171 + 99 country -- <<PK>> - country_id: INTEGER -- -- name: TEXT -- code: TEXT -style=autoresize +- name: TEXT UNIQUE +- code: TEXT UNIQUE +style=autoresize +bg=ORANGE Relation - 472 - 504 - 80 - 40 + 864 + 567 + 99 + 45 lt=- m1=1 m2=n - 80.0;20.0;10.0;20.0 + 90.0;20.0;10.0;20.0 Relation - 264 - 504 - 80 - 40 + 612 + 567 + 108 + 45 lt=- m1=1 m2=n - 80.0;20.0;10.0;20.0 + 100.0;20.0;10.0;20.0 UMLClass - 136 - 488 - 136 - 104 + 441 + 549 + 180 + 117 ip_range -- <<PK>> -- range_id +- ip_range_id -- -- from: INTEGER -- to: INTEGER +- low: INTEGER UNIQUE +- high: INTEGER UNIQUE - city_id: INTEGER -style=autoresize +style=autoresize +bg=ORANGE Relation - 176 - 384 - 32 - 120 + 522 + 459 + 36 + 108 lt=- m1=1 m2=n - 10.0;130.0;10.0;10.0 + 10.0;100.0;10.0;10.0 UMLClass - 576 - 264 - 144 - 80 + 945 + 306 + 162 + 90 route -- @@ -204,16 +210,17 @@ m2=n -- - name: TEXT UNIQUE -- -style=autoresize +style=autoresize +bg=CYAN UMLClass - 208 - 96 - 152 - 80 + 549 + 135 + 171 + 90 browser -- @@ -222,16 +229,17 @@ style=autoresize -- - name: TEXT UNIQUE -- -style=autoresize +style=autoresize +bg=MAGENTA Relation - 144 - 168 - 32 - 96 + 486 + 216 + 36 + 108 lt=- m1=n @@ -242,10 +250,10 @@ m2=1 UMLClass - 392 - 96 - 152 - 80 + 756 + 135 + 171 + 90 referer -- @@ -254,35 +262,36 @@ m2=1 -- - name: TEXT UNIQUE -- -style=autoresize +style=autoresize +bg=CYAN Relation - 400 - 168 - 32 - 88 + 783 + 216 + 36 + 108 lt=- m1=n m2=1 - 10.0;90.0;10.0;10.0 + 10.0;100.0;10.0;10.0 Relation - 496 - 288 - 96 - 40 + 864 + 333 + 99 + 45 lt=- m1=n m2=1 - 10.0;20.0;100.0;20.0 + 10.0;20.0;90.0;20.0 diff --git a/regina/database.py b/regina/database.py index f62a1a7..0d4ac00 100644 --- a/regina/database.py +++ b/regina/database.py @@ -15,11 +15,11 @@ if __name__ == "__main__": # make relative imports work as described here: http sys.path.insert(0, path.dirname(path.dirname(filepath))) # local -from regina.utility.sql_util import replace_null, sanitize, sql_select, sql_exists +from regina.utility.sql_util import replace_null, sanitize, sql_select, sql_exists, sql_tablesize from regina.utility.utility import pdebug, get_filepath, warning, pmessage, is_blacklisted, is_whitelisted from regina.utility.globals import settings from regina.data_collection.request import Request -from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings +from regina.utility.globals import user_agent_platforms, user_agent_browsers, settings """ create reginas database as shown in the uml diagram database.uxf @@ -32,14 +32,22 @@ class Database: # verify that the database is created self.cur.execute("pragma schema_version") if self.cur.fetchone()[0] == 0: # not created - pdebug(f"Database.__init__: Creating database at {database_path}") + pdebug(f"Database.__init__: Creating new databse at {database_path}", lvl=1) with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file: create_db = file.read() self.cur.executescript(create_db) self.conn.commit() + else: + pdebug(f"Database.__init__: Opening existing database at {database_path}", lvl=1) + + def __del__(self): + self.cur.close() + self.conn.commit() + self.conn.close() def __call__(self, s): """execute a command and return fetchall()""" + pdebug(f"Database: execute: \"{s}\"", lvl=4) self.cur.execute(s) return self.cur.fetchall() def execute(self, s): @@ -51,133 +59,147 @@ class Database: # VISITOR # def is_visitor_human(self, visitor_id: int): + self.execute(f"SELECT is_human FROM visitor WHERE visitor_id = {visitor_id}") + if self.fetchone()[0] == 1: + return True + return False + + def update_is_visitor_human(self, visitor_id: int): """ check if they have a known platform AND browser if settings "human_needs_success": check if at least one request did not result in an error (http status >= 400) + + updates the visitor.is_human column + @returns True if human, else False """ - max_success_status = 400 - if settings["status_300_is_success"]: max_success_status = 300 + def set_not_human(debug_str=""): + pdebug(f"update_is_visitor_human: visitor_id={visitor_id:5} is not human: Failed check: {debug_str}", lvl=3) + self.cur.execute(f"UPDATE visitor SET is_human = 0 WHERE visitor_id = {visitor_id}") + return False + self.cur.execute(f"SELECT browser_id, platform_id FROM visitor WHERE visitor_id = {visitor_id}") - browsers_and_platforms = self.cur.fetchall() - if len(browsers_and_platforms) != 1: - pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many") - return False - browser = self.get_name("browser", browsers_and_platforms[0][0]) - if not browser in visitor_agent_browsers: - return False - platform = self.get_name("platform", browsers_and_platforms[0][1]) - if not platform in visitor_agent_operating_systems: - return False - if settings["human_needs_success"]: + browser_id, platform_id = self.cur.fetchall()[0] + browser = self.get_name("browser", browser_id) + if not browser in user_agent_browsers: + return set_not_human("browser") + + platform = self.get_name("platform", platform_id) + if not platform in user_agent_platforms: + return set_not_human("platform") + + max_success_status = 300 + if settings["data-collection"]["status_300_is_success"]: max_success_status = 400 + + if settings["data-collection"]["human_needs_successful_request"]: # check if at least request was successful (status < 400) self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM request WHERE visitor_id = {visitor_id} AND status < {max_success_status})") - if self.cur.fetchone()[0] == 1: - # pdebug(f"is_visitor_human: Visitor {visitor_id} is human") - pass - else: - # pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests") - return False + if self.cur.fetchone()[0] == 0: + return set_not_human("successful request") + # if here, is human + self.cur.execute(f"UPDATE visitor SET is_human = 1 WHERE visitor_id = {visitor_id}") return True - def get_visitor_id(self, request: Request, insert=True) -> int | None: - """ - get the visitor_id. Adds the visitor if not already existing - """ + def get_visitor_id(self, request: Request, insert=True) -> tuple[int | None, bool]: """ get the visitor_id: - If settings unique_visitor_is_ip_address: Check if visitor with ip address exists - Else: check if visitor with ip_address, browser and platform exists + if settings unique_visitor_is_ip_address: Check if visitor with ip address exists + else: check if visitor with ip_address, browser and platform exists - If visitor does not exist and insert: insert, return id - Else: return None + @return visitor_id, is_new_visitor + if visitor does not exist: + if insert: return visitor_id, True + else: return None, False + else: return visitor_id, False """ - if settings["hash_ip_address"]: - ip_address = hash(request.ip_address) - else: - ip_address = request.ip_address + ip_address = request.ip_address # if insert == True, ids will be int browser_id: int | None = self.get_id("browser", request.get_browser(), insert=insert) platform_id: int | None = self.get_id("platform", request.get_platform(), insert=insert) constraints = [("ip_address", ip_address)] - if not settings["unique_visitor_is_ip_address"]: + if not settings["data-collection"]["unique_visitor_is_ip_address"]: if browser_id: constraints.append(("browser_id", browser_id)) if platform_id: constraints.append(("platform_id", platform_id)) - require_update_is_human = False + is_new_visitor = False if not sql_exists(self.cur, "visitor", constraints): - require_update_is_human = True + is_new_visitor = True if not insert: - return None + return None, False is_mobile = int(request.get_mobile()) ip_range_id = 0 - if settings["get_visitor_location"]: + if settings["data-collection"]["get_visitor_location"]: ip_range_id = self.get_ip_range_id(request.ip_address) - is_human = 0 # is_visitor_human cannot be called until visitor is in db - self.cur.execute(f"INSERT INTO visitor (ip_address, ip_range_id, platform_id, browser_id, is_mobile, is_human, ip_range_id) VALUES ('{ip_address}', '{ip_range_id}', '{platform_id}', '{browser_id}', '{is_mobile}', '{is_human}');") + is_human = 0 # update_is_visitor_human cannot be called until visitor is in db + self.cur.execute(f"INSERT INTO visitor (ip_address, ip_range_id, platform_id, browser_id, is_mobile, is_human) VALUES ('{ip_address}', '{ip_range_id}', '{platform_id}', '{browser_id}', '{is_mobile}', '{is_human}');") visitor_id = sql_select(self.cur, "visitor", constraints)[0][0] - # TODO: if requests are not added yet, visitor might not be recognized since it does not have a successful requets yet - if require_update_is_human: - is_human = self.is_visitor_human(visitor_id) - if is_human: - self.cur.execute(f"UPDATE visitor SET is_human = 1 WHERE visitor_id = {visitor_id}") - return visitor_id + return visitor_id, is_new_visitor + def get_visitor_ids_for_date(self, date:str) -> list[int]: + return [ visitor_id[0] for visitor_id in self(f"SELECT DISTINCT visitor_id FROM request WHERE {date}") ] + + def get_visitor_count(self) -> int: + return sql_tablesize(self.cur, "visitor") # # REQUEST # - def request_exists(self, request: Request, visitor_id: int, route_id: int): + def get_request_count(self) -> int: + return sql_tablesize(self.cur, "request") + + def request_exists(self, request_timestamp: int, visitor_id: int, route_id: int): """ - Check if a request from same visitor was made to same location in the same day, if setting "request_is_same_on_same_day" is True - If not, always returns False + Return if a request from same visitor was made to same route within the timespan set by the 'ignore_duplicate_requests_within_x_seconds' option """ - if not settings["request_is_same_on_same_day"]: return False - # get all requests from same visitor to same route - self.cur.execute(f"SELECT request_id, time FROM request WHERE visitor_id = '{visitor_id}' AND = route_id = '{route_id}'") - # check if on same day - date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d") - for request_id, date1 in self.cur.fetchall(): - date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d") - if date0 == date1: - pdebug(f"request_exists: Request is on same day as request {request_id}") - return True + ignore_seconds = settings["data-collection"]["ignore_duplicate_requests_within_x_seconds"] + time_min, time_max = max(0, request_timestamp - ignore_seconds), request_timestamp + ignore_seconds + requests = self(f"SELECT request_id, time FROM request WHERE visitor_id = '{visitor_id}' AND route_id = '{route_id}' AND time BETWEEN {time_min} AND {time_max}") + if len(requests) > 0: + pdebug(f"request_exists: Found {len(requests)} requests within {ignore_seconds} minutes (v_id={visitor_id}, r_id={route_id}, t={request_timestamp})") + return True return False - def add_request(self, request: Request) -> (int | None): - """returns visitor_id if new request was added, else None""" - visitor_id = self.get_visitor_id(request) - self.conn.commit() - # browser_id = self.get_id("browser", request.get_browser()) - # platform_id = self.get_id("platform", request.get_platform()) + def add_request(self, request: Request) -> tuple[int | None, bool]: + """ + @returns visitor_id, is_new_visitor + if new request was added, else None + """ + visitor_id, is_new_visitor = self.get_visitor_id(request) referer_id = self.get_id("referer", request.referer) route_id = self.get_id("route", request.route) # check if request is unique - if self.request_exists(request, visitor_id, route_id): - # pdebug("request exists:", request) - return None + if self.request_exists(request.time_local, visitor_id, route_id): + pdebug("add_request: exists:", request, lvl=3) + return None, is_new_visitor else: - # pdebug("new request:", request) + pdebug("add_request: added", request, lvl=3) self.cur.execute(f"INSERT INTO request (visitor_id, route_id, referer_id, time, status) VALUES ({visitor_id}, {route_id}, {referer_id}, {request.time_local}, {request.status})") - return visitor_id + return visitor_id, is_new_visitor def add_requests(self, requests: list[Request]): - added_requests = 0 + """ + Add a list of requests to the database + Adds the visitors, if needed + @returs added_request_count, visitors_count, new_visitors_count + """ + added_request_count = 0 # check the new visitors later - new_visitors = [] + visitors: set[int] = set() + new_visitors: set[int] = set() for i in range(len(requests)): - if is_blacklisted(requests[i].request_route, settings["request_route_blacklist"]): continue - if not is_whitelisted(requests[i].request_route, settings["request_route_whitelist"]): continue - visitor = self.add_request(requests[i]) - if visitor: - new_visitors.append(visitor) + if is_blacklisted(requests[i].route, settings["data-collection"]["request_route_blacklist"]): continue + if not is_whitelisted(requests[i].route, settings["data-collection"]["request_route_whitelist"]): continue + visitor_id, is_new_visitor = self.add_request(requests[i]) + if visitor_id: + added_request_count += 1 + visitors.add(visitor_id) + if is_new_visitor: + new_visitors.add(visitor_id) # update the is_human column for all new visitors for visitor_id in new_visitors: - # TODO this does not look right - if not sql_exists(self.cur, "visitor", [("visitor_id", visitor_id)]): continue - # pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}") - self.conn.commit() - pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.") + self.update_is_visitor_human(visitor_id) + + return added_request_count, len(visitors), len(new_visitors) def get_id(self, table: str, name: str, insert=True) -> int | None: @@ -192,7 +214,8 @@ class Database: if not table in supported_tables: raise ValueError(f"table '{table}' is not supported ({supported_tables})") name = sanitize(replace_null(name)) # if non existent, add name - if not sql_exists(self.cur, table, [("name", name)]): + pdebug(f"get_id(table={table},\tname={name}", lvl=4) + if not sql_exists(self.cur, table, [("name", name)], do_sanitize=False): # double sanitizing might lead to problems with quotes if not insert: return None self.cur.execute(f"INSERT INTO {table} (name) VALUES ('{name}')") return self(f"SELECT {table}_id FROM {table} WHERE name = '{name}'")[0][0] @@ -207,8 +230,7 @@ class Database: if not table in supported_tables: raise ValueError(f"table '{table}' is not supported ({supported_tables})") ret = self(f"SELECT name FROM {table} WHERE {table}_id = '{id_}'") if len(ret) == 0: return None - # TODO check if this returns tuple or value - return ret[0] + return ret[0][0] @@ -231,7 +253,7 @@ class Database: """ update the ip_range_id column of visitor with visitor_id """ - results = self(f"SELECT ip_address FROM visitor WHERE visitor_id = {visitor_id}") + results = self(f"SELECT ip_address FROM visitor WHERE visitor_id = '{visitor_id}'") if len(results) == 0: # sanity checks warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}") return @@ -248,7 +270,9 @@ class Database: get the id of country of name if not present, insert and return id """ - if not sql_exists(self.cur, "country", [("name", name)]): + name = sanitize(name) + code = sanitize(code) + if not sql_exists(self.cur, "country", [("name", name)], do_sanitize=False): self.cur.execute(f"INSERT INTO country (name, code) VALUES ('{name}', '{code}')") countries = self(f"SELECT country_id FROM country WHERE name = '{name}'") if len(countries) > 0: @@ -260,9 +284,11 @@ class Database: return country_id_val def get_city_id(self, name, region, country_id) -> int: - if not sql_exists(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)]): + name = sanitize(name) + region = sanitize(region) + if not sql_exists(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)], do_sanitize=False): self.cur.execute(f"INSERT INTO city (name, region, country_id) VALUES ('{name}', '{region}', '{country_id}')") - cities = sql_select(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)]) + cities = sql_select(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)], do_sanitize=False) if len(cities) > 0: city_id_val = cities[0][0] else: @@ -283,19 +309,36 @@ class Database: """ # indices for the csv FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5 + + # FROM https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python (Quentin Pradet) + def _count_generator(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024*1024) + def rawgencount(filename): + with open(filename, "rb") as file: + f_gen = _count_generator(file.raw.read) + return sum( buf.count(b'\n') for buf in f_gen ) + + pmessage(f"Recreating the GeoIP database from {geoip_city_csv_path}. This might take a long time...") + row_count = rawgencount(geoip_city_csv_path) + pmessage(f"Total rows: {row_count}") + with open(geoip_city_csv_path, 'r') as file: csv = reader(file, delimiter=',', quotechar='"') + file.seek(0) # execute only if file could be opened # delete all previous data self.cur.execute(f"DELETE FROM ip_range") self.cur.execute(f"DELETE FROM city") self.cur.execute(f"DELETE FROM country") + self.conn.commit() self.cur.execute(f"VACUUM") # guarantees that unkown city/country will have id 0 self.cur.execute(f"INSERT INTO country (country_id, name, code) VALUES (0, 'Unknown', 'XX') ") self.cur.execute(f"INSERT INTO city (city_id, name, region) VALUES (0, 'Unknown', 'Unkown') ") - print(f"Recreating the geoip database from {geoip_city_csv_path}. This might take a long time...") # for combining city ranges into a 'City in ' range # country_id for the range that was last added (for combining multiple csv rows in one ip_range) @@ -307,18 +350,22 @@ class Database: def add_range(low, high, city_name, region, country_id): city_id = self.get_city_id(city_name, region, country_id) - pdebug(f"update_ip_range_id: Adding range for city={city_name}, country_id={country_id}, low={low}, high={high}") + pdebug(f"update_ip_range_id: Adding range for city={city_name:20}, country_id={country_id:3}, low={low:16}, high={high:16}", lvl=2) self.cur.execute(f"INSERT INTO ip_range (low, high, city_id) VALUES ({low}, {high}, {city_id})") - for row in csv: + for i, row in enumerate(csv, 1): + # if i % 100 == 0: + pmessage(f"Updating GeoIP database: {i:7}/{row_count} ({100.0*i/row_count:.2f}%)", end="\r") # these might contain problematic characters (') - row[CITY] = sanitize(row[CITY]) - row[COUNTRY] = sanitize(row[COUNTRY]) - row[REGION] = sanitize(row[REGION]) + # row[CITY] = sanitize(row[CITY]) + if row[COUNTRY] == "United Kingdom of Great Britain and Northern Ireland": + row[COUNTRY] = "United Kingdom" + # row[COUNTRY] = sanitize(row[COUNTRY]) + # row[REGION] = sanitize(row[REGION]) # make sure country exists country_id = self.get_country_id(row[COUNTRY], row[CODE]) # only add cities for countries the user is interested in - if row[CODE] in settings["get_cities_for_countries"]: + if row[CODE] in settings["data-collection"]["get_cities_for_countries"]: add_range(row[FROM], row[TO], row[CITY], row[REGION], country_id) else: # if continuing @@ -343,13 +390,13 @@ class Database: # REQUEST # # TIME/DATE - def get_earliest_date(self) -> int: + def get_earliest_timestamp(self) -> int: """return the earliest time as unixepoch""" date = self(f"SELECT MIN(time) FROM request")[0][0] if not isinstance(date, int): return 0 else: return date - def get_latest_date(self) -> int: + def get_latest_timestamp(self) -> int: """return the latest time as unixepoch""" date = self(f"SELECT MAX(time) FROM request")[0][0] if not isinstance(date, int): return 0 diff --git a/regina/sql/create_db.sql b/regina/sql/create_db.sql index a712aac..973c609 100644 --- a/regina/sql/create_db.sql +++ b/regina/sql/create_db.sql @@ -1,7 +1,7 @@ -- see database.uxf CREATE TABLE IF NOT EXISTS visitor( visitor_id INTEGER PRIMARY KEY, - + ip_address INTEGER, ip_range_id INTEGER, platform_id INTEGER, browser_id INTEGER, @@ -28,12 +28,12 @@ CREATE TABLE IF NOT EXISTS request( request_id INTEGER PRIMARY KEY, visitor_id INTEGER, route_id INTEGER, - referer INTEGER, + referer_id INTEGER, time INTEGER, status INTEGER, FOREIGN KEY(visitor_id) REFERENCES visitor(visitor_id), FOREIGN KEY(route_id) REFERENCES route(route_id), - FOREIGN KEY(referer) REFERENCES referer(referer_id) + FOREIGN KEY(referer_id) REFERENCES referer(referer_id) ) STRICT; CREATE TABLE IF NOT EXISTS referer( @@ -57,7 +57,7 @@ CREATE TABLE IF NOT EXISTS ip_range( ) STRICT; CREATE TABLE IF NOT EXISTS city( - city INTEGER PRIMARY KEY, + city_id INTEGER PRIMARY KEY, name TEXT, region TEXT, country_id INTEGER, diff --git a/regina/utility/sql_util.py b/regina/utility/sql_util.py index b399705..808da10 100644 --- a/regina/utility/sql_util.py +++ b/regina/utility/sql_util.py @@ -1,6 +1,40 @@ import sqlite3 as sql """Various utilities""" +def get_date_constraint(at_date=None, min_date=None, max_date=None): + """ + get a condition string that sets a condition on the time to a certain date + the conditions can be a string representing a date or an int/float in unixepoch + """ + # dates in unix time + s = "" + if at_date is not None: + if isinstance(at_date, str): + s += f"DATE(time, 'unixepoch') = '{sanitize(at_date)}' AND " + elif isinstance(at_date, int|float): + s += f"time = {int(at_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}") + if min_date is not None: + if isinstance(min_date, str): + s += f"DATE(time, 'unixepoch') >= '{sanitize(min_date)}' AND " + elif isinstance(min_date, int|float): + s += f"time >= {int(min_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}") + if max_date is not None: + if isinstance(max_date, str): + s += f"DATE(time, 'unixepoch') <= '{sanitize(max_date)}' AND " + elif isinstance(max_date, int|float): + s += f"time <= {int(max_date)} AND " + else: + print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}") + if s == "": + print(f"WARNING: get_where_date_str: no date_str generated. Returning 'time > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") + return "time > 0" + return s.removesuffix(" AND ") + + def replace_null(s): if not s: return "None" @@ -11,10 +45,11 @@ def sanitize(s): return s.replace("'", r"''").strip(" ") # .replace('"', r'\"')\ -def sql_get_constaint_str(constraints: list[tuple[str, str|int]], logic="AND") -> str: +def sql_get_constaint_str(constraints: list[tuple[str, str|int]], logic="AND", do_sanitize=True) -> str: c_str = "" for name, val in constraints: - c_str += f"{name} = '{sanitize(val)}' {logic} " + if do_sanitize: val = sanitize(val) + c_str += f"{name} = '{val}' {logic} " return c_str.strip(logic + " ") def sql_get_value_str(values: list[list]) -> str: @@ -25,12 +60,12 @@ def sql_get_value_str(values: list[list]) -> str: c_str = c_str.strip(", ") + "), " return c_str.strip(", ") -def sql_exists(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND") -> bool: - cur.execute(f"SELECT EXISTS (SELECT 1 FROM {table} WHERE {sql_get_constaint_str(constraints, logic)})") +def sql_exists(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND", do_sanitize=True) -> bool: + cur.execute(f"SELECT EXISTS (SELECT 1 FROM {table} WHERE {sql_get_constaint_str(constraints, logic, do_sanitize=do_sanitize)})") return cur.fetchone()[0] == 1 -def sql_select(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND"): - cur.execute(f"SELECT * FROM {table} WHERE {sql_get_constaint_str(constraints, logic)}") +def sql_select(cur: sql.Cursor, table: str, constraints: list[tuple[str, str|int]], logic="AND", do_sanitize=True): + cur.execute(f"SELECT * FROM {table} WHERE {sql_get_constaint_str(constraints, logic, do_sanitize=do_sanitize)}") return cur.fetchall() def sql_insert(cur: sql.Cursor, table: str, values: list[list]):