diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py index 7cc9fdc..de79710 100644 --- a/regina/db_operation/collect.py +++ b/regina/db_operation/collect.py @@ -1,49 +1,12 @@ import sqlite3 as sql from re import fullmatch, match -from ipaddress import IPv4Address, ip_address -from time import mktime -from datetime import datetime as dt from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max from regina.utility.utility import pdebug, warning, pmessage -from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings """ collect information from the access log and put it into the database """ -months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"] - - - -class Request: - def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""): - self.ip_address = int(IPv4Address(sanitize(ip_address))) - self.time_local = 0 - #[20/Nov/2022:00:47:36 +0100] - m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) - if m: - g = m.groups() - try: - if g[1] in months: - datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) - # pdebug(f"Request __init__: datetime {datetime_}, from {g}") - self.time_local = int(mktime(datetime_.timetuple())) - else: - warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}") - except Exception as e: - warning(f"Request:__init__: {e}") - else: - warning(f"Request:__init__: Could not match time: '{time_local}'") - self.request_type = sanitize(request_type) - self.request_file = sanitize(request_file) - self.request_protocol = sanitize(request_protocol) - self.status = sanitize(status) - self.bytes_sent = sanitize(bytes_sent) - self.referer = sanitize(referer) - self.visitor_agent = sanitize(visitor_agent) - - def __repr__(self): - return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}" re_remote_addr = r"[0-9a-fA-F.:]+" re_remote_visitor = ".*" @@ -54,6 +17,7 @@ re_body_bytes_sent = r'\d+' re_http_referer = r'"([^"]*)"' re_http_visitor_agent = r'"([^"]*)"' re_log_format: str = f'({re_remote_addr}) - ({re_remote_visitor}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_visitor_agent}' + def parse_log(logfile:str) -> list[Request]: """ create Request objects from each line in the logfile @@ -77,171 +41,3 @@ def parse_log(logfile:str) -> list[Request]: status=g[4], bytes_sent=g[5], referer=g[6], visitor_agent=g[7])) return requests - -def visitor_exists(cursor, request) -> bool: - if settings["hash_ip_address"]: - ip_address = hash(request.ip_address) - else: - ip_address = request.ip_address - if settings["unique_visitor_is_ip_address"]: - return sql_exists(cursor, t_visitor, [("ip_address", ip_address)]) - else: - return sql_exists(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)]) - -def get_visitor_id(request: Request, cursor: sql.Cursor) -> int: - """ - get the visitor_id. Adds the visitor if not already existing - """ - if settings["hash_ip_address"]: - ip_address = hash(request.ip_address) - else: - ip_address = request.ip_address - - if visitor_exists(cursor, request): - if settings["unique_visitor_is_ip_address"]: - visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address)])[0][0] - else: - visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0] - else: # new visitor - # new visitor_id is number of elements - visitor_id = sql_max(cursor, t_visitor, "visitor_id") + 1 - # pdebug("new visitor:", visitor_id, request.ip_address) - platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent) - ip_range_id_val = 0 - if settings["get_visitor_location"]: - ip_range_id_val = get_ip_range_id(cursor, request.ip_address) - is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(cursor, visitor_id)) - cursor.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") - return visitor_id - -def is_visitor_human(cur: sql.Cursor, visitor_id: int): - global settings - """ - check if they have a known platform AND browser - check if at least one request did not result in an error (http status >= 400) - """ - max_success_status = 400 - if settings["status_300_is_success"]: max_success_status = 300 - cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}") - browsers_and_platforms = cur.fetchall() - if len(browsers_and_platforms) != 1: - pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many") - return False - if not browsers_and_platforms[0][0] in visitor_agent_browsers: - return False - if not browsers_and_platforms[0][1] in visitor_agent_operating_systems: - return False - # check if has browser - # cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)") - # if no browser and platform - # exists = cur.fetchone() - # if exists is None or exists[0] == 0: - # return False - # if human needs successful request - if settings["human_needs_success"]: - # check if at least request was successful (status < 400) - cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})") - if cur.fetchone()[0] == 1: - # pdebug(f"is_visitor_human: Visitor {visitor_id} is human") - pass - else: - # pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests") - return False - # visitor is human - return True - -def request_exists(cur: sql.Cursor, request: Request, visitor_id: int, group_id: int): - # get all requests from same visitor to same location - cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'") - date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d") - for request_id, date1 in cur.fetchall(): - if settings["request_is_same_on_same_day"]: - date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d") - if date0 == date1: - pdebug(f"request_exists: Request is on same day as request {request_id}") - return True - return False - - -# re_visitor_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)" -# 1: platform, 2: version, 3: details -def get_os_browser_pairs_from_agent(visitor_agent): - # for groups in findall(re_visitor_agent, visitor_agent): - operating_system = "" - browser = "" - mobile = "Mobi" in visitor_agent - for os in visitor_agent_operating_systems: - if os in visitor_agent: - operating_system = os - break - for br in visitor_agent_browsers: - if br in visitor_agent: - browser = br - break - # if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{visitor_agent}', found os: '{operating_system}' and browser: '{browser}'") - return operating_system, browser, mobile - - -def get_ip_range_id(cur: sql.Cursor, ip_address: int): - cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper") - results = cur.fetchall() - ip_range_id_val = 0 - if len(results) == 0: - pass - elif len(results) > 1: - warning(f"get_countries: Found multiple ip_ranges for ip_address={ip_address}: results={results}") - else: - ip_range_id_val = results[0][0] - return ip_range_id_val - -def update_ip_range_id(cur: sql.Cursor, visitor_id: int): - cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}") - results = cur.fetchall() - if len(results) == 0: - warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}") - return - elif len(results) > 1: - warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}") - return - ip_address = results[0][0] - cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(cur, ip_address)}' WHERE visitor_id = '{visitor_id}'") - - -def add_requests_to_db(requests: list[Request], db_name: str): - conn = sql.connect(db_name) - cursor = conn.cursor() - added_requests = 0 - # check the new visitors later - max_visitor_id = sql_max(cursor, t_visitor, "visitor_id") - request_blacklist = settings["request_location_regex_blacklist"] - for i in range(len(requests)): - request = requests[i] - # skip requests to blacklisted locations - if request_blacklist: - if fullmatch(request_blacklist, request.request_file): - # pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'") - continue - # pdebug("add_requests_to_db:", i, "request:", request) - visitor_id = get_visitor_id(request, cursor) - conn.commit() - group_id: int = get_filegroup(request.request_file, cursor) - # check if request is unique - if request_exists(cursor, request, visitor_id, group_id): - # pdebug("request exists:", request) - pass - else: - # pdebug("new request:", request) - request_id = sql_max(cursor, t_request, "request_id") + 1 - sql_insert(cursor, t_request, [[request_id, visitor_id, group_id, request.time_local, request.referer, request.status]]) - added_requests += 1 - visitor_count = sql_tablesize(cursor, t_visitor) - for visitor_id in range(max_visitor_id, visitor_count): - if not sql_exists(cursor, t_visitor, [(str(visitor_id), "visitor_id")]): continue - is_human = is_visitor_human(cursor, visitor_id) - cursor.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}") - # pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {cursor.fetchall()}") - if is_human: - cursor.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}") - cursor.close() - conn.commit() - pmessage(f"Collection Summary: Added {visitor_count - max_visitor_id} new visitors and {added_requests} new requests.") diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py index b157c11..e3b7948 100644 --- a/regina/db_operation/database.py +++ b/regina/db_operation/database.py @@ -2,10 +2,15 @@ import sqlite3 as sql from csv import reader from os import path, listdir +import pkg_resources +import re +from datetime import datetime as dt # local -from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max -from regina.utility.utility import pdebug -from regina.utility.globals import settings +from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max +from .utility.utility import pdebug, get_filepath, warning, pmessage +from .utility.globals import settings +from .db_operation.request import Request +from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings """ create reginas database as shown in the uml diagram database.uxf @@ -37,6 +42,8 @@ class Table: for c in self.constaints: s += f", {c}" return s + + t_request = "request" t_file = "file" t_filegroup = "filegroup" @@ -100,36 +107,217 @@ database_tables = { -def get_filegroup(filename: str, cursor: sql.Cursor) -> int: - """ - get the filegroup - returns the group where - 1) filename is the groupname - 2) the filetype of filename is the groupname - 3) new group with filename as gorupname - """ - # pdebug(f"get_filegroup: {filename}") - if sql_exists(cursor, t_file, [("filename", filename)]): - return sql_select(cursor, t_file, [("filename", filename)])[0][1] - else: - suffix = filename.split('.')[-1] - cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") - # cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") - group_id_candidates = cursor.fetchall() - # pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") - if group_id_candidates: - return group_id_candidates[0][0] - else: # add new group file filename - group_id = sql_max(cursor, t_filegroup, "group_id") + 1 +class Database: + def __init__(self, database_path): + self.conn = sql.connect(database_path) + self.cur = self.conn.cursor() + # verify that the database is created + self.cur.execute("pragma schema_version") + if self.cur.fetchone()[0] == 0: # not created + pdebug(f"Database.__init__: Creating database at {database_path}") + with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file: + create_db = file.read() + self.cur.execute(create_db) - # pdebug("new file(group):", group_id, filename) - # add group - sql_insert(cursor, t_filegroup, [[group_id, filename]]) - # add file - sql_insert(cursor, t_file, [[filename, group_id]]) - return group_id + def __call__(self, s): + """execute a command and return fetchall()""" + self.cur.execute(s) + return self.cur.fetchall() + + # + # VISITOR + # + def visitor_exists(self, request) -> bool: + if settings["hash_ip_address"]: + ip_address = hash(request.ip_address) + else: + ip_address = request.ip_address + if settings["unique_visitor_is_ip_address"]: + return sql_exists(self.cur, t_visitor, [("ip_address", ip_address)]) + else: + return sql_exists(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)]) + + def is_visitor_human(self, visitor_id: int): + """ + check if they have a known platform AND browser + check if at least one request did not result in an error (http status >= 400) + """ + max_success_status = 400 + if settings["status_300_is_success"]: max_success_status = 300 + self.cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}") + browsers_and_platforms = self.cur.fetchall() + if len(browsers_and_platforms) != 1: + pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many") + return False + if not browsers_and_platforms[0][0] in visitor_agent_browsers: + return False + if not browsers_and_platforms[0][1] in visitor_agent_operating_systems: + return False + # check if has browser + # self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)") + # if no browser and platform + # exists = self.cur.fetchone() + # if exists is None or exists[0] == 0: + # return False + # if human needs successful request + if settings["human_needs_success"]: + # check if at least request was successful (status < 400) + self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})") + if self.cur.fetchone()[0] == 1: + # pdebug(f"is_visitor_human: Visitor {visitor_id} is human") + pass + else: + # pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests") + return False + return True + + def get_visitor_id(self, request: Request) -> int: + """ + get the visitor_id. Adds the visitor if not already existing + """ + if settings["hash_ip_address"]: + ip_address = hash(request.ip_address) + else: + ip_address = request.ip_address + + if self.visitor_exists(request): + if settings["unique_visitor_is_ip_address"]: + visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address)])[0][0] + else: + visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0] + else: # new visitor + # new visitor_id is number of elements + visitor_id = sql_max(self.cur, t_visitor, "visitor_id") + 1 + # pdebug("new visitor:", visitor_id, request.ip_address) + platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent) + ip_range_id_val = 0 + if settings["get_visitor_location"]: + ip_range_id_val = get_ip_range_id(self.cur, request.ip_address) + is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(self.cur, visitor_id)) + self.cur.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") + return visitor_id + + + # + # REQUEST + # + def request_exists(self, request: Request, visitor_id: int, group_id: int): + # get all requests from same visitor to same location + # TODO this looks wrong + self.cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'") + date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d") + for request_id, date1 in self.cur.fetchall(): + if settings["request_is_same_on_same_day"]: + date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d") + if date0 == date1: + pdebug(f"request_exists: Request is on same day as request {request_id}") + return True + return False + + def add_request(self, request: Request) -> (int | None): + """returns visitor_id if new request was added, else None""" + # skip requests to blacklisted locations + if request_blacklist: + if re.fullmatch(request_blacklist, request.request_file): + # pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'") + return None + # pdebug("add_requests_to_db:", i, "request:", request) + visitor_id = self.get_visitor_id(request) + self.conn.commit() + group_id: int = self.get_filegroup(request.request_file) + # check if request is unique + if self.request_exists(request, visitor_id, group_id): + # pdebug("request exists:", request) + return None + else: + # pdebug("new request:", request) + sql_insert(t_request, [[None, visitor_id, group_id, request.time_local, request.referer, request.status]]) + return visitor_id + + def add_requests(self, requests: list[Request]): + added_requests = 0 + # check the new visitors later + request_blacklist = settings["request_location_regex_blacklist"] + new_visitors = [] + for i in range(len(requests)): + visitor = self.add_request(requests[i]) + if visitor: + new_visitors.append(visitor) + + # update the is_human column for all new visitors + for visitor_id in new_visitors: + if not sql_exists(self.cur, t_visitor, [(str(visitor_id), "visitor_id")]): continue + is_human = self.is_visitor_human(visitor_id) + self.cur.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}") + # pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}") + if is_human: + self.cur.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}") + self.conn.commit() + pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.") + + # + # FILE(GROUP) + # + def get_filegroup(self, filename: str) -> int: + """ + get the filegroup + returns the group where + 1) filename is the groupname + 2) the filetype of filename is the groupname + 3) new group with filename as gorupname + """ + # pdebug(f"get_filegroup: {filename}") + if sql_exists(self.cur, t_file, [("filename", filename)]): + return sql_select(self.cur, t_file, [("filename", filename)])[0][1] + else: + suffix = filename.split('.')[-1] + self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") + # self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") + group_id_candidates = self.cur.fetchall() + # pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") + if group_id_candidates: + return group_id_candidates[0][0] + else: # add new group file filename + group_id = sql_max(self.cur, t_filegroup, "group_id") + 1 + + # pdebug("new file(group):", group_id, filename) + # add group + sql_insert(self.cur, t_filegroup, [[group_id, filename]]) + # add file + sql_insert(self.cur, t_file, [[filename, group_id]]) + return group_id + + # + # GEOIP + # + def get_ip_range_id(self, ip_address: int): + self.cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper") + results = self.cur.fetchall() + ip_range_id_val = 0 + if len(results) == 0: + pass + elif len(results) > 1: + warning(f"get_ip_range_id: Found multiple ip_ranges for ip_address={ip_address}: results={results}") + else: + ip_range_id_val = results[0][0] + return ip_range_id_val + + def update_ip_range_id(self, visitor_id: int): + self.cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}") + results = self.cur.fetchall() + if len(results) == 0: + warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}") + return + elif len(results) > 1: + warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}") + return + ip_address = results[0][0] + self.cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(self.cur, ip_address)}' WHERE visitor_id = '{visitor_id}'") def create_filegroups(cursor: sql.Cursor, filegroup_str: str): + """ + TODO: make re-usable (alter groups when config changes) + """ # filegroup_str: 'name1: file1, file2, file3; name2: file33' groups = filegroup_str.strip(";").split(";") pdebug("create_filegroups:", groups) diff --git a/regina/db_operation/request.py b/regina/db_operation/request.py new file mode 100644 index 0000000..9586ba0 --- /dev/null +++ b/regina/db_operation/request.py @@ -0,0 +1,62 @@ +from ipaddress import IPv4Address, ip_address +from time import mktime +from re import fullmatch, match +from datetime import datetime as dt + +from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max +from .utility.utility import pdebug, warning, pmessage +from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings + +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"] + +class Request: + def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""): + self.ip_address = int(IPv4Address(sanitize(ip_address))) + self.time_local = 0 + #[20/Nov/2022:00:47:36 +0100] + m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) + if m: + g = m.groups() + try: + if g[1] in months: + datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) + # pdebug(f"Request __init__: datetime {datetime_}, from {g}") + self.time_local = int(mktime(datetime_.timetuple())) + else: + warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}") + except Exception as e: + warning(f"Request:__init__: {e}") + else: + warning(f"Request:__init__: Could not match time: '{time_local}'") + self.request_type = sanitize(request_type) + self.request_file = sanitize(request_file) + self.request_protocol = sanitize(request_protocol) + self.status = sanitize(status) + self.bytes_sent = sanitize(bytes_sent) + self.referer = sanitize(referer) + self.visitor_agent = sanitize(visitor_agent) + + def __repr__(self): + return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}" + + def get_os(self): + # for groups in findall(re_visitor_agent, visitor_agent): + operating_system = "" + for os in visitor_agent_operating_systems: + if os in self.visitor_agent: + operating_system = os + break + return operating_system + + def get_browser(self): + browser = "" + for br in visitor_agent_browsers: + if br in self.visitor_agent: + browser = br + break + return browser + + def get_mobile(self): + return "Mobi" in self.visitor_agent + + diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py index 6191a61..92e47bf 100644 --- a/regina/db_operation/visualize.py +++ b/regina/db_operation/visualize.py @@ -9,7 +9,7 @@ from datetime import datetime as dt from numpy import empty # local -from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country +from regina.db_operation.database import Database, t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where from regina.utility.utility import pdebug, warning, missing_arg from regina.utility.globals import settings @@ -66,7 +66,7 @@ def valid_status(status: int): # # FILTERS # -def get_os_browser_mobile_rankings(cur: sql.Cursor, visitor_ids: list[int]): +def get_os_browser_mobile_rankings(db: Database, visitor_ids: list[int]): """ returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage """ @@ -76,8 +76,7 @@ def get_os_browser_mobile_rankings(cur: sql.Cursor, visitor_ids: list[int]): browser_count = 0.0 mobile_ranking = { True: 0.0, False: 0.0 } for visitor_id in visitor_ids: - cur.execute(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}") - os, browser, mobile = cur.fetchone() + os, browser, mobile = db(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}")[0] mobile = bool(mobile) if os: if os in os_ranking: os_ranking[os] += 1 @@ -134,34 +133,30 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None): # get the earliest date -def get_earliest_date(cur: sql.Cursor) -> int: +def get_earliest_date(db: Database) -> int: """return the earliest time as unixepoch""" - cur.execute(f"SELECT MIN(date) FROM {t_request}") - date = cur.fetchone()[0] + date = db(f"SELECT MIN(date) FROM {t_request}")[0][0] if not isinstance(date, int): return 0 else: return date # get the latest date -def get_latest_date(cur: sql.Cursor) -> int: +def get_latest_date(db: Database) -> int: """return the latest time as unixepoch""" - cur.execute(f"SELECT MAX(date) FROM {t_request}") - date = cur.fetchone()[0] + date = db(f"SELECT MAX(date) FROM {t_request}")[0][0] if not isinstance(date, int): return 0 else: return date # get all dates # the date:str parameter in all these function must be a sqlite constraint -def get_days(cur: sql.Cursor, date:str) -> list[str]: +def get_days(db: Database, date:str) -> list[str]: """get a list of all dates in yyyy-mm-dd format""" - cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") - days = [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, ) + days = [ date[0] for date in db(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")] # fetchall returns tuples (date, ) days.sort() return days -def get_months(cur: sql.Cursor, date:str) -> list[str]: +def get_months(db: Database, date:str) -> list[str]: """get a list of all dates in yyyy-mm format""" - cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") - dates = get_days(cur, date) + dates = get_days(db, date) date_dict = {} for date in dates: date_without_day = date[0:date.rfind('-')] @@ -169,14 +164,13 @@ def get_months(cur: sql.Cursor, date:str) -> list[str]: return list(date_dict.keys()) -def get_visitor_agent(cur: sql.Cursor, visitor_id: int): - return sql_select(cur, t_visitor, [("visitor_id", visitor_id)])[0][2] +def get_visitor_agent(db: Database, visitor_id: int): + return sql_select(db.cur, t_visitor, [("visitor_id", visitor_id)])[0][2] -def get_unique_visitor_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: - cur.execute(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}") - return [ visitor_id[0] for visitor_id in cur.fetchall() ] +def get_unique_visitor_ids_for_date(db: Database, date:str) -> list[int]: + return [ visitor_id[0] for visitor_id in db(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}") ] -def get_human_visitors(cur: sql.Cursor, unique_visitor_ids, unique_visitor_ids_human: list): +def get_human_visitors(db: Database, unique_visitor_ids, unique_visitor_ids_human: list): """ check if they have a known platform AND browser check if at least one request did not result in an error (http status >= 400) @@ -195,22 +189,22 @@ def get_human_visitors(cur: sql.Cursor, unique_visitor_ids, unique_visitor_ids_h unique_visitor_ids_human.append(visitor_id) # pdebug("get_human_visitors: (2)", unique_visitor_ids_human) -def get_unique_request_ids_for_date(cur: sql.Cursor, date:str): +def get_unique_request_ids_for_date(db: Database, date:str): cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}") return [ request_id[0] for request_id in cur.fetchall()] -def get_unique_request_ids_for_date_and_visitor(cur: sql.Cursor, date:str, visitor_id: int, unique_request_ids_human: list): +def get_unique_request_ids_for_date_and_visitor(db: Database, date:str, visitor_id: int, unique_request_ids_human: list): cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND visitor_id = {visitor_id}") # all unique requests for visitor_id for request_id in cur.fetchall(): unique_request_ids_human.append(request_id[0]) # get number of requests per day -def get_request_count_for_date(cur: sql.Cursor, date:str) -> int: +def get_request_count_for_date(db: Database, date:str) -> int: cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}") return cur.fetchone()[0] -def get_unique_visitor_count(cur: sql.Cursor) -> int: +def get_unique_visitor_count(db: Database) -> int: return sql_tablesize(cur, t_visitor) @@ -218,7 +212,7 @@ def get_unique_visitor_count(cur: sql.Cursor) -> int: # # RANKINGS # -def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: +def get_file_ranking(db: Database, date:str) -> list[tuple[int, str]]: global settings """ :returns [(request_count, groupname)] @@ -255,7 +249,7 @@ def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: # print(ranking) return ranking -def get_visitor_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: +def get_visitor_agent_ranking(db: Database, date:str) -> list[tuple[int, str]]: """ :returns [(request_count, visitor_agent)] """ @@ -276,7 +270,7 @@ def get_visitor_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str] # print(ranking) return ranking -def get_request_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date_condition:str) -> list[tuple[int, str]]: +def get_request_ranking(field_name: str, table: str, whitelist_regex: str, db: Database, date_condition:str) -> list[tuple[int, str]]: """ 1) get all the distinct entries for field_name after min_date_unix_time 2) call get_name_function with the distinct entry diff --git a/regina/main.py b/regina/main.py index 8bdd62f..fe28c24 100644 --- a/regina/main.py +++ b/regina/main.py @@ -4,13 +4,23 @@ from sys import argv, exit from os.path import isfile import sqlite3 as sql -from regina.db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id -from regina.db_operation.database import create_db, update_geoip_tables, t_visitor -from regina.db_operation.visualize import visualize -from regina.utility.settings_manager import read_settings_file -from regina.utility.globals import settings, version -from regina.utility.utility import pmessage -from regina.utility.sql_util import sql_tablesize + +if __name__ == "__main__": + if __package__ is None: + # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change + __package__ = "regina" + import sys + from os import path + filepath = path.realpath(path.abspath(__file__)) + sys.path.insert(0, path.dirname(path.dirname(filepath))) + +from .db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id +from .db_operation.database import create_db, update_geoip_tables, t_visitor +from .db_operation.visualize import visualize +from .utility.settings_manager import read_settings_file +from .utility.globals import settings, version +from .utility.utility import pmessage +from .utility.sql_util import sql_tablesize """ start regina, launch either collect or visualize diff --git a/regina/sql/create_db.sql b/regina/sql/create_db.sql new file mode 100644 index 0000000..56adae8 --- /dev/null +++ b/regina/sql/create_db.sql @@ -0,0 +1,50 @@ +CREATE TABLE IF NOT EXISTS visitor( + visitor_id INTEGER PRIMARY KEY, + platform TEXT, + browser TEXT, + is_human INTEGER, + range_id INTEGER +) STRICT; + +CREATE TABLE IF NOT EXISTS request( + request_id INTEGER PRIMARY KEY, + visitor_id INTEGER, + FOREIGN KEY(visitor_id) REFERENCES visitor(visitor_id), + group_id INTEGER, + FOREIGN KEY(group_id) REFERENCES filegroup(group_id), + date INTEGER, + referer TEXT, + status INTEGER +) STRICT; + +CREATE TABLE IF NOT EXISTS filegroup( + group_id INTEGER PRIMARY KEY, + groupname TEXT +) STRICT; +CREATE TABLE IF NOT EXISTS file( + filename TEXT, + group_id INTEGER, + FOREIGN KEY(group_id) REFERENCES filegroup(group_id) +) STRICT; + +CREATE TABLE IF NOT EXISTS ip_range( + range_id INTEGER PRIMARY KEY, + from INTEGER, + to INTEGER, + city_id INTEGER, + FOREIGN KEY(city_id) REFERENCES city(city_id) +) STRICT; + +CREATE TABLE IF NOT EXISTS city( + city INTEGER PRIMARY KEY, + name TEXT, + region TEXT, + country_id INTEGER, + FOREIGN KEY(country_id) REFERENCES country(country_id) +) STRICT; + +CREATE TABLE IF NOT EXISTS country( + country_id INTEGER PRIMARY KEY, + name TEXT, + code TEXT +) STRICT; diff --git a/regina/utility/globals.py b/regina/utility/globals.py index 7073e5d..395ead2 100644 --- a/regina/utility/globals.py +++ b/regina/utility/globals.py @@ -1,5 +1,7 @@ """global variables for regina""" +import os + version = "1.0" # default settings, these are overwriteable through a config file @@ -74,3 +76,12 @@ visitor_agent_browsers = [ ] +# set directories +config_dir = os.path.join(os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config")), "regina") +data_dir = os.path.join(os.environ.get("XDG_DATA_HOME", os.path.expanduser("~/.local/share")), "regina") +cache_dir = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "regina") + +# check if environment variables are set and use them if they are +if 'REGINA_CONFIG_DIR' in os.environ: config_dir = os.environ['REGINA_CONFIG_DIR'] +if 'REGINA_DATA_DIR' in os.environ: data_dir = os.environ['REGINA_DATA_DIR'] +if 'REGINA_CACHE_DIR' in os.environ: cache_dir = os.environ['REGINA_CACHE_DIR'] diff --git a/regina/utility/utility.py b/regina/utility/utility.py index 6788174..90a4d70 100644 --- a/regina/utility/utility.py +++ b/regina/utility/utility.py @@ -1,6 +1,7 @@ # from sys import path # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") from sys import exit +from os import path from regina.utility.globals import settings @@ -29,3 +30,11 @@ def missing_arg(arg): print("Missing ", arg) exit(1) + +def get_filepath(filename, directories: list): + """search directories for file and return the full path to the file""" + for d in directories: + p = f"{path.expanduser(d)}/{filename}" + if path.isfile(p): + return p + raise FileNotFoundError(f"{filename} not in {directories}") diff --git a/setup.py b/setup.py index 7cd99b4..14acb72 100755 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +from matplotlib.pyplot import matplotlib from setuptools import setup, find_packages setup( @@ -12,8 +13,8 @@ setup( license="GPLv3", - packages=find_packages(), - install_requires=[], + packages=["regina"], + install_requires=["matplotlib"], python_requires='>=3.10', classifiers=[