class wrapper for the db

This commit is contained in:
matthias@arch 2023-05-05 13:14:52 +02:00
parent 3457fff2c6
commit ecc75560e3
9 changed files with 394 additions and 273 deletions

View File

@ -1,49 +1,12 @@
import sqlite3 as sql import sqlite3 as sql
from re import fullmatch, match from re import fullmatch, match
from ipaddress import IPv4Address, ip_address
from time import mktime
from datetime import datetime as dt
from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
from regina.utility.utility import pdebug, warning, pmessage from regina.utility.utility import pdebug, warning, pmessage
from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
""" """
collect information from the access log and put it into the database collect information from the access log and put it into the database
""" """
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
class Request:
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""):
self.ip_address = int(IPv4Address(sanitize(ip_address)))
self.time_local = 0
#[20/Nov/2022:00:47:36 +0100]
m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
if m:
g = m.groups()
try:
if g[1] in months:
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
self.time_local = int(mktime(datetime_.timetuple()))
else:
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
except Exception as e:
warning(f"Request:__init__: {e}")
else:
warning(f"Request:__init__: Could not match time: '{time_local}'")
self.request_type = sanitize(request_type)
self.request_file = sanitize(request_file)
self.request_protocol = sanitize(request_protocol)
self.status = sanitize(status)
self.bytes_sent = sanitize(bytes_sent)
self.referer = sanitize(referer)
self.visitor_agent = sanitize(visitor_agent)
def __repr__(self):
return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}"
re_remote_addr = r"[0-9a-fA-F.:]+" re_remote_addr = r"[0-9a-fA-F.:]+"
re_remote_visitor = ".*" re_remote_visitor = ".*"
@ -54,6 +17,7 @@ re_body_bytes_sent = r'\d+'
re_http_referer = r'"([^"]*)"' re_http_referer = r'"([^"]*)"'
re_http_visitor_agent = r'"([^"]*)"' re_http_visitor_agent = r'"([^"]*)"'
re_log_format: str = f'({re_remote_addr}) - ({re_remote_visitor}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_visitor_agent}' re_log_format: str = f'({re_remote_addr}) - ({re_remote_visitor}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_visitor_agent}'
def parse_log(logfile:str) -> list[Request]: def parse_log(logfile:str) -> list[Request]:
""" """
create Request objects from each line in the logfile create Request objects from each line in the logfile
@ -77,171 +41,3 @@ def parse_log(logfile:str) -> list[Request]:
status=g[4], bytes_sent=g[5], referer=g[6], visitor_agent=g[7])) status=g[4], bytes_sent=g[5], referer=g[6], visitor_agent=g[7]))
return requests return requests
def visitor_exists(cursor, request) -> bool:
if settings["hash_ip_address"]:
ip_address = hash(request.ip_address)
else:
ip_address = request.ip_address
if settings["unique_visitor_is_ip_address"]:
return sql_exists(cursor, t_visitor, [("ip_address", ip_address)])
else:
return sql_exists(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])
def get_visitor_id(request: Request, cursor: sql.Cursor) -> int:
"""
get the visitor_id. Adds the visitor if not already existing
"""
if settings["hash_ip_address"]:
ip_address = hash(request.ip_address)
else:
ip_address = request.ip_address
if visitor_exists(cursor, request):
if settings["unique_visitor_is_ip_address"]:
visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address)])[0][0]
else:
visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0]
else: # new visitor
# new visitor_id is number of elements
visitor_id = sql_max(cursor, t_visitor, "visitor_id") + 1
# pdebug("new visitor:", visitor_id, request.ip_address)
platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent)
ip_range_id_val = 0
if settings["get_visitor_location"]:
ip_range_id_val = get_ip_range_id(cursor, request.ip_address)
is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(cursor, visitor_id))
cursor.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');")
return visitor_id
def is_visitor_human(cur: sql.Cursor, visitor_id: int):
global settings
"""
check if they have a known platform AND browser
check if at least one request did not result in an error (http status >= 400)
"""
max_success_status = 400
if settings["status_300_is_success"]: max_success_status = 300
cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}")
browsers_and_platforms = cur.fetchall()
if len(browsers_and_platforms) != 1:
pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many")
return False
if not browsers_and_platforms[0][0] in visitor_agent_browsers:
return False
if not browsers_and_platforms[0][1] in visitor_agent_operating_systems:
return False
# check if has browser
# cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
# if no browser and platform
# exists = cur.fetchone()
# if exists is None or exists[0] == 0:
# return False
# if human needs successful request
if settings["human_needs_success"]:
# check if at least request was successful (status < 400)
cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})")
if cur.fetchone()[0] == 1:
# pdebug(f"is_visitor_human: Visitor {visitor_id} is human")
pass
else:
# pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests")
return False
# visitor is human
return True
def request_exists(cur: sql.Cursor, request: Request, visitor_id: int, group_id: int):
# get all requests from same visitor to same location
cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'")
date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d")
for request_id, date1 in cur.fetchall():
if settings["request_is_same_on_same_day"]:
date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d")
if date0 == date1:
pdebug(f"request_exists: Request is on same day as request {request_id}")
return True
return False
# re_visitor_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)"
# 1: platform, 2: version, 3: details
def get_os_browser_pairs_from_agent(visitor_agent):
# for groups in findall(re_visitor_agent, visitor_agent):
operating_system = ""
browser = ""
mobile = "Mobi" in visitor_agent
for os in visitor_agent_operating_systems:
if os in visitor_agent:
operating_system = os
break
for br in visitor_agent_browsers:
if br in visitor_agent:
browser = br
break
# if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{visitor_agent}', found os: '{operating_system}' and browser: '{browser}'")
return operating_system, browser, mobile
def get_ip_range_id(cur: sql.Cursor, ip_address: int):
cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper")
results = cur.fetchall()
ip_range_id_val = 0
if len(results) == 0:
pass
elif len(results) > 1:
warning(f"get_countries: Found multiple ip_ranges for ip_address={ip_address}: results={results}")
else:
ip_range_id_val = results[0][0]
return ip_range_id_val
def update_ip_range_id(cur: sql.Cursor, visitor_id: int):
cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}")
results = cur.fetchall()
if len(results) == 0:
warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}")
return
elif len(results) > 1:
warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}")
return
ip_address = results[0][0]
cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(cur, ip_address)}' WHERE visitor_id = '{visitor_id}'")
def add_requests_to_db(requests: list[Request], db_name: str):
conn = sql.connect(db_name)
cursor = conn.cursor()
added_requests = 0
# check the new visitors later
max_visitor_id = sql_max(cursor, t_visitor, "visitor_id")
request_blacklist = settings["request_location_regex_blacklist"]
for i in range(len(requests)):
request = requests[i]
# skip requests to blacklisted locations
if request_blacklist:
if fullmatch(request_blacklist, request.request_file):
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
continue
# pdebug("add_requests_to_db:", i, "request:", request)
visitor_id = get_visitor_id(request, cursor)
conn.commit()
group_id: int = get_filegroup(request.request_file, cursor)
# check if request is unique
if request_exists(cursor, request, visitor_id, group_id):
# pdebug("request exists:", request)
pass
else:
# pdebug("new request:", request)
request_id = sql_max(cursor, t_request, "request_id") + 1
sql_insert(cursor, t_request, [[request_id, visitor_id, group_id, request.time_local, request.referer, request.status]])
added_requests += 1
visitor_count = sql_tablesize(cursor, t_visitor)
for visitor_id in range(max_visitor_id, visitor_count):
if not sql_exists(cursor, t_visitor, [(str(visitor_id), "visitor_id")]): continue
is_human = is_visitor_human(cursor, visitor_id)
cursor.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}")
# pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {cursor.fetchall()}")
if is_human:
cursor.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}")
cursor.close()
conn.commit()
pmessage(f"Collection Summary: Added {visitor_count - max_visitor_id} new visitors and {added_requests} new requests.")

View File

@ -2,10 +2,15 @@
import sqlite3 as sql import sqlite3 as sql
from csv import reader from csv import reader
from os import path, listdir from os import path, listdir
import pkg_resources
import re
from datetime import datetime as dt
# local # local
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
from regina.utility.utility import pdebug from .utility.utility import pdebug, get_filepath, warning, pmessage
from regina.utility.globals import settings from .utility.globals import settings
from .db_operation.request import Request
from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
""" """
create reginas database as shown in the uml diagram database.uxf create reginas database as shown in the uml diagram database.uxf
@ -37,6 +42,8 @@ class Table:
for c in self.constaints: for c in self.constaints:
s += f", {c}" s += f", {c}"
return s return s
t_request = "request" t_request = "request"
t_file = "file" t_file = "file"
t_filegroup = "filegroup" t_filegroup = "filegroup"
@ -100,7 +107,158 @@ database_tables = {
def get_filegroup(filename: str, cursor: sql.Cursor) -> int: class Database:
def __init__(self, database_path):
self.conn = sql.connect(database_path)
self.cur = self.conn.cursor()
# verify that the database is created
self.cur.execute("pragma schema_version")
if self.cur.fetchone()[0] == 0: # not created
pdebug(f"Database.__init__: Creating database at {database_path}")
with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file:
create_db = file.read()
self.cur.execute(create_db)
def __call__(self, s):
"""execute a command and return fetchall()"""
self.cur.execute(s)
return self.cur.fetchall()
#
# VISITOR
#
def visitor_exists(self, request) -> bool:
if settings["hash_ip_address"]:
ip_address = hash(request.ip_address)
else:
ip_address = request.ip_address
if settings["unique_visitor_is_ip_address"]:
return sql_exists(self.cur, t_visitor, [("ip_address", ip_address)])
else:
return sql_exists(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])
def is_visitor_human(self, visitor_id: int):
"""
check if they have a known platform AND browser
check if at least one request did not result in an error (http status >= 400)
"""
max_success_status = 400
if settings["status_300_is_success"]: max_success_status = 300
self.cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}")
browsers_and_platforms = self.cur.fetchall()
if len(browsers_and_platforms) != 1:
pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many")
return False
if not browsers_and_platforms[0][0] in visitor_agent_browsers:
return False
if not browsers_and_platforms[0][1] in visitor_agent_operating_systems:
return False
# check if has browser
# self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
# if no browser and platform
# exists = self.cur.fetchone()
# if exists is None or exists[0] == 0:
# return False
# if human needs successful request
if settings["human_needs_success"]:
# check if at least request was successful (status < 400)
self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})")
if self.cur.fetchone()[0] == 1:
# pdebug(f"is_visitor_human: Visitor {visitor_id} is human")
pass
else:
# pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests")
return False
return True
def get_visitor_id(self, request: Request) -> int:
"""
get the visitor_id. Adds the visitor if not already existing
"""
if settings["hash_ip_address"]:
ip_address = hash(request.ip_address)
else:
ip_address = request.ip_address
if self.visitor_exists(request):
if settings["unique_visitor_is_ip_address"]:
visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address)])[0][0]
else:
visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0]
else: # new visitor
# new visitor_id is number of elements
visitor_id = sql_max(self.cur, t_visitor, "visitor_id") + 1
# pdebug("new visitor:", visitor_id, request.ip_address)
platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent)
ip_range_id_val = 0
if settings["get_visitor_location"]:
ip_range_id_val = get_ip_range_id(self.cur, request.ip_address)
is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(self.cur, visitor_id))
self.cur.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');")
return visitor_id
#
# REQUEST
#
def request_exists(self, request: Request, visitor_id: int, group_id: int):
# get all requests from same visitor to same location
# TODO this looks wrong
self.cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'")
date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d")
for request_id, date1 in self.cur.fetchall():
if settings["request_is_same_on_same_day"]:
date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d")
if date0 == date1:
pdebug(f"request_exists: Request is on same day as request {request_id}")
return True
return False
def add_request(self, request: Request) -> (int | None):
"""returns visitor_id if new request was added, else None"""
# skip requests to blacklisted locations
if request_blacklist:
if re.fullmatch(request_blacklist, request.request_file):
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
return None
# pdebug("add_requests_to_db:", i, "request:", request)
visitor_id = self.get_visitor_id(request)
self.conn.commit()
group_id: int = self.get_filegroup(request.request_file)
# check if request is unique
if self.request_exists(request, visitor_id, group_id):
# pdebug("request exists:", request)
return None
else:
# pdebug("new request:", request)
sql_insert(t_request, [[None, visitor_id, group_id, request.time_local, request.referer, request.status]])
return visitor_id
def add_requests(self, requests: list[Request]):
added_requests = 0
# check the new visitors later
request_blacklist = settings["request_location_regex_blacklist"]
new_visitors = []
for i in range(len(requests)):
visitor = self.add_request(requests[i])
if visitor:
new_visitors.append(visitor)
# update the is_human column for all new visitors
for visitor_id in new_visitors:
if not sql_exists(self.cur, t_visitor, [(str(visitor_id), "visitor_id")]): continue
is_human = self.is_visitor_human(visitor_id)
self.cur.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}")
# pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}")
if is_human:
self.cur.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}")
self.conn.commit()
pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.")
#
# FILE(GROUP)
#
def get_filegroup(self, filename: str) -> int:
""" """
get the filegroup get the filegroup
returns the group where returns the group where
@ -109,27 +267,57 @@ def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
3) new group with filename as gorupname 3) new group with filename as gorupname
""" """
# pdebug(f"get_filegroup: {filename}") # pdebug(f"get_filegroup: {filename}")
if sql_exists(cursor, t_file, [("filename", filename)]): if sql_exists(self.cur, t_file, [("filename", filename)]):
return sql_select(cursor, t_file, [("filename", filename)])[0][1] return sql_select(self.cur, t_file, [("filename", filename)])[0][1]
else: else:
suffix = filename.split('.')[-1] suffix = filename.split('.')[-1]
cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
# cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") # self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
group_id_candidates = cursor.fetchall() group_id_candidates = self.cur.fetchall()
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") # pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
if group_id_candidates: if group_id_candidates:
return group_id_candidates[0][0] return group_id_candidates[0][0]
else: # add new group file filename else: # add new group file filename
group_id = sql_max(cursor, t_filegroup, "group_id") + 1 group_id = sql_max(self.cur, t_filegroup, "group_id") + 1
# pdebug("new file(group):", group_id, filename) # pdebug("new file(group):", group_id, filename)
# add group # add group
sql_insert(cursor, t_filegroup, [[group_id, filename]]) sql_insert(self.cur, t_filegroup, [[group_id, filename]])
# add file # add file
sql_insert(cursor, t_file, [[filename, group_id]]) sql_insert(self.cur, t_file, [[filename, group_id]])
return group_id return group_id
#
# GEOIP
#
def get_ip_range_id(self, ip_address: int):
self.cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper")
results = self.cur.fetchall()
ip_range_id_val = 0
if len(results) == 0:
pass
elif len(results) > 1:
warning(f"get_ip_range_id: Found multiple ip_ranges for ip_address={ip_address}: results={results}")
else:
ip_range_id_val = results[0][0]
return ip_range_id_val
def update_ip_range_id(self, visitor_id: int):
self.cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}")
results = self.cur.fetchall()
if len(results) == 0:
warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}")
return
elif len(results) > 1:
warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}")
return
ip_address = results[0][0]
self.cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(self.cur, ip_address)}' WHERE visitor_id = '{visitor_id}'")
def create_filegroups(cursor: sql.Cursor, filegroup_str: str): def create_filegroups(cursor: sql.Cursor, filegroup_str: str):
"""
TODO: make re-usable (alter groups when config changes)
"""
# filegroup_str: 'name1: file1, file2, file3; name2: file33' # filegroup_str: 'name1: file1, file2, file3; name2: file33'
groups = filegroup_str.strip(";").split(";") groups = filegroup_str.strip(";").split(";")
pdebug("create_filegroups:", groups) pdebug("create_filegroups:", groups)

View File

@ -0,0 +1,62 @@
from ipaddress import IPv4Address, ip_address
from time import mktime
from re import fullmatch, match
from datetime import datetime as dt
from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
from .utility.utility import pdebug, warning, pmessage
from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
class Request:
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""):
self.ip_address = int(IPv4Address(sanitize(ip_address)))
self.time_local = 0
#[20/Nov/2022:00:47:36 +0100]
m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
if m:
g = m.groups()
try:
if g[1] in months:
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
self.time_local = int(mktime(datetime_.timetuple()))
else:
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
except Exception as e:
warning(f"Request:__init__: {e}")
else:
warning(f"Request:__init__: Could not match time: '{time_local}'")
self.request_type = sanitize(request_type)
self.request_file = sanitize(request_file)
self.request_protocol = sanitize(request_protocol)
self.status = sanitize(status)
self.bytes_sent = sanitize(bytes_sent)
self.referer = sanitize(referer)
self.visitor_agent = sanitize(visitor_agent)
def __repr__(self):
return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}"
def get_os(self):
# for groups in findall(re_visitor_agent, visitor_agent):
operating_system = ""
for os in visitor_agent_operating_systems:
if os in self.visitor_agent:
operating_system = os
break
return operating_system
def get_browser(self):
browser = ""
for br in visitor_agent_browsers:
if br in self.visitor_agent:
browser = br
break
return browser
def get_mobile(self):
return "Mobi" in self.visitor_agent

View File

@ -9,7 +9,7 @@ from datetime import datetime as dt
from numpy import empty from numpy import empty
# local # local
from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country from regina.db_operation.database import Database, t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
from regina.utility.utility import pdebug, warning, missing_arg from regina.utility.utility import pdebug, warning, missing_arg
from regina.utility.globals import settings from regina.utility.globals import settings
@ -66,7 +66,7 @@ def valid_status(status: int):
# #
# FILTERS # FILTERS
# #
def get_os_browser_mobile_rankings(cur: sql.Cursor, visitor_ids: list[int]): def get_os_browser_mobile_rankings(db: Database, visitor_ids: list[int]):
""" """
returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage
""" """
@ -76,8 +76,7 @@ def get_os_browser_mobile_rankings(cur: sql.Cursor, visitor_ids: list[int]):
browser_count = 0.0 browser_count = 0.0
mobile_ranking = { True: 0.0, False: 0.0 } mobile_ranking = { True: 0.0, False: 0.0 }
for visitor_id in visitor_ids: for visitor_id in visitor_ids:
cur.execute(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}") os, browser, mobile = db(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}")[0]
os, browser, mobile = cur.fetchone()
mobile = bool(mobile) mobile = bool(mobile)
if os: if os:
if os in os_ranking: os_ranking[os] += 1 if os in os_ranking: os_ranking[os] += 1
@ -134,34 +133,30 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None):
# get the earliest date # get the earliest date
def get_earliest_date(cur: sql.Cursor) -> int: def get_earliest_date(db: Database) -> int:
"""return the earliest time as unixepoch""" """return the earliest time as unixepoch"""
cur.execute(f"SELECT MIN(date) FROM {t_request}") date = db(f"SELECT MIN(date) FROM {t_request}")[0][0]
date = cur.fetchone()[0]
if not isinstance(date, int): return 0 if not isinstance(date, int): return 0
else: return date else: return date
# get the latest date # get the latest date
def get_latest_date(cur: sql.Cursor) -> int: def get_latest_date(db: Database) -> int:
"""return the latest time as unixepoch""" """return the latest time as unixepoch"""
cur.execute(f"SELECT MAX(date) FROM {t_request}") date = db(f"SELECT MAX(date) FROM {t_request}")[0][0]
date = cur.fetchone()[0]
if not isinstance(date, int): return 0 if not isinstance(date, int): return 0
else: return date else: return date
# get all dates # get all dates
# the date:str parameter in all these function must be a sqlite constraint # the date:str parameter in all these function must be a sqlite constraint
def get_days(cur: sql.Cursor, date:str) -> list[str]: def get_days(db: Database, date:str) -> list[str]:
"""get a list of all dates in yyyy-mm-dd format""" """get a list of all dates in yyyy-mm-dd format"""
cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") days = [ date[0] for date in db(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")] # fetchall returns tuples (date, )
days = [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, )
days.sort() days.sort()
return days return days
def get_months(cur: sql.Cursor, date:str) -> list[str]: def get_months(db: Database, date:str) -> list[str]:
"""get a list of all dates in yyyy-mm format""" """get a list of all dates in yyyy-mm format"""
cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}") dates = get_days(db, date)
dates = get_days(cur, date)
date_dict = {} date_dict = {}
for date in dates: for date in dates:
date_without_day = date[0:date.rfind('-')] date_without_day = date[0:date.rfind('-')]
@ -169,14 +164,13 @@ def get_months(cur: sql.Cursor, date:str) -> list[str]:
return list(date_dict.keys()) return list(date_dict.keys())
def get_visitor_agent(cur: sql.Cursor, visitor_id: int): def get_visitor_agent(db: Database, visitor_id: int):
return sql_select(cur, t_visitor, [("visitor_id", visitor_id)])[0][2] return sql_select(db.cur, t_visitor, [("visitor_id", visitor_id)])[0][2]
def get_unique_visitor_ids_for_date(cur: sql.Cursor, date:str) -> list[int]: def get_unique_visitor_ids_for_date(db: Database, date:str) -> list[int]:
cur.execute(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}") return [ visitor_id[0] for visitor_id in db(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}") ]
return [ visitor_id[0] for visitor_id in cur.fetchall() ]
def get_human_visitors(cur: sql.Cursor, unique_visitor_ids, unique_visitor_ids_human: list): def get_human_visitors(db: Database, unique_visitor_ids, unique_visitor_ids_human: list):
""" """
check if they have a known platform AND browser check if they have a known platform AND browser
check if at least one request did not result in an error (http status >= 400) check if at least one request did not result in an error (http status >= 400)
@ -195,22 +189,22 @@ def get_human_visitors(cur: sql.Cursor, unique_visitor_ids, unique_visitor_ids_h
unique_visitor_ids_human.append(visitor_id) unique_visitor_ids_human.append(visitor_id)
# pdebug("get_human_visitors: (2)", unique_visitor_ids_human) # pdebug("get_human_visitors: (2)", unique_visitor_ids_human)
def get_unique_request_ids_for_date(cur: sql.Cursor, date:str): def get_unique_request_ids_for_date(db: Database, date:str):
cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}") cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}")
return [ request_id[0] for request_id in cur.fetchall()] return [ request_id[0] for request_id in cur.fetchall()]
def get_unique_request_ids_for_date_and_visitor(cur: sql.Cursor, date:str, visitor_id: int, unique_request_ids_human: list): def get_unique_request_ids_for_date_and_visitor(db: Database, date:str, visitor_id: int, unique_request_ids_human: list):
cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND visitor_id = {visitor_id}") cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND visitor_id = {visitor_id}")
# all unique requests for visitor_id # all unique requests for visitor_id
for request_id in cur.fetchall(): for request_id in cur.fetchall():
unique_request_ids_human.append(request_id[0]) unique_request_ids_human.append(request_id[0])
# get number of requests per day # get number of requests per day
def get_request_count_for_date(cur: sql.Cursor, date:str) -> int: def get_request_count_for_date(db: Database, date:str) -> int:
cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}") cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}")
return cur.fetchone()[0] return cur.fetchone()[0]
def get_unique_visitor_count(cur: sql.Cursor) -> int: def get_unique_visitor_count(db: Database) -> int:
return sql_tablesize(cur, t_visitor) return sql_tablesize(cur, t_visitor)
@ -218,7 +212,7 @@ def get_unique_visitor_count(cur: sql.Cursor) -> int:
# #
# RANKINGS # RANKINGS
# #
def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: def get_file_ranking(db: Database, date:str) -> list[tuple[int, str]]:
global settings global settings
""" """
:returns [(request_count, groupname)] :returns [(request_count, groupname)]
@ -255,7 +249,7 @@ def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
# print(ranking) # print(ranking)
return ranking return ranking
def get_visitor_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: def get_visitor_agent_ranking(db: Database, date:str) -> list[tuple[int, str]]:
""" """
:returns [(request_count, visitor_agent)] :returns [(request_count, visitor_agent)]
""" """
@ -276,7 +270,7 @@ def get_visitor_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]
# print(ranking) # print(ranking)
return ranking return ranking
def get_request_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date_condition:str) -> list[tuple[int, str]]: def get_request_ranking(field_name: str, table: str, whitelist_regex: str, db: Database, date_condition:str) -> list[tuple[int, str]]:
""" """
1) get all the distinct entries for field_name after min_date_unix_time 1) get all the distinct entries for field_name after min_date_unix_time
2) call get_name_function with the distinct entry 2) call get_name_function with the distinct entry

View File

@ -4,13 +4,23 @@
from sys import argv, exit from sys import argv, exit
from os.path import isfile from os.path import isfile
import sqlite3 as sql import sqlite3 as sql
from regina.db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id
from regina.db_operation.database import create_db, update_geoip_tables, t_visitor if __name__ == "__main__":
from regina.db_operation.visualize import visualize if __package__ is None:
from regina.utility.settings_manager import read_settings_file # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
from regina.utility.globals import settings, version __package__ = "regina"
from regina.utility.utility import pmessage import sys
from regina.utility.sql_util import sql_tablesize from os import path
filepath = path.realpath(path.abspath(__file__))
sys.path.insert(0, path.dirname(path.dirname(filepath)))
from .db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id
from .db_operation.database import create_db, update_geoip_tables, t_visitor
from .db_operation.visualize import visualize
from .utility.settings_manager import read_settings_file
from .utility.globals import settings, version
from .utility.utility import pmessage
from .utility.sql_util import sql_tablesize
""" """
start regina, launch either collect or visualize start regina, launch either collect or visualize

50
regina/sql/create_db.sql Normal file
View File

@ -0,0 +1,50 @@
CREATE TABLE IF NOT EXISTS visitor(
visitor_id INTEGER PRIMARY KEY,
platform TEXT,
browser TEXT,
is_human INTEGER,
range_id INTEGER
) STRICT;
CREATE TABLE IF NOT EXISTS request(
request_id INTEGER PRIMARY KEY,
visitor_id INTEGER,
FOREIGN KEY(visitor_id) REFERENCES visitor(visitor_id),
group_id INTEGER,
FOREIGN KEY(group_id) REFERENCES filegroup(group_id),
date INTEGER,
referer TEXT,
status INTEGER
) STRICT;
CREATE TABLE IF NOT EXISTS filegroup(
group_id INTEGER PRIMARY KEY,
groupname TEXT
) STRICT;
CREATE TABLE IF NOT EXISTS file(
filename TEXT,
group_id INTEGER,
FOREIGN KEY(group_id) REFERENCES filegroup(group_id)
) STRICT;
CREATE TABLE IF NOT EXISTS ip_range(
range_id INTEGER PRIMARY KEY,
from INTEGER,
to INTEGER,
city_id INTEGER,
FOREIGN KEY(city_id) REFERENCES city(city_id)
) STRICT;
CREATE TABLE IF NOT EXISTS city(
city INTEGER PRIMARY KEY,
name TEXT,
region TEXT,
country_id INTEGER,
FOREIGN KEY(country_id) REFERENCES country(country_id)
) STRICT;
CREATE TABLE IF NOT EXISTS country(
country_id INTEGER PRIMARY KEY,
name TEXT,
code TEXT
) STRICT;

View File

@ -1,5 +1,7 @@
"""global variables for regina""" """global variables for regina"""
import os
version = "1.0" version = "1.0"
# default settings, these are overwriteable through a config file # default settings, these are overwriteable through a config file
@ -74,3 +76,12 @@ visitor_agent_browsers = [
] ]
# set directories
config_dir = os.path.join(os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config")), "regina")
data_dir = os.path.join(os.environ.get("XDG_DATA_HOME", os.path.expanduser("~/.local/share")), "regina")
cache_dir = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "regina")
# check if environment variables are set and use them if they are
if 'REGINA_CONFIG_DIR' in os.environ: config_dir = os.environ['REGINA_CONFIG_DIR']
if 'REGINA_DATA_DIR' in os.environ: data_dir = os.environ['REGINA_DATA_DIR']
if 'REGINA_CACHE_DIR' in os.environ: cache_dir = os.environ['REGINA_CACHE_DIR']

View File

@ -1,6 +1,7 @@
# from sys import path # from sys import path
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
from sys import exit from sys import exit
from os import path
from regina.utility.globals import settings from regina.utility.globals import settings
@ -29,3 +30,11 @@ def missing_arg(arg):
print("Missing ", arg) print("Missing ", arg)
exit(1) exit(1)
def get_filepath(filename, directories: list):
"""search directories for file and return the full path to the file"""
for d in directories:
p = f"{path.expanduser(d)}/{filename}"
if path.isfile(p):
return p
raise FileNotFoundError(f"{filename} not in {directories}")

View File

@ -1,3 +1,4 @@
from matplotlib.pyplot import matplotlib
from setuptools import setup, find_packages from setuptools import setup, find_packages
setup( setup(
@ -12,8 +13,8 @@ setup(
license="GPLv3", license="GPLv3",
packages=find_packages(), packages=["regina"],
install_requires=[], install_requires=["matplotlib"],
python_requires='>=3.10', python_requires='>=3.10',
classifiers=[ classifiers=[