class wrapper for the db
This commit is contained in:
parent
3457fff2c6
commit
ecc75560e3
@ -1,49 +1,12 @@
|
||||
import sqlite3 as sql
|
||||
from re import fullmatch, match
|
||||
from ipaddress import IPv4Address, ip_address
|
||||
from time import mktime
|
||||
from datetime import datetime as dt
|
||||
from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
|
||||
from regina.utility.utility import pdebug, warning, pmessage
|
||||
from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
|
||||
|
||||
"""
|
||||
collect information from the access log and put it into the database
|
||||
"""
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
|
||||
|
||||
|
||||
|
||||
class Request:
|
||||
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""):
|
||||
self.ip_address = int(IPv4Address(sanitize(ip_address)))
|
||||
self.time_local = 0
|
||||
#[20/Nov/2022:00:47:36 +0100]
|
||||
m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
|
||||
if m:
|
||||
g = m.groups()
|
||||
try:
|
||||
if g[1] in months:
|
||||
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
|
||||
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
|
||||
self.time_local = int(mktime(datetime_.timetuple()))
|
||||
else:
|
||||
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
|
||||
except Exception as e:
|
||||
warning(f"Request:__init__: {e}")
|
||||
else:
|
||||
warning(f"Request:__init__: Could not match time: '{time_local}'")
|
||||
self.request_type = sanitize(request_type)
|
||||
self.request_file = sanitize(request_file)
|
||||
self.request_protocol = sanitize(request_protocol)
|
||||
self.status = sanitize(status)
|
||||
self.bytes_sent = sanitize(bytes_sent)
|
||||
self.referer = sanitize(referer)
|
||||
self.visitor_agent = sanitize(visitor_agent)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}"
|
||||
|
||||
re_remote_addr = r"[0-9a-fA-F.:]+"
|
||||
re_remote_visitor = ".*"
|
||||
@ -54,6 +17,7 @@ re_body_bytes_sent = r'\d+'
|
||||
re_http_referer = r'"([^"]*)"'
|
||||
re_http_visitor_agent = r'"([^"]*)"'
|
||||
re_log_format: str = f'({re_remote_addr}) - ({re_remote_visitor}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_visitor_agent}'
|
||||
|
||||
def parse_log(logfile:str) -> list[Request]:
|
||||
"""
|
||||
create Request objects from each line in the logfile
|
||||
@ -77,171 +41,3 @@ def parse_log(logfile:str) -> list[Request]:
|
||||
status=g[4], bytes_sent=g[5], referer=g[6], visitor_agent=g[7]))
|
||||
return requests
|
||||
|
||||
|
||||
def visitor_exists(cursor, request) -> bool:
|
||||
if settings["hash_ip_address"]:
|
||||
ip_address = hash(request.ip_address)
|
||||
else:
|
||||
ip_address = request.ip_address
|
||||
if settings["unique_visitor_is_ip_address"]:
|
||||
return sql_exists(cursor, t_visitor, [("ip_address", ip_address)])
|
||||
else:
|
||||
return sql_exists(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])
|
||||
|
||||
def get_visitor_id(request: Request, cursor: sql.Cursor) -> int:
|
||||
"""
|
||||
get the visitor_id. Adds the visitor if not already existing
|
||||
"""
|
||||
if settings["hash_ip_address"]:
|
||||
ip_address = hash(request.ip_address)
|
||||
else:
|
||||
ip_address = request.ip_address
|
||||
|
||||
if visitor_exists(cursor, request):
|
||||
if settings["unique_visitor_is_ip_address"]:
|
||||
visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address)])[0][0]
|
||||
else:
|
||||
visitor_id = sql_select(cursor, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0]
|
||||
else: # new visitor
|
||||
# new visitor_id is number of elements
|
||||
visitor_id = sql_max(cursor, t_visitor, "visitor_id") + 1
|
||||
# pdebug("new visitor:", visitor_id, request.ip_address)
|
||||
platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent)
|
||||
ip_range_id_val = 0
|
||||
if settings["get_visitor_location"]:
|
||||
ip_range_id_val = get_ip_range_id(cursor, request.ip_address)
|
||||
is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(cursor, visitor_id))
|
||||
cursor.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');")
|
||||
return visitor_id
|
||||
|
||||
def is_visitor_human(cur: sql.Cursor, visitor_id: int):
|
||||
global settings
|
||||
"""
|
||||
check if they have a known platform AND browser
|
||||
check if at least one request did not result in an error (http status >= 400)
|
||||
"""
|
||||
max_success_status = 400
|
||||
if settings["status_300_is_success"]: max_success_status = 300
|
||||
cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
browsers_and_platforms = cur.fetchall()
|
||||
if len(browsers_and_platforms) != 1:
|
||||
pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many")
|
||||
return False
|
||||
if not browsers_and_platforms[0][0] in visitor_agent_browsers:
|
||||
return False
|
||||
if not browsers_and_platforms[0][1] in visitor_agent_operating_systems:
|
||||
return False
|
||||
# check if has browser
|
||||
# cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
|
||||
# if no browser and platform
|
||||
# exists = cur.fetchone()
|
||||
# if exists is None or exists[0] == 0:
|
||||
# return False
|
||||
# if human needs successful request
|
||||
if settings["human_needs_success"]:
|
||||
# check if at least request was successful (status < 400)
|
||||
cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})")
|
||||
if cur.fetchone()[0] == 1:
|
||||
# pdebug(f"is_visitor_human: Visitor {visitor_id} is human")
|
||||
pass
|
||||
else:
|
||||
# pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests")
|
||||
return False
|
||||
# visitor is human
|
||||
return True
|
||||
|
||||
def request_exists(cur: sql.Cursor, request: Request, visitor_id: int, group_id: int):
|
||||
# get all requests from same visitor to same location
|
||||
cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'")
|
||||
date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d")
|
||||
for request_id, date1 in cur.fetchall():
|
||||
if settings["request_is_same_on_same_day"]:
|
||||
date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d")
|
||||
if date0 == date1:
|
||||
pdebug(f"request_exists: Request is on same day as request {request_id}")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# re_visitor_agent = r"(?: ?([\w\- ]+)(?:\/([\w.]+))?(?: \(([^()]*)\))?)"
|
||||
# 1: platform, 2: version, 3: details
|
||||
def get_os_browser_pairs_from_agent(visitor_agent):
|
||||
# for groups in findall(re_visitor_agent, visitor_agent):
|
||||
operating_system = ""
|
||||
browser = ""
|
||||
mobile = "Mobi" in visitor_agent
|
||||
for os in visitor_agent_operating_systems:
|
||||
if os in visitor_agent:
|
||||
operating_system = os
|
||||
break
|
||||
for br in visitor_agent_browsers:
|
||||
if br in visitor_agent:
|
||||
browser = br
|
||||
break
|
||||
# if not operating_system or not browser: print(f"Warning: get_os_browser_pairs_from_agent: Could not find all information for agent '{visitor_agent}', found os: '{operating_system}' and browser: '{browser}'")
|
||||
return operating_system, browser, mobile
|
||||
|
||||
|
||||
def get_ip_range_id(cur: sql.Cursor, ip_address: int):
|
||||
cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper")
|
||||
results = cur.fetchall()
|
||||
ip_range_id_val = 0
|
||||
if len(results) == 0:
|
||||
pass
|
||||
elif len(results) > 1:
|
||||
warning(f"get_countries: Found multiple ip_ranges for ip_address={ip_address}: results={results}")
|
||||
else:
|
||||
ip_range_id_val = results[0][0]
|
||||
return ip_range_id_val
|
||||
|
||||
def update_ip_range_id(cur: sql.Cursor, visitor_id: int):
|
||||
cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
results = cur.fetchall()
|
||||
if len(results) == 0:
|
||||
warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}")
|
||||
return
|
||||
elif len(results) > 1:
|
||||
warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}")
|
||||
return
|
||||
ip_address = results[0][0]
|
||||
cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(cur, ip_address)}' WHERE visitor_id = '{visitor_id}'")
|
||||
|
||||
|
||||
def add_requests_to_db(requests: list[Request], db_name: str):
|
||||
conn = sql.connect(db_name)
|
||||
cursor = conn.cursor()
|
||||
added_requests = 0
|
||||
# check the new visitors later
|
||||
max_visitor_id = sql_max(cursor, t_visitor, "visitor_id")
|
||||
request_blacklist = settings["request_location_regex_blacklist"]
|
||||
for i in range(len(requests)):
|
||||
request = requests[i]
|
||||
# skip requests to blacklisted locations
|
||||
if request_blacklist:
|
||||
if fullmatch(request_blacklist, request.request_file):
|
||||
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
|
||||
continue
|
||||
# pdebug("add_requests_to_db:", i, "request:", request)
|
||||
visitor_id = get_visitor_id(request, cursor)
|
||||
conn.commit()
|
||||
group_id: int = get_filegroup(request.request_file, cursor)
|
||||
# check if request is unique
|
||||
if request_exists(cursor, request, visitor_id, group_id):
|
||||
# pdebug("request exists:", request)
|
||||
pass
|
||||
else:
|
||||
# pdebug("new request:", request)
|
||||
request_id = sql_max(cursor, t_request, "request_id") + 1
|
||||
sql_insert(cursor, t_request, [[request_id, visitor_id, group_id, request.time_local, request.referer, request.status]])
|
||||
added_requests += 1
|
||||
visitor_count = sql_tablesize(cursor, t_visitor)
|
||||
for visitor_id in range(max_visitor_id, visitor_count):
|
||||
if not sql_exists(cursor, t_visitor, [(str(visitor_id), "visitor_id")]): continue
|
||||
is_human = is_visitor_human(cursor, visitor_id)
|
||||
cursor.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
# pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {cursor.fetchall()}")
|
||||
if is_human:
|
||||
cursor.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}")
|
||||
cursor.close()
|
||||
conn.commit()
|
||||
pmessage(f"Collection Summary: Added {visitor_count - max_visitor_id} new visitors and {added_requests} new requests.")
|
||||
|
@ -2,10 +2,15 @@
|
||||
import sqlite3 as sql
|
||||
from csv import reader
|
||||
from os import path, listdir
|
||||
import pkg_resources
|
||||
import re
|
||||
from datetime import datetime as dt
|
||||
# local
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
|
||||
from regina.utility.utility import pdebug
|
||||
from regina.utility.globals import settings
|
||||
from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
|
||||
from .utility.utility import pdebug, get_filepath, warning, pmessage
|
||||
from .utility.globals import settings
|
||||
from .db_operation.request import Request
|
||||
from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
|
||||
|
||||
"""
|
||||
create reginas database as shown in the uml diagram database.uxf
|
||||
@ -37,6 +42,8 @@ class Table:
|
||||
for c in self.constaints:
|
||||
s += f", {c}"
|
||||
return s
|
||||
|
||||
|
||||
t_request = "request"
|
||||
t_file = "file"
|
||||
t_filegroup = "filegroup"
|
||||
@ -100,36 +107,217 @@ database_tables = {
|
||||
|
||||
|
||||
|
||||
def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
|
||||
"""
|
||||
get the filegroup
|
||||
returns the group where
|
||||
1) filename is the groupname
|
||||
2) the filetype of filename is the groupname
|
||||
3) new group with filename as gorupname
|
||||
"""
|
||||
# pdebug(f"get_filegroup: {filename}")
|
||||
if sql_exists(cursor, t_file, [("filename", filename)]):
|
||||
return sql_select(cursor, t_file, [("filename", filename)])[0][1]
|
||||
else:
|
||||
suffix = filename.split('.')[-1]
|
||||
cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
|
||||
# cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
|
||||
group_id_candidates = cursor.fetchall()
|
||||
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
|
||||
if group_id_candidates:
|
||||
return group_id_candidates[0][0]
|
||||
else: # add new group file filename
|
||||
group_id = sql_max(cursor, t_filegroup, "group_id") + 1
|
||||
class Database:
|
||||
def __init__(self, database_path):
|
||||
self.conn = sql.connect(database_path)
|
||||
self.cur = self.conn.cursor()
|
||||
# verify that the database is created
|
||||
self.cur.execute("pragma schema_version")
|
||||
if self.cur.fetchone()[0] == 0: # not created
|
||||
pdebug(f"Database.__init__: Creating database at {database_path}")
|
||||
with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file:
|
||||
create_db = file.read()
|
||||
self.cur.execute(create_db)
|
||||
|
||||
# pdebug("new file(group):", group_id, filename)
|
||||
# add group
|
||||
sql_insert(cursor, t_filegroup, [[group_id, filename]])
|
||||
# add file
|
||||
sql_insert(cursor, t_file, [[filename, group_id]])
|
||||
return group_id
|
||||
def __call__(self, s):
|
||||
"""execute a command and return fetchall()"""
|
||||
self.cur.execute(s)
|
||||
return self.cur.fetchall()
|
||||
|
||||
#
|
||||
# VISITOR
|
||||
#
|
||||
def visitor_exists(self, request) -> bool:
|
||||
if settings["hash_ip_address"]:
|
||||
ip_address = hash(request.ip_address)
|
||||
else:
|
||||
ip_address = request.ip_address
|
||||
if settings["unique_visitor_is_ip_address"]:
|
||||
return sql_exists(self.cur, t_visitor, [("ip_address", ip_address)])
|
||||
else:
|
||||
return sql_exists(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])
|
||||
|
||||
def is_visitor_human(self, visitor_id: int):
|
||||
"""
|
||||
check if they have a known platform AND browser
|
||||
check if at least one request did not result in an error (http status >= 400)
|
||||
"""
|
||||
max_success_status = 400
|
||||
if settings["status_300_is_success"]: max_success_status = 300
|
||||
self.cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
browsers_and_platforms = self.cur.fetchall()
|
||||
if len(browsers_and_platforms) != 1:
|
||||
pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many")
|
||||
return False
|
||||
if not browsers_and_platforms[0][0] in visitor_agent_browsers:
|
||||
return False
|
||||
if not browsers_and_platforms[0][1] in visitor_agent_operating_systems:
|
||||
return False
|
||||
# check if has browser
|
||||
# self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
|
||||
# if no browser and platform
|
||||
# exists = self.cur.fetchone()
|
||||
# if exists is None or exists[0] == 0:
|
||||
# return False
|
||||
# if human needs successful request
|
||||
if settings["human_needs_success"]:
|
||||
# check if at least request was successful (status < 400)
|
||||
self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})")
|
||||
if self.cur.fetchone()[0] == 1:
|
||||
# pdebug(f"is_visitor_human: Visitor {visitor_id} is human")
|
||||
pass
|
||||
else:
|
||||
# pdebug(f"is_visitor_human: Visitor {visitor_id} only had unsuccessful requests")
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_visitor_id(self, request: Request) -> int:
|
||||
"""
|
||||
get the visitor_id. Adds the visitor if not already existing
|
||||
"""
|
||||
if settings["hash_ip_address"]:
|
||||
ip_address = hash(request.ip_address)
|
||||
else:
|
||||
ip_address = request.ip_address
|
||||
|
||||
if self.visitor_exists(request):
|
||||
if settings["unique_visitor_is_ip_address"]:
|
||||
visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address)])[0][0]
|
||||
else:
|
||||
visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0]
|
||||
else: # new visitor
|
||||
# new visitor_id is number of elements
|
||||
visitor_id = sql_max(self.cur, t_visitor, "visitor_id") + 1
|
||||
# pdebug("new visitor:", visitor_id, request.ip_address)
|
||||
platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent)
|
||||
ip_range_id_val = 0
|
||||
if settings["get_visitor_location"]:
|
||||
ip_range_id_val = get_ip_range_id(self.cur, request.ip_address)
|
||||
is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(self.cur, visitor_id))
|
||||
self.cur.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');")
|
||||
return visitor_id
|
||||
|
||||
|
||||
#
|
||||
# REQUEST
|
||||
#
|
||||
def request_exists(self, request: Request, visitor_id: int, group_id: int):
|
||||
# get all requests from same visitor to same location
|
||||
# TODO this looks wrong
|
||||
self.cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'")
|
||||
date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d")
|
||||
for request_id, date1 in self.cur.fetchall():
|
||||
if settings["request_is_same_on_same_day"]:
|
||||
date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d")
|
||||
if date0 == date1:
|
||||
pdebug(f"request_exists: Request is on same day as request {request_id}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_request(self, request: Request) -> (int | None):
|
||||
"""returns visitor_id if new request was added, else None"""
|
||||
# skip requests to blacklisted locations
|
||||
if request_blacklist:
|
||||
if re.fullmatch(request_blacklist, request.request_file):
|
||||
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
|
||||
return None
|
||||
# pdebug("add_requests_to_db:", i, "request:", request)
|
||||
visitor_id = self.get_visitor_id(request)
|
||||
self.conn.commit()
|
||||
group_id: int = self.get_filegroup(request.request_file)
|
||||
# check if request is unique
|
||||
if self.request_exists(request, visitor_id, group_id):
|
||||
# pdebug("request exists:", request)
|
||||
return None
|
||||
else:
|
||||
# pdebug("new request:", request)
|
||||
sql_insert(t_request, [[None, visitor_id, group_id, request.time_local, request.referer, request.status]])
|
||||
return visitor_id
|
||||
|
||||
def add_requests(self, requests: list[Request]):
|
||||
added_requests = 0
|
||||
# check the new visitors later
|
||||
request_blacklist = settings["request_location_regex_blacklist"]
|
||||
new_visitors = []
|
||||
for i in range(len(requests)):
|
||||
visitor = self.add_request(requests[i])
|
||||
if visitor:
|
||||
new_visitors.append(visitor)
|
||||
|
||||
# update the is_human column for all new visitors
|
||||
for visitor_id in new_visitors:
|
||||
if not sql_exists(self.cur, t_visitor, [(str(visitor_id), "visitor_id")]): continue
|
||||
is_human = self.is_visitor_human(visitor_id)
|
||||
self.cur.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
# pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}")
|
||||
if is_human:
|
||||
self.cur.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}")
|
||||
self.conn.commit()
|
||||
pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.")
|
||||
|
||||
#
|
||||
# FILE(GROUP)
|
||||
#
|
||||
def get_filegroup(self, filename: str) -> int:
|
||||
"""
|
||||
get the filegroup
|
||||
returns the group where
|
||||
1) filename is the groupname
|
||||
2) the filetype of filename is the groupname
|
||||
3) new group with filename as gorupname
|
||||
"""
|
||||
# pdebug(f"get_filegroup: {filename}")
|
||||
if sql_exists(self.cur, t_file, [("filename", filename)]):
|
||||
return sql_select(self.cur, t_file, [("filename", filename)])[0][1]
|
||||
else:
|
||||
suffix = filename.split('.')[-1]
|
||||
self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
|
||||
# self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
|
||||
group_id_candidates = self.cur.fetchall()
|
||||
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
|
||||
if group_id_candidates:
|
||||
return group_id_candidates[0][0]
|
||||
else: # add new group file filename
|
||||
group_id = sql_max(self.cur, t_filegroup, "group_id") + 1
|
||||
|
||||
# pdebug("new file(group):", group_id, filename)
|
||||
# add group
|
||||
sql_insert(self.cur, t_filegroup, [[group_id, filename]])
|
||||
# add file
|
||||
sql_insert(self.cur, t_file, [[filename, group_id]])
|
||||
return group_id
|
||||
|
||||
#
|
||||
# GEOIP
|
||||
#
|
||||
def get_ip_range_id(self, ip_address: int):
|
||||
self.cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper")
|
||||
results = self.cur.fetchall()
|
||||
ip_range_id_val = 0
|
||||
if len(results) == 0:
|
||||
pass
|
||||
elif len(results) > 1:
|
||||
warning(f"get_ip_range_id: Found multiple ip_ranges for ip_address={ip_address}: results={results}")
|
||||
else:
|
||||
ip_range_id_val = results[0][0]
|
||||
return ip_range_id_val
|
||||
|
||||
def update_ip_range_id(self, visitor_id: int):
|
||||
self.cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
results = self.cur.fetchall()
|
||||
if len(results) == 0:
|
||||
warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}")
|
||||
return
|
||||
elif len(results) > 1:
|
||||
warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}")
|
||||
return
|
||||
ip_address = results[0][0]
|
||||
self.cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(self.cur, ip_address)}' WHERE visitor_id = '{visitor_id}'")
|
||||
|
||||
def create_filegroups(cursor: sql.Cursor, filegroup_str: str):
|
||||
"""
|
||||
TODO: make re-usable (alter groups when config changes)
|
||||
"""
|
||||
# filegroup_str: 'name1: file1, file2, file3; name2: file33'
|
||||
groups = filegroup_str.strip(";").split(";")
|
||||
pdebug("create_filegroups:", groups)
|
||||
|
62
regina/db_operation/request.py
Normal file
62
regina/db_operation/request.py
Normal file
@ -0,0 +1,62 @@
|
||||
from ipaddress import IPv4Address, ip_address
|
||||
from time import mktime
|
||||
from re import fullmatch, match
|
||||
from datetime import datetime as dt
|
||||
|
||||
from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
|
||||
from .utility.utility import pdebug, warning, pmessage
|
||||
from .utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
|
||||
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
|
||||
|
||||
class Request:
|
||||
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""):
|
||||
self.ip_address = int(IPv4Address(sanitize(ip_address)))
|
||||
self.time_local = 0
|
||||
#[20/Nov/2022:00:47:36 +0100]
|
||||
m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
|
||||
if m:
|
||||
g = m.groups()
|
||||
try:
|
||||
if g[1] in months:
|
||||
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
|
||||
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
|
||||
self.time_local = int(mktime(datetime_.timetuple()))
|
||||
else:
|
||||
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
|
||||
except Exception as e:
|
||||
warning(f"Request:__init__: {e}")
|
||||
else:
|
||||
warning(f"Request:__init__: Could not match time: '{time_local}'")
|
||||
self.request_type = sanitize(request_type)
|
||||
self.request_file = sanitize(request_file)
|
||||
self.request_protocol = sanitize(request_protocol)
|
||||
self.status = sanitize(status)
|
||||
self.bytes_sent = sanitize(bytes_sent)
|
||||
self.referer = sanitize(referer)
|
||||
self.visitor_agent = sanitize(visitor_agent)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}"
|
||||
|
||||
def get_os(self):
|
||||
# for groups in findall(re_visitor_agent, visitor_agent):
|
||||
operating_system = ""
|
||||
for os in visitor_agent_operating_systems:
|
||||
if os in self.visitor_agent:
|
||||
operating_system = os
|
||||
break
|
||||
return operating_system
|
||||
|
||||
def get_browser(self):
|
||||
browser = ""
|
||||
for br in visitor_agent_browsers:
|
||||
if br in self.visitor_agent:
|
||||
browser = br
|
||||
break
|
||||
return browser
|
||||
|
||||
def get_mobile(self):
|
||||
return "Mobi" in self.visitor_agent
|
||||
|
||||
|
@ -9,7 +9,7 @@ from datetime import datetime as dt
|
||||
|
||||
from numpy import empty
|
||||
# local
|
||||
from regina.db_operation.database import t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country
|
||||
from regina.db_operation.database import Database, t_request, t_visitor, t_file, t_filegroup, t_ip_range, t_city, t_country
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
|
||||
from regina.utility.utility import pdebug, warning, missing_arg
|
||||
from regina.utility.globals import settings
|
||||
@ -66,7 +66,7 @@ def valid_status(status: int):
|
||||
#
|
||||
# FILTERS
|
||||
#
|
||||
def get_os_browser_mobile_rankings(cur: sql.Cursor, visitor_ids: list[int]):
|
||||
def get_os_browser_mobile_rankings(db: Database, visitor_ids: list[int]):
|
||||
"""
|
||||
returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage
|
||||
"""
|
||||
@ -76,8 +76,7 @@ def get_os_browser_mobile_rankings(cur: sql.Cursor, visitor_ids: list[int]):
|
||||
browser_count = 0.0
|
||||
mobile_ranking = { True: 0.0, False: 0.0 }
|
||||
for visitor_id in visitor_ids:
|
||||
cur.execute(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}")
|
||||
os, browser, mobile = cur.fetchone()
|
||||
os, browser, mobile = db(f"SELECT platform,browser,mobile FROM {t_visitor} WHERE visitor_id = {visitor_id}")[0]
|
||||
mobile = bool(mobile)
|
||||
if os:
|
||||
if os in os_ranking: os_ranking[os] += 1
|
||||
@ -134,34 +133,30 @@ def get_where_date_str(at_date=None, min_date=None, max_date=None):
|
||||
|
||||
|
||||
# get the earliest date
|
||||
def get_earliest_date(cur: sql.Cursor) -> int:
|
||||
def get_earliest_date(db: Database) -> int:
|
||||
"""return the earliest time as unixepoch"""
|
||||
cur.execute(f"SELECT MIN(date) FROM {t_request}")
|
||||
date = cur.fetchone()[0]
|
||||
date = db(f"SELECT MIN(date) FROM {t_request}")[0][0]
|
||||
if not isinstance(date, int): return 0
|
||||
else: return date
|
||||
|
||||
# get the latest date
|
||||
def get_latest_date(cur: sql.Cursor) -> int:
|
||||
def get_latest_date(db: Database) -> int:
|
||||
"""return the latest time as unixepoch"""
|
||||
cur.execute(f"SELECT MAX(date) FROM {t_request}")
|
||||
date = cur.fetchone()[0]
|
||||
date = db(f"SELECT MAX(date) FROM {t_request}")[0][0]
|
||||
if not isinstance(date, int): return 0
|
||||
else: return date
|
||||
|
||||
# get all dates
|
||||
# the date:str parameter in all these function must be a sqlite constraint
|
||||
def get_days(cur: sql.Cursor, date:str) -> list[str]:
|
||||
def get_days(db: Database, date:str) -> list[str]:
|
||||
"""get a list of all dates in yyyy-mm-dd format"""
|
||||
cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")
|
||||
days = [ date[0] for date in cur.fetchall() ] # fetchall returns tuples (date, )
|
||||
days = [ date[0] for date in db(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")] # fetchall returns tuples (date, )
|
||||
days.sort()
|
||||
return days
|
||||
|
||||
def get_months(cur: sql.Cursor, date:str) -> list[str]:
|
||||
def get_months(db: Database, date:str) -> list[str]:
|
||||
"""get a list of all dates in yyyy-mm format"""
|
||||
cur.execute(f"SELECT DISTINCT DATE(date, 'unixepoch') FROM {t_request} WHERE {date}")
|
||||
dates = get_days(cur, date)
|
||||
dates = get_days(db, date)
|
||||
date_dict = {}
|
||||
for date in dates:
|
||||
date_without_day = date[0:date.rfind('-')]
|
||||
@ -169,14 +164,13 @@ def get_months(cur: sql.Cursor, date:str) -> list[str]:
|
||||
return list(date_dict.keys())
|
||||
|
||||
|
||||
def get_visitor_agent(cur: sql.Cursor, visitor_id: int):
|
||||
return sql_select(cur, t_visitor, [("visitor_id", visitor_id)])[0][2]
|
||||
def get_visitor_agent(db: Database, visitor_id: int):
|
||||
return sql_select(db.cur, t_visitor, [("visitor_id", visitor_id)])[0][2]
|
||||
|
||||
def get_unique_visitor_ids_for_date(cur: sql.Cursor, date:str) -> list[int]:
|
||||
cur.execute(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}")
|
||||
return [ visitor_id[0] for visitor_id in cur.fetchall() ]
|
||||
def get_unique_visitor_ids_for_date(db: Database, date:str) -> list[int]:
|
||||
return [ visitor_id[0] for visitor_id in db(f"SELECT DISTINCT visitor_id FROM {t_request} WHERE {date}") ]
|
||||
|
||||
def get_human_visitors(cur: sql.Cursor, unique_visitor_ids, unique_visitor_ids_human: list):
|
||||
def get_human_visitors(db: Database, unique_visitor_ids, unique_visitor_ids_human: list):
|
||||
"""
|
||||
check if they have a known platform AND browser
|
||||
check if at least one request did not result in an error (http status >= 400)
|
||||
@ -195,22 +189,22 @@ def get_human_visitors(cur: sql.Cursor, unique_visitor_ids, unique_visitor_ids_h
|
||||
unique_visitor_ids_human.append(visitor_id)
|
||||
# pdebug("get_human_visitors: (2)", unique_visitor_ids_human)
|
||||
|
||||
def get_unique_request_ids_for_date(cur: sql.Cursor, date:str):
|
||||
def get_unique_request_ids_for_date(db: Database, date:str):
|
||||
cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date}")
|
||||
return [ request_id[0] for request_id in cur.fetchall()]
|
||||
|
||||
def get_unique_request_ids_for_date_and_visitor(cur: sql.Cursor, date:str, visitor_id: int, unique_request_ids_human: list):
|
||||
def get_unique_request_ids_for_date_and_visitor(db: Database, date:str, visitor_id: int, unique_request_ids_human: list):
|
||||
cur.execute(f"SELECT DISTINCT request_id FROM {t_request} WHERE {date} AND visitor_id = {visitor_id}")
|
||||
# all unique requests for visitor_id
|
||||
for request_id in cur.fetchall():
|
||||
unique_request_ids_human.append(request_id[0])
|
||||
|
||||
# get number of requests per day
|
||||
def get_request_count_for_date(cur: sql.Cursor, date:str) -> int:
|
||||
def get_request_count_for_date(db: Database, date:str) -> int:
|
||||
cur.execute(f"SELECT COUNT(*) FROM {t_request} WHERE {date}")
|
||||
return cur.fetchone()[0]
|
||||
|
||||
def get_unique_visitor_count(cur: sql.Cursor) -> int:
|
||||
def get_unique_visitor_count(db: Database) -> int:
|
||||
return sql_tablesize(cur, t_visitor)
|
||||
|
||||
|
||||
@ -218,7 +212,7 @@ def get_unique_visitor_count(cur: sql.Cursor) -> int:
|
||||
#
|
||||
# RANKINGS
|
||||
#
|
||||
def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
|
||||
def get_file_ranking(db: Database, date:str) -> list[tuple[int, str]]:
|
||||
global settings
|
||||
"""
|
||||
:returns [(request_count, groupname)]
|
||||
@ -255,7 +249,7 @@ def get_file_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
|
||||
# print(ranking)
|
||||
return ranking
|
||||
|
||||
def get_visitor_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
|
||||
def get_visitor_agent_ranking(db: Database, date:str) -> list[tuple[int, str]]:
|
||||
"""
|
||||
:returns [(request_count, visitor_agent)]
|
||||
"""
|
||||
@ -276,7 +270,7 @@ def get_visitor_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]
|
||||
# print(ranking)
|
||||
return ranking
|
||||
|
||||
def get_request_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date_condition:str) -> list[tuple[int, str]]:
|
||||
def get_request_ranking(field_name: str, table: str, whitelist_regex: str, db: Database, date_condition:str) -> list[tuple[int, str]]:
|
||||
"""
|
||||
1) get all the distinct entries for field_name after min_date_unix_time
|
||||
2) call get_name_function with the distinct entry
|
||||
|
@ -4,13 +4,23 @@
|
||||
from sys import argv, exit
|
||||
from os.path import isfile
|
||||
import sqlite3 as sql
|
||||
from regina.db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id
|
||||
from regina.db_operation.database import create_db, update_geoip_tables, t_visitor
|
||||
from regina.db_operation.visualize import visualize
|
||||
from regina.utility.settings_manager import read_settings_file
|
||||
from regina.utility.globals import settings, version
|
||||
from regina.utility.utility import pmessage
|
||||
from regina.utility.sql_util import sql_tablesize
|
||||
|
||||
if __name__ == "__main__":
|
||||
if __package__ is None:
|
||||
# make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
|
||||
__package__ = "regina"
|
||||
import sys
|
||||
from os import path
|
||||
filepath = path.realpath(path.abspath(__file__))
|
||||
sys.path.insert(0, path.dirname(path.dirname(filepath)))
|
||||
|
||||
from .db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id
|
||||
from .db_operation.database import create_db, update_geoip_tables, t_visitor
|
||||
from .db_operation.visualize import visualize
|
||||
from .utility.settings_manager import read_settings_file
|
||||
from .utility.globals import settings, version
|
||||
from .utility.utility import pmessage
|
||||
from .utility.sql_util import sql_tablesize
|
||||
|
||||
"""
|
||||
start regina, launch either collect or visualize
|
||||
|
50
regina/sql/create_db.sql
Normal file
50
regina/sql/create_db.sql
Normal file
@ -0,0 +1,50 @@
|
||||
CREATE TABLE IF NOT EXISTS visitor(
|
||||
visitor_id INTEGER PRIMARY KEY,
|
||||
platform TEXT,
|
||||
browser TEXT,
|
||||
is_human INTEGER,
|
||||
range_id INTEGER
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS request(
|
||||
request_id INTEGER PRIMARY KEY,
|
||||
visitor_id INTEGER,
|
||||
FOREIGN KEY(visitor_id) REFERENCES visitor(visitor_id),
|
||||
group_id INTEGER,
|
||||
FOREIGN KEY(group_id) REFERENCES filegroup(group_id),
|
||||
date INTEGER,
|
||||
referer TEXT,
|
||||
status INTEGER
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS filegroup(
|
||||
group_id INTEGER PRIMARY KEY,
|
||||
groupname TEXT
|
||||
) STRICT;
|
||||
CREATE TABLE IF NOT EXISTS file(
|
||||
filename TEXT,
|
||||
group_id INTEGER,
|
||||
FOREIGN KEY(group_id) REFERENCES filegroup(group_id)
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ip_range(
|
||||
range_id INTEGER PRIMARY KEY,
|
||||
from INTEGER,
|
||||
to INTEGER,
|
||||
city_id INTEGER,
|
||||
FOREIGN KEY(city_id) REFERENCES city(city_id)
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS city(
|
||||
city INTEGER PRIMARY KEY,
|
||||
name TEXT,
|
||||
region TEXT,
|
||||
country_id INTEGER,
|
||||
FOREIGN KEY(country_id) REFERENCES country(country_id)
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS country(
|
||||
country_id INTEGER PRIMARY KEY,
|
||||
name TEXT,
|
||||
code TEXT
|
||||
) STRICT;
|
@ -1,5 +1,7 @@
|
||||
"""global variables for regina"""
|
||||
|
||||
import os
|
||||
|
||||
version = "1.0"
|
||||
|
||||
# default settings, these are overwriteable through a config file
|
||||
@ -74,3 +76,12 @@ visitor_agent_browsers = [
|
||||
]
|
||||
|
||||
|
||||
# set directories
|
||||
config_dir = os.path.join(os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config")), "regina")
|
||||
data_dir = os.path.join(os.environ.get("XDG_DATA_HOME", os.path.expanduser("~/.local/share")), "regina")
|
||||
cache_dir = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "regina")
|
||||
|
||||
# check if environment variables are set and use them if they are
|
||||
if 'REGINA_CONFIG_DIR' in os.environ: config_dir = os.environ['REGINA_CONFIG_DIR']
|
||||
if 'REGINA_DATA_DIR' in os.environ: data_dir = os.environ['REGINA_DATA_DIR']
|
||||
if 'REGINA_CACHE_DIR' in os.environ: cache_dir = os.environ['REGINA_CACHE_DIR']
|
||||
|
@ -1,6 +1,7 @@
|
||||
# from sys import path
|
||||
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
|
||||
from sys import exit
|
||||
from os import path
|
||||
|
||||
from regina.utility.globals import settings
|
||||
|
||||
@ -29,3 +30,11 @@ def missing_arg(arg):
|
||||
print("Missing ", arg)
|
||||
exit(1)
|
||||
|
||||
|
||||
def get_filepath(filename, directories: list):
|
||||
"""search directories for file and return the full path to the file"""
|
||||
for d in directories:
|
||||
p = f"{path.expanduser(d)}/{filename}"
|
||||
if path.isfile(p):
|
||||
return p
|
||||
raise FileNotFoundError(f"{filename} not in {directories}")
|
||||
|
5
setup.py
5
setup.py
@ -1,3 +1,4 @@
|
||||
from matplotlib.pyplot import matplotlib
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
@ -12,8 +13,8 @@ setup(
|
||||
|
||||
license="GPLv3",
|
||||
|
||||
packages=find_packages(),
|
||||
install_requires=[],
|
||||
packages=["regina"],
|
||||
install_requires=["matplotlib"],
|
||||
python_requires='>=3.10',
|
||||
|
||||
classifiers=[
|
||||
|
Loading…
Reference in New Issue
Block a user