Changed database structure

This commit is contained in:
matthias@arch 2023-05-11 13:05:01 +02:00
parent ecc75560e3
commit a49f15b9f0
3 changed files with 360 additions and 461 deletions

View File

@ -1,13 +1,13 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<diagram program="umlet" version="15.1"> <diagram program="umlet" version="15.1">
<zoom_level>10</zoom_level> <zoom_level>8</zoom_level>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>70</x> <x>96</x>
<y>220</y> <y>248</y>
<w>250</w> <w>160</w>
<h>190</h> <h>144</h>
</coordinates> </coordinates>
<panel_attributes>visitor <panel_attributes>visitor
-- --
@ -15,53 +15,35 @@
- visitor_id: INTEGER - visitor_id: INTEGER
-- --
- ip_address: INTEGER - ip_address: INTEGER
- visitor agent string: TEXT - platform_id: INTEGER
- platform: TEXT - browser_id: INTEGER
- browser: TEXT
- mobile: INTEGER - mobile: INTEGER
- is_human: INTEGER - is_human: INTEGER
- range_id: INTEGER - range_id: INTEGER
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>640</x>
<y>220</y>
<w>180</w>
<h>100</h>
</coordinates>
<panel_attributes>filegroup
--
&lt;&lt;PK&gt;&gt;
- group_id: INTEGER
--
- group_name: TEXT
style=autoresize</panel_attributes> style=autoresize</panel_attributes>
<additional_attributes/> <additional_attributes/>
</element> </element>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>560</x> <x>216</x>
<y>220</y> <y>168</y>
<w>100</w> <w>32</w>
<h>50</h> <h>96</h>
</coordinates> </coordinates>
<panel_attributes>lt=- <panel_attributes>lt=-
m1=n m1=n
m2=1 m2=1
</panel_attributes> </panel_attributes>
<additional_attributes>10.0;20.0;80.0;20.0</additional_attributes> <additional_attributes>10.0;100.0;10.0;10.0</additional_attributes>
</element> </element>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>380</x> <x>352</x>
<y>210</y> <y>240</y>
<w>190</w> <w>152</w>
<h>170</h> <h>136</h>
</coordinates> </coordinates>
<panel_attributes>request <panel_attributes>request
-- --
@ -69,10 +51,10 @@ m2=1
- request_id: INTEGER - request_id: INTEGER
-- --
- visitor_id: INTEGER - visitor_id: INTEGER
- group_id: INTEGER - route_id: INTEGER
- referer_id: INTEGER
-- --
- date: TEXT - time: INTEGER
- referer: TEXT
- status: INTEGER - status: INTEGER
style=autoresize</panel_attributes> style=autoresize</panel_attributes>
<additional_attributes/> <additional_attributes/>
@ -80,81 +62,42 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>310</x> <x>248</x>
<y>220</y> <y>248</y>
<w>90</w> <w>120</w>
<h>50</h> <h>40</h>
</coordinates> </coordinates>
<panel_attributes>lt=- <panel_attributes>lt=-
m1=1 m1=1
m2=n m2=n
</panel_attributes> </panel_attributes>
<additional_attributes>10.0;20.0;70.0;20.0</additional_attributes> <additional_attributes>10.0;20.0;130.0;20.0</additional_attributes>
</element> </element>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>640</x> <x>16</x>
<y>400</y> <y>96</y>
<w>180</w> <w>160</w>
<h>100</h>
</coordinates>
<panel_attributes>file
--
&lt;&lt;PK&gt;&gt;
- filename: TEXT
--
- group_id: INTEGER
--
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>670</x>
<y>310</y>
<w>40</w>
<h>110</h>
</coordinates>
<panel_attributes>lt=-
m1=n
m2=1
</panel_attributes>
<additional_attributes>10.0;90.0;10.0;10.0</additional_attributes>
</element>
<element>
<id>UMLNote</id>
<coordinates>
<x>490</x>
<y>100</y>
<w>300</w>
<h>70</h>
</coordinates>
<panel_attributes>One group contains multiple files.
Lets you group the images from a
html with the html itself
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>650</x>
<y>160</y>
<w>30</w>
<h>80</h> <h>80</h>
</coordinates> </coordinates>
<panel_attributes>lt=&lt;-</panel_attributes> <panel_attributes>platform
<additional_attributes>10.0;60.0;10.0;10.0</additional_attributes> --
&lt;&lt;PK&gt;&gt;
- platform_id: INTEGER
--
- name: TEXT UNIQUE
--
style=autoresize</panel_attributes>
<additional_attributes/>
</element> </element>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>360</x> <x>328</x>
<y>520</y> <y>488</y>
<w>190</w> <w>152</w>
<h>130</h> <h>104</h>
</coordinates> </coordinates>
<panel_attributes>city <panel_attributes>city
-- --
@ -170,15 +113,15 @@ style=autoresize</panel_attributes>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>620</x> <x>536</x>
<y>520</y> <y>488</y>
<w>120</w> <w>152</w>
<h>110</h> <h>88</h>
</coordinates> </coordinates>
<panel_attributes>country <panel_attributes>country
-- --
&lt;&lt;PK&gt;&gt; &lt;&lt;PK&gt;&gt;
- country_id - country_id: INTEGER
-- --
- name: TEXT - name: TEXT
- code: TEXT - code: TEXT
@ -188,10 +131,10 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>540</x> <x>472</x>
<y>540</y> <y>504</y>
<w>100</w> <w>80</w>
<h>50</h> <h>40</h>
</coordinates> </coordinates>
<panel_attributes>lt=- <panel_attributes>lt=-
m1=1 m1=1
@ -202,10 +145,10 @@ m2=n
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>280</x> <x>264</x>
<y>540</y> <y>504</y>
<w>100</w> <w>80</w>
<h>50</h> <h>40</h>
</coordinates> </coordinates>
<panel_attributes>lt=- <panel_attributes>lt=-
m1=1 m1=1
@ -216,10 +159,10 @@ m2=n
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>120</x> <x>136</x>
<y>520</y> <y>488</y>
<w>170</w> <w>136</w>
<h>130</h> <h>104</h>
</coordinates> </coordinates>
<panel_attributes>ip_range <panel_attributes>ip_range
-- --
@ -235,15 +178,111 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>170</x> <x>176</x>
<y>400</y> <y>384</y>
<w>40</w> <w>32</w>
<h>140</h> <h>120</h>
</coordinates> </coordinates>
<panel_attributes>lt=- <panel_attributes>lt=-
m1=1 m1=1
m2=n m2=n
</panel_attributes> </panel_attributes>
<additional_attributes>10.0;120.0;10.0;10.0</additional_attributes> <additional_attributes>10.0;130.0;10.0;10.0</additional_attributes>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>576</x>
<y>264</y>
<w>144</w>
<h>80</h>
</coordinates>
<panel_attributes>route
--
&lt;&lt;PK&gt;&gt;
- route_id: INTEGER
--
- name: TEXT UNIQUE
--
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>208</x>
<y>96</y>
<w>152</w>
<h>80</h>
</coordinates>
<panel_attributes>browser
--
&lt;&lt;PK&gt;&gt;
- browser_id: INTEGER
--
- name: TEXT UNIQUE
--
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>144</x>
<y>168</y>
<w>32</w>
<h>96</h>
</coordinates>
<panel_attributes>lt=-
m1=n
m2=1
</panel_attributes>
<additional_attributes>10.0;100.0;10.0;10.0</additional_attributes>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>392</x>
<y>96</y>
<w>152</w>
<h>80</h>
</coordinates>
<panel_attributes>referer
--
&lt;&lt;PK&gt;&gt;
- referer_id: INTEGER
--
- name: TEXT UNIQUE
--
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>400</x>
<y>168</y>
<w>32</w>
<h>88</h>
</coordinates>
<panel_attributes>lt=-
m1=n
m2=1
</panel_attributes>
<additional_attributes>10.0;90.0;10.0;10.0</additional_attributes>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>496</x>
<y>288</y>
<w>96</w>
<h>40</h>
</coordinates>
<panel_attributes>lt=-
m1=n
m2=1
</panel_attributes>
<additional_attributes>10.0;20.0;100.0;20.0</additional_attributes>
</element> </element>
</diagram> </diagram>

View File

@ -5,8 +5,18 @@ from os import path, listdir
import pkg_resources import pkg_resources
import re import re
from datetime import datetime as dt from datetime import datetime as dt
if __name__ == "__main__": # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
if __package__ is None:
__package__ = "regina"
import sys
from os import path
filepath = path.realpath(path.abspath(__file__))
print(path.dirname(path.dirname(path.dirname(filepath))))
sys.path.insert(0, path.dirname(path.dirname(path.dirname(filepath))))
# local # local
from .utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max from .utility.sql_util import replace_null, sanitize, sql_select, sql_exists
from .utility.utility import pdebug, get_filepath, warning, pmessage from .utility.utility import pdebug, get_filepath, warning, pmessage
from .utility.globals import settings from .utility.globals import settings
from .db_operation.request import Request from .db_operation.request import Request
@ -16,97 +26,6 @@ from .utility.globals import visitor_agent_operating_systems, visitor_agent_brow
create reginas database as shown in the uml diagram database.uxf create reginas database as shown in the uml diagram database.uxf
""" """
class Entry:
"""
represents an sql entry
type_ is INTEGER, TEXT, REAL...
"""
def __init__(self, name, type_) -> None:
self.name = name
self.type_ = type_
def __repr__(self):
return f"[{self.name}] {self.type_}"
class Table:
def __init__(self, name, key: Entry, entries: list[Entry]=[], constaints: list[str]=[]):
self.name = name
self.key = key
self.entries = entries
self.constaints = constaints
def create_sql_str(self):
return f"CREATE TABLE IF NOT EXISTS {self.name}\n({self})\n"
def __repr__(self):
s = f"{self.key} PRIMARY KEY"
for entry in self.entries:
s += f", {entry}"
for c in self.constaints:
s += f", {c}"
return s
t_request = "request"
t_file = "file"
t_filegroup = "filegroup"
t_visitor = "visitor"
t_city = "city"
t_country = "country"
t_ip_range = "ip_range"
visitor_id = Entry("visitor_id", "INTEGER")
request_id = Entry("request_id", "INTEGER")
filegroup_id = Entry("group_id", "INTEGER")
ip_address_entry = Entry("ip_address", "INTEGER")
filename_entry = Entry("filename", "TEXT")
city_id = Entry("city_id", "INTEGER")
country_id = Entry("country_id", "INTEGER")
ip_range_id = Entry("ip_range_id", "INTEGER")
database_tables = {
t_visitor: Table(t_visitor, visitor_id, [
Entry("ip_address", "INTEGER"),
Entry("visitor_agent", "TEXT"),
Entry("platform", "TEXT"),
Entry("browser", "TEXT"),
Entry("mobile", "INTEGER"),
Entry("is_human", "INTEGER"),
ip_range_id,
],
[f"UNIQUE({visitor_id.name})"]),
t_file: Table(t_file, filename_entry,
[filegroup_id],
[f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id,
[Entry("groupname", "TEXT")],
[f"UNIQUE({filegroup_id.name})"]),
t_request: Table(t_request, request_id, [
visitor_id,
filegroup_id,
Entry("date", "INTEGER"),
Entry("referer", "TEXT"),
Entry("status", "INTEGER")
],
["UNIQUE(request_id)"]),
t_ip_range: Table(t_ip_range, ip_range_id, [
Entry("lower", "INTEGER"),
Entry("upper", "INTEGER"),
city_id,
],
[f"UNIQUE({ip_range_id.name})"]),
t_city: Table(t_city, city_id, [
country_id,
Entry("name", "TEXT"),
Entry("region", "TEXT"),
],
[f"UNIQUE({city_id.name})"]),
t_country: Table(t_country, country_id, [
Entry("name", "TEXT"),
Entry("code", "TEXT"),
],
[f"UNIQUE({country_id.name})"]),
}
class Database: class Database:
def __init__(self, database_path): def __init__(self, database_path):
self.conn = sql.connect(database_path) self.conn = sql.connect(database_path)
@ -118,6 +37,7 @@ class Database:
with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file: with open(pkg_resources.resource_filename("regina", "sql/create_db.sql"), "r") as file:
create_db = file.read() create_db = file.read()
self.cur.execute(create_db) self.cur.execute(create_db)
self.conn.commit()
def __call__(self, s): def __call__(self, s):
"""execute a command and return fetchall()""" """execute a command and return fetchall()"""
@ -127,42 +47,27 @@ class Database:
# #
# VISITOR # VISITOR
# #
def visitor_exists(self, request) -> bool:
if settings["hash_ip_address"]:
ip_address = hash(request.ip_address)
else:
ip_address = request.ip_address
if settings["unique_visitor_is_ip_address"]:
return sql_exists(self.cur, t_visitor, [("ip_address", ip_address)])
else:
return sql_exists(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])
def is_visitor_human(self, visitor_id: int): def is_visitor_human(self, visitor_id: int):
""" """
check if they have a known platform AND browser check if they have a known platform AND browser
check if at least one request did not result in an error (http status >= 400) if settings "human_needs_success": check if at least one request did not result in an error (http status >= 400)
""" """
max_success_status = 400 max_success_status = 400
if settings["status_300_is_success"]: max_success_status = 300 if settings["status_300_is_success"]: max_success_status = 300
self.cur.execute(f"SELECT browser, platform FROM {t_visitor} WHERE visitor_id = {visitor_id}") self.cur.execute(f"SELECT browser_id, platform_id FROM visitor WHERE visitor_id = {visitor_id}")
browsers_and_platforms = self.cur.fetchall() browsers_and_platforms = self.cur.fetchall()
if len(browsers_and_platforms) != 1: if len(browsers_and_platforms) != 1:
pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many") pdebug(f"is_visitor_human: {visitor_id} - could not find visitor or found too many")
return False return False
if not browsers_and_platforms[0][0] in visitor_agent_browsers: browser = self.get_name("browser", browsers_and_platforms[0][0])
if not browser in visitor_agent_browsers:
return False return False
if not browsers_and_platforms[0][1] in visitor_agent_operating_systems: platform = self.get_name("platform", browsers_and_platforms[0][1])
if not platform in visitor_agent_operating_systems:
return False return False
# check if has browser
# self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_visitor} WHERE visitor_id = {visitor_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
# if no browser and platform
# exists = self.cur.fetchone()
# if exists is None or exists[0] == 0:
# return False
# if human needs successful request
if settings["human_needs_success"]: if settings["human_needs_success"]:
# check if at least request was successful (status < 400) # check if at least request was successful (status < 400)
self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_request} WHERE visitor_id = {visitor_id} AND status < {max_success_status})") self.cur.execute(f"SELECT EXISTS (SELECT 1 FROM request WHERE visitor_id = {visitor_id} AND status < {max_success_status})")
if self.cur.fetchone()[0] == 1: if self.cur.fetchone()[0] == 1:
# pdebug(f"is_visitor_human: Visitor {visitor_id} is human") # pdebug(f"is_visitor_human: Visitor {visitor_id} is human")
pass pass
@ -171,67 +76,85 @@ class Database:
return False return False
return True return True
def get_visitor_id(self, request: Request) -> int: def get_visitor_id(self, request: Request, insert=True) -> int | None:
""" """
get the visitor_id. Adds the visitor if not already existing get the visitor_id. Adds the visitor if not already existing
""" """
"""
get the visitor_id:
If settings unique_visitor_is_ip_address: Check if visitor with ip address exists
Else: check if visitor with ip_address, browser and platform exists
If visitor does not exist and insert: insert, return id
Else: return None
"""
if settings["hash_ip_address"]: if settings["hash_ip_address"]:
ip_address = hash(request.ip_address) ip_address = hash(request.ip_address)
else: else:
ip_address = request.ip_address ip_address = request.ip_address
if self.visitor_exists(request): # if insert == True, ids will be int
if settings["unique_visitor_is_ip_address"]: browser_id: int | None = self.get_id("browser", request.get_browser(), insert=insert)
visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address)])[0][0] platform_id: int | None = self.get_id("platform", request.get_platform(), insert=insert)
else: constraints = [("ip_address", ip_address)]
visitor_id = sql_select(self.cur, t_visitor, [("ip_address", ip_address), ("visitor_agent", request.visitor_agent)])[0][0] if not settings["unique_visitor_is_ip_address"]:
else: # new visitor if browser_id: constraints.append(("browser_id", browser_id))
# new visitor_id is number of elements if platform_id: constraints.append(("platform_id", platform_id))
visitor_id = sql_max(self.cur, t_visitor, "visitor_id") + 1 require_update_is_human = False
# pdebug("new visitor:", visitor_id, request.ip_address) if not sql_exists(self.cur, "visitor", constraints):
platform, browser, mobile = get_os_browser_pairs_from_agent(request.visitor_agent) require_update_is_human = True
ip_range_id_val = 0 if not insert:
return None
is_mobile = int(request.get_mobile())
ip_range_id = 0
if settings["get_visitor_location"]: if settings["get_visitor_location"]:
ip_range_id_val = get_ip_range_id(self.cur, request.ip_address) ip_range_id = self.get_ip_range_id(request.ip_address)
is_human = 0 # is_visitor_human cannot be called until visitor is in db int(is_visitor_human(self.cur, visitor_id)) is_human = 0 # is_visitor_human cannot be called until visitor is in db
self.cur.execute(f"INSERT INTO {t_visitor} (visitor_id, ip_address, visitor_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({visitor_id}, '{ip_address}', '{request.visitor_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") self.cur.execute(f"INSERT INTO visitor (ip_address, ip_range_id, platform_id, browser_id, is_mobile, is_human, ip_range_id) VALUES ('{ip_address}', '{ip_range_id}', '{platform_id}', '{browser_id}', '{is_mobile}', '{is_human}');")
visitor_id = sql_select(self.cur, "visitor", constraints)[0][0]
# TODO: if requests are not added yet, visitor might not be recognized since it does not have a successful requets yet
if require_update_is_human:
is_human = self.is_visitor_human(visitor_id)
if is_human:
self.cur.execute(f"UPDATE visitor SET is_human = 1 WHERE visitor_id = {visitor_id}")
return visitor_id return visitor_id
# #
# REQUEST # REQUEST
# #
def request_exists(self, request: Request, visitor_id: int, group_id: int): def request_exists(self, request: Request, visitor_id: int, route_id: int):
# get all requests from same visitor to same location """
# TODO this looks wrong Check if a request from same visitor was made to same location in the same day, if setting "request_is_same_on_same_day" is True
self.cur.execute(f"SELECT request_id, date FROM {t_request} WHERE visitor_id = '{visitor_id}' AND group_id = '{group_id}'") If not, always returns False
"""
if not settings["request_is_same_on_same_day"]: return False
# get all requests from same visitor to same route
self.cur.execute(f"SELECT request_id, time FROM request WHERE visitor_id = '{visitor_id}' AND = route_id = '{route_id}'")
# check if on same day
date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d") date0 = dt.fromtimestamp(request.time_local).strftime("%Y-%m-%d")
for request_id, date1 in self.cur.fetchall(): for request_id, date1 in self.cur.fetchall():
if settings["request_is_same_on_same_day"]: date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d")
date1 = dt.fromtimestamp(date1).strftime("%Y-%m-%d") if date0 == date1:
if date0 == date1: pdebug(f"request_exists: Request is on same day as request {request_id}")
pdebug(f"request_exists: Request is on same day as request {request_id}") return True
return True
return False return False
def add_request(self, request: Request) -> (int | None): def add_request(self, request: Request) -> (int | None):
"""returns visitor_id if new request was added, else None""" """returns visitor_id if new request was added, else None"""
# skip requests to blacklisted locations
if request_blacklist:
if re.fullmatch(request_blacklist, request.request_file):
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
return None
# pdebug("add_requests_to_db:", i, "request:", request)
visitor_id = self.get_visitor_id(request) visitor_id = self.get_visitor_id(request)
self.conn.commit() self.conn.commit()
group_id: int = self.get_filegroup(request.request_file) # browser_id = self.get_id("browser", request.get_browser())
# platform_id = self.get_id("platform", request.get_platform())
referer_id = self.get_id("referer", request.referer)
route_id = self.get_id("route", request.route)
# check if request is unique # check if request is unique
if self.request_exists(request, visitor_id, group_id): if self.request_exists(request, visitor_id, route_id):
# pdebug("request exists:", request) # pdebug("request exists:", request)
return None return None
else: else:
# pdebug("new request:", request) # pdebug("new request:", request)
sql_insert(t_request, [[None, visitor_id, group_id, request.time_local, request.referer, request.status]]) self.cur.execute(f"INSERT INTO request (visitor_id, route_id, referer_id, time, status) VALUES ({visitor_id}, {route_id}, {referer_id}, {request.time_local}, {request.status})")
return visitor_id return visitor_id
def add_requests(self, requests: list[Request]): def add_requests(self, requests: list[Request]):
@ -246,53 +169,50 @@ class Database:
# update the is_human column for all new visitors # update the is_human column for all new visitors
for visitor_id in new_visitors: for visitor_id in new_visitors:
if not sql_exists(self.cur, t_visitor, [(str(visitor_id), "visitor_id")]): continue # TODO this does not look right
is_human = self.is_visitor_human(visitor_id) if not sql_exists(self.cur, "visitor", [("visitor_id", visitor_id)]): continue
self.cur.execute(f"SELECT * FROM {t_visitor} WHERE visitor_id = {visitor_id}")
# pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}") # pdebug(f"add_rq_to_db: {visitor_id} is_human? {is_human}, {self.cur.fetchall()}")
if is_human:
self.cur.execute(f"UPDATE {t_visitor} SET is_human = 1 WHERE visitor_id = {visitor_id}")
self.conn.commit() self.conn.commit()
pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.") pmessage(f"Collection Summary: Added {len(new_visitors)} new visitors and {added_requests} new requests.")
#
# FILE(GROUP)
#
def get_filegroup(self, filename: str) -> int:
"""
get the filegroup
returns the group where
1) filename is the groupname
2) the filetype of filename is the groupname
3) new group with filename as gorupname
"""
# pdebug(f"get_filegroup: {filename}")
if sql_exists(self.cur, t_file, [("filename", filename)]):
return sql_select(self.cur, t_file, [("filename", filename)])[0][1]
else:
suffix = filename.split('.')[-1]
self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
# self.cur.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
group_id_candidates = self.cur.fetchall()
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
if group_id_candidates:
return group_id_candidates[0][0]
else: # add new group file filename
group_id = sql_max(self.cur, t_filegroup, "group_id") + 1
# pdebug("new file(group):", group_id, filename) def get_id(self, table: str, name: str, insert=True) -> int | None:
# add group """
sql_insert(self.cur, t_filegroup, [[group_id, filename]]) get the id of name in table
# add file if name is not in table:
sql_insert(self.cur, t_file, [[filename, group_id]]) if insert: add and return id
return group_id else: return None
supported tables: platform, browser, referer, route, city
"""
supported_tables = ["platform", "browser", "referer", "route", "city"]
if not table in supported_tables: raise ValueError(f"table '{table}' is not supported ({supported_tables})")
name = sanitize(replace_null(name))
# if non existent, add name
if not sql_exists(self.cur, table, [("name", name)]):
if not insert: return None
self.cur.execute(f"INSERT INTO {table} (name) VALUES ('{name}')")
return self(f"SELECT {table}_id FROM {table} WHERE name = '{name}'")[0][0]
def get_name(self, table: str, id_: int) -> (str | None):
"""
get the name of id in table
if id is not in table, returns None
supported tables: platform, browser, referer, route, city
"""
supported_tables = ["platform", "browser", "referer", "route", "city"]
if not table in supported_tables: raise ValueError(f"table '{table}' is not supported ({supported_tables})")
ret = self(f"SELECT name FROM {table} WHERE {table}_id = '{id_}'")
if len(ret) == 0: return None
# TODO check if this returns tuple or value
return ret[0]
# #
# GEOIP # GEOIP
# #
def get_ip_range_id(self, ip_address: int): def get_ip_range_id(self, ip_address: int) -> int:
self.cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper") results = self(f"SELECT ip_range_id FROM ip_range WHERE '{ip_address}' BETWEEN low AND high")
results = self.cur.fetchall()
ip_range_id_val = 0 ip_range_id_val = 0
if len(results) == 0: if len(results) == 0:
pass pass
@ -302,174 +222,114 @@ class Database:
ip_range_id_val = results[0][0] ip_range_id_val = results[0][0]
return ip_range_id_val return ip_range_id_val
def update_ip_range_id(self, visitor_id: int): def update_ip_range_id(self, visitor_id: int):
self.cur.execute(f"SELECT ip_address FROM {t_visitor} WHERE visitor_id = {visitor_id}") """
results = self.cur.fetchall() update the ip_range_id column of visitor with visitor_id
if len(results) == 0: """
results = self(f"SELECT ip_address FROM visitor WHERE visitor_id = {visitor_id}")
if len(results) == 0: # sanity checks
warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}") warning(f"update_ip_range_id: Invalid visitor_id={visitor_id}")
return return
elif len(results) > 1: elif len(results) > 1:
warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}") warning(f"update_ip_range_id: Found multiple ip_addresses for visitor_id={visitor_id}: results={results}")
return return
ip_address = results[0][0] ip_address = results[0][0]
self.cur.execute(f"UPDATE {t_visitor} SET {ip_range_id.name} = '{get_ip_range_id(self.cur, ip_address)}' WHERE visitor_id = '{visitor_id}'") self.cur.execute(f"UPDATE visitor SET ip_range_id = '{self.get_ip_range_id(ip_address)}' WHERE visitor_id = '{visitor_id}'")
def create_filegroups(cursor: sql.Cursor, filegroup_str: str):
"""
TODO: make re-usable (alter groups when config changes) def get_country_id(self, name, code) -> int:
""" """
# filegroup_str: 'name1: file1, file2, file3; name2: file33' get the id of country of name
groups = filegroup_str.strip(";").split(";") if not present, insert and return id
pdebug("create_filegroups:", groups) """
for group in groups: if not sql_exists(self.cur, "country", [("name", name)]):
name, vals = group.split(":") self.cur.execute(f"INSERT INTO country (name, code) VALUES ('{name}', '{code}')")
# create/get group countries = self(f"SELECT country_id FROM country WHERE name = '{name}'")
if sql_exists(cursor, t_filegroup, [("groupname", name)]): if len(countries) > 0:
group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0] country_id_val = countries[0][0]
else: else:
group_id = sql_max(cursor, t_filegroup, "group_id") + 1 warning(f"get_country_id: Could not get country_id for name='{name}'.")
sql_insert(cursor, t_filegroup, [(group_id, name)]) return 0
# pdebug("create_filegroups: group_id", group_id) assert(type(country_id_val) == int)
# create/edit file return country_id_val
for filename in vals.split(","):
if sql_exists(cursor, t_file, [("filename", filename)]): # if exist, update
cursor.execute(f"UPDATE {t_file} SET group_id = {group_id} WHERE filename = '{filename}'")
else:
sql_insert(cursor, t_file, [[filename, group_id]])
def get_files_from_dir_rec(p: str, files: list[str]): def get_city_id(self, name, region, country_id) -> int:
"""recursivly append all files to files""" if not sql_exists(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)]):
pdebug("get_files_from_dir_rec:",p) self.cur.execute(f"INSERT INTO city (name, region, country_id) VALUES ('{name}', '{region}', '{country_id}')")
if path.isfile(p): cities = sql_select(self.cur, "city", [("name", name), ("region", region), ("country_id", country_id)])
files.append(p) if len(cities) > 0:
elif path.isdir(p): city_id_val = cities[0][0]
for p_ in listdir(p): else:
get_files_from_dir_rec(p + "/" + p_, files) warning(f"get_city_id: Could not get city_id for name='{name}', region='{region}' and country_id='{country_id}'.")
return 0
assert(type(city_id_val) == int)
return city_id_val
def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_filetypes:list[str]) -> str: def update_geoip_tables(self, geoip_city_csv_path: str):
""" """
:param list of nginx locations and the corresponding directories update the geoip data with the contents of the geoip_city_csv file
:param auto_filetype_groups list of filetypes for auto grouping
"""
files: list[str] = []
start_i = 0
if len(location_and_dirs) > 0 and len(location_and_dirs[0]) == 2:
for location, dir_ in location_and_dirs:
get_files_from_dir_rec(dir_, files)
# replace dir_ with location, eg /www/website with /
for i in range(start_i, len(files)):
files[i] = files[i].replace(dir_, location).replace("//", "/")
filegroups = ""
# create groups for each filetype
for ft in auto_group_filetypes:
filegroups += f"{ft}:"
for file in files:
if file.endswith(f".{ft}"):
filegroups += f"{file},"
filegroups = filegroups.strip(",") + ";"
pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups)
return filegroups
def get_country_id(cur:sql.Cursor, name, code, country_tablesize): Make sure to update the visitor.ip_range_id column for all visitors.
# countries = sql_select(cur, t_country, [("name", name)]) In case something changed, they might point to a different city. (won't fix)
cur.execute(f"SELECT {country_id.name} FROM {t_country} WHERE name = '{name}'") """
countries = cur.fetchall() # indices for the csv
if len(countries) > 0: FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5
country_id_val = countries[0][0] with open(geoip_city_csv_path, 'r') as file:
else: # insert new country csv = reader(file, delimiter=',', quotechar='"')
country_id_val = country_tablesize # execute only if file could be opened
# pdebug(f"update_geoip_tables: Adding country #{country_id_val}, name={name}") # delete all previous data
cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES ({country_id_val}, '{name}', '{code}')") self.cur.execute(f"DELETE FROM ip_range")
country_tablesize += 1 self.cur.execute(f"DELETE FROM city")
return country_id_val, country_tablesize self.cur.execute(f"DELETE FROM country")
self.cur.execute(f"VACUUM")
def get_city_id(cur: sql.Cursor, name, region, country_id, city_tablesize): # guarantees that unkown city/country will have id 0
# cities = sql_select(cur, t_city, [("name", name)]) self.cur.execute(f"INSERT INTO country (country_id, name, code) VALUES (0, 'Unknown', 'XX') ")
cur.execute(f"SELECT {city_id.name} FROM {t_city} WHERE name = '{name}'") self.cur.execute(f"INSERT INTO city (city_id, name, region) VALUES (0, 'Unknown', 'Unkown') ")
cities = cur.fetchall() print(f"Recreating the geoip database from {geoip_city_csv_path}. This might take a long time...")
if len(cities) > 0:
city_id_val = cities[0][0]
else: # insert new city
city_id_val = city_tablesize
# pdebug(f"update_geoip_tables: Adding city #{city_id_val}, name={row[CITY]}, country={country_id_val}")
cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region, country_id) VALUES ({city_id_val}, '{name}', '{region}', '{country_id}')")
city_tablesize += 1
return city_id_val, city_tablesize
def update_geoip_tables(cur: sql.Cursor, geoip_city_csv: str): # for combining city ranges into a 'City in <Country>' range
FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5 # country_id for the range that was last added (for combining multiple csv rows in one ip_range)
ip_range_id_val = 0 RANGE_DONE = -1
with open(geoip_city_csv, 'r') as file: combine_range_country_id = RANGE_DONE
# delete all previous data combine_range_country_name = ""
cur.execute(f"DELETE FROM {t_ip_range}") combine_range_low = RANGE_DONE
cur.execute(f"VACUUM") combine_range_high = RANGE_DONE
csv = reader(file, delimiter=',', quotechar='"')
def add_range(low, high, city_name, region, country_id):
city_id = self.get_city_id(city_name, region, country_id)
pdebug(f"update_ip_range_id: Adding range for city={city_name}, country_id={country_id}, low={low}, high={high}")
self.cur.execute(f"INSERT INTO ip_range (low, high, city_id) VALUES ({low}, {high}, {city_id})")
for row in csv:
# these might contain problematic characters (')
row[CITY] = sanitize(row[CITY])
row[COUNTRY] = sanitize(row[COUNTRY])
row[REGION] = sanitize(row[REGION])
# guarantees that unkown city/country will have id 0 # make sure country exists
if not sql_exists(cur, t_country, [("name", "Unknown")]): country_id = self.get_country_id(row[COUNTRY], row[CODE])
cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES (0, 'Unknown', 'XX') ") # only add cities for countries the user is interested in
if not sql_exists(cur, t_city, [("name", "Unknown")]): if row[CODE] in settings["get_cities_for_countries"]:
cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region) VALUES (0, 'Unknown', 'Unkown') ") add_range(row[FROM], row[TO], row[CITY], row[REGION], country_id)
country_tablesize = sql_tablesize(cur, t_country) else:
city_tablesize = sql_tablesize(cur, t_city) # if continuing
print(f"Recreating the geoip database from {geoip_city_csv}. This might take a long time...") if combine_range_country_id != RANGE_DONE:
combine_range_country_id = 0 # if continuing previous range, extend the upper range limit
combine_range_lower = -1 if combine_range_country_id == country_id:
combine_range_upper = -1 combine_range_high = row[TO]
combine_range_country_name = "" else: # new range for country, append
for row in csv: add_range(combine_range_low, combine_range_high, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id)
# these might contain problematic characters (') combine_range_country_id = RANGE_DONE
row[CITY] = sanitize(row[CITY]) # not elif, this has to be executed if previous else was executed
row[COUNTRY] = sanitize(row[COUNTRY]) if combine_range_country_id == RANGE_DONE : # currently in new range, combine with later ranges
row[REGION] = sanitize(row[REGION]) combine_range_country_id = country_id
combine_range_country_name = row[COUNTRY]
# make sure country exists combine_range_low = row[FROM]
country_id_val, country_tablesize = get_country_id(cur, row[COUNTRY], row[CODE], country_tablesize) combine_range_high = row[TO]
if row[CODE] in settings["get_cities_for_countries"]: if combine_range_country_id >= 0: # last range , append
# make sure city exists add_range(combine_range_low, combine_range_high, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id)
city_id_val, city_tablesize = get_city_id(cur, row[CITY], row[REGION], country_id_val, city_tablesize)
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding range for city={row[CITY]}, country={row[COUNTRY]}, lower={row[FROM]}, upper={row[TO]}")
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {row[FROM]}, {row[TO]}, {city_id_val})")
ip_range_id_val += 1
else:
if combine_range_country_id >= 0:
if combine_range_country_id == country_id_val: combine_range_upper = row[TO]
else: # new range for country, append
# get id for dummy city
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}")
city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize)
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})")
ip_range_id_val += 1
combine_range_country_id = -1
if combine_range_country_id < 0 : # combine with later ranges
combine_range_country_id = country_id_val
combine_range_lower = row[FROM]
combine_range_upper = row[TO]
combine_range_country_name = row[COUNTRY]
if combine_range_country_id >= 0: # last range , append
# get id for dummy city
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}")
city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize)
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})")
ip_range_id_val += 1
def create_db(db_name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]):
"""
create the name with database_tables
"""
print(f"creating database: '{db_name}'")
conn = sql.connect(f"{db_name}")
cursor = conn.cursor()
for table in database_tables.values():
cursor.execute(table.create_sql_str())
filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes)
create_filegroups(cursor, filegroup_str)
cursor.close()
conn.commit()
conn.close()
if __name__ == '__main__': if __name__ == '__main__':
create_db("test.db") db = Database("test.db")

View File

@ -13,7 +13,7 @@ class Request:
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""): def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", visitor_agent=""):
self.ip_address = int(IPv4Address(sanitize(ip_address))) self.ip_address = int(IPv4Address(sanitize(ip_address)))
self.time_local = 0 self.time_local = 0
#[20/Nov/2022:00:47:36 +0100] # turn [20/Nov/2022:00:47:36 +0100] to unix time
m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
if m: if m:
g = m.groups() g = m.groups()
@ -29,7 +29,7 @@ class Request:
else: else:
warning(f"Request:__init__: Could not match time: '{time_local}'") warning(f"Request:__init__: Could not match time: '{time_local}'")
self.request_type = sanitize(request_type) self.request_type = sanitize(request_type)
self.request_file = sanitize(request_file) self.request_route = sanitize(request_file)
self.request_protocol = sanitize(request_protocol) self.request_protocol = sanitize(request_protocol)
self.status = sanitize(status) self.status = sanitize(status)
self.bytes_sent = sanitize(bytes_sent) self.bytes_sent = sanitize(bytes_sent)
@ -37,9 +37,9 @@ class Request:
self.visitor_agent = sanitize(visitor_agent) self.visitor_agent = sanitize(visitor_agent)
def __repr__(self): def __repr__(self):
return f"{self.ip_address} - {self.time_local} - {self.request_file} - {self.visitor_agent} - {self.status}" return f"{self.ip_address} - {self.time_local} - {self.request_route} - {self.visitor_agent} - {self.status}"
def get_os(self): def get_platform(self):
# for groups in findall(re_visitor_agent, visitor_agent): # for groups in findall(re_visitor_agent, visitor_agent):
operating_system = "" operating_system = ""
for os in visitor_agent_operating_systems: for os in visitor_agent_operating_systems: