2022-12-14 19:15:43 +01:00

287 lines
12 KiB
Python

# from sys import path
import sqlite3 as sql
from csv import reader
from os import path, listdir
# local
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug
from regina.utility.globals import settings
"""
create reginas database as shown in the uml diagram database.uxf
"""
class Entry:
"""
represents an sql entry
type_ is INTEGER, TEXT, REAL...
"""
def __init__(self, name, type_) -> None:
self.name = name
self.type_ = type_
def __repr__(self):
return f"[{self.name}] {self.type_}"
class Table:
def __init__(self, name, key: Entry, entries: list[Entry]=[], constaints: list[str]=[]):
self.name = name
self.key = key
self.entries = entries
self.constaints = constaints
def create_sql_str(self):
return f"CREATE TABLE IF NOT EXISTS {self.name}\n({self})\n"
def __repr__(self):
s = f"{self.key} PRIMARY KEY"
for entry in self.entries:
s += f", {entry}"
for c in self.constaints:
s += f", {c}"
return s
t_request = "request"
t_file = "file"
t_filegroup = "filegroup"
t_visitor = "visitor"
t_city = "city"
t_country = "country"
t_ip_range = "ip_range"
visitor_id = Entry("visitor_id", "INTEGER")
request_id = Entry("request_id", "INTEGER")
filegroup_id = Entry("group_id", "INTEGER")
ip_address_entry = Entry("ip_address", "TEXT")
filename_entry = Entry("filename", "TEXT")
city_id = Entry("city_id", "INTEGER")
country_id = Entry("country_id", "INTEGER")
ip_range_id = Entry("ip_range_id", "INTEGER")
database_tables = {
t_visitor: Table(t_visitor, visitor_id, [
Entry("ip_address", "INTEGER"),
Entry("visitor_agent", "TEXT"),
Entry("platform", "TEXT"),
Entry("browser", "TEXT"),
Entry("mobile", "INTEGER"),
Entry("is_human", "INTEGER"),
ip_range_id,
],
[f"UNIQUE({visitor_id.name})"]),
t_file: Table(t_file, filename_entry,
[filegroup_id],
[f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id,
[Entry("groupname", "TEXT")],
[f"UNIQUE({filegroup_id.name})"]),
t_request: Table(t_request, request_id, [
visitor_id,
filegroup_id,
Entry("date", "INTEGER"),
Entry("referer", "TEXT"),
Entry("status", "INTEGER")
],
["UNIQUE(request_id)"]),
t_ip_range: Table(t_ip_range, ip_range_id, [
Entry("lower", "INTEGER"),
Entry("upper", "INTEGER"),
city_id,
],
[f"UNIQUE({ip_range_id.name})"]),
t_city: Table(t_city, city_id, [
country_id,
Entry("name", "TEXT"),
Entry("region", "TEXT"),
],
[f"UNIQUE({city_id.name})"]),
t_country: Table(t_country, country_id, [
Entry("name", "TEXT"),
Entry("code", "TEXT"),
],
[f"UNIQUE({country_id.name})"]),
}
def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
"""
get the filegroup
returns the group where
1) filename is the groupname
2) the filetype of filename is the groupname
3) new group with filename as gorupname
"""
# pdebug(f"get_filegroup: {filename}")
if sql_exists(cursor, t_file, [("filename", filename)]):
return sql_select(cursor, t_file, [("filename", filename)])[0][1]
else:
suffix = filename.split('.')[-1]
cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
# cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
group_id_candidates = cursor.fetchall()
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
if group_id_candidates:
return group_id_candidates[0][0]
else: # add new group file filename
group_id = sql_tablesize(cursor, t_filegroup)
# pdebug("new file(group):", group_id, filename)
# add group
sql_insert(cursor, t_filegroup, [[group_id, filename]])
# add file
sql_insert(cursor, t_file, [[filename, group_id]])
return group_id
def create_filegroups(cursor: sql.Cursor, filegroup_str: str):
# filegroup_str: 'name1: file1, file2, file3; name2: file33'
groups = filegroup_str.strip(";").split(";")
pdebug("create_filegroups:", groups)
for group in groups:
name, vals = group.split(":")
# create/get group
if sql_exists(cursor, t_filegroup, [("groupname", name)]):
group_id = sql_select(cursor, t_filegroup, [("groupname", name)])[0][0]
else:
group_id = sql_tablesize(cursor, t_filegroup)
sql_insert(cursor, t_filegroup, [(group_id, name)])
# pdebug("create_filegroups: group_id", group_id)
# create/edit file
for filename in vals.split(","):
if sql_exists(cursor, t_file, [("filename", filename)]): # if exist, update
cursor.execute(f"UPDATE {t_file} SET group_id = {group_id} WHERE filename = '{filename}'")
else:
sql_insert(cursor, t_file, [[filename, group_id]])
def get_files_from_dir_rec(p: str, files: list[str]):
"""recursivly append all files to files"""
pdebug("get_files_from_dir_rec:",p)
if path.isfile(p):
files.append(p)
elif path.isdir(p):
for p_ in listdir(p):
get_files_from_dir_rec(p + "/" + p_, files)
def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_filetypes:list[str]) -> str:
"""
:param list of nginx locations and the corresponding directories
:param auto_filetype_groups list of filetypes for auto grouping
"""
files: list[str] = []
start_i = 0
if len(location_and_dirs) > 0 and len(location_and_dirs[0]) == 2:
for location, dir_ in location_and_dirs:
get_files_from_dir_rec(dir_, files)
# replace dir_ with location, eg /www/website with /
for i in range(start_i, len(files)):
files[i] = files[i].replace(dir_, location).replace("//", "/")
filegroups = ""
# create groups for each filetype
for ft in auto_group_filetypes:
filegroups += f"{ft}:"
for file in files:
if file.endswith(f".{ft}"):
filegroups += f"{file},"
filegroups = filegroups.strip(",") + ";"
pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups)
return filegroups
def get_country_id(cur:sql.Cursor, name, code, country_tablesize):
# countries = sql_select(cur, t_country, [("name", name)])
cur.execute(f"SELECT {country_id.name} FROM {t_country} WHERE name = '{name}'")
countries = cur.fetchall()
if len(countries) > 0:
country_id_val = countries[0][0]
else: # insert new country
country_id_val = country_tablesize
# pdebug(f"update_geoip_tables: Adding country #{country_id_val}, name={name}")
cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES ({country_id_val}, '{name}', '{code}')")
country_tablesize += 1
return country_id_val, country_tablesize
def get_city_id(cur: sql.Cursor, name, region, country_id, city_tablesize):
# cities = sql_select(cur, t_city, [("name", name)])
cur.execute(f"SELECT {city_id.name} FROM {t_city} WHERE name = '{name}'")
cities = cur.fetchall()
if len(cities) > 0:
city_id_val = cities[0][0]
else: # insert new city
city_id_val = city_tablesize
# pdebug(f"update_geoip_tables: Adding city #{city_id_val}, name={row[CITY]}, country={country_id_val}")
cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region, country_id) VALUES ({city_id_val}, '{name}', '{region}', '{country_id}')")
city_tablesize += 1
return city_id_val, city_tablesize
def update_geoip_tables(cur: sql.Cursor, geoip_city_csv: str):
FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5
ip_range_id_val = 0
with open(geoip_city_csv, 'r') as file:
# delete all previous data
cur.execute(f"DELETE FROM {t_ip_range}")
cur.execute(f"VACUUM")
csv = reader(file, delimiter=',', quotechar='"')
# guarantees that unkown city/country will have id 0
if not sql_exists(cur, t_country, [("name", "Unknown")]):
cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES (0, 'Unknown', 'XX') ")
if not sql_exists(cur, t_city, [("name", "Unknown")]):
cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region) VALUES (0, 'Unknown', 'Unkown') ")
country_tablesize = sql_tablesize(cur, t_country)
city_tablesize = sql_tablesize(cur, t_city)
print(f"Recreating the geoip database from {geoip_city_csv}. This might take a long time...")
combine_range_country_id = 0
combine_range_lower = -1
combine_range_upper = -1
combine_range_country_name = ""
for row in csv:
# these might contain problematic characters (')
row[CITY] = sanitize(row[CITY])
row[COUNTRY] = sanitize(row[COUNTRY])
row[REGION] = sanitize(row[REGION])
# make sure country exists
country_id_val, country_tablesize = get_country_id(cur, row[COUNTRY], row[CODE], country_tablesize)
if row[CODE] in settings["get_cities_for_countries"]:
# make sure city exists
city_id_val, city_tablesize = get_city_id(cur, row[CITY], row[REGION], country_id_val, city_tablesize)
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding range for city={row[CITY]}, country={row[COUNTRY]}, lower={row[FROM]}, upper={row[TO]}")
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {row[FROM]}, {row[TO]}, {city_id_val})")
ip_range_id_val += 1
else:
if combine_range_country_id >= 0:
if combine_range_country_id == country_id_val: combine_range_upper = row[TO]
else: # new range for country, append
# get id for dummy city
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}")
city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize)
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})")
ip_range_id_val += 1
combine_range_country_id = -1
if combine_range_country_id < 0 : # combine with later ranges
combine_range_country_id = country_id_val
combine_range_lower = row[FROM]
combine_range_upper = row[TO]
combine_range_country_name = row[COUNTRY]
if combine_range_country_id >= 0: # last range , append
# get id for dummy city
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}")
city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize)
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})")
ip_range_id_val += 1
def create_db(db_name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]):
"""
create the name with database_tables
"""
print(f"creating database: '{db_name}'")
conn = sql.connect(f"{db_name}")
cursor = conn.cursor()
for table in database_tables.values():
cursor.execute(table.create_sql_str())
filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes)
create_filegroups(cursor, filegroup_str)
cursor.close()
conn.commit()
conn.close()
if __name__ == '__main__':
create_db("test.db")