multiple fixes

This commit is contained in:
matthias@arch 2022-11-28 23:29:32 +01:00
parent 3007f1ff8d
commit 7be6e67aaf
11 changed files with 118 additions and 40 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ testing/
__pycache__ __pycache__
build/ build/
regina.egg-info/ regina.egg-info/
regina/test/

View File

@ -4,8 +4,8 @@
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>364</x> <x>247</x>
<y>273</y> <y>312</y>
<w>299</w> <w>299</w>
<h>234</h> <h>234</h>
</coordinates> </coordinates>
@ -26,8 +26,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>1092</x> <x>975</x>
<y>273</y> <y>312</y>
<w>234</w> <w>234</w>
<h>130</h> <h>130</h>
</coordinates> </coordinates>
@ -43,8 +43,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>988</x> <x>871</x>
<y>273</y> <y>312</y>
<w>130</w> <w>130</w>
<h>65</h> <h>65</h>
</coordinates> </coordinates>
@ -57,8 +57,8 @@ m2=1
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>754</x> <x>637</x>
<y>260</y> <y>299</y>
<w>247</w> <w>247</w>
<h>221</h> <h>221</h>
</coordinates> </coordinates>
@ -79,8 +79,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>650</x> <x>533</x>
<y>273</y> <y>312</y>
<w>130</w> <w>130</w>
<h>65</h> <h>65</h>
</coordinates> </coordinates>
@ -93,8 +93,8 @@ m2=n
<element> <element>
<id>UMLClass</id> <id>UMLClass</id>
<coordinates> <coordinates>
<x>1092</x> <x>975</x>
<y>546</y> <y>585</y>
<w>234</w> <w>234</w>
<h>130</h> <h>130</h>
</coordinates> </coordinates>
@ -111,8 +111,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>1131</x> <x>1014</x>
<y>390</y> <y>429</y>
<w>52</w> <w>52</w>
<h>182</h> <h>182</h>
</coordinates> </coordinates>
@ -125,8 +125,8 @@ m2=1
<element> <element>
<id>UMLNote</id> <id>UMLNote</id>
<coordinates> <coordinates>
<x>897</x> <x>780</x>
<y>117</y> <y>156</y>
<w>390</w> <w>390</w>
<h>91</h> <h>91</h>
</coordinates> </coordinates>
@ -139,8 +139,8 @@ style=autoresize</panel_attributes>
<element> <element>
<id>Relation</id> <id>Relation</id>
<coordinates> <coordinates>
<x>1105</x> <x>988</x>
<y>195</y> <y>234</y>
<w>39</w> <w>39</w>
<h>104</h> <h>104</h>
</coordinates> </coordinates>

View File

@ -27,6 +27,10 @@ humans_need_success = True
# dont collect requests to locations matched by this # dont collect requests to locations matched by this
request_location_regex_blacklist = /analytics.* request_location_regex_blacklist = /analytics.*
# get nation
user_get_country = True
# VISUALIZATION # VISUALIZATION
# separate users into all and humans # separate users into all and humans
get_human_percentage = True get_human_percentage = True

View File

@ -1,3 +1,6 @@
"""Gather analytics from nginx access logs and visualize them through generated images and a generated html""" """Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
# __package__ = 'regina' # __package__ = 'regina'
import regina.utility import regina.utility
from importlib import resources
# ip2nation_db_path = resources.path("regina", "ip2nation.db")

View File

@ -4,13 +4,13 @@ from time import mktime
from datetime import datetime as dt from datetime import datetime as dt
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug, warning from regina.utility.utility import pdebug, warning, pmessage
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
""" """
collect information from the access log and put it into the database collect information from the access log and put it into the database
""" """
months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"] months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
@ -23,13 +23,16 @@ class Request:
if m: if m:
g = m.groups() g = m.groups()
try: try:
if g[1] in months:
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5])) datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
self.time_local = int(mktime(datetime_.timetuple())) self.time_local = int(mktime(datetime_.timetuple()))
else:
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
except Exception as e: except Exception as e:
warning(f"Request:__init__: {e}") warning(f"Request:__init__: {e}")
else: else:
warning(f"Request:__init__: Could not match time: '{time_local}'") warning(f"Request:__init__: Could not match time: '{time_local}'")
self.request_type = sanitize(request_type) self.request_type = sanitize(request_type)
self.request_file = sanitize(request_file) self.request_file = sanitize(request_file)
self.request_protocol = sanitize(request_protocol) self.request_protocol = sanitize(request_protocol)
@ -93,7 +96,7 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int:
else: # new user else: # new user
# new user_id is number of elements # new user_id is number of elements
user_id: int = sql_tablesize(cursor, t_user) user_id: int = sql_tablesize(cursor, t_user)
pdebug("new user:", user_id, request.ip_address) # pdebug("new user:", user_id, request.ip_address)
platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent) platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent)
is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id)) is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id))
cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');") cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');")
@ -107,10 +110,21 @@ def is_user_human(cur: sql.Cursor, user_id: int):
""" """
max_success_status = 400 max_success_status = 400
if settings["status_300_is_success"]: max_success_status = 300 if settings["status_300_is_success"]: max_success_status = 300
cur.execute(f"SELECT browser, platform FROM {t_user} WHERE user_id = {user_id}")
browsers_and_platforms = cur.fetchall()
if len(browsers_and_platforms) != 1:
pdebug(f"is_user_human: {user_id} - could not find user or found too many")
return False
if not browsers_and_platforms[0][0] in user_agent_browsers:
return False
if not browsers_and_platforms[0][1] in user_agent_operating_systems:
return False
# check if has browser # check if has browser
cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)") # cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
# if no browser and platform # if no browser and platform
if cur.fetchone()[0] == 0: return False # exists = cur.fetchone()
# if exists is None or exists[0] == 0:
# return False
# if human needs successful request # if human needs successful request
if settings["human_needs_success"]: if settings["human_needs_success"]:
# check if at least request was successful (status < 400) # check if at least request was successful (status < 400)
@ -144,9 +158,21 @@ def get_os_browser_pairs_from_agent(user_agent):
return operating_system, browser, mobile return operating_system, browser, mobile
# def set_countries(cur: sql.Cursor, user_ids: list[int]):
# if settings["user_get_country"]:
# ipconn = sql.connect(ip2nation_db_path)
# ipcur = ipconn.cursor()
# for user_id in user_ids:
# ip_address = sql_select(cur, t_user, [("user_id", user_id)])
# cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}")
# ip_address = cur.fetchall()[0][0]
# ipcur.execute("SELECT iso_code_3 FROM ip2nationCountries WHERE ip")
def add_requests_to_db(requests: list[Request], db_name: str): def add_requests_to_db(requests: list[Request], db_name: str):
conn = sql.connect(db_name) conn = sql.connect(db_name)
cursor = conn.cursor() cursor = conn.cursor()
added_requests = 0
# check the new users later # check the new users later
max_user_id = sql_tablesize(cursor, t_user) max_user_id = sql_tablesize(cursor, t_user)
request_blacklist = settings["request_location_regex_blacklist"] request_blacklist = settings["request_location_regex_blacklist"]
@ -154,7 +180,9 @@ def add_requests_to_db(requests: list[Request], db_name: str):
request = requests[i] request = requests[i]
# skip requests to blacklisted locations # skip requests to blacklisted locations
if request_blacklist: if request_blacklist:
if match(request_blacklist, request.request_file): continue if match(request_blacklist, request.request_file):
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
continue
# pdebug("add_requests_to_db:", i, "request:", request) # pdebug("add_requests_to_db:", i, "request:", request)
user_id = get_user_id(request, cursor) user_id = get_user_id(request, cursor)
conn.commit() conn.commit()
@ -169,9 +197,14 @@ def add_requests_to_db(requests: list[Request], db_name: str):
# pdebug("new request:", request) # pdebug("new request:", request)
request_id = sql_tablesize(cursor, t_request) request_id = sql_tablesize(cursor, t_request)
sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]]) sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]])
for user_id in range(max_user_id, sql_tablesize(cursor, t_user)): added_requests += 1
user_count = sql_tablesize(cursor, t_user)
for user_id in range(max_user_id, user_count):
is_human = is_user_human(cursor, user_id) is_human = is_user_human(cursor, user_id)
cursor.execute(f"SELECT * FROM {t_user} WHERE user_id = {user_id}")
# pdebug(f"add_rq_to_db: {user_id} is_human? {is_human}, {cursor.fetchall()}")
if is_human: if is_human:
cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}") cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}")
cursor.close() cursor.close()
conn.commit() conn.commit()
pmessage(f"Collection Summary: Added {user_count - max_user_id} new users and {added_requests} new requests.")

View File

@ -47,7 +47,16 @@ filegroup_id = Entry("group_id", "INTEGER")
ip_address_entry = Entry("ip_address", "TEXT") ip_address_entry = Entry("ip_address", "TEXT")
filename_entry = Entry("filename", "TEXT") filename_entry = Entry("filename", "TEXT")
database_tables = { database_tables = {
t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER")], [f"UNIQUE({user_id.name})"]), t_user: Table(t_user, user_id, [
Entry("ip_address", "TEXT"),
Entry("user_agent", "TEXT"),
Entry("platform", "TEXT"),
Entry("browser", "TEXT"),
Entry("mobile", "INTEGER"),
Entry("is_human", "INTEGER"),
# Entry("country_iso_code_3", "TEXT")
],
[f"UNIQUE({user_id.name})"]),
t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]),
t_request: Table(t_request, request_id, [ t_request: Table(t_request, request_id, [
@ -73,7 +82,7 @@ def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'") cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
# cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'") # cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
group_id_candidates = cursor.fetchall() group_id_candidates = cursor.fetchall()
pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}") # pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
if group_id_candidates: if group_id_candidates:
return group_id_candidates[0][0] return group_id_candidates[0][0]
else: # add new group file filename else: # add new group file filename

View File

@ -321,7 +321,10 @@ def cleanup_referer(referer: str) -> str:
domain = m.groups()[1].replace(subdomains, "") domain = m.groups()[1].replace(subdomains, "")
location = m.groups()[3] location = m.groups()[3]
referer = domain assert(len(domain.split(".")) == 2)
referer = domain.split(".")[0]
if not settings["referer_ranking_ignore_tld"]: referer += "." + domain.split(".")[1]
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
if not settings["referer_ranking_ignore_location"]: referer += location if not settings["referer_ranking_ignore_location"]: referer += location

View File

@ -8,18 +8,31 @@ from regina.db_operation.database import create_db
from regina.db_operation.visualize import visualize from regina.db_operation.visualize import visualize
from regina.utility.settings_manager import read_settings_file from regina.utility.settings_manager import read_settings_file
from regina.utility.globals import settings, version from regina.utility.globals import settings, version
from regina.utility.utility import pmessage
""" """
start regina, launch either collect or visualize start regina, launch either collect or visualize
TODO: TODO:
- bei referrers &auml;hnliche zusammenlegen, z.b. www.google.de und https://google.com
- optionen: - optionen:
- unique user = ip address - unique user = ip address
- max requests/time - max requests/time
- fix datum im user and request count plot
- fix datum monat is 1 zu wenig
- checken warum last x days und total counts abweichen
- länder aus ip addresse
- "manuelle" datenbank beabeitung in cli:
- user + alle seine requests löschen
- user agents:
- android vor linux suchen, oder linux durch X11 ersetzen
- alles was bot drin hat als bot betrachten
- wenn datenbankgröße zum problem wird: - wenn datenbankgröße zum problem wird:
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id - referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
- selbes für platforms und browsers - selbes für platforms und browsers
- test:
- human detection
- referer cleanup
- schöne log nachrichten für die cron mail
- testing!
""" """
@ -82,7 +95,6 @@ def main():
settings["version"] = version settings["version"] = version
if log_file: settings["access_log"] = log_file if log_file: settings["access_log"] = log_file
print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not settings["server_name"]: missing_arg("server-name") if not settings["server_name"]: missing_arg("server-name")
if not settings["access_log"]: missing_arg("log") if not settings["access_log"]: missing_arg("log")
@ -91,14 +103,20 @@ def main():
settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",") settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",")
if isinstance(settings["locs_and_dirs"], str): if isinstance(settings["locs_and_dirs"], str):
settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ] settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ]
if collect: if collect:
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not isfile(settings["db"]): if not isfile(settings["db"]):
create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"]) create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
requests = parse_log(settings["access_log"]) requests = parse_log(settings["access_log"])
add_requests_to_db(requests, settings["db"]) add_requests_to_db(requests, settings["db"])
if visualize_: elif visualize_:
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'")
if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
visualize(settings) visualize(settings)
else:
error("Either --collect --visualize has to be provided")
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -14,6 +14,7 @@ settings = {
"filegroups": "", "filegroups": "",
"request_location_regex_blacklist": "", "request_location_regex_blacklist": "",
"unique_user_is_ip_address": False, "unique_user_is_ip_address": False,
"user_get_country": True,
# VISUALIZATION # VISUALIZATION
"get_human_percentage": False, "get_human_percentage": False,
@ -25,6 +26,7 @@ settings = {
"referer_ranking_ignore_protocol": True, "referer_ranking_ignore_protocol": True,
"referer_ranking_ignore_subdomain": False, "referer_ranking_ignore_subdomain": False,
"referer_ranking_ignore_location": True, "referer_ranking_ignore_location": True,
"referer_ranking_ignore_tld": False,
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty "referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
"user_agent_ranking_regex_whitelist": r"", "user_agent_ranking_regex_whitelist": r"",
"file_ranking_plot_max_files": 15, "file_ranking_plot_max_files": 15,

View File

@ -6,12 +6,15 @@ from sys import exit
Various utitity Various utitity
""" """
DEBUG = True DEBUG = False
def pdebug(*args): def pdebug(*args, **keys):
if DEBUG: print(*args) if DEBUG: print(*args, **keys)
def warning(*w): def warning(*w, **k):
print("Warning:", *w) print("Warning:", *w, **k)
def pmessage(*args, **keys):
print(*args, **keys)
def error(*arg): def error(*arg):
print("Error:", *arg) print("Error:", *arg)

View File

@ -24,6 +24,8 @@ setup(
"Topic :: Utilities", "Topic :: Utilities",
], ],
# data_files=[("ip2nation", ["ip2nation.sql", "ip2nation.db"])],
# scripts=["bin/nicole"], # scripts=["bin/nicole"],
entry_points={ entry_points={
"console_scripts": [ "regina=regina.main:main" ], "console_scripts": [ "regina=regina.main:main" ],