multiple fixes

This commit is contained in:
matthias@arch 2022-11-28 23:29:32 +01:00
parent 3007f1ff8d
commit 7be6e67aaf
11 changed files with 118 additions and 40 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ testing/
__pycache__
build/
regina.egg-info/
regina/test/

View File

@ -4,8 +4,8 @@
<element>
<id>UMLClass</id>
<coordinates>
<x>364</x>
<y>273</y>
<x>247</x>
<y>312</y>
<w>299</w>
<h>234</h>
</coordinates>
@ -26,8 +26,8 @@ style=autoresize</panel_attributes>
<element>
<id>UMLClass</id>
<coordinates>
<x>1092</x>
<y>273</y>
<x>975</x>
<y>312</y>
<w>234</w>
<h>130</h>
</coordinates>
@ -43,8 +43,8 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>988</x>
<y>273</y>
<x>871</x>
<y>312</y>
<w>130</w>
<h>65</h>
</coordinates>
@ -57,8 +57,8 @@ m2=1
<element>
<id>UMLClass</id>
<coordinates>
<x>754</x>
<y>260</y>
<x>637</x>
<y>299</y>
<w>247</w>
<h>221</h>
</coordinates>
@ -79,8 +79,8 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>650</x>
<y>273</y>
<x>533</x>
<y>312</y>
<w>130</w>
<h>65</h>
</coordinates>
@ -93,8 +93,8 @@ m2=n
<element>
<id>UMLClass</id>
<coordinates>
<x>1092</x>
<y>546</y>
<x>975</x>
<y>585</y>
<w>234</w>
<h>130</h>
</coordinates>
@ -111,8 +111,8 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>1131</x>
<y>390</y>
<x>1014</x>
<y>429</y>
<w>52</w>
<h>182</h>
</coordinates>
@ -125,8 +125,8 @@ m2=1
<element>
<id>UMLNote</id>
<coordinates>
<x>897</x>
<y>117</y>
<x>780</x>
<y>156</y>
<w>390</w>
<h>91</h>
</coordinates>
@ -139,8 +139,8 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>1105</x>
<y>195</y>
<x>988</x>
<y>234</y>
<w>39</w>
<h>104</h>
</coordinates>

View File

@ -27,6 +27,10 @@ humans_need_success = True
# dont collect requests to locations matched by this
request_location_regex_blacklist = /analytics.*
# get nation
user_get_country = True
# VISUALIZATION
# separate users into all and humans
get_human_percentage = True

View File

@ -1,3 +1,6 @@
"""Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
# __package__ = 'regina'
import regina.utility
from importlib import resources
# ip2nation_db_path = resources.path("regina", "ip2nation.db")

View File

@ -4,13 +4,13 @@ from time import mktime
from datetime import datetime as dt
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug, warning
from regina.utility.utility import pdebug, warning, pmessage
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
"""
collect information from the access log and put it into the database
"""
months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
@ -23,13 +23,16 @@ class Request:
if m:
g = m.groups()
try:
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
self.time_local = int(mktime(datetime_.timetuple()))
if g[1] in months:
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
self.time_local = int(mktime(datetime_.timetuple()))
else:
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
except Exception as e:
warning(f"Request:__init__: {e}")
else:
warning(f"Request:__init__: Could not match time: '{time_local}'")
self.request_type = sanitize(request_type)
self.request_file = sanitize(request_file)
self.request_protocol = sanitize(request_protocol)
@ -93,7 +96,7 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int:
else: # new user
# new user_id is number of elements
user_id: int = sql_tablesize(cursor, t_user)
pdebug("new user:", user_id, request.ip_address)
# pdebug("new user:", user_id, request.ip_address)
platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent)
is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id))
cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');")
@ -107,10 +110,21 @@ def is_user_human(cur: sql.Cursor, user_id: int):
"""
max_success_status = 400
if settings["status_300_is_success"]: max_success_status = 300
cur.execute(f"SELECT browser, platform FROM {t_user} WHERE user_id = {user_id}")
browsers_and_platforms = cur.fetchall()
if len(browsers_and_platforms) != 1:
pdebug(f"is_user_human: {user_id} - could not find user or found too many")
return False
if not browsers_and_platforms[0][0] in user_agent_browsers:
return False
if not browsers_and_platforms[0][1] in user_agent_operating_systems:
return False
# check if has browser
cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
# cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
# if no browser and platform
if cur.fetchone()[0] == 0: return False
# exists = cur.fetchone()
# if exists is None or exists[0] == 0:
# return False
# if human needs successful request
if settings["human_needs_success"]:
# check if at least request was successful (status < 400)
@ -144,9 +158,21 @@ def get_os_browser_pairs_from_agent(user_agent):
return operating_system, browser, mobile
# def set_countries(cur: sql.Cursor, user_ids: list[int]):
# if settings["user_get_country"]:
# ipconn = sql.connect(ip2nation_db_path)
# ipcur = ipconn.cursor()
# for user_id in user_ids:
# ip_address = sql_select(cur, t_user, [("user_id", user_id)])
# cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}")
# ip_address = cur.fetchall()[0][0]
# ipcur.execute("SELECT iso_code_3 FROM ip2nationCountries WHERE ip")
def add_requests_to_db(requests: list[Request], db_name: str):
conn = sql.connect(db_name)
cursor = conn.cursor()
added_requests = 0
# check the new users later
max_user_id = sql_tablesize(cursor, t_user)
request_blacklist = settings["request_location_regex_blacklist"]
@ -154,7 +180,9 @@ def add_requests_to_db(requests: list[Request], db_name: str):
request = requests[i]
# skip requests to blacklisted locations
if request_blacklist:
if match(request_blacklist, request.request_file): continue
if match(request_blacklist, request.request_file):
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
continue
# pdebug("add_requests_to_db:", i, "request:", request)
user_id = get_user_id(request, cursor)
conn.commit()
@ -169,9 +197,14 @@ def add_requests_to_db(requests: list[Request], db_name: str):
# pdebug("new request:", request)
request_id = sql_tablesize(cursor, t_request)
sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]])
for user_id in range(max_user_id, sql_tablesize(cursor, t_user)):
added_requests += 1
user_count = sql_tablesize(cursor, t_user)
for user_id in range(max_user_id, user_count):
is_human = is_user_human(cursor, user_id)
cursor.execute(f"SELECT * FROM {t_user} WHERE user_id = {user_id}")
# pdebug(f"add_rq_to_db: {user_id} is_human? {is_human}, {cursor.fetchall()}")
if is_human:
cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}")
cursor.close()
conn.commit()
pmessage(f"Collection Summary: Added {user_count - max_user_id} new users and {added_requests} new requests.")

View File

@ -47,7 +47,16 @@ filegroup_id = Entry("group_id", "INTEGER")
ip_address_entry = Entry("ip_address", "TEXT")
filename_entry = Entry("filename", "TEXT")
database_tables = {
t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER")], [f"UNIQUE({user_id.name})"]),
t_user: Table(t_user, user_id, [
Entry("ip_address", "TEXT"),
Entry("user_agent", "TEXT"),
Entry("platform", "TEXT"),
Entry("browser", "TEXT"),
Entry("mobile", "INTEGER"),
Entry("is_human", "INTEGER"),
# Entry("country_iso_code_3", "TEXT")
],
[f"UNIQUE({user_id.name})"]),
t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]),
t_request: Table(t_request, request_id, [
@ -73,7 +82,7 @@ def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
# cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
group_id_candidates = cursor.fetchall()
pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
if group_id_candidates:
return group_id_candidates[0][0]
else: # add new group file filename

View File

@ -321,7 +321,10 @@ def cleanup_referer(referer: str) -> str:
domain = m.groups()[1].replace(subdomains, "")
location = m.groups()[3]
referer = domain
assert(len(domain.split(".")) == 2)
referer = domain.split(".")[0]
if not settings["referer_ranking_ignore_tld"]: referer += "." + domain.split(".")[1]
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
if not settings["referer_ranking_ignore_location"]: referer += location

View File

@ -8,18 +8,31 @@ from regina.db_operation.database import create_db
from regina.db_operation.visualize import visualize
from regina.utility.settings_manager import read_settings_file
from regina.utility.globals import settings, version
from regina.utility.utility import pmessage
"""
start regina, launch either collect or visualize
TODO:
- bei referrers &auml;hnliche zusammenlegen, z.b. www.google.de und https://google.com
- optionen:
- unique user = ip address
- max requests/time
- fix datum im user and request count plot
- fix datum monat is 1 zu wenig
- checken warum last x days und total counts abweichen
- länder aus ip addresse
- "manuelle" datenbank beabeitung in cli:
- user + alle seine requests löschen
- user agents:
- android vor linux suchen, oder linux durch X11 ersetzen
- alles was bot drin hat als bot betrachten
- wenn datenbankgröße zum problem wird:
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
- selbes für platforms und browsers
- test:
- human detection
- referer cleanup
- schöne log nachrichten für die cron mail
- testing!
"""
@ -82,7 +95,6 @@ def main():
settings["version"] = version
if log_file: settings["access_log"] = log_file
print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not settings["server_name"]: missing_arg("server-name")
if not settings["access_log"]: missing_arg("log")
@ -91,14 +103,20 @@ def main():
settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",")
if isinstance(settings["locs_and_dirs"], str):
settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ]
if collect:
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not isfile(settings["db"]):
create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
requests = parse_log(settings["access_log"])
add_requests_to_db(requests, settings["db"])
if visualize_:
elif visualize_:
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'")
if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
visualize(settings)
else:
error("Either --collect --visualize has to be provided")
if __name__ == '__main__':
main()

View File

@ -14,6 +14,7 @@ settings = {
"filegroups": "",
"request_location_regex_blacklist": "",
"unique_user_is_ip_address": False,
"user_get_country": True,
# VISUALIZATION
"get_human_percentage": False,
@ -25,6 +26,7 @@ settings = {
"referer_ranking_ignore_protocol": True,
"referer_ranking_ignore_subdomain": False,
"referer_ranking_ignore_location": True,
"referer_ranking_ignore_tld": False,
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
"user_agent_ranking_regex_whitelist": r"",
"file_ranking_plot_max_files": 15,

View File

@ -6,12 +6,15 @@ from sys import exit
Various utitity
"""
DEBUG = True
def pdebug(*args):
if DEBUG: print(*args)
DEBUG = False
def pdebug(*args, **keys):
if DEBUG: print(*args, **keys)
def warning(*w):
print("Warning:", *w)
def warning(*w, **k):
print("Warning:", *w, **k)
def pmessage(*args, **keys):
print(*args, **keys)
def error(*arg):
print("Error:", *arg)

View File

@ -24,6 +24,8 @@ setup(
"Topic :: Utilities",
],
# data_files=[("ip2nation", ["ip2nation.sql", "ip2nation.db"])],
# scripts=["bin/nicole"],
entry_points={
"console_scripts": [ "regina=regina.main:main" ],