multiple fixes
This commit is contained in:
parent
3007f1ff8d
commit
7be6e67aaf
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,3 +2,4 @@ testing/
|
||||
__pycache__
|
||||
build/
|
||||
regina.egg-info/
|
||||
regina/test/
|
||||
|
36
database.uxf
36
database.uxf
@ -4,8 +4,8 @@
|
||||
<element>
|
||||
<id>UMLClass</id>
|
||||
<coordinates>
|
||||
<x>364</x>
|
||||
<y>273</y>
|
||||
<x>247</x>
|
||||
<y>312</y>
|
||||
<w>299</w>
|
||||
<h>234</h>
|
||||
</coordinates>
|
||||
@ -26,8 +26,8 @@ style=autoresize</panel_attributes>
|
||||
<element>
|
||||
<id>UMLClass</id>
|
||||
<coordinates>
|
||||
<x>1092</x>
|
||||
<y>273</y>
|
||||
<x>975</x>
|
||||
<y>312</y>
|
||||
<w>234</w>
|
||||
<h>130</h>
|
||||
</coordinates>
|
||||
@ -43,8 +43,8 @@ style=autoresize</panel_attributes>
|
||||
<element>
|
||||
<id>Relation</id>
|
||||
<coordinates>
|
||||
<x>988</x>
|
||||
<y>273</y>
|
||||
<x>871</x>
|
||||
<y>312</y>
|
||||
<w>130</w>
|
||||
<h>65</h>
|
||||
</coordinates>
|
||||
@ -57,8 +57,8 @@ m2=1
|
||||
<element>
|
||||
<id>UMLClass</id>
|
||||
<coordinates>
|
||||
<x>754</x>
|
||||
<y>260</y>
|
||||
<x>637</x>
|
||||
<y>299</y>
|
||||
<w>247</w>
|
||||
<h>221</h>
|
||||
</coordinates>
|
||||
@ -79,8 +79,8 @@ style=autoresize</panel_attributes>
|
||||
<element>
|
||||
<id>Relation</id>
|
||||
<coordinates>
|
||||
<x>650</x>
|
||||
<y>273</y>
|
||||
<x>533</x>
|
||||
<y>312</y>
|
||||
<w>130</w>
|
||||
<h>65</h>
|
||||
</coordinates>
|
||||
@ -93,8 +93,8 @@ m2=n
|
||||
<element>
|
||||
<id>UMLClass</id>
|
||||
<coordinates>
|
||||
<x>1092</x>
|
||||
<y>546</y>
|
||||
<x>975</x>
|
||||
<y>585</y>
|
||||
<w>234</w>
|
||||
<h>130</h>
|
||||
</coordinates>
|
||||
@ -111,8 +111,8 @@ style=autoresize</panel_attributes>
|
||||
<element>
|
||||
<id>Relation</id>
|
||||
<coordinates>
|
||||
<x>1131</x>
|
||||
<y>390</y>
|
||||
<x>1014</x>
|
||||
<y>429</y>
|
||||
<w>52</w>
|
||||
<h>182</h>
|
||||
</coordinates>
|
||||
@ -125,8 +125,8 @@ m2=1
|
||||
<element>
|
||||
<id>UMLNote</id>
|
||||
<coordinates>
|
||||
<x>897</x>
|
||||
<y>117</y>
|
||||
<x>780</x>
|
||||
<y>156</y>
|
||||
<w>390</w>
|
||||
<h>91</h>
|
||||
</coordinates>
|
||||
@ -139,8 +139,8 @@ style=autoresize</panel_attributes>
|
||||
<element>
|
||||
<id>Relation</id>
|
||||
<coordinates>
|
||||
<x>1105</x>
|
||||
<y>195</y>
|
||||
<x>988</x>
|
||||
<y>234</y>
|
||||
<w>39</w>
|
||||
<h>104</h>
|
||||
</coordinates>
|
||||
|
@ -27,6 +27,10 @@ humans_need_success = True
|
||||
# dont collect requests to locations matched by this
|
||||
request_location_regex_blacklist = /analytics.*
|
||||
|
||||
# get nation
|
||||
user_get_country = True
|
||||
|
||||
|
||||
# VISUALIZATION
|
||||
# separate users into all and humans
|
||||
get_human_percentage = True
|
||||
|
@ -1,3 +1,6 @@
|
||||
"""Gather analytics from nginx access logs and visualize them through generated images and a generated html"""
|
||||
# __package__ = 'regina'
|
||||
import regina.utility
|
||||
|
||||
from importlib import resources
|
||||
# ip2nation_db_path = resources.path("regina", "ip2nation.db")
|
||||
|
@ -4,13 +4,13 @@ from time import mktime
|
||||
from datetime import datetime as dt
|
||||
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
|
||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
|
||||
from regina.utility.utility import pdebug, warning
|
||||
from regina.utility.utility import pdebug, warning, pmessage
|
||||
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
|
||||
|
||||
"""
|
||||
collect information from the access log and put it into the database
|
||||
"""
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dez"]
|
||||
|
||||
|
||||
|
||||
@ -23,13 +23,16 @@ class Request:
|
||||
if m:
|
||||
g = m.groups()
|
||||
try:
|
||||
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
|
||||
self.time_local = int(mktime(datetime_.timetuple()))
|
||||
if g[1] in months:
|
||||
datetime_ = dt(int(g[2]), months.index(g[1])+1, int(g[0]), int(g[3]), int(g[4]), int(g[5]))
|
||||
# pdebug(f"Request __init__: datetime {datetime_}, from {g}")
|
||||
self.time_local = int(mktime(datetime_.timetuple()))
|
||||
else:
|
||||
warning(f"Request:__init__: Unkown month: '{g[1]}'. Using timestamp {self.time_local}")
|
||||
except Exception as e:
|
||||
warning(f"Request:__init__: {e}")
|
||||
else:
|
||||
warning(f"Request:__init__: Could not match time: '{time_local}'")
|
||||
|
||||
self.request_type = sanitize(request_type)
|
||||
self.request_file = sanitize(request_file)
|
||||
self.request_protocol = sanitize(request_protocol)
|
||||
@ -93,7 +96,7 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int:
|
||||
else: # new user
|
||||
# new user_id is number of elements
|
||||
user_id: int = sql_tablesize(cursor, t_user)
|
||||
pdebug("new user:", user_id, request.ip_address)
|
||||
# pdebug("new user:", user_id, request.ip_address)
|
||||
platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent)
|
||||
is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id))
|
||||
cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');")
|
||||
@ -107,10 +110,21 @@ def is_user_human(cur: sql.Cursor, user_id: int):
|
||||
"""
|
||||
max_success_status = 400
|
||||
if settings["status_300_is_success"]: max_success_status = 300
|
||||
cur.execute(f"SELECT browser, platform FROM {t_user} WHERE user_id = {user_id}")
|
||||
browsers_and_platforms = cur.fetchall()
|
||||
if len(browsers_and_platforms) != 1:
|
||||
pdebug(f"is_user_human: {user_id} - could not find user or found too many")
|
||||
return False
|
||||
if not browsers_and_platforms[0][0] in user_agent_browsers:
|
||||
return False
|
||||
if not browsers_and_platforms[0][1] in user_agent_operating_systems:
|
||||
return False
|
||||
# check if has browser
|
||||
cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
|
||||
# cur.execute(f"SELECT EXISTS (SELECT 1 FROM {t_user} WHERE user_id = {user_id} AND platform IS NOT NULL AND browser IS NOT NULL)")
|
||||
# if no browser and platform
|
||||
if cur.fetchone()[0] == 0: return False
|
||||
# exists = cur.fetchone()
|
||||
# if exists is None or exists[0] == 0:
|
||||
# return False
|
||||
# if human needs successful request
|
||||
if settings["human_needs_success"]:
|
||||
# check if at least request was successful (status < 400)
|
||||
@ -144,9 +158,21 @@ def get_os_browser_pairs_from_agent(user_agent):
|
||||
return operating_system, browser, mobile
|
||||
|
||||
|
||||
# def set_countries(cur: sql.Cursor, user_ids: list[int]):
|
||||
# if settings["user_get_country"]:
|
||||
# ipconn = sql.connect(ip2nation_db_path)
|
||||
# ipcur = ipconn.cursor()
|
||||
# for user_id in user_ids:
|
||||
# ip_address = sql_select(cur, t_user, [("user_id", user_id)])
|
||||
# cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}")
|
||||
# ip_address = cur.fetchall()[0][0]
|
||||
# ipcur.execute("SELECT iso_code_3 FROM ip2nationCountries WHERE ip")
|
||||
|
||||
|
||||
def add_requests_to_db(requests: list[Request], db_name: str):
|
||||
conn = sql.connect(db_name)
|
||||
cursor = conn.cursor()
|
||||
added_requests = 0
|
||||
# check the new users later
|
||||
max_user_id = sql_tablesize(cursor, t_user)
|
||||
request_blacklist = settings["request_location_regex_blacklist"]
|
||||
@ -154,7 +180,9 @@ def add_requests_to_db(requests: list[Request], db_name: str):
|
||||
request = requests[i]
|
||||
# skip requests to blacklisted locations
|
||||
if request_blacklist:
|
||||
if match(request_blacklist, request.request_file): continue
|
||||
if match(request_blacklist, request.request_file):
|
||||
# pdebug(f"add_requests_to_db: request on blacklist '{request.request_file}'")
|
||||
continue
|
||||
# pdebug("add_requests_to_db:", i, "request:", request)
|
||||
user_id = get_user_id(request, cursor)
|
||||
conn.commit()
|
||||
@ -169,9 +197,14 @@ def add_requests_to_db(requests: list[Request], db_name: str):
|
||||
# pdebug("new request:", request)
|
||||
request_id = sql_tablesize(cursor, t_request)
|
||||
sql_insert(cursor, t_request, [[request_id, user_id, group_id, request.time_local, request.referer, request.status]])
|
||||
for user_id in range(max_user_id, sql_tablesize(cursor, t_user)):
|
||||
added_requests += 1
|
||||
user_count = sql_tablesize(cursor, t_user)
|
||||
for user_id in range(max_user_id, user_count):
|
||||
is_human = is_user_human(cursor, user_id)
|
||||
cursor.execute(f"SELECT * FROM {t_user} WHERE user_id = {user_id}")
|
||||
# pdebug(f"add_rq_to_db: {user_id} is_human? {is_human}, {cursor.fetchall()}")
|
||||
if is_human:
|
||||
cursor.execute(f"UPDATE {t_user} SET is_human = 1 WHERE user_id = {user_id}")
|
||||
cursor.close()
|
||||
conn.commit()
|
||||
pmessage(f"Collection Summary: Added {user_count - max_user_id} new users and {added_requests} new requests.")
|
||||
|
@ -47,7 +47,16 @@ filegroup_id = Entry("group_id", "INTEGER")
|
||||
ip_address_entry = Entry("ip_address", "TEXT")
|
||||
filename_entry = Entry("filename", "TEXT")
|
||||
database_tables = {
|
||||
t_user: Table(t_user, user_id, [Entry("ip_address", "TEXT"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER")], [f"UNIQUE({user_id.name})"]),
|
||||
t_user: Table(t_user, user_id, [
|
||||
Entry("ip_address", "TEXT"),
|
||||
Entry("user_agent", "TEXT"),
|
||||
Entry("platform", "TEXT"),
|
||||
Entry("browser", "TEXT"),
|
||||
Entry("mobile", "INTEGER"),
|
||||
Entry("is_human", "INTEGER"),
|
||||
# Entry("country_iso_code_3", "TEXT")
|
||||
],
|
||||
[f"UNIQUE({user_id.name})"]),
|
||||
t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]),
|
||||
t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]),
|
||||
t_request: Table(t_request, request_id, [
|
||||
@ -73,7 +82,7 @@ def get_filegroup(filename: str, cursor: sql.Cursor) -> int:
|
||||
cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname = '{suffix}'")
|
||||
# cursor.execute(f"SELECT group_id FROM {t_filegroup} WHERE groupname LIKE '%.{suffix}'")
|
||||
group_id_candidates = cursor.fetchall()
|
||||
pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
|
||||
# pdebug(f"get_filegroup: file={filename} candidates={group_id_candidates}")
|
||||
if group_id_candidates:
|
||||
return group_id_candidates[0][0]
|
||||
else: # add new group file filename
|
||||
|
@ -321,7 +321,10 @@ def cleanup_referer(referer: str) -> str:
|
||||
domain = m.groups()[1].replace(subdomains, "")
|
||||
location = m.groups()[3]
|
||||
|
||||
referer = domain
|
||||
assert(len(domain.split(".")) == 2)
|
||||
referer = domain.split(".")[0]
|
||||
if not settings["referer_ranking_ignore_tld"]: referer += "." + domain.split(".")[1]
|
||||
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
|
||||
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
|
||||
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
|
||||
if not settings["referer_ranking_ignore_location"]: referer += location
|
||||
|
@ -8,18 +8,31 @@ from regina.db_operation.database import create_db
|
||||
from regina.db_operation.visualize import visualize
|
||||
from regina.utility.settings_manager import read_settings_file
|
||||
from regina.utility.globals import settings, version
|
||||
from regina.utility.utility import pmessage
|
||||
|
||||
"""
|
||||
start regina, launch either collect or visualize
|
||||
TODO:
|
||||
- bei referrers ähnliche zusammenlegen, z.b. www.google.de und https://google.com
|
||||
- optionen:
|
||||
- unique user = ip address
|
||||
- max requests/time
|
||||
|
||||
- fix datum im user and request count plot
|
||||
- fix datum monat is 1 zu wenig
|
||||
- checken warum last x days und total counts abweichen
|
||||
- länder aus ip addresse
|
||||
- "manuelle" datenbank beabeitung in cli:
|
||||
- user + alle seine requests löschen
|
||||
- user agents:
|
||||
- android vor linux suchen, oder linux durch X11 ersetzen
|
||||
- alles was bot drin hat als bot betrachten
|
||||
- wenn datenbankgröße zum problem wird:
|
||||
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
|
||||
- selbes für platforms und browsers
|
||||
- test:
|
||||
- human detection
|
||||
- referer cleanup
|
||||
- schöne log nachrichten für die cron mail
|
||||
- testing!
|
||||
"""
|
||||
|
||||
|
||||
@ -82,7 +95,6 @@ def main():
|
||||
settings["version"] = version
|
||||
if log_file: settings["access_log"] = log_file
|
||||
|
||||
print(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
|
||||
|
||||
if not settings["server_name"]: missing_arg("server-name")
|
||||
if not settings["access_log"]: missing_arg("log")
|
||||
@ -91,14 +103,20 @@ def main():
|
||||
settings["auto_group_filetypes"] = settings["auto_group_filetypes"].split(",")
|
||||
if isinstance(settings["locs_and_dirs"], str):
|
||||
settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ]
|
||||
|
||||
|
||||
if collect:
|
||||
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
|
||||
if not isfile(settings["db"]):
|
||||
create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
|
||||
requests = parse_log(settings["access_log"])
|
||||
add_requests_to_db(requests, settings["db"])
|
||||
if visualize_:
|
||||
elif visualize_:
|
||||
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'")
|
||||
if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
|
||||
visualize(settings)
|
||||
else:
|
||||
error("Either --collect --visualize has to be provided")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -14,6 +14,7 @@ settings = {
|
||||
"filegroups": "",
|
||||
"request_location_regex_blacklist": "",
|
||||
"unique_user_is_ip_address": False,
|
||||
"user_get_country": True,
|
||||
|
||||
# VISUALIZATION
|
||||
"get_human_percentage": False,
|
||||
@ -25,6 +26,7 @@ settings = {
|
||||
"referer_ranking_ignore_protocol": True,
|
||||
"referer_ranking_ignore_subdomain": False,
|
||||
"referer_ranking_ignore_location": True,
|
||||
"referer_ranking_ignore_tld": False,
|
||||
"referer_ranking_regex_whitelist": r"^[^\-].*", # minus means empty
|
||||
"user_agent_ranking_regex_whitelist": r"",
|
||||
"file_ranking_plot_max_files": 15,
|
||||
|
@ -6,12 +6,15 @@ from sys import exit
|
||||
Various utitity
|
||||
"""
|
||||
|
||||
DEBUG = True
|
||||
def pdebug(*args):
|
||||
if DEBUG: print(*args)
|
||||
DEBUG = False
|
||||
def pdebug(*args, **keys):
|
||||
if DEBUG: print(*args, **keys)
|
||||
|
||||
def warning(*w):
|
||||
print("Warning:", *w)
|
||||
def warning(*w, **k):
|
||||
print("Warning:", *w, **k)
|
||||
|
||||
def pmessage(*args, **keys):
|
||||
print(*args, **keys)
|
||||
|
||||
def error(*arg):
|
||||
print("Error:", *arg)
|
||||
|
Loading…
x
Reference in New Issue
Block a user