Refactored
This commit is contained in:
parent
0e0ece77ea
commit
0f9a46a115
@ -7,14 +7,14 @@ collect information from the access log and put it into the database
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
re_remote_addr = r"[0-9a-fA-F.:]+"
|
re_remote_addr = r"[0-9a-fA-F.:]+"
|
||||||
re_remote_visitor = ".*"
|
re_remote_user = ".*"
|
||||||
re_time_local = r"\[.+\]"
|
re_time_local = r"\[.+\]"
|
||||||
re_request = r'"[^"]+"'
|
re_request = r'"[^"]+"'
|
||||||
re_status = r'\d+'
|
re_status = r'\d+'
|
||||||
re_body_bytes_sent = r'\d+'
|
re_body_bytes_sent = r'\d+'
|
||||||
re_http_referer = r'"([^"]*)"'
|
re_http_referer = r'"([^"]*)"'
|
||||||
re_http_visitor_agent = r'"([^"]*)"'
|
re_http_user_agent = r'"([^"]*)"'
|
||||||
re_log_format: str = f'({re_remote_addr}) - ({re_remote_visitor}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_visitor_agent}'
|
re_log_format: str = f'({re_remote_addr}) - ({re_remote_user}) ({re_time_local}) ({re_request}) ({re_status}) ({re_body_bytes_sent}) {re_http_referer} {re_http_user_agent}'
|
||||||
|
|
||||||
def parse_log(logfile_path:str) -> list[Request]:
|
def parse_log(logfile_path:str) -> list[Request]:
|
||||||
"""
|
"""
|
||||||
@ -23,19 +23,21 @@ def parse_log(logfile_path:str) -> list[Request]:
|
|||||||
requests = []
|
requests = []
|
||||||
with open(logfile_path, "r") as file:
|
with open(logfile_path, "r") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
for line in lines:
|
for i in range(len(lines)):
|
||||||
m = match(re_log_format, line)
|
m = match(re_log_format, lines[i])
|
||||||
if m is None:
|
if m is None:
|
||||||
warning(f"parse_log: Unmatched line: '{line}'")
|
warning(f"parse_log: Could not match line {i:3}: '{lines[i]}'")
|
||||||
continue
|
continue
|
||||||
# print(m.groups())
|
pdebug(f"parse_log: line {i:3} match groups:", m.groups(), lvl=4)
|
||||||
g = m.groups()
|
# _ is user
|
||||||
request_ = m.groups()[3].split(" ")
|
ip_address, _, timestamp, request_, status, bytes_sent, referer, user_agent = m.groups()
|
||||||
if len(request_) != 3:
|
request_parts = request_.split(" ")
|
||||||
warning(f"parse_log: len('{m.groups()[3]}'.split(' ')) is {len(request_)} and not 3")
|
if len(request_parts) != 3:
|
||||||
|
warning(f"parse_log: Could not parse request of line {i:3}: '{request_}'")
|
||||||
continue
|
continue
|
||||||
requests.append(Request(ip_address=g[0], time_local=g[2],
|
http_function, route, protocol = request_parts
|
||||||
request_type=request_[0], request_route=request_[1], request_protocol=request_[2],
|
requests.append(Request(ip_address=ip_address, time_local=timestamp,
|
||||||
status=g[4], bytes_sent=g[5], referer=g[6], user_agent=g[7]))
|
request_type=http_function, request_route=route, request_protocol=protocol,
|
||||||
|
status=status, bytes_sent=bytes_sent, referer=referer, user_agent=user_agent))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ from datetime import datetime as dt
|
|||||||
|
|
||||||
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
|
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_max
|
||||||
from regina.utility.utility import pdebug, warning, pmessage
|
from regina.utility.utility import pdebug, warning, pmessage
|
||||||
from regina.utility.globals import visitor_agent_operating_systems, visitor_agent_browsers, settings
|
from regina.utility.globals import user_agent_platforms, user_agent_browsers, settings
|
||||||
|
|
||||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
|
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", "Nov", "Dec"]
|
||||||
|
|
||||||
@ -28,21 +28,21 @@ class Request:
|
|||||||
warning(f"Request:__init__: {e}")
|
warning(f"Request:__init__: {e}")
|
||||||
else:
|
else:
|
||||||
warning(f"Request:__init__: Could not match time: '{time_local}'")
|
warning(f"Request:__init__: Could not match time: '{time_local}'")
|
||||||
self.request_type = sanitize(request_type)
|
self.type = sanitize(request_type) # GET, POST, ...
|
||||||
self.request_route = sanitize(request_route)
|
self.route = sanitize(request_route) # eg. /index.html
|
||||||
self.request_protocol = sanitize(request_protocol)
|
self.protocol = sanitize(request_protocol) # eg. HTTP/1.1
|
||||||
self.status = sanitize(status)
|
self.status = sanitize(status) # http status code
|
||||||
self.bytes_sent = sanitize(bytes_sent)
|
self.bytes_sent = sanitize(bytes_sent)
|
||||||
self.referer = sanitize(referer)
|
self.referer = sanitize(referer)
|
||||||
self.user_agent = sanitize(user_agent)
|
self.user_agent = sanitize(user_agent)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"{self.ip_address} - {self.time_local} - {self.request_route} - {self.user_agent} - {self.status}"
|
return f"{self.ip_address} - {self.time_local} - {self.route} - {self.user_agent} - {self.status}"
|
||||||
|
|
||||||
def get_platform(self):
|
def get_platform(self):
|
||||||
# for groups in findall(re_visitor_agent, visitor_agent):
|
# for groups in findall(re_visitor_agent, visitor_agent):
|
||||||
operating_system = ""
|
operating_system = ""
|
||||||
for os in visitor_agent_operating_systems:
|
for os in user_agent_platforms:
|
||||||
if os in self.user_agent:
|
if os in self.user_agent:
|
||||||
operating_system = os
|
operating_system = os
|
||||||
break
|
break
|
||||||
@ -50,7 +50,7 @@ class Request:
|
|||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
browser = ""
|
browser = ""
|
||||||
for br in visitor_agent_browsers:
|
for br in user_agent_browsers:
|
||||||
if br in self.user_agent:
|
if br in self.user_agent:
|
||||||
browser = br
|
browser = br
|
||||||
break
|
break
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
# from sys import path
|
# from sys import path
|
||||||
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
|
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
|
||||||
from sys import exit
|
from sys import exit, stderr
|
||||||
from os import path
|
from os import path, makedirs
|
||||||
from re import fullmatch
|
from re import fullmatch, Pattern
|
||||||
|
|
||||||
from regina.utility.globals import settings
|
from regina.utility.globals import settings
|
||||||
|
|
||||||
@ -10,49 +10,73 @@ from regina.utility.globals import settings
|
|||||||
Various utitity
|
Various utitity
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def is_whitelisted(val: str, whitelist: str|list[str]|None):
|
def _fullmatch(val, regexp, match_none=True):
|
||||||
|
"""
|
||||||
|
Check if val fully matches regexp
|
||||||
|
Regexp can be:
|
||||||
|
None -> return match_none
|
||||||
|
str
|
||||||
|
re.Pattern
|
||||||
|
list of the above, in which case True if returned if it matches any of the expressions in the list
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not regexp: return match_none
|
||||||
|
if type(regexp) == str:
|
||||||
|
if fullmatch(regexp, val):
|
||||||
|
return True
|
||||||
|
elif type(regexp) == list:
|
||||||
|
for w in regexp:
|
||||||
|
if _fullmatch(val, w):
|
||||||
|
return True
|
||||||
|
elif type(regexp) == Pattern:
|
||||||
|
if not regexp.pattern: # if whitelist = re.compile('')
|
||||||
|
return match_none
|
||||||
|
elif fullmatch(regexp, val):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
warning(f"_fullmatch: Unsupported regexp type: {type(regexp)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_whitelisted(val: str, whitelist: str|Pattern|None|list) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if val is in a regex whitelist
|
Check if val is in a regex whitelist
|
||||||
whitelist: regexp, list of regexp or None
|
whitelist: regexp as str or compiled pattern or None, or a list of them
|
||||||
if whitelist is None, always return True
|
if whitelist is None, always return True
|
||||||
"""
|
"""
|
||||||
if not whitelist: return True
|
wl = _fullmatch(val, whitelist)
|
||||||
if type(whitelist) == str:
|
if not wl: pdebug(f"is_whitelisted: value='{val}' is not on whitelist: '{whitelist}'", lvl=4)
|
||||||
return fullmatch(whitelist, val)
|
return wl
|
||||||
if type(whitelist) == list:
|
|
||||||
for w in whitelist:
|
|
||||||
if not fullmatch(w, val): return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def is_blacklisted(val: str, blacklist: str|list[str]|None):
|
def is_blacklisted(val: str, blacklist: str|Pattern|None|list):
|
||||||
"""
|
"""
|
||||||
Check if val is in a regex blacklist
|
Check if val is in a regex blacklist
|
||||||
blacklist: regexp, list of regexp or None
|
blacklist: regexp as str or compiled pattern or None, or a list of them
|
||||||
if blacklist is None, always return False
|
if blacklist is None, always return False
|
||||||
"""
|
"""
|
||||||
return not is_whitelisted(val, blacklist)
|
bl = _fullmatch(val, blacklist, match_none=False)
|
||||||
|
if bl: pdebug(f"is_blacklisted: value='{val}' is blacklisted: '{blacklist}'", lvl=4)
|
||||||
|
return bl
|
||||||
|
|
||||||
|
|
||||||
def pdebug(*args, **keys):
|
def pdebug(*args, lvl=2, **keys):
|
||||||
if settings["debug"]: print(*args, **keys)
|
if settings["debug"]["debug_level"] >= lvl: print(*args, **keys)
|
||||||
|
|
||||||
def warning(*w, **k):
|
def warning(*w, **k):
|
||||||
print("Warning:", *w, **k)
|
print("Warning:", *w, file=stderr, **k)
|
||||||
|
|
||||||
def pmessage(*args, **keys):
|
def pmessage(*args, **keys):
|
||||||
print(*args, **keys)
|
print(*args, **keys)
|
||||||
|
|
||||||
def error(*arg):
|
def error(*args, errno: int=1, **k):
|
||||||
print("Error:", *arg)
|
print("Error:", *args, file=stderr, **k)
|
||||||
exit(1)
|
exit(errno)
|
||||||
|
|
||||||
def missing_arg_val(arg):
|
def dict_str(d: dict):
|
||||||
print("Missing argument for", arg)
|
"""nicer string for dictionaries"""
|
||||||
exit(1)
|
s = ""
|
||||||
|
for k, v in d.items():
|
||||||
def missing_arg(arg):
|
s += f"{k}:\t{v}\n"
|
||||||
print("Missing ", arg)
|
return s.strip("\n")
|
||||||
exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def get_filepath(filename, directories: list):
|
def get_filepath(filename, directories: list):
|
||||||
@ -62,3 +86,9 @@ def get_filepath(filename, directories: list):
|
|||||||
if path.isfile(p):
|
if path.isfile(p):
|
||||||
return p
|
return p
|
||||||
raise FileNotFoundError(f"{filename} not in {directories}")
|
raise FileNotFoundError(f"{filename} not in {directories}")
|
||||||
|
|
||||||
|
def make_parent_dirs(p):
|
||||||
|
parent = path.dirname(p)
|
||||||
|
if not path.isdir(parent):
|
||||||
|
pdebug(f"make_parent_dirs: Making directory '{parent}'", lvl=2)
|
||||||
|
makedirs(parent)
|
||||||
|
Loading…
Reference in New Issue
Block a user