From 4a97335b9608dbccc751bfeec4272dd9509253d2 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Mon, 15 May 2023 21:58:02 +0200 Subject: [PATCH] refactored visualization rankings and statistics now use more sql features for better performance added data export changed html variable names --- regina/data_visualization/history.py | 53 +++ regina/data_visualization/ranking.py | 251 ++++++++----- regina/data_visualization/utility.py | 116 ++---- regina/data_visualization/visualize.py | 483 ++++++++++++++----------- template.html | 49 +-- 5 files changed, 545 insertions(+), 407 deletions(-) create mode 100644 regina/data_visualization/history.py diff --git a/regina/data_visualization/history.py b/regina/data_visualization/history.py new file mode 100644 index 0000000..414540d --- /dev/null +++ b/regina/data_visualization/history.py @@ -0,0 +1,53 @@ +from regina.database import Database + +def get_visitor_count_between(db: Database, timestamps: tuple[int, int], only_human=False): + return db(f"""SELECT COUNT(visitor_id) + FROM visitor AS v + WHERE EXISTS ( + SELECT 1 + FROM request as r + WHERE r.visitor_id = v.visitor_id + AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + ) + {'AND v.is_human = 1' if only_human else ''}""")[0][0] + +def get_request_count_between(db: Database, timestamps: tuple[int, int], only_human=False): + return db(f"""SELECT COUNT(r.request_id) + FROM request AS r, visitor AS v + WHERE r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + {'AND v.is_human = 1' if only_human else ''}""")[0][0] + + +def get_new_visitor_count_between(db: Database, timestamps: tuple[int, int]): + return db(f"""SELECT COUNT(*) + FROM visitor AS v + JOIN ( + SELECT visitor_id, MIN(time) AS first_request_time + FROM request + GROUP BY visitor_id + ) AS r ON v.visitor_id = r.visitor_id + WHERE r.first_request_time BETWEEN {timestamps[0]} AND {timestamps[1]}""")[0][0] + +def get_request_from_new_visitor_count_between(db: Database, timestamps: tuple[int, int]): + return db(f"""SELECT COUNT(*) + FROM request AS r + JOIN ( + SELECT visitor_id, MIN(time) AS first_request_time + FROM request + GROUP BY visitor_id + ) AS v ON r.visitor_id = v.visitor_id + WHERE v.first_request_time BETWEEN {timestamps[0]} AND {timestamps[1]}""")[0][0] + + +def get_mobile_visitor_count_between(db: Database, timestamps: tuple[int, int], only_human=True) -> float: + return db(f"""SELECT COUNT(*) + FROM visitor AS v + WHERE EXISTS ( + SELECT 1 + FROM request as r + WHERE r.visitor_id = v.visitor_id + AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + ) + {'AND v.is_human = 1' if only_human else ''} + AND v.is_mobile = 1""")[0][0] + diff --git a/regina/data_visualization/ranking.py b/regina/data_visualization/ranking.py index 273957c..07c953f 100644 --- a/regina/data_visualization/ranking.py +++ b/regina/data_visualization/ranking.py @@ -2,54 +2,68 @@ from re import fullmatch from regina.database import Database from regina.utility.globals import settings -from regina.utility.utility import pdebug, warning, missing_arg, is_blacklisted, is_whitelisted +from regina.utility.utility import pdebug, warning, is_blacklisted, is_whitelisted +from regina.utility.sql_util import sanitize from regina.data_visualization.utility import is_valid_status, cleanup_referer -def get_route_ranking(db: Database, date_condition:str) -> list[tuple[int, str]]: +def get_route_ranking(db: Database, timestamps: tuple[int, int]) -> list[tuple[int, str]]: """ :returns [(request_count, route name)] """ ranking = [] for (route_id, name) in db(f"SELECT route_id, name FROM route"): - if is_blacklisted(name, settings["route_ranking_blacklist"]): continue - if not is_whitelisted(name, settings["route_ranking_whitelist"]): continue - if settings["route_ranking_ignore_404"]: # use only succesful routes + if is_blacklisted(name, settings["rankings"]["route_blacklist"]): continue + if not is_whitelisted(name, settings["rankings"]["route_whitelist"]): continue + if settings["rankings"]["route_ignore_404"]: # use only succesful routes success = False - for (status) in db(f"SELECT status FROM request WHERE route_id = {route_id}"): + for (status, ) in db(f"SELECT status FROM request WHERE route_id = {route_id}"): if is_valid_status(status): - pdebug(f"get_route_ranking: success code {status} for route with route_id {route_id} and name {name}") + pdebug(f"get_route_ranking: success code {status} for route with route_id {route_id} and name {name}", lvl=4) success = True break if not success: - pdebug(f"get_route_ranking: route with route_id {route_id} and name {name} has only requests resulting in error") + pdebug(f"get_route_ranking: route with route_id {route_id} and name {name} has only requests resulting in error", lvl=3) continue - db.execute(f"SELECT COUNT(*) FROM request WHERE route_id = {route_id} AND {date_condition}") + db.execute(f"SELECT COUNT(*) FROM request WHERE route_id = {route_id} AND time BETWEEN {timestamps[0]} AND {timestamps[1]}") ranking.append((db.fetchone()[0], name)) ranking.sort() return ranking - -def get_ranking(db: Database, table: str, field_name: str, date_condition:str, whitelist_regex: str|list[str]|None=None, blacklist_regex: str|list[str]|None=None) -> list[tuple[int, str]]: +def route_ranking_group_routes(route_ranking: list[tuple[int, str]]): """ - 1) get all the distinct entries for field_name after min_date_unix_time - 2) call get_name_function with the distinct entry - 3) skip if not fully matching regex whitelist - 4) skip if fully matching regex blacklist - 5) for every entry, get the count in table after min_date_unix_time - 6) sort by count in ascending order - @returns [(count, name)] + group the routes in the route ranking according the groups defined in the config section "route-groups" + """ + ranking = {} + for count, route in route_ranking: + ingroup = False + for group_name, group_regexp in settings["route-groups"].items(): + if fullmatch(group_regexp, route): + if group_name in ranking: + ranking[group_name] += count + else: + ranking[group_name] = count + ingroup = True + if not ingroup: + ranking[route] = count + ranking = [ (c, name) for name, c in ranking.items() ] + ranking.sort() + return ranking + + +def get_referer_ranking(db: Database, timestamps: tuple[int, int]) -> list[tuple[int, str]]: + """ + @returns [(count, referer)] """ ranking = [] - for (name) in db(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date_condition}"): - if is_blacklisted(name, blacklist_regex): continue - if not is_whitelisted(name, whitelist_regex): continue - db.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date_condition}") + for referer_id, name in db(f"SELECT referer_id, name FROM referer"): + if is_blacklisted(name, settings["rankings"]["referer_blacklist"]): continue + if not is_whitelisted(name, settings["rankings"]["referer_whitelist"]): continue + db.execute(f"SELECT COUNT(*) FROM request WHERE referer_id = {referer_id} AND time BETWEEN {timestamps[0]} AND {timestamps[1]}") ranking.append((db.fetchone()[0], name)) ranking.sort() return ranking - def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]): unique_referers = dict() for count, referer in referer_ranking: @@ -64,88 +78,129 @@ def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]): referer_ranking.sort() -def get_city_and_country_ranking(db: Database, require_humans=True): +def get_city_ranking(db: Database, timestamps: tuple[int, int], add_country_code=True, only_human=True): """ - @returns [(count, "city (CO)")], [(count, country)] + @returns [(count, city (Country Code))] """ - cities_dict = {} - country_dict = {} - - sql_cmd = f"SELECT ci.name, co.code, co.name FROM country AS co, city as ci, visitor as v, ip_range as i WHERE v.ip_range_id = i.ip_range_id AND i.city_id = ci.city_id AND ci.country_id = co.country_id" - if require_humans: sql_cmd += " AND v.is_human = 1" - result = db(sql_cmd) - - for (city, country_code, country) in result: - if city in cities_dict: - cities_dict[city][0] += 1 - else: - if is_blacklisted(city, settings["city_ranking_blacklist"]): continue - if not is_whitelisted(city, settings["city_ranking_whitelist"]): continue - cities_dict[city] = [1, country_code, country] # count, country code - - if country in country_dict: - country_dict[country] += 1 - else: - if is_blacklisted(country, settings["country_ranking_blacklist"]): continue - if not is_whitelisted(country, settings["country_ranking_whitelist"]): continue - country_dict[country] = 1 # count, country code - - city_ranking = [(v[0], f"{city} ({v[1]})") for city,v in cities_dict.items()] - city_ranking.sort() - country_ranking = [(count, country) for country,count in country_dict.items()] - country_ranking.sort() - return city_ranking, country_ranking + ranking = [] + results = db(f"""SELECT co.code, ci.name,COUNT(v.visitor_id) + FROM country as co, city as ci, visitor as v, ip_range as i + WHERE ci.city_id = i.city_id + AND co.country_id = ci.country_id + AND i.ip_range_id = v.ip_range_id + AND EXISTS( + SELECT 1 + FROM request AS r + WHERE r.visitor_id = v.visitor_id + AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + ) + {'AND v.is_human = 1' if only_human else ''} + GROUP BY ci.name + ORDER BY COUNT(v.visitor_id) + """) + for code, name, count in results: + if is_blacklisted(name, settings["rankings"]["city_blacklist"]): continue + if not is_whitelisted(name, settings["rankings"]["city_whitelist"]): continue + if add_country_code: + name = f"{name} ({code})" + ranking.append((count, name)) + # for (city_id, name) in db(f"SELECT city_id, name FROM city"): + # if is_blacklisted(name, settings["rankings"]["city_blacklist"]): continue + # if not is_whitelisted(name, settings["rankings"]["city_whitelist"]): continue + # db.execute(f"""SELECT COUNT(v.visitor_id) + # FROM visitor AS v, ip_range AS i + # WHERE i.city_id = {city_id} + # AND i.ip_range_id = v.ip_range_id + # AND EXISTS( + # SELECT 1 + # FROM request AS r + # WHERE r.visitor_id = v.visitor_id + # AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + # ) + # {'AND v.is_human = 1' if only_human else ''}""") + # ranking.append((db.fetchone()[0], name)) + ranking.sort() + return ranking -def get_platform_browser_mobile_rankings(db: Database, visitor_ids: list[int]) -> tuple[list[tuple[int, str]], list[tuple[int, str]], float]: +def get_country_ranking(db: Database, timestamps: tuple[int, int], only_human=True): """ - returns [(count, operating_system)], [(count, browser)], mobile_visitor_percentage + @returns [(count, country)] """ - platform_ranking = {} - platform_count = 0.0 - browser_ranking = {} - browser_count = 0.0 - mobile_ranking = { True: 0.0, False: 0.0 } - for visitor_id in visitor_ids: - platform_id, browser_id, is_mobile = db(f"SELECT platform_id, browser_id, is_mobile FROM visitor WHERE visitor_id = {visitor_id}")[0] - is_mobile = bool(is_mobile) - if platform_id: - if platform_id in platform_ranking: platform_ranking[platform_id] += 1 - else: platform_ranking[platform_id] = 1 - platform_count += 1 - if browser_id: - if browser_id in browser_ranking: browser_ranking[browser_id] += 1 - else: browser_ranking[browser_id] = 1 - browser_count += 1 - if (platform_id or browser_id): - mobile_ranking[is_mobile] += 1 - try: - mobile_visitor_percentage = mobile_ranking[True] / (mobile_ranking[True] + mobile_ranking[False]) - except ZeroDivisionError: - mobile_visitor_percentage = 0.0 - - platform_ranking = [(c * 100/platform_count, db.get_name("platform", p_id)) for p_id, c in platform_ranking.items()] - platform_ranking.sort() - browser_ranking = [(c * 100/browser_count, db.get_name("browser", b_id)) for b_id, c in browser_ranking.items()] - browser_ranking.sort() - return platform_ranking, browser_ranking, mobile_visitor_percentage*100 + ranking = [] + # for (country_id, name) in db(f"SELECT country_id, name FROM country"): + # if is_blacklisted(name, settings["rankings"]["country_blacklist"]): continue + # if not is_whitelisted(name, settings["rankings"]["country_whitelist"]): continue + # db.execute(f"""SELECT COUNT(v.visitor_id) + # FROM visitor AS v, ip_range AS i, city AS ci + # WHERE ci.country_id = {country_id} + # AND ci.city_id = i.city_id + # AND i.ip_range_id = v.ip_range_id + # AND EXISTS( + # SELECT 1 + # FROM request AS r + # WHERE r.visitor_id = v.visitor_id + # AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + # ) + # {'AND v.is_human = 1' if only_human else ''}""") + # ranking.append((db.fetchone()[0], name)) + results = db(f"""SELECT co.name,COUNT(v.visitor_id) + FROM country as co, city as ci, visitor as v, ip_range as i + WHERE co.country_id = ci.country_id + AND ci.city_id = i.city_id + AND i.ip_range_id = v.ip_range_id + AND EXISTS( + SELECT 1 + FROM request AS r + WHERE r.visitor_id = v.visitor_id + AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + ) + {'AND v.is_human = 1' if only_human else ''} + GROUP BY co.name + ORDER BY COUNT(v.visitor_id) + """) + for name, count in results: + if is_blacklisted(name, settings["rankings"]["country_blacklist"]): continue + if not is_whitelisted(name, settings["rankings"]["country_whitelist"]): continue + ranking.append((count, name)) + ranking.sort() + return ranking -# Store ranking in results class and dump with pickle -# class Results: -# def __init__(self, timespan_name, -# r_routes: list[tuple[int, str]], -# r_referrers: list[tuple[int, str]], -# r_platforms: list[tuple[int, str]], -# r_browsers: list[tuple[int, str]], -# r_cities: list[tuple[int, str]], -# r_countries: list[tuple[int, str]], -# ): -# self.r_routes = r_routes -# self.r_referrers= r_referrers -# self.r_platforms= r_platforms -# self.r_browsers = r_browsers -# self.r_cities = r_cities -# self.r_countries= r_countries +def _get_platform_or_browser_ranking(db: Database, timestamps: tuple[int, int], table: str, only_human=False): + ranking = [] + for (table_id, name) in db(f"SELECT {table}_id, name FROM {table}"): + # if is_blacklisted(name, settings["rankings"][f"{table}_blacklist"]): continue + # if not is_whitelisted(name, settings["rankings"][f"{table}_whitelist"]): continue + if name == "None": continue + db.execute(f"""SELECT COUNT(v.visitor_id) + FROM visitor AS v, {table} AS t + WHERE v.{table}_id = {table_id} + AND EXISTS( + SELECT 1 + FROM request AS r + WHERE r.visitor_id = v.visitor_id + AND r.time BETWEEN {timestamps[0]} AND {timestamps[1]} + ) + {'AND v.is_human = 1' if only_human else ''}""") + ranking.append((db.fetchone()[0], name)) + ranking.sort() + return ranking + +def get_platform_ranking(db: Database, timestamps: tuple[int, int], only_human=False): + return _get_platform_or_browser_ranking(db, timestamps, "platform", only_human=only_human) + +def get_browser_ranking(db: Database, timestamps: tuple[int, int], only_human=False): + return _get_platform_or_browser_ranking(db, timestamps, "browser", only_human=only_human) + + +def make_ranking_relative(ranking: list[tuple[int, str]]) -> list[tuple[float, str]]: + total_count = sum([ c for c, _ in ranking ]) + if total_count == 0: + warning(f"make_ranking_relative: Can not make ranking relative, total_count is 0") + return [ (float(c), name) for c, name in ranking ] + rel_ranking = [ (100.0*c/total_count, name) for c, name in ranking ] + return rel_ranking + diff --git a/regina/data_visualization/utility.py b/regina/data_visualization/utility.py index efa78a3..59e084c 100644 --- a/regina/data_visualization/utility.py +++ b/regina/data_visualization/utility.py @@ -2,17 +2,21 @@ from re import fullmatch from regina.database import Database from regina.utility.globals import settings -from regina.utility.utility import pdebug, warning, missing_arg +from regina.utility.utility import pdebug, warning +from regina.utility.sql_util import sanitize, sql_tablesize # re_uri_protocol = f"(https?)://" re_uri_protocol = f"(https?://)?" -re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)" +re_uri_ipv4 = r"(?:\d{1,3}\.?){4}" # re_uri_ipv6 = "" -re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})" -re_uri_route = r"(?:/(.*))?" -re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_route})" +re_uri_domain = r"(?:[^/:]+)" +re_uri_port = r"(?::\d+)?" +re_uri_route = r"(?:/.*)?" +re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_port})({re_uri_route})" # (https?://)?((?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})|(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?))((?:/(.*))?) +re_domain = r"[^/:]+\.[a-z]{2,}" + def cleanup_referer(referer: str) -> str: """ split the referer uri into its parts and reassemeble them depending on settings @@ -21,90 +25,40 @@ def cleanup_referer(referer: str) -> str: if not m: warning(f"cleanup_referer: Could not match referer '{referer}'") return referer - # pdebug(f"cleanup_referer: {referer} - {m.groups()}") - protocol = m.groups()[0] - subdomains = m.groups()[2] - if not subdomains: subdomains = "" - domain = m.groups()[1].replace(subdomains, "") - route = m.groups()[3] + pdebug(f"cleanup_referer: {referer} - {m.groups()}", lvl=4) + protocol, domain, port, route = m.groups() + if not protocol: protocol = "" + if not port: port = "" - referer = domain - if settings["referer_ranking_ignore_tld"]: - if len(domain.split(".")) == 2: # if domain.tld - referer = domain.split(".")[0] - if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer - if not settings["referer_ranking_ignore_protocol"]: referer = protocol + referer - if not settings["referer_ranking_ignore_route"]: referer += route + if fullmatch(re_domain, domain): # no ip address + parts = domain.split(".") + if len(parts) < 2: + warning(f"cleanup_referer: Domain has not enough parts: '{domain}'") + tld = parts[-1] + referer = parts[-2] + subdomains = "" + for sd in parts[:-2]: + subdomains += f"{sd}." + if not settings["rankings"]["referer_ignore_tld"]: referer += "." + tld + if not settings["rankings"]["referer_ignore_subdomain"]: referer = subdomains + referer + else: + referer = domain + if not settings["rankings"]["referer_ignore_protocol"]: referer = protocol + referer + if not settings["rankings"]["referer_ignore_port"]: referer += port + if not settings["rankings"]["referer_ignore_route"]: referer += route # pdebug(f"cleanup_referer: cleaned up: {referer}") return referer - - -def get_where_date_str(at_date=None, min_date=None, max_date=None): - """ - get a condition string that sets a condition on the time - """ - # dates in unix time - s = "" - if at_date is not None: - if isinstance(at_date, str): - s += f"DATE(time, 'unixepoch') = '{sanitize(at_date)}' AND " - elif isinstance(at_date, int|float): - s += f"time = {int(at_date)} AND " - else: - print(f"WARNING: get_where_date_str: Invalid type of argument at_date: {type(at_date)}") - if min_date is not None: - if isinstance(min_date, str): - s += f"DATE(time, 'unixepoch') >= '{sanitize(min_date)}' AND " - elif isinstance(min_date, int|float): - s += f"time >= {int(min_date)} AND " - else: - print(f"WARNING: get_where_date_str: Invalid type of argument min_date: {type(min_date)}") - if max_date is not None: - if isinstance(max_date, str): - s += f"DATE(time, 'unixepoch') <= '{sanitize(max_date)}' AND " - elif isinstance(max_date, int|float): - s += f"time <= {int(max_date)} AND " - else: - print(f"WARNING: get_where_date_str: Invalid type of argument max_date: {type(max_date)}") - if s == "": - print(f"WARNING: get_where_date_str: no date_str generated. Returning 'time > 0'. at_date={at_date}, min_date={min_date}, max_date={max_date}") - return "time > 0" - return s.removesuffix(" AND ") - def is_valid_status(status: int): if status >= 400: return False - if settings["status_300_is_success"] and status >= 300: return True + if settings["data-collection"]["status_300_is_success"] and status >= 300: return True return status < 300 -# -# GETTERS -# -def get_unique_visitor_ids_for_date(db: Database, date:str) -> list[int]: - return [ visitor_id[0] for visitor_id in db(f"SELECT DISTINCT visitor_id FROM request WHERE {date}") ] -def append_human_visitors(db: Database, unique_visitor_ids, unique_visitor_ids_human: list): - """ - for visitor in unique_visitor_ids: - if human -> append to unique_visitor_ids_human - """ - for visitor_id in unique_visitor_ids: - db.execute(f"SELECT is_human FROM visitor WHERE visitor_id = {visitor_id}") - if db.fetchone()[0] == 1: - unique_visitor_ids_human.append(visitor_id) +def len_list_list(l: list[list]): + size = 0 + for i in range(len(l)): + size += len(l[i]) + return size -def get_unique_request_ids_for_date(db: Database, date_constraint:str): - return [ request_id[0] for request_id in db(f"SELECT DISTINCT request_id FROM request WHERE {date_constraint}")] -def append_unique_request_ids_for_date_and_visitor(db: Database, date_constraint:str, visitor_id: int, unique_request_ids_human: list): - """append all unique requests for visitor_id at date_constraint to unique_request_ids_human""" - for request_id in db(f"SELECT DISTINCT request_id FROM request WHERE {date_constraint} AND visitor_id = {visitor_id}"): - unique_request_ids_human.append(request_id[0]) - -# get number of requests per day -def get_request_count_for_date(db: Database, date_constraint:str) -> int: - db.execute(f"SELECT COUNT(*) FROM request WHERE {date_constraint}") - return db.fetchone()[0] - -def get_unique_visitor_count(db: Database) -> int: - return sql_tablesize(db.cur, "visitor") diff --git a/regina/data_visualization/visualize.py b/regina/data_visualization/visualize.py index 96e295f..e0ad08d 100644 --- a/regina/data_visualization/visualize.py +++ b/regina/data_visualization/visualize.py @@ -1,20 +1,18 @@ # from sys import path # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") -import sqlite3 as sql -from sys import exit -from re import fullmatch import matplotlib.pyplot as plt -from os.path import isdir +from pickle import dump +from os import path, makedirs from datetime import datetime as dt -from numpy import empty # local from regina.database import Database -from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where -from regina.utility.utility import pdebug, warning, missing_arg +from regina.utility.sql_util import get_date_constraint, sanitize +from regina.utility.utility import pdebug, warning, error, make_parent_dirs, dict_str from regina.utility.globals import settings -from regina.data_visualization.utility import cleanup_referer, get_where_date_str, get_unique_visitor_ids_for_date, get_unique_request_ids_for_date, append_human_visitors, append_unique_request_ids_for_date_and_visitor -from regina.data_visualization.ranking import get_city_and_country_ranking, get_platform_browser_mobile_rankings, get_ranking, cleanup_referer_ranking, get_route_ranking +from regina.data_visualization.utility import len_list_list +from regina.data_visualization.ranking import get_referer_ranking, cleanup_referer_ranking, get_route_ranking, route_ranking_group_routes, get_browser_ranking, get_platform_ranking, get_city_ranking, get_country_ranking, make_ranking_relative +import regina.data_visualization.history as h """ visualize information from the databse @@ -53,12 +51,14 @@ color_settings_platforms = { palette["blue"]: ["Windows"], } - -def len_list_list(l: list[list]): - size = 0 - for i in range(len(l)): - size += len(l[i]) - return size +color_settings_history = { + "visitors": "#000050", + "visitors_human": "#3366ff", + "visitors_new": "#66ccff", + "requests": "#770000", + "requests_human": "#ff3500", + "requests_new": "#ff9999", +} # @@ -86,18 +86,18 @@ def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot): for idx,rect in enumerate(bar_plot): ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8)) -def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None): +def plot_ranking(ranking: list[tuple[int or float, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None): """ make a bar plot of the ranking """ # pdebug(f"plot_ranking: ranking={ranking}") if not fig: - fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + fig = plt.figure(figsize=figsize, dpi=settings["plot-generation"]["dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) # create new axis if none is given ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) # fill x y data - if len(ranking) > settings["file_ranking_plot_max_files"]: - start_index = len(ranking) - settings["file_ranking_plot_max_files"] + if len(ranking) > settings["rankings"]["route_plot_max_routes"]: + start_index = len(ranking) - settings["rankings"]["route_plot_max_routes"] else: start_index = 0 x_names = [] y_counts = [] @@ -120,14 +120,14 @@ def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", if len(y_counts) > 0: add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar) - if settings["plot_add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar) + if settings["plot-generation"]["add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar) # ax.ylabel(y_counts) return fig # def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue", rotate_xlabel=0): # if not fig: -# fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) +# fig = plt.figure(figsize=None, dpi=settings["plot-generation"]["dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) # if not ax: # ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) # else: @@ -139,29 +139,39 @@ def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", # if label: ax.legend() # return fig, ax -def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major", rotate_xlabel=0, figsize=None): - if not fig: - fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) - if not (ax1 and ax2): - ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1) - ax2 = ax1.twinx() - ax2.set_ylabel(ylabel2) - ax1.tick_params(axis="x", rotation=90) - plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1) - plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2) - # ax1.set_xticks(ax1.get_xticks()) - # ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor") - # if label1 or label2: ax1.legend() - if plots: plots += plot1 + plot2 - else: plots = plot1 + plot2 - plt.legend(plots, [ l.get_label() for l in plots]) - if grid == "major" or grid == "minor" or grid == "both": - if grid == "minor" or "both": - ax1.minorticks_on() - ax1.grid(visible=True, which=grid, linestyle="-", color="#888") +class Plot2Y: + def __init__(self, xlabel, ylabel_left, ylabel_right, grid="major", rotate_xlabel=0, figsize=None): + self.fig, self.ax1 = plt.subplots(figsize=figsize, dpi=settings["plot-generation"]["dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + self.ax1.set_xlabel(xlabel=xlabel) #, ylabel=ylabel_left) + self.ax1.set_ylabel(ylabel=ylabel_left) #, ylabel=ylabel_left) + self.ax2 = self.ax1.twinx() + self.ax2.set_ylabel(ylabel_right) + self.ax1.tick_params(axis="x", rotation=90) + self.plots = None + if grid == "major" or grid == "minor" or grid == "both": + if grid == "minor" or "both": + self.ax1.minorticks_on() + self.ax1.grid(visible=True, which=grid, linestyle="-", color="#888") - return fig, ax1, ax2, plots + def _plot(self, ax, xdata, ydata, label="", linestyle="-", marker="", color="blue"): + plot = ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color) + # ax1.set_xticks(ax1.get_xticks()) + # ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor") + # if label1 or label2: ax1.legend() + if self.plots: self.plots += plot + else: self.plots = plot + plt.legend(self.plots, [ l.get_label() for l in self.plots ]) + + + def plot_left(self, xdata, ydata, label="", linestyle="-", marker="", color="blue"): + self._plot(self.ax1, xdata, ydata, label, linestyle, marker, color) + + def plot_right(self, xdata, ydata, label="", linestyle="-", marker="", color="blue"): + self._plot(self.ax2, xdata, ydata, label, linestyle, marker, color) + + def get_fig(self): + return self.fig # @@ -172,194 +182,259 @@ def visualize(db: Database): This assumes sanity checks have been done """ pdebug("visualizing...") - if not settings["db"]: missing_arg("db") - if not settings["server_name"]: missing_arg("server_name") - img_dir = settings["img_dir"] - pdebug("img_dir:", img_dir) - img_filetype = settings["img_filetype"] - if isdir(img_dir) and img_filetype: - gen_img = True - else: - print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'") - gen_img = False + def make_dir_if_not_None(d): + if d: + if not path.isdir(d): + makedirs(d) - img_location = settings["img_location"] - names = { - # paths - "img_route_ranking_last_x_days": f"ranking_routes_last_x_days.{img_filetype}", - "img_referer_ranking_last_x_days": f"ranking_referers_last_x_days.{img_filetype}", - "img_countries_last_x_days": f"ranking_countries_last_x_days.{img_filetype}", - "img_cities_last_x_days": f"ranking_cities_last_x_days.{img_filetype}", - "img_browser_ranking_last_x_days": f"ranking_browsers_last_x_days.{img_filetype}", - "img_platform_ranking_last_x_days": f"ranking_platforms_last_x_days.{img_filetype}", - "img_visitors_and_requests_last_x_days": f"visitor_request_count_daily_last_x_days.{img_filetype}", + # plot generation + img_out_dir = settings["plot-generation"]["img_out_dir"] + make_dir_if_not_None(img_out_dir) + img_filetype = settings["plot-generation"]["filetype"] + img_location = settings["html-generation"]["img_location"] + pdebug(f"visualize: img_out_dir='{img_out_dir}', filetype='{img_filetype}', {img_location}='{img_location}'", lvl=2) + if not img_out_dir: + pdebug(f"visualize: Not generating images since img_out_dir is None", lvl=1) - "img_route_ranking_total": f"ranking_routes_total.{img_filetype}", - "img_referer_ranking_total": f"ranking_referers_total.{img_filetype}", - "img_countries_total": f"ranking_countries_total.{img_filetype}", - "img_cities_total": f"ranking_cities_total.{img_filetype}", - "img_browser_ranking_total": f"ranking_browsers_total.{img_filetype}", - "img_platform_ranking_total": f"ranking_platforms_total.{img_filetype}", - "img_visitors_and_requests_total": f"visitor_request_count_daily_total.{img_filetype}", + # data export + data_out_dir = settings["data-export"]["data_out_dir"] + make_dir_if_not_None(data_out_dir) + data_filetype = settings["data-export"]["filetype"] + pdebug(f"visualize: data_out_dir='{data_out_dir}', filetype='{data_filetype}'", lvl=2) + if not data_out_dir: + pdebug(f"visualize: Not exporting data since data_out_dir is None", lvl=1) + + if not data_out_dir and not img_out_dir: + warning(f"data_out_dir and img_out_dir are both None. No data will be exported and no plots will be generated!") + + html_variables = { # values - "mobile_visitor_percentage_total": 0.0, - "mobile_visitor_percentage_last_x_days": 0.0, - "visitor_count_last_x_days": 0, - "visitor_count_total": 0, - "request_count_last_x_days": 0, - "request_count_total": 0, - "human_visitor_percentage_last_x_days": 0.0, - "human_visitor_percentage_total": 0.0, - "human_request_percentage_last_x_days": 0.0, - "human_request_percentage_total": 0.0, + "visitor_count_last_x_days": "NaN", + "visitor_count_total": "NaN", + "request_count_last_x_days": "NaN", + "request_count_total": "NaN", + "visitor_count_human_last_x_days": "NaN", + "visitor_count_human_total": "NaN", + "request_count_human_last_x_days": "NaN", + "request_count_human_total": "NaN", + "human_visitor_percentage_last_x_days": "NaN", + "human_visitor_percentage_total": "NaN", + "human_request_percentage_last_x_days": "NaN", + "human_request_percentage_total": "NaN", + "mobile_visitor_percentage_total": "NaN", + "mobile_visitor_percentage_last_x_days": "NaN", # general - "regina_version": settings["version"], - "server_name": settings["server_name"], - "last_x_days": settings["last_x_days"], # must be after all the things with last_x_days! - "earliest_date": "1990-1-1", - "generation_date": "1990-1-1 0:0:0", + "regina_version": settings["regina"]["version"], + "server_name": settings["regina"]["server_name"], + "last_x_days": settings["data-visualization"]["last_x_days"], + "earliest_date": "1990-1-1", + "generation_date": "1990-1-1 0:0:0", } - db = Database(database_path=settings["db"]) + for suffix in ["last_x_days", "total"]: + # add all plot paths as variables: img_plot_suffix -> plot_suffix.filetype + # not adding img_location or img_out_dir since these names are needed for both + html_variables.update((f"img_{plot_}_{suffix}", f"{plot_}_{suffix}.{img_filetype}") for plot_ in ["ranking_platform", "ranking_browser", "ranking_country", "ranking_city", "ranking_referer", "ranking_route", "history_visitor_request"]) + + get_humans_visitors = settings["data-visualization"]["history_track_human_visitors"] + get_new_visitors = settings["data-visualization"]["history_track_new_visitors"] - get_humans = settings["get_human_percentage"] - # pdebug(f"visualize: settings {settings}") # DATE STRINGS - earliest_date = db.get_earliest_date() - names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d") - names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") - # LAST_X_DAYS - # last_x_days_min_date: latest_date - last_x_days - secs_per_day = 86400 - last_x_days_min_date = db.get_latest_date() - settings["last_x_days"] * secs_per_day - last_x_days_constraint = get_where_date_str(min_date=last_x_days_min_date) - last_x_days = db.get_days_where(last_x_days_constraint) - last_x_days_contraints = [get_where_date_str(at_date=day) for day in last_x_days] + earliest_timestamp = db.get_earliest_timestamp() + html_variables["earliest_date"] = dt.fromtimestamp(earliest_timestamp).strftime("%Y-%m-%d") + html_variables["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S") - # ALL DATES - all_time_constraint = get_where_date_str(min_date=0) - # all months in yyyy-mm format - months_all_time = db.get_months_where(all_time_constraint) - # sqlite constrict to month string - months_strs = [] - for year_month in months_all_time: - year, month = year_month.split("-") - # first day of the month - min_date = dt(int(year), int(month), 1).timestamp() - month = (int(month) % 12) + 1 # + 1 month - year = int(year) - if month == 1: year += 1 - # first day of the next month - 1 sec - max_date = dt(year, month, 1).timestamp() - 1 - months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date)) + todos: list[tuple[str, tuple[int, int], list[str], list[str], list[tuple[int, int]]]] = [] # suffix, whole_time_timestamps, history_date_constraints, history_date_names, history_date_timestamps - for i in range(2): - suffix = ["_total", "_last_x_days"][i] - date_constraint = [all_time_constraint, last_x_days_constraint][i] - date_names = [months_all_time, last_x_days][i] - date_constraints = [months_strs, last_x_days_contraints][i] - assert(len(date_names) == len(date_constraints)) + now_stamp = int(dt.now().timestamp()) + total: bool = settings["data-visualization"]["total"] + if total: + all_time_timestamps = (0, now_stamp) + # all months in yyyy-mm format + month_names = db.get_months_where(get_date_constraint(min_date=0)) + month_timestamps = [] + # sqlite constrict to month string + month_constraints = [] + for year_month in month_names: + year, month = year_month.split("-") + # timestamp of first day of the month + min_date = int(dt(int(year), int(month), 1).timestamp()) + month = (int(month) % 12) + 1 # + 1 month + year = int(year) + # first day of the next month - 1 sec + if month == 1: year += 1 + max_date = int(dt(year, month, 1).timestamp()) - 1 + month_constraints.append(get_date_constraint(min_date=min_date, max_date=max_date)) + month_timestamps.append((min_date, max_date)) + todos.append(("total", all_time_timestamps, month_constraints, month_names, month_timestamps)) - # FILES + last_x_days: int = settings["data-visualization"]["last_x_days"] + if last_x_days > 0: + secs_per_day = 86400 + last_x_days_min_date = db.get_latest_timestamp() - last_x_days * secs_per_day + last_x_days_timestamps = (last_x_days_min_date, now_stamp) + last_x_days_constraint = get_date_constraint(min_date=last_x_days_min_date) + days = db.get_days_where(last_x_days_constraint) # yyyy-mm-dd + day_constrains = [ get_date_constraint(at_date=day) for day in days ] + day_timestamps = [] + for day in days: + year, month, day = day.split("-") + min_date = int(dt(int(year), int(month), int(day)).timestamp()) + max_date = min_date + secs_per_day + day_timestamps.append((min_date, max_date)) + + todos.append(("last_x_days", last_x_days_timestamps, day_constrains, days, day_timestamps)) + + def export_ranking(name: str, column_name: str, ranking: list[tuple[int or float, str]]): + filename = f"{data_out_dir}/{name}.{data_filetype}" + if data_filetype == "pkl": + pdebug(f"visualize: Exporting {name} as pickle to '{filename}'", lvl=2) + with open(filename, "wb") as file: + dump(ranking, file) + elif data_filetype == "csv": + pdebug(f"visualize: Exporting {name} as csv to '{filename}'", lvl=2) + s = f'"{name}"\n' + s += f'"count","{column_name}"\n' + for count, item in ranking: + s += f'{count},"{item}"\n' + s = s.strip("\n") + with open(filename, "w") as file: + file.write(s) + else: + error(f"visualize: Unsupported data filetype: '{data_filetype}'") + + def savefig(name: str, figure): + filename = f"{img_out_dir}/{name}.{img_filetype}" + pdebug(f"visualize: Saving plot for {name} as '{filename}'") + figure.savefig(filename, bbox_inches="tight") # bboximg_inches="tight" + + + pdebug(f"visualize: total={total}, last_x_days={last_x_days}", lvl=3) + for suffix, whole_timespan_timestamps, single_date_constraints, single_date_names, single_date_timestamps in todos: + assert(len(single_date_names) == len(single_date_constraints)) + + # STATISTICS + visitor_count = h.get_visitor_count_between(db, whole_timespan_timestamps) + request_count = h.get_request_count_between(db, whole_timespan_timestamps) + html_variables[f"visitor_count_{suffix}"] = visitor_count + html_variables[f"request_count_{suffix}"] = request_count + + if get_humans_visitors: + visitor_count_human = h.get_visitor_count_between(db, whole_timespan_timestamps, only_human=True) + request_count_human = h.get_request_count_between(db, whole_timespan_timestamps, only_human=True) + html_variables[f"visitor_count_human_{suffix}"] = visitor_count_human + html_variables[f"request_count_human_{suffix}"] = request_count_human + try: html_variables[f"human_visitor_percentage_{suffix}"] = 100.0 * visitor_count_human / visitor_count + except ZeroDivisionError: pass + try: html_variables[f"human_request_percentage_{suffix}"] = 100.0 * request_count_human / request_count + except ZeroDivisionError: pass + try: html_variables[f"mobile_visitor_percentage_{suffix}"] = 100.0 * h.get_mobile_visitor_count_between(db, whole_timespan_timestamps, only_human=True) / visitor_count_human + except ZeroDivisionError: pass + + # HISTORY + date_count = len(single_date_constraints) + visitor_count_dates = [ h.get_visitor_count_between(db, single_date_timestamps[i], only_human=False) for i in range(date_count) ] + request_count_dates = [ h.get_request_count_between(db, single_date_timestamps[i], only_human=False) for i in range(date_count) ] + + visitor_count_human_dates = [ h.get_visitor_count_between(db, single_date_timestamps[i], only_human=True) for i in range(date_count) ] + request_count_human_dates = [ h.get_request_count_between(db, single_date_timestamps[i], only_human=True) for i in range(date_count) ] + + visitor_count_new_dates = [ h.get_new_visitor_count_between(db, single_date_timestamps[i]) for i in range(date_count) ] + request_count_new_dates = [ h.get_request_from_new_visitor_count_between(db, single_date_timestamps[i]) for i in range(date_count) ] + + if img_out_dir: + plt_history = Plot2Y(xlabel="Date", ylabel_left="Visitor count", ylabel_right="Request count", rotate_xlabel=-45, figsize=settings["plot-generation"]["size_broad"]) + # visitors, plot on correct order + plt_history.plot_left(single_date_names, visitor_count_dates, label="Unique visitors", color=color_settings_history["visitors"]) + if get_humans_visitors: + plt_history.plot_left(single_date_names, visitor_count_human_dates, label="Unique visitors (human)", color=color_settings_history["visitors_human"]) + if get_new_visitors: + plt_history.plot_left(single_date_names, visitor_count_new_dates, label="Unique visitors (new)", color=color_settings_history["visitors_new"]) + # requests + plt_history.plot_right(single_date_names, request_count_dates, label="Unique requests", color=color_settings_history["requests"]) + if get_humans_visitors: + plt_history.plot_left(single_date_names, request_count_human_dates, label="Unique requests (human)", color=color_settings_history["requests_human"]) + if get_new_visitors: + plt_history.plot_left(single_date_names, request_count_new_dates, label="Unique requests (new)", color=color_settings_history["requests_new"]) + + savefig(f"history_visitor_request_{suffix}", plt_history.get_fig()) + # if data_out_dir: # TODO export history + # s = "" + + # ROUTES # TODO handle groups - file_ranking = get_route_ranking(db, date_constraint) - if gen_img: - fig_file_ranking = plot_ranking(file_ranking, xlabel="Route Name", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot_size_broad"]) - fig_file_ranking.savefig(f"{img_dir}/{names[f'img_route_ranking{suffix}']}", bbox_inches="tight") + route_ranking = get_route_ranking(db, whole_timespan_timestamps) + route_ranking = route_ranking_group_routes(route_ranking) + pdebug("visualize: route ranking", route_ranking, lvl=3) + if img_out_dir: + fig_file_ranking = plot_ranking(route_ranking, xlabel="Route", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot-generation"]["size_broad"]) + savefig(f"ranking_route_{suffix}", fig_file_ranking) + if data_out_dir: + export_ranking(f"ranking_route_{suffix}", "route", route_ranking) + # REFERER - referer_ranking = get_ranking(db, "request", "referer", date_constraint, settings["referer_ranking_whitelist"], settings["referer_ranking_whitelist"]) - pdebug("Referer ranking", referer_ranking) + referer_ranking = get_referer_ranking(db, whole_timespan_timestamps) cleanup_referer_ranking(referer_ranking) - if gen_img: - fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"]) - fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}", bbox_inches="tight") + pdebug("visualize: referer ranking", referer_ranking, lvl=3) + if img_out_dir: + fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot-generation"]["size_broad"]) + savefig(f"ranking_referer_{suffix}", fig_referer_ranking) + if data_out_dir: + export_ranking(f"ranking_referer_{suffix}", "referer", referer_ranking) # GEOIP - if settings["do_geoip_rankings"]: - city_ranking, country_ranking = get_city_and_country_ranking(db, require_humans=settings["geoip_only_humans"]) - pdebug("Country ranking:", country_ranking) - pdebug("City ranking:", city_ranking) - if gen_img: - fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"]) - fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_countries{suffix}']}", bbox_inches="tight") - - fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"]) - fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_cities{suffix}']}", bbox_inches="tight") - - - # USER - # visitor_agent_ranking = get_visitor_agent_ranking(cur, date_str) - # for the time span - unique_visitor_ids = get_unique_visitor_ids_for_date(db, date_constraint) - unique_visitor_ids_human = [] - append_human_visitors(db, unique_visitor_ids, unique_visitor_ids_human) - # for each date - date_count = len(date_constraints) - unique_visitor_ids_dates: list[list[int]] = [] - unique_request_ids_dates: list[list[int]] = [] - unique_visitor_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)] - unique_request_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)] - for i in range(date_count): - date_constraint_ = date_constraints[i] - unique_visitor_ids_dates.append(get_unique_visitor_ids_for_date(db, date_constraint_)) - unique_request_ids_dates.append(get_unique_request_ids_for_date(db, date_constraint_)) - if get_humans: - # empty_list = [] - # unique_visitor_ids_human_dates.append(empty_list) - append_human_visitors(db, unique_visitor_ids_dates[i], unique_visitor_ids_human_dates[i]) - # unique_request_ids_human_dates.append(list()) - for human in unique_visitor_ids_human_dates[i]: - append_unique_request_ids_for_date_and_visitor(db, date_constraint_, human, unique_request_ids_human_dates[i]) - # print("\n\tuu", unique_visitor_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_visitor_ids_human_dates, "\n\turh", unique_request_ids_human_dates) - # pdebug("uui", unique_visitor_ids) - # pdebug("uuih", unique_visitor_ids_human) - # pdebug("uuid", unique_visitor_ids_dates) - # pdebug("uuidh", unique_visitor_ids_human_dates) - # pdebug("urid", unique_request_ids_dates) - # pdebug("uridh", unique_visitor_ids_human_dates) - # pdebug(f"human_visitor_precentage: len_list_list(visitor_ids)={len_list_list(unique_visitor_ids_dates)}, len_list_list(visitor_ids_human)={len_list_list(unique_visitor_ids_human_dates)}") - if get_humans: - try: - names[f"human_visitor_percentage{suffix}"] = round(100 * len_list_list(unique_visitor_ids_human_dates) / len_list_list(unique_visitor_ids_dates), 2) - except: - names[f"human_visitor_percentage{suffix}"] = -1.0 - try: - names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2) - except: - names[f"human_request_percentage{suffix}"] = -1.0 - names[f"visitor_count{suffix}"] = len_list_list(unique_visitor_ids_dates) - names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates) - if gen_img: - fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="Visitor count", label1="Unique visitors", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"], rotate_xlabel=-45, figsize=settings["plot_size_broad"]) - if get_humans: - fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique visitors (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots, rotate_xlabel=-45, figsize=settings["plot_size_broad"]) - fig_daily.savefig(f"{img_dir}/{names[f'img_visitors_and_requests{suffix}']}", bbox_inches="tight") + if settings["data-collection"]["get_visitor_location"]: + country_ranking = get_country_ranking(db, whole_timespan_timestamps, only_human=settings["rankings"]["geoip_only_humans"]) + pdebug("visualize: country ranking:", country_ranking, lvl=3) + city_ranking = get_city_ranking(db, whole_timespan_timestamps, add_country_code=settings["rankings"]["city_add_country_code"], only_human=settings["rankings"]["geoip_only_humans"]) + pdebug("visualize: city ranking:", city_ranking, lvl=3) + if img_out_dir: + fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot-generation"]["size_broad"]) + savefig(f"ranking_country_{suffix}", fig_referer_ranking) + fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot-generation"]["size_broad"]) + savefig(f"ranking_city_{suffix}", fig_referer_ranking) + if data_out_dir: + export_ranking(f"ranking_country_{suffix}", "country", country_ranking) + export_ranking(f"ranking_city_{suffix}", "city", city_ranking) # os & browser - platform_ranking, browser_ranking, names[f"mobile_visitor_percentage{suffix}"] = get_platform_browser_mobile_rankings(db, unique_visitor_ids_human) - if gen_img: - fig_os_rating = plot_ranking(platform_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_platforms, figsize=settings["plot_size_narrow"]) - fig_os_rating.savefig(f"{img_dir}/{names[f'img_platform_ranking{suffix}']}", bbox_inches="tight") - fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browser", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot_size_narrow"]) - fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}", bbox_inches="tight") + browser_ranking = get_browser_ranking(db, whole_timespan_timestamps, only_human=False) + browser_ranking = make_ranking_relative(browser_ranking) + pdebug("visualize: browser ranking:", browser_ranking, lvl=3) + platform_ranking = get_platform_ranking(db, whole_timespan_timestamps, only_human=False) + platform_ranking = make_ranking_relative(platform_ranking) + pdebug("visualize: platform ranking:", platform_ranking, lvl=3) + if img_out_dir: + fig_os_rating = plot_ranking(platform_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_platforms, figsize=settings["plot-generation"]["size_narrow"]) + savefig(f"ranking_platform_{suffix}", fig_os_rating) + fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browser", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot-generation"]["size_narrow"]) + savefig(f"ranking_browser_{suffix}", fig_browser_rating) + if data_out_dir: + export_ranking(f"ranking_platform_{suffix}", "platform", platform_ranking) + export_ranking(f"ranking_browser_{suffix}", "browser", browser_ranking) - # print("OS ranking", os_ranking) - # print("Browser ranking", browser_ranking) - # print("Mobile percentage", names["mobile_visitor_percentage"]) - if settings["template_html"] and settings["html_out_path"]: - pdebug(f"visualize: writing to html: {settings['html_out_path']}") - with open(settings["template_html"], "r") as file: + html_variables_str = dict_str(html_variables).replace('\n', '\n\t') + pdebug(f"visualize: html_variables:\n\t{html_variables_str}", lvl=2) + + template_html: str|None = settings["html-generation"]["template_html"] + html_out_path: str|None = settings["html-generation"]["html_out_path"] + if template_html and html_out_path: + pdebug(f"visualize: generating from template '{template_html}' to '{html_out_path}'", lvl=2) + if not path.isfile(template_html): + error(f"Invalid template file path: '{template_html}'") + with open(template_html, "r") as file: html = file.read() - for name, value in names.items(): + for name, value in html_variables.items(): if "img" in name: value = f"{img_location}/{value}" - if type(value) == float: + elif type(value) == float: value = f"{value:.2f}" html = html.replace(f"%{name}", str(value)) - with open(settings["html_out_path"], "w") as file: + make_parent_dirs(html_out_path) + with open(html_out_path, "w") as file: file.write(html) else: - warning(f"Skipping html generation because either template_html or html_out_path is invalid: template_html='{settings['template_html']}', html_out_path='{settings['html_out_path']}'") + pdebug(f"visualize: skipping html generation because either template_html or html_out_path is None: template_html='{template_html}', html_out_path='{html_out_path}'", lvl=1) diff --git a/template.html b/template.html index 3ac6b12..c232365 100644 --- a/template.html +++ b/template.html @@ -48,43 +48,44 @@

Analytics for %server_name

-
+

Last %last_x_days days


Visitor and request count (per month)

- Daily Statistics + Daily Statistics
    -
  • visitor count: %visitor_count_last_x_days, from which %human_visitor_percentage_last_x_days% are human
  • -
  • request count: %request_count_last_x_days, from which %human_request_percentage_last_x_days% came from human visitors
  • +
  • visitor count: %visitor_count_last_x_days, from which %visitor_count_human_last_x_days (%human_visitor_percentage_last_x_days%) are human
  • +
  • request count: %request_count_last_x_days, from which %request_count_human_last_x_days (%human_request_percentage_last_x_days%) came from human visitors

File access

- File ranking for the last %last_x_days days + File ranking for the last %last_x_days days

Platforms and browsers

- Operating system ranking for the last %last_x_days days - Browser ranking for the last %last_x_days days + Operating system ranking for the last %last_x_days days + Browser ranking for the last %last_x_days days

Mobile visitors: %mobile_visitor_percentage_last_x_days%


Referrers

- Referer ranking for the last %last_x_days days + Referer ranking for the last %last_x_days days
- - - - -
+

GeoIP

+ Country ranking for the last %last_x_days days + City ranking for the last %last_x_days days +
+ +
-
+

Total (since %earliest_date)


Visitor and request count (per month)

- Monthly Statistics + Monthly Statistics
  • Total visitor count: %visitor_count_total, from which %human_visitor_percentage_total% are human
  • Total request count: %request_count_total, from which %human_request_percentage_total% came from human visitors
  • @@ -92,24 +93,24 @@

    File access

    - File ranking + File ranking

    Platforms and browsers

    - Operating system ranking - Browser ranking + Operating system ranking + Browser ranking

    Mobile visitors: %mobile_visitor_percentage_total%


    Referrers

    - Referer ranking + Referer ranking
    - - - - -
+

GeoIP

+ Country ranking + City ranking +
+

These analytics were generated by regina %regina_version at %generation_date