2023-05-13 00:28:07 +02:00

366 lines
18 KiB
Python

# from sys import path
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
import sqlite3 as sql
from sys import exit
from re import fullmatch
import matplotlib.pyplot as plt
from os.path import isdir
from datetime import datetime as dt
from numpy import empty
# local
from regina.database import Database
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
from regina.utility.utility import pdebug, warning, missing_arg
from regina.utility.globals import settings
from regina.data_visualization.utility import cleanup_referer, get_where_date_str, get_unique_visitor_ids_for_date, get_unique_request_ids_for_date, append_human_visitors, append_unique_request_ids_for_date_and_visitor
from regina.data_visualization.ranking import get_city_and_country_ranking, get_platform_browser_mobile_rankings, get_ranking, cleanup_referer_ranking, get_route_ranking
"""
visualize information from the databse
"""
palette = {
"red": "#ee4035",
"orange": "#f37736",
"yellow": "#fdf458",
"green": "#7bc043",
"blue": "#0392cf",
"purple": "#b044a0",
}
color_settings_filetypes = {
palette["red"]: ["html", "php"],
palette["green"]: ["jpg", "png", "jpeg", "gif", "svg", "webp"],
palette["yellow"]: ["css"],
"grey": ["txt"]
}
color_settings_alternate = list(palette.values())
color_settings_browsers = {
palette["red"]: ["Safari"],
palette["orange"]: ["Firefox"],
palette["yellow"]: ["Chrome"],
"grey": ["Edge"],
palette["green"]: ["Chromium"],
palette["purple"]: ["Brave"]
}
color_settings_platforms = {
palette["red"]: ["Mac"],
palette["green"]: ["Android"],
"grey": ["iPhone", "iPad"],
palette["yellow"]: ["Linux"],
palette["purple"]: ["BSD"],
palette["blue"]: ["Windows"],
}
def len_list_list(l: list[list]):
size = 0
for i in range(len(l)):
size += len(l[i])
return size
#
# PLOTTING
#
def add_vertikal_labels_in_bar_plot(labels, max_y_val, ax, bar_plot):
"""
Add the label of the bar in or on top of the bar, depending on the bar size
"""
# pdebug("add_vertikal_labels_in_bar_plot:", labels)
for idx,rect in enumerate(bar_plot):
height = rect.get_height()
if height > 0.6 * max_y_val: # if the bar is large, put label in the bar
height = 0.05 * max_y_val
ax.text(rect.get_x() + rect.get_width()/2., height + 0.025 * max_y_val,
labels[idx],
ha='center', va='bottom', rotation=90)
def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot):
"""
add the height of the bar on the top of each bar
"""
# pdebug("add_labels_at_top_of_bar:", xdata, ydata)
y_offset = 0.05 * max_y_val
for idx,rect in enumerate(bar_plot):
ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8))
def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None):
"""
make a bar plot of the ranking
"""
# pdebug(f"plot_ranking: ranking={ranking}")
if not fig:
fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
# create new axis if none is given
ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
# fill x y data
if len(ranking) > settings["file_ranking_plot_max_files"]:
start_index = len(ranking) - settings["file_ranking_plot_max_files"]
else: start_index = 0
x_names = []
y_counts = []
colors = []
for i in range(start_index, len(ranking)):
x_names.append(ranking[i][1])
y_counts.append(ranking[i][0])
ft = ranking[i][1].split(".")[-1]
color = palette["blue"]
# if not color_settings: color = palette["blue"]
if isinstance(color_settings, dict):
for key, val in color_settings.items():
if ft in val: color = key
if not color: color = palette["blue"]
elif isinstance(color_settings, list):
# print(color_settings, (i - start_index) % len(color_settings))
color = color_settings[(i - start_index) % len(color_settings)]
colors.append(color)
bar = ax.bar(x_names, y_counts, tick_label="", color=colors)
if len(y_counts) > 0:
add_vertikal_labels_in_bar_plot(x_names, y_counts[-1], ax, bar)
if settings["plot_add_count_label"]: add_labels_at_top_of_bar(x_names, y_counts, y_counts[-1], ax, bar)
# ax.ylabel(y_counts)
return fig
# def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue", rotate_xlabel=0):
# if not fig:
# fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
# if not ax:
# ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
# else:
# ax = ax.twinx()
# ax.set_ylabel(ylabel)
# # ax.tick_params(axis="y", labelcolor="r")
# ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color)
# plt.xticks(rotation=rotate_xlabel)
# if label: ax.legend()
# return fig, ax
def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major", rotate_xlabel=0, figsize=None):
if not fig:
fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
if not (ax1 and ax2):
ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1)
ax2 = ax1.twinx()
ax2.set_ylabel(ylabel2)
ax1.tick_params(axis="x", rotation=90)
plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1)
plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2)
# ax1.set_xticks(ax1.get_xticks())
# ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor")
# if label1 or label2: ax1.legend()
if plots: plots += plot1 + plot2
else: plots = plot1 + plot2
plt.legend(plots, [ l.get_label() for l in plots])
if grid == "major" or grid == "minor" or grid == "both":
if grid == "minor" or "both":
ax1.minorticks_on()
ax1.grid(visible=True, which=grid, linestyle="-", color="#888")
return fig, ax1, ax2, plots
#
# MAIN
#
def visualize(db: Database):
"""
This assumes sanity checks have been done
"""
pdebug("visualizing...")
if not settings["db"]: missing_arg("db")
if not settings["server_name"]: missing_arg("server_name")
img_dir = settings["img_dir"]
pdebug("img_dir:", img_dir)
img_filetype = settings["img_filetype"]
if isdir(img_dir) and img_filetype:
gen_img = True
else:
print(f"Warning: Not generating images since at least one required variable is invalid: img_dir='{img_dir}', img_filetype='{img_filetype}'")
gen_img = False
img_location = settings["img_location"]
names = {
# paths
"img_route_ranking_last_x_days": f"ranking_routes_last_x_days.{img_filetype}",
"img_referer_ranking_last_x_days": f"ranking_referers_last_x_days.{img_filetype}",
"img_countries_last_x_days": f"ranking_countries_last_x_days.{img_filetype}",
"img_cities_last_x_days": f"ranking_cities_last_x_days.{img_filetype}",
"img_browser_ranking_last_x_days": f"ranking_browsers_last_x_days.{img_filetype}",
"img_platform_ranking_last_x_days": f"ranking_platforms_last_x_days.{img_filetype}",
"img_visitors_and_requests_last_x_days": f"visitor_request_count_daily_last_x_days.{img_filetype}",
"img_route_ranking_total": f"ranking_routes_total.{img_filetype}",
"img_referer_ranking_total": f"ranking_referers_total.{img_filetype}",
"img_countries_total": f"ranking_countries_total.{img_filetype}",
"img_cities_total": f"ranking_cities_total.{img_filetype}",
"img_browser_ranking_total": f"ranking_browsers_total.{img_filetype}",
"img_platform_ranking_total": f"ranking_platforms_total.{img_filetype}",
"img_visitors_and_requests_total": f"visitor_request_count_daily_total.{img_filetype}",
# values
"mobile_visitor_percentage_total": 0.0,
"mobile_visitor_percentage_last_x_days": 0.0,
"visitor_count_last_x_days": 0,
"visitor_count_total": 0,
"request_count_last_x_days": 0,
"request_count_total": 0,
"human_visitor_percentage_last_x_days": 0.0,
"human_visitor_percentage_total": 0.0,
"human_request_percentage_last_x_days": 0.0,
"human_request_percentage_total": 0.0,
# general
"regina_version": settings["version"],
"server_name": settings["server_name"],
"last_x_days": settings["last_x_days"], # must be after all the things with last_x_days!
"earliest_date": "1990-1-1",
"generation_date": "1990-1-1 0:0:0",
}
db = Database(database_path=settings["db"])
get_humans = settings["get_human_percentage"]
# pdebug(f"visualize: settings {settings}")
# DATE STRINGS
earliest_date = db.get_earliest_date()
names["earliest_date"] = dt.fromtimestamp(earliest_date).strftime("%Y-%m-%d")
names["generation_date"] = dt.now().strftime("%Y-%m-%d %H:%M:%S")
# LAST_X_DAYS
# last_x_days_min_date: latest_date - last_x_days
secs_per_day = 86400
last_x_days_min_date = db.get_latest_date() - settings["last_x_days"] * secs_per_day
last_x_days_constraint = get_where_date_str(min_date=last_x_days_min_date)
last_x_days = db.get_days_where(last_x_days_constraint)
last_x_days_contraints = [get_where_date_str(at_date=day) for day in last_x_days]
# ALL DATES
all_time_constraint = get_where_date_str(min_date=0)
# all months in yyyy-mm format
months_all_time = db.get_months_where(all_time_constraint)
# sqlite constrict to month string
months_strs = []
for year_month in months_all_time:
year, month = year_month.split("-")
# first day of the month
min_date = dt(int(year), int(month), 1).timestamp()
month = (int(month) % 12) + 1 # + 1 month
year = int(year)
if month == 1: year += 1
# first day of the next month - 1 sec
max_date = dt(year, month, 1).timestamp() - 1
months_strs.append(get_where_date_str(min_date=min_date, max_date=max_date))
for i in range(2):
suffix = ["_total", "_last_x_days"][i]
date_constraint = [all_time_constraint, last_x_days_constraint][i]
date_names = [months_all_time, last_x_days][i]
date_constraints = [months_strs, last_x_days_contraints][i]
assert(len(date_names) == len(date_constraints))
# FILES
# TODO handle groups
file_ranking = get_route_ranking(db, date_constraint)
if gen_img:
fig_file_ranking = plot_ranking(file_ranking, xlabel="Route Name", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot_size_broad"])
fig_file_ranking.savefig(f"{img_dir}/{names[f'img_route_ranking{suffix}']}", bbox_inches="tight")
# REFERER
referer_ranking = get_ranking(db, "request", "referer", date_constraint, settings["referer_ranking_whitelist"], settings["referer_ranking_whitelist"])
pdebug("Referer ranking", referer_ranking)
cleanup_referer_ranking(referer_ranking)
if gen_img:
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}", bbox_inches="tight")
# GEOIP
if settings["do_geoip_rankings"]:
city_ranking, country_ranking = get_city_and_country_ranking(db, require_humans=settings["geoip_only_humans"])
pdebug("Country ranking:", country_ranking)
pdebug("City ranking:", city_ranking)
if gen_img:
fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_countries{suffix}']}", bbox_inches="tight")
fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of visitors", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_cities{suffix}']}", bbox_inches="tight")
# USER
# visitor_agent_ranking = get_visitor_agent_ranking(cur, date_str)
# for the time span
unique_visitor_ids = get_unique_visitor_ids_for_date(db, date_constraint)
unique_visitor_ids_human = []
append_human_visitors(db, unique_visitor_ids, unique_visitor_ids_human)
# for each date
date_count = len(date_constraints)
unique_visitor_ids_dates: list[list[int]] = []
unique_request_ids_dates: list[list[int]] = []
unique_visitor_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
unique_request_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
for i in range(date_count):
date_constraint_ = date_constraints[i]
unique_visitor_ids_dates.append(get_unique_visitor_ids_for_date(db, date_constraint_))
unique_request_ids_dates.append(get_unique_request_ids_for_date(db, date_constraint_))
if get_humans:
# empty_list = []
# unique_visitor_ids_human_dates.append(empty_list)
append_human_visitors(db, unique_visitor_ids_dates[i], unique_visitor_ids_human_dates[i])
# unique_request_ids_human_dates.append(list())
for human in unique_visitor_ids_human_dates[i]:
append_unique_request_ids_for_date_and_visitor(db, date_constraint_, human, unique_request_ids_human_dates[i])
# print("\n\tuu", unique_visitor_ids_dates, "\n\tur",unique_request_ids_dates, "\n\tuuh", unique_visitor_ids_human_dates, "\n\turh", unique_request_ids_human_dates)
# pdebug("uui", unique_visitor_ids)
# pdebug("uuih", unique_visitor_ids_human)
# pdebug("uuid", unique_visitor_ids_dates)
# pdebug("uuidh", unique_visitor_ids_human_dates)
# pdebug("urid", unique_request_ids_dates)
# pdebug("uridh", unique_visitor_ids_human_dates)
# pdebug(f"human_visitor_precentage: len_list_list(visitor_ids)={len_list_list(unique_visitor_ids_dates)}, len_list_list(visitor_ids_human)={len_list_list(unique_visitor_ids_human_dates)}")
if get_humans:
try:
names[f"human_visitor_percentage{suffix}"] = round(100 * len_list_list(unique_visitor_ids_human_dates) / len_list_list(unique_visitor_ids_dates), 2)
except:
names[f"human_visitor_percentage{suffix}"] = -1.0
try:
names[f"human_request_percentage{suffix}"] = round(100 * len_list_list(unique_request_ids_human_dates) / len_list_list(unique_request_ids_dates), 2)
except:
names[f"human_request_percentage{suffix}"] = -1.0
names[f"visitor_count{suffix}"] = len_list_list(unique_visitor_ids_dates)
names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates)
if gen_img:
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="Visitor count", label1="Unique visitors", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"], rotate_xlabel=-45, figsize=settings["plot_size_broad"])
if get_humans:
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(visitor_ids) for visitor_ids in unique_visitor_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique visitors (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots, rotate_xlabel=-45, figsize=settings["plot_size_broad"])
fig_daily.savefig(f"{img_dir}/{names[f'img_visitors_and_requests{suffix}']}", bbox_inches="tight")
# os & browser
platform_ranking, browser_ranking, names[f"mobile_visitor_percentage{suffix}"] = get_platform_browser_mobile_rankings(db, unique_visitor_ids_human)
if gen_img:
fig_os_rating = plot_ranking(platform_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_platforms, figsize=settings["plot_size_narrow"])
fig_os_rating.savefig(f"{img_dir}/{names[f'img_platform_ranking{suffix}']}", bbox_inches="tight")
fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browser", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot_size_narrow"])
fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}", bbox_inches="tight")
# print("OS ranking", os_ranking)
# print("Browser ranking", browser_ranking)
# print("Mobile percentage", names["mobile_visitor_percentage"])
if settings["template_html"] and settings["html_out_path"]:
pdebug(f"visualize: writing to html: {settings['html_out_path']}")
with open(settings["template_html"], "r") as file:
html = file.read()
for name, value in names.items():
if "img" in name:
value = f"{img_location}/{value}"
if type(value) == float:
value = f"{value:.2f}"
html = html.replace(f"%{name}", str(value))
with open(settings["html_out_path"], "w") as file:
file.write(html)
else:
warning(f"Skipping html generation because either template_html or html_out_path is invalid: template_html='{settings['template_html']}', html_out_path='{settings['html_out_path']}'")