added geoip

This commit is contained in:
matthias@arch 2022-12-14 14:55:04 +01:00
parent ba7cd1c22b
commit 5b7fae371e
12 changed files with 587 additions and 259 deletions

View File

@ -1,30 +1,25 @@
#compdef nicole
#compdef regina
# https://zsh.sourceforge.io/Doc/Release/Completion-System.html#Completion-Functions
_lyrics-site()
{
_values "lyrics site" \
'genius[use only genius.com]' \
'azlyrics[use only azlyrics.com]' \
'all[use all supported sites (default)]'
_config-file() {
# list all files that end in .conf
# -s separator, descritions options
_values -s , 'config files' $(find . -type f -name '*.conf')
}
_csv-file() {
_values -s , 'geoip city database as csv' $(find . -type f -name '*.csv')
}
_nicole()
{
_regina() {
# each argument is
# n:message:action
# option[description]:message:action
# # -s allow stacking, eg -inr
_arguments -s \
'-d[process directory]':directory:_directories \
'-f[process file]':file:_files \
'-r[go through directories recursively]' \
'-s[silent]' \
'-i[ignore history]' \
'-n[do not write to history]' \
'-o[overwrite if the file already has lyrics]' \
'-t[test, only print lyrics, dont write to tags]' \
'-h[show this]' \
'--rm_explicit[remove the "Explicit" lyrics warning from the title tag]' \
'--site[specify lyrics site]':lyrics-site:_lyrics-site
{--help,-h}'[show help]' \
{--config,-c}'[use this config file]':config:_config-file \
'--visualize[visualize the data in the database]' \
'--collect[collect requests from the nginx log]' \
'--access-log[source this logfile]':logfile:_file \
'--update-geoip[recreate the geoip database from csv]':csv:_csv-file
}
_nicole "$@"
_regina "$@"

View File

@ -4,34 +4,35 @@
<element>
<id>UMLClass</id>
<coordinates>
<x>247</x>
<y>312</y>
<x>299</x>
<y>221</y>
<w>299</w>
<h>234</h>
<h>247</h>
</coordinates>
<panel_attributes>User
<panel_attributes>user
--
&lt;&lt;PK&gt;&gt;
- user_id: INTEGER
--
- ip address: TEXT
- ip_address: INTEGER
- user agent string: TEXT
- platform: TEXT
- browser: TEXT
- mobile: INTEGER
- is_human: INTEGER
- range_id: INTEGER
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>975</x>
<y>312</y>
<x>1040</x>
<y>221</y>
<w>234</w>
<h>130</h>
</coordinates>
<panel_attributes>FileGroup
<panel_attributes>filegroup
--
&lt;&lt;PK&gt;&gt;
- group_id: INTEGER
@ -43,8 +44,8 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>871</x>
<y>312</y>
<x>936</x>
<y>221</y>
<w>130</w>
<h>65</h>
</coordinates>
@ -57,12 +58,12 @@ m2=1
<element>
<id>UMLClass</id>
<coordinates>
<x>637</x>
<y>299</y>
<x>702</x>
<y>208</y>
<w>247</w>
<h>221</h>
</coordinates>
<panel_attributes>Request
<panel_attributes>request
--
&lt;&lt;PK&gt;&gt;
- request_id: INTEGER
@ -79,26 +80,26 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>533</x>
<y>312</y>
<w>130</w>
<x>585</x>
<y>221</y>
<w>143</w>
<h>65</h>
</coordinates>
<panel_attributes>lt=-
m1=1
m2=n
</panel_attributes>
<additional_attributes>10.0;20.0;80.0;20.0</additional_attributes>
<additional_attributes>10.0;20.0;90.0;20.0</additional_attributes>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>975</x>
<y>585</y>
<x>1040</x>
<y>455</y>
<w>234</w>
<h>130</h>
</coordinates>
<panel_attributes>File
<panel_attributes>file
--
&lt;&lt;PK&gt;&gt;
- name: TEXT
@ -111,22 +112,22 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>1014</x>
<y>429</y>
<x>1079</x>
<y>338</y>
<w>52</w>
<h>182</h>
<h>143</h>
</coordinates>
<panel_attributes>lt=-
m1=n
m2=1
</panel_attributes>
<additional_attributes>10.0;120.0;10.0;10.0</additional_attributes>
<additional_attributes>10.0;90.0;10.0;10.0</additional_attributes>
</element>
<element>
<id>UMLNote</id>
<coordinates>
<x>780</x>
<y>156</y>
<x>845</x>
<y>65</y>
<w>390</w>
<h>91</h>
</coordinates>
@ -139,12 +140,110 @@ style=autoresize</panel_attributes>
<element>
<id>Relation</id>
<coordinates>
<x>988</x>
<y>234</y>
<x>1053</x>
<y>143</y>
<w>39</w>
<h>104</h>
</coordinates>
<panel_attributes>lt=&lt;-</panel_attributes>
<additional_attributes>10.0;60.0;10.0;10.0</additional_attributes>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>676</x>
<y>611</y>
<w>247</w>
<h>169</h>
</coordinates>
<panel_attributes>city
--
&lt;&lt;PK&gt;&gt;
- city_id: INTEGER
--
- country_id: INTEGER
- name: TEXT
- region: TEXT
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>1014</x>
<y>611</y>
<w>156</w>
<h>143</h>
</coordinates>
<panel_attributes>country
--
&lt;&lt;PK&gt;&gt;
- country_id
--
- name: TEXT
- code: TEXT
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>910</x>
<y>637</y>
<w>130</w>
<h>65</h>
</coordinates>
<panel_attributes>lt=-
m1=1
m2=n
</panel_attributes>
<additional_attributes>80.0;20.0;10.0;20.0</additional_attributes>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>572</x>
<y>637</y>
<w>130</w>
<h>65</h>
</coordinates>
<panel_attributes>lt=-
m1=1
m2=n
</panel_attributes>
<additional_attributes>80.0;20.0;10.0;20.0</additional_attributes>
</element>
<element>
<id>UMLClass</id>
<coordinates>
<x>364</x>
<y>611</y>
<w>221</w>
<h>169</h>
</coordinates>
<panel_attributes>ip_range
--
&lt;&lt;PK&gt;&gt;
- range_id
--
- from: INTEGER
- to: INTEGER
- city_id: INTEGER
style=autoresize</panel_attributes>
<additional_attributes/>
</element>
<element>
<id>Relation</id>
<coordinates>
<x>429</x>
<y>455</y>
<w>52</w>
<h>182</h>
</coordinates>
<panel_attributes>lt=-
m1=1
m2=n
</panel_attributes>
<additional_attributes>10.0;120.0;10.0;10.0</additional_attributes>
</element>
</diagram>

View File

@ -1,13 +1,27 @@
# default configuration for regina
# GENERAL
server_name = default_sever
# path to the database
db = /home/my_user/analytics/my_website.db
# ************************************* REGINA CONFIGURATION **************************************
# .__
# _______ ____ ____ |__| ____ _____
# \_ __ \_/ __ \ / ___\| |/ \\__ \
# | | \/\ ___// /_/ > | | \/ __ \_
# |__| \___ >___ /|__|___| (____ /
# \/_____/ \/ \/
# *************************************************************************************************
# File format:
# Assign value
# key = value
# Lists
# key = el1, el2, el3
# - do not use quotation marks (unless your literally want one)
# - leading and trailing whitespaces will be ignored
# ******************************************* GENERAL *********************************************
# path to the database eg. /home/my_user/analytics/my_website.db
db =
# DATA COLLECTION
# **************************************** DATA COLLECTION ****************************************
# these changes will only apply to newly collected data/creation of new database
# path to the nginx access log to parse.
access_log = /home/my_user/analytics/access.log
# *************************************************************************************************
# path to the nginx access log to parse. /var/log/nginx/access.log. Make sure you have write permissions!
access_log =
# nginx locations and their root directory: location:directory,location:directory,...
locs_and_dirs = /:/www/my_website,/error:/www/error
@ -22,19 +36,31 @@ status_300_is_success = False
# if False, unique user is (ip-address - user agent) pair, if True only ip addess
unique_user_is_ip_address = False
# wether a user needs to make at least 1 successful request to be a human
humans_need_success = True
human_needs_success = True
# dont collect requests to locations matched by this
# dont collect requests to locations fully match this
request_location_regex_blacklist = /analytics.*
# get nation
user_get_country = True
# list if capitalized ISO 3166-1 alpha-2 country codes for which the ip address ranges need to be collected at city level, not country level
# eg for EU: AT, BE, BG, HR, CY, CZ, DK, EE, FI, FR, DE, GZ, HU, IE, IT, LV, LT, LU, MT, NL, PL, PT, RO, SK, SI, ES, SE
get_cities_for_countries =
# VISUALIZATION
# ***************************************** VISUALIZATION *****************************************
# these changes can be changed at any point in time as the only affect the visualization of the data
# *************************************************************************************************
# will be available as variable for the the generated website as %server_name
server_name = default_sever
# separate users into all and humans
get_human_percentage = True
# regex expression as whitelist for file ranking
# generate a country and city ranking
do_geoip_rankings = False
# only use humans for geoip rankings
geoip_only_humans = True
city_ranking_regex_blacklist = City in .*
country_ranking_regex_blacklist =
# ignore the protocol in referers, so https://url.com = http://url.com -> url.com
referer_ranking_ignore_protocol = True
@ -49,15 +75,19 @@ referer_ranking_regex_whitelist = ^[^\-].*
# regex expression as whitelist for user agent ranking
user_agent_ranking_regex_whitelist =
# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))
# regex expression as whitelist for file ranking
# eg .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) to only show these files
file_ranking_regex_whitelist =
# maximum number of file(group)s on the file ranking
file_ranking_plot_max_files = 20
# wether to ignore non existing files in the ranking
file_ranking_ignore_error_files = True
# "plot_figsize" = (60 40),
plot_dpi = 300
# affects user/request count plot, file ranking and referer ranking
plot_size_broad = 10, 6
# affects platform and browser ranking
plot_size_narrow = 7, 5
# output directory for the generated plots
img_dir = /www/analytics/images
@ -67,3 +97,9 @@ img_location = images
template_html = /home/my_user/analytics/template.html
# output for the generated html
html_out_path = /www/analytics/statistics.html
# ******************************************** REGINA *********************************************
# these settings affect the behavior of regina
# *************************************************************************************************
# print lots! of debug messages to help you find problems
debug = False

View File

@ -2,96 +2,62 @@
% Matthias Quintern
% April 2022
# Name
# NAME
**R**uling **E**mpress **G**enerating **I**n-depth **N**ginx **A**nalytics (obviously)
Regina is an analytics tool for nginx.
## About
# SYNOPSIS
| **regina** --config CONFIG_FILE [OPTION...]
# DESCRIPTION
It collects information from the nginx access.log and stores it in a sqlite3 database.
Regina supports several data visualization configurations and can generate an admin-analytics page from an html template file.
# SYNOPSIS
| With config file:
| **regina** [OPTION...]
## Visualization options:
- Line plot: Einmal seit Beginn der Aufzeichnung(pro Monat), einmal letzte 30 Tage (pro Tag)
x: date
y: #unique users, #unique requests
- Bar charts:
- unique user information:
- used browsers (in percent)
- used operating systems (in percent)
- countries (in percent)
- unique request information:
- requested files (in counts)
- HTTP referrers (in counts)
A unique user is a IP-address - user agent pair.
A unique request is a unique-user - requested file - date (day) - combination.
## Command line options
**-d** directory
: process directory [directory]
**-h**, **--help**
: Show the the possible command line arguments
**-f** file
: process file [file]
**-c**, **--config** config-file
: Retrieve settings from the config-file
**-r**
: go through directories recursively
**--access-log** log-file
: Overrides the access_log from the configuration
**-s**
: silent, no command-line output
**--collect**
: Collect information from the access_log and store them in the databse
**-i**
: ignore history
**--visualize**
: Visualize the data from the database
**-n**
: do not write to history
**-o**
: overwrite if the file already has lyrics
**-t**
: test, do not write lyrics to file, but print to stdout
**-h**
: show this
**--rm_explicit**
: remove the "[Explicit]" lyrics warning from the song's title tag
**--site** site
: onlysearch [site] for lyrics (genius or azlyrics)
If you do not specify a directory or file, the program will ask you if you want to use the current working directory.
Example: `nicole -ior -d ~/music/artist --rm_explicit`
**--update-geoip** geoip-db
: Recreate the geoip part of the database from the geoip-db csv. The csv must have this form: lower, upper, country-code, country-name, region, city
# INSTALLATION AND UPDATING
To update nicole, simply follow the installation instructions.
To update regina, simply follow the installation instructions.
## pacman (Arch Linux)
Installing nicole using the Arch Build System also installs the man-page and a zsh completion script, if you have zsh installed.
Installing regina using the Arch Build System also installs the man-page and a zsh completion script, if you have zsh installed.
```shell
git clone https://github.com/MatthiasQuintern/nicole.git
cd nicole
git clone https://github.com/MatthiasQuintern/regina.git
cd regina
makepkg -si
```
## pip
You can also install nicole with python-pip:
You can also install regina with python-pip:
```shell
git clone https://github.com/MatthiasQuintern/nicole.git
cd nicole
git clone https://github.com/MatthiasQuintern/regina.git
cd regina
python3 -m pip install .
```
You can also install it system-wide using `sudo python3 -m pip install.`
You can also install it system-wide using `sudo python3 -m pip install .`
If you also want to install the man-page and the zsh completion script:
```shell
sudo cp nicole.1.man /usr/share/man/man1/nicole.1
sudo gzip /usr/share/man/man1/nicole.1
sudo cp _nicole.compdef.zsh /usr/share/zsh/site-functions/_nicole
sudo chmod +x /usr/share/zsh/site-functions/_nicole
sudo cp regina.1.man /usr/share/man/man1/regina.1
sudo gzip /usr/share/man/man1/regina.1
sudo cp _regina.compdef.zsh /usr/share/zsh/site-functions/_regina
sudo chmod +x /usr/share/zsh/site-functions/_regina
```
# CHANGELOG

View File

@ -1,8 +1,9 @@
import sqlite3 as sql
from re import fullmatch, match
from ipaddress import IPv4Address, ip_address
from time import mktime
from datetime import datetime as dt
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug, warning, pmessage
from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings
@ -16,7 +17,7 @@ months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct",
class Request:
def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""):
self.ip_address = sanitize(ip_address)
self.ip_address = int(IPv4Address(sanitize(ip_address)))
self.time_local = 0
#[20/Nov/2022:00:47:36 +0100]
m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local)
@ -98,8 +99,11 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int:
user_id: int = sql_tablesize(cursor, t_user)
# pdebug("new user:", user_id, request.ip_address)
platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent)
ip_range_id_val = 0
if settings["user_get_location"]:
ip_range_id_val = get_ip_range_id(cursor, request.ip_address)
is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id))
cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');")
cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');")
return user_id
def is_user_human(cur: sql.Cursor, user_id: int):
@ -170,15 +174,30 @@ def get_os_browser_pairs_from_agent(user_agent):
return operating_system, browser, mobile
# def set_countries(cur: sql.Cursor, user_ids: list[int]):
# if settings["user_get_country"]:
# ipconn = sql.connect(ip2nation_db_path)
# ipcur = ipconn.cursor()
# for user_id in user_ids:
# ip_address = sql_select(cur, t_user, [("user_id", user_id)])
# cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}")
# ip_address = cur.fetchall()[0][0]
# ipcur.execute("SELECT iso_code_3 FROM ip2nationCountries WHERE ip")
def get_ip_range_id(cur: sql.Cursor, ip_address: int):
print(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE lower <= '{ip_address}' AND to >= '{ip_address}'")
cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper")
results = cur.fetchall()
ip_range_id_val = 0
if len(results) == 0:
pass
elif len(results) > 1:
warning(f"get_countries: Found multiple ip_ranges for ip_address={ip_address}: results={results}")
else:
ip_range_id_val = results[0][0]
return ip_range_id_val
def update_ip_range_id(cur: sql.Cursor, user_id: int):
cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}")
results = cur.fetchall()
if len(results) == 0:
warning(f"update_ip_range_id: Invalid user_id={user_id}")
return
elif len(results) > 1:
warning(f"update_ip_range_id: Found multiple ip_addresses for user_id={user_id}: results={results}")
return
ip_address = results[0][0]
cur.execute(f"UPDATE {t_user} SET {ip_range_id.name} = '{get_ip_range_id(cur, ip_address)}' WHERE user_id = '{user_id}'")
def add_requests_to_db(requests: list[Request], db_name: str):

View File

@ -1,10 +1,11 @@
# from sys import path
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
import sqlite3 as sql
from csv import reader
from os import path, listdir
# local
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize
from regina.utility.utility import pdebug
from regina.utility.globals import settings
"""
create reginas database as shown in the uml diagram database.uxf
@ -40,28 +41,61 @@ t_request = "request"
t_file = "file"
t_filegroup = "filegroup"
t_user = "user"
t_city = "city"
t_country = "country"
t_ip_range = "ip_range"
user_id = Entry("user_id", "INTEGER")
request_id = Entry("request_id", "INTEGER")
filegroup_id = Entry("group_id", "INTEGER")
ip_address_entry = Entry("ip_address", "TEXT")
filename_entry = Entry("filename", "TEXT")
city_id = Entry("city_id", "INTEGER")
country_id = Entry("country_id", "INTEGER")
ip_range_id = Entry("ip_range_id", "INTEGER")
database_tables = {
t_user: Table(t_user, user_id, [
Entry("ip_address", "TEXT"),
Entry("ip_address", "INTEGER"),
Entry("user_agent", "TEXT"),
Entry("platform", "TEXT"),
Entry("browser", "TEXT"),
Entry("mobile", "INTEGER"),
Entry("is_human", "INTEGER"),
# Entry("country_iso_code_3", "TEXT")
ip_range_id,
],
[f"UNIQUE({user_id.name})"]),
t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]),
t_file: Table(t_file, filename_entry,
[filegroup_id],
[f"UNIQUE({filename_entry.name})"]),
t_filegroup: Table(t_filegroup, filegroup_id,
[Entry("groupname", "TEXT")],
[f"UNIQUE({filegroup_id.name})"]),
t_request: Table(t_request, request_id, [
user_id, filegroup_id, Entry("date", "INTEGER"), Entry("referer", "TEXT"), Entry("status", "INTEGER")
], ["UNIQUE(request_id)"]),
user_id,
filegroup_id,
Entry("date", "INTEGER"),
Entry("referer", "TEXT"),
Entry("status", "INTEGER")
],
["UNIQUE(request_id)"]),
t_ip_range: Table(t_ip_range, ip_range_id, [
Entry("lower", "INTEGER"),
Entry("upper", "INTEGER"),
city_id,
],
[f"UNIQUE({ip_range_id.name})"]),
t_city: Table(t_city, city_id, [
country_id,
Entry("name", "TEXT"),
Entry("region", "TEXT"),
],
[f"UNIQUE({city_id.name})"]),
t_country: Table(t_country, country_id, [
Entry("name", "TEXT"),
Entry("code", "TEXT"),
],
[f"UNIQUE({country_id.name})"]),
}
@ -146,17 +180,103 @@ def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_f
pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups)
return filegroups
def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]):
def get_country_id(cur:sql.Cursor, name, code, country_tablesize):
# countries = sql_select(cur, t_country, [("name", name)])
cur.execute(f"SELECT {country_id.name} FROM {t_country} WHERE name = '{name}'")
countries = cur.fetchall()
if len(countries) > 0:
country_id_val = countries[0][0]
else: # insert new country
country_id_val = country_tablesize
# pdebug(f"update_geoip_tables: Adding country #{country_id_val}, name={name}")
cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES ({country_id_val}, '{name}', '{code}')")
country_tablesize += 1
return country_id_val, country_tablesize
def get_city_id(cur: sql.Cursor, name, region, country_id, city_tablesize):
# cities = sql_select(cur, t_city, [("name", name)])
cur.execute(f"SELECT {city_id.name} FROM {t_city} WHERE name = '{name}'")
cities = cur.fetchall()
if len(cities) > 0:
city_id_val = cities[0][0]
else: # insert new city
city_id_val = city_tablesize
# pdebug(f"update_geoip_tables: Adding city #{city_id_val}, name={row[CITY]}, country={country_id_val}")
cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region, country_id) VALUES ({city_id_val}, '{name}', '{region}', '{country_id}')")
city_tablesize += 1
return city_id_val, city_tablesize
def update_geoip_tables(cur: sql.Cursor, geoip_city_csv: str):
FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5
ip_range_id_val = 0
with open(geoip_city_csv, 'r') as file:
# delete all previous data
cur.execute(f"DELETE FROM {t_ip_range}")
cur.execute(f"VACUUM")
csv = reader(file, delimiter=',', quotechar='"')
# guarantees that unkown city/country will have id 0
if not sql_exists(cur, t_country, [("name", "Unknown")]):
cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES (0, 'Unknown', 'XX') ")
if not sql_exists(cur, t_city, [("name", "Unknown")]):
cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region) VALUES (0, 'Unknown', 'Unkown') ")
country_tablesize = sql_tablesize(cur, t_country)
city_tablesize = sql_tablesize(cur, t_city)
print(f"Recreating the geoip database from {geoip_city_csv}. This might take a long time...")
combine_range_country_id = 0
combine_range_lower = -1
combine_range_upper = -1
combine_range_country_name = ""
for row in csv:
# these might contain problematic characters (')
row[CITY] = sanitize(row[CITY])
row[COUNTRY] = sanitize(row[COUNTRY])
row[REGION] = sanitize(row[REGION])
# make sure country exists
country_id_val, country_tablesize = get_country_id(cur, row[COUNTRY], row[CODE], country_tablesize)
if row[CODE] in settings["get_cities_for_countries"]:
# make sure city exists
city_id_val, city_tablesize = get_city_id(cur, row[CITY], row[REGION], country_id_val, city_tablesize)
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding range for city={row[CITY]}, country={row[COUNTRY]}, lower={row[FROM]}, upper={row[TO]}")
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {row[FROM]}, {row[TO]}, {city_id_val})")
ip_range_id_val += 1
else:
if combine_range_country_id >= 0:
if combine_range_country_id == country_id_val: combine_range_upper = row[TO]
else: # new range for country, append
# get id for dummy city
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}")
city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize)
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})")
ip_range_id_val += 1
combine_range_country_id = -1
if combine_range_country_id < 0 : # combine with later ranges
combine_range_country_id = country_id_val
combine_range_lower = row[FROM]
combine_range_upper = row[TO]
combine_range_country_name = row[COUNTRY]
if combine_range_country_id >= 0: # last range , append
# get id for dummy city
pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}")
city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize)
cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})")
ip_range_id_val += 1
def create_db(db_name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]):
"""
create the name with database_tables
"""
print(f"creating database: '{name}'")
conn = sql.connect(f"{name}")
print(f"creating database: '{db_name}'")
conn = sql.connect(f"{db_name}")
cursor = conn.cursor()
for table in database_tables.values():
cursor.execute(table.create_sql_str())
filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes)
create_filegroups(cursor, filegroup_str)
cursor.close()
conn.commit()
conn.close()

View File

@ -9,7 +9,7 @@ from datetime import datetime as dt
from numpy import empty
# local
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup
from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, t_ip_range, t_city, t_country
from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where
from regina.utility.utility import pdebug, warning, missing_arg
from regina.utility.globals import settings
@ -277,7 +277,7 @@ def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
# print(ranking)
return ranking
def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]:
def get_request_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date_condition:str) -> list[tuple[int, str]]:
"""
1) get all the distinct entries for field_name after min_date_unix_time
2) call get_name_function with the distinct entry
@ -286,25 +286,27 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs
:returns [(request_count, name)]
"""
ranking = []
cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}")
cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date_condition}")
for name in cur.fetchall():
name = name[0]
if whitelist_regex:
if not fullmatch(whitelist_regex, name):
continue
# ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename))
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}")
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date_condition}")
ranking.append((cur.fetchone()[0], name))
ranking.sort()
# print(ranking)
return ranking
re_uri_protocol = f"(https?)://"
# re_uri_protocol = f"(https?)://"
re_uri_protocol = f"(https?://)?"
re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)"
# re_uri_ipv6 = ""
re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})"
re_uri_location = r"(?:/(.*))?"
re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})"
# (https?://)?((?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})|(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?))((?:/(.*))?)
def cleanup_referer(referer: str) -> str:
"""
@ -326,7 +328,7 @@ def cleanup_referer(referer: str) -> str:
if len(domain.split(".")) == 2: # if domain.tld
referer = domain.split(".")[0]
if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer
if not settings["referer_ranking_ignore_protocol"]: referer = protocol + referer
if not settings["referer_ranking_ignore_location"]: referer += location
# pdebug(f"cleanup_referer: cleaned up: {referer}")
return referer
@ -344,6 +346,37 @@ def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]):
referer_ranking.append((count, referer))
referer_ranking.sort()
def get_city_and_country_ranking(cur:sql.Cursor, require_humans=True, regex_city_blacklist="", regex_country_blacklist=""):
sql_cmd = f"SELECT ci.name, c.code, c.name FROM {t_country} AS c, {t_city} as ci, {t_user} as u, {t_ip_range} as i WHERE u.ip_range_id = i.ip_range_id AND i.city_id = ci.city_id AND ci.country_id = c.country_id"
if require_humans: sql_cmd += " AND u.is_human = 1"
cur.execute(sql_cmd)
pdebug(f"get_city_and_country_ranking: require_humans={require_humans}, regex_city_blacklist='{regex_city_blacklist}', regex_country_blacklist='{regex_country_blacklist}'")
cities = cur.fetchall()
cities_dict = {}
country_dict = {}
# TODO: find out why regex_blacklist does not work
pdebug(f"get_city_and_country_ranking: found {len(cities)} ip_ranges")
validate_city_cmd = lambda _ : True
validate_country_cmd = lambda _ : True
if len(regex_city_blacklist) > 0: validate_city_cmd = lambda city : fullmatch(regex_city_blacklist, city) is None
if len(regex_country_blacklist) > 0 : validate_country_cmd = lambda country : fullmatch(regex_country_blacklist, country) is None
for i in range(len(cities)):
if cities[i][0] in cities_dict:
cities_dict[cities[i][0]][0] += 1
else:
if validate_city_cmd(cities[i][0]):
cities_dict[cities[i][0]] = [1, cities[i][1], cities[i][2]] # count, country code
if cities[i][2] in country_dict:
country_dict[cities[i][2]] += 1
else:
if validate_country_cmd(cities[i][2]):
country_dict[cities[i][2]] = 1 # count, country code
city_ranking = [(v[0], f"{k} ({v[1]})") for k,v in cities_dict.items()]
city_ranking.sort()
country_ranking = [(v, k) for k,v in country_dict.items()]
country_ranking.sort()
return city_ranking, country_ranking
#
# PLOTTING
@ -365,13 +398,13 @@ def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot):
for idx,rect in enumerate(bar_plot):
ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8))
def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[]):
def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None):
"""
make a bar plot of the most requested files
"""
# pdebug(f"plot_ranking: ranking={ranking}")
if not fig:
fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
# create new axis if none is given
ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
# fill x y data
@ -404,36 +437,32 @@ def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="",
return fig
def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue"):
if not fig:
fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
if not ax:
ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
else:
ax = ax.twinx()
ax.set_ylabel(ylabel)
# ax.tick_params(axis="y", labelcolor="r")
ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color)
if label: ax.legend()
# if xlim:
# if xlim[0] != xlim[1]:
# ax.set_xlim(*xlim)
# def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue", rotate_xlabel=0):
# if not fig:
# fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
# if not ax:
# ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel)
# else:
# ax = ax.twinx()
# ax.set_ylabel(ylabel)
# # ax.tick_params(axis="y", labelcolor="r")
# ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color)
# plt.xticks(rotation=rotate_xlabel)
# if label: ax.legend()
# return fig, ax
# if ylim:
# if ylim[0] != ylim[1]:
# ax.set_ylim(*ylim)
return fig, ax
def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major"):
def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major", rotate_xlabel=0, figsize=None):
if not fig:
fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None)
if not (ax1 and ax2):
ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1)
ax2 = ax1.twinx()
ax2.set_ylabel(ylabel2)
# ax.tick_params(axis="y", labelcolor="r")
ax1.tick_params(axis="x", rotation=90)
plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1)
plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2)
# ax1.set_xticks(ax1.get_xticks())
# ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor")
# if label1 or label2: ax1.legend()
if plots: plots += plot1 + plot2
else: plots = plot1 + plot2
@ -444,13 +473,6 @@ def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlab
ax1.minorticks_on()
ax1.grid(visible=True, which=grid, linestyle="-", color="#888")
# if xlim:
# if xlim[0] != xlim[1]:
# ax.set_xlim(*xlim)
# if ylim:
# if ylim[0] != ylim[1]:
# ax.set_ylim(*ylim)
return fig, ax1, ax2, plots
@ -470,16 +492,20 @@ def visualize(loaded_settings: dict):
img_location = settings["img_location"]
names = {
# paths
"img_file_ranking_last_x_days": f"ranking_all_time_files_last_x_days.{img_filetype}",
"img_referer_ranking_last_x_days": f"ranking_all_time_referers_last_x_days.{img_filetype}",
"img_browser_ranking_last_x_days": f"ranking_all_time_browsers_last_x_days.{img_filetype}",
"img_operating_system_ranking_last_x_days": f"ranking_all_time_operating_systems_last_x_days.{img_filetype}",
"img_file_ranking_last_x_days": f"ranking_files_last_x_days.{img_filetype}",
"img_referer_ranking_last_x_days": f"ranking_referers_last_x_days.{img_filetype}",
"img_countries_last_x_days": f"ranking_countries_last_x_days.{img_filetype}",
"img_cities_last_x_days": f"ranking_cities_last_x_days.{img_filetype}",
"img_browser_ranking_last_x_days": f"ranking_browsers_last_x_days.{img_filetype}",
"img_operating_system_ranking_last_x_days": f"ranking_operating_systems_last_x_days.{img_filetype}",
"img_users_and_requests_last_x_days": f"user_request_count_daily_last_x_days.{img_filetype}",
"img_file_ranking_total": f"ranking_all_time_files_total.{img_filetype}",
"img_referer_ranking_total": f"ranking_all_time_referers_total.{img_filetype}",
"img_browser_ranking_total": f"ranking_all_time_browsers_total.{img_filetype}",
"img_operating_system_ranking_total": f"ranking_all_time_operating_systems_total.{img_filetype}",
"img_file_ranking_total": f"ranking_files_total.{img_filetype}",
"img_referer_ranking_total": f"ranking_referers_total.{img_filetype}",
"img_countries_total": f"ranking_countries_total.{img_filetype}",
"img_cities_total": f"ranking_cities_total.{img_filetype}",
"img_browser_ranking_total": f"ranking_browsers_total.{img_filetype}",
"img_operating_system_ranking_total": f"ranking_operating_systems_total.{img_filetype}",
"img_users_and_requests_total": f"user_request_count_daily_total.{img_filetype}",
# values
"mobile_user_percentage_total": 0.0,
@ -522,7 +548,6 @@ def visualize(loaded_settings: dict):
days = get_days(cur, last_x_days_str)
days_strs = [get_where_date_str(at_date=day) for day in days]
# ALL DATES
all_time_str = get_where_date_str(min_date=0)
# all months in yyyy-mm format
@ -550,15 +575,29 @@ def visualize(loaded_settings: dict):
# FILES
file_ranking = get_file_ranking(cur, date_str)
if gen_img:
fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes)
fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}")
fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot_size_broad"])
fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}", bbox_inches="tight")
# REFERER
referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
referer_ranking = get_request_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str)
pdebug("Referer ranking", referer_ranking)
cleanup_referer_ranking(referer_ranking)
if gen_img:
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate)
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}")
fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}", bbox_inches="tight")
# GEOIP
if settings["do_geoip_rankings"]:
city_ranking, country_ranking = get_city_and_country_ranking(cur, require_humans=settings["geoip_only_humans"], regex_city_blacklist=settings["city_ranking_regex_blacklist"], regex_country_blacklist=settings["country_ranking_regex_blacklist"])
pdebug("Country ranking:", country_ranking)
pdebug("City ranking:", city_ranking)
if gen_img:
fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of users", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_countries{suffix}']}", bbox_inches="tight")
fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of users", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"])
fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_cities{suffix}']}", bbox_inches="tight")
# USER
# user_agent_ranking = get_user_agent_ranking(cur, date_str)
@ -570,8 +609,8 @@ def visualize(loaded_settings: dict):
date_count = len(date_strs)
unique_user_ids_dates: list[list[int]] = []
unique_request_ids_dates: list[list[int]] = []
unique_user_ids_human_dates: list[list[int]] = [[] for i in range(date_count)]
unique_request_ids_human_dates: list[list[int]] = [[] for i in range(date_count)]
unique_user_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
unique_request_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)]
for i in range(date_count):
date_str_ = date_strs[i]
unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_))
@ -603,26 +642,19 @@ def visualize(loaded_settings: dict):
names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates)
names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates)
if gen_img:
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"])
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"], rotate_xlabel=-45, figsize=settings["plot_size_broad"])
if get_humans:
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots)
fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}")
fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots, rotate_xlabel=-45, figsize=settings["plot_size_broad"])
fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}", bbox_inches="tight")
# os & browser
os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human)
if gen_img:
fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems)
fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}")
fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers)
fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}")
fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems, figsize=settings["plot_size_narrow"])
fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}", bbox_inches="tight")
fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot_size_narrow"])
fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}", bbox_inches="tight")
# print("File Ranking", file_ranking)
# print("referer Ranking", referer_ranking)
# print("user agent ranking", user_agent_ranking)
# print("Unique Users:", get_unique_user_count(cur))
# fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue")
# fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange")
# fig_daily.savefig(f"{img_dir}/daily.{img_filetype}")
# print("OS ranking", os_ranking)
# print("Browser ranking", browser_ranking)
# print("Mobile percentage", names["mobile_user_percentage"])

View File

@ -3,12 +3,14 @@
# __package__="."
from sys import argv, exit
from os.path import isfile
from regina.db_operation.collect import parse_log, add_requests_to_db
from regina.db_operation.database import create_db
import sqlite3 as sql
from regina.db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id
from regina.db_operation.database import create_db, update_geoip_tables, t_user
from regina.db_operation.visualize import visualize
from regina.utility.settings_manager import read_settings_file
from regina.utility.globals import settings, version
from regina.utility.utility import pmessage
from regina.utility.sql_util import sql_tablesize
"""
start regina, launch either collect or visualize
@ -17,23 +19,24 @@ TODO:
- unique user = ip address
- max requests/time
- unique request datums unabhängig
- fix datum im user and request count plot
- fix datum monat is 1 zu wenig
- fix ms edge nicht dabei
X fix datum im user and request count plot
X fix datum monat is 1 zu wenig
X fix ms edge nicht dabei
- für letzten Tag: uhrzeit - requests/users plot
- checken warum last x days und total counts abweichen
- länder aus ip addresse
- "manuelle" datenbank beabeitung in cli:
- user + alle seine requests löschen
- user agents:
- android vor linux suchen, oder linux durch X11 ersetzen
X android vor linux suchen, oder linux durch X11 ersetzen
- alles was bot drin hat als bot betrachten
- wenn datenbankgröße zum problem wird:
- referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id
- selbes für platforms und browsers
- test:
- human detection
- referer cleanup
X referer cleanup
X geoip
- schöne log nachrichten für die cron mail
- testing!
"""
@ -41,13 +44,10 @@ TODO:
def help():
helpstring = """Command line options:
--server-name string
--log path to the access.log
--db name of the database
--settings["filegroups"] string describing settings["filegroups"], eg 'name1: file1, file2; name2: file3, file4, file5;'
--auto-group-filetypes comma separated list of filetypes, eg 'css,png,gif'
--locs-and_dirs comma separated list of nginx_location:directory pairs, eg '/:/www/website'
--config-file path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line
--config <path> path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line
--update-geoip <path> path to IP-COUNTRY-REGION-CITY database in csv format
--visualize generate the visualization website
--collect fill the database from the nginx access log
"""
print(helpstring)
@ -68,16 +68,20 @@ def main():
collect = False
visualize_ = False
log_file = ""
geoip_city_csv = ""
# parse args
i = 1
while i in range(1, len(argv)):
if argv[i] == "--config":
if argv[i] in ["--config", "-c"]:
if len(argv) > i + 1: config_file = argv[i+1]
else: missing_arg_val(argv[i])
if argv[i] == "--log-file":
elif argv[i] == "--log-file":
if len(argv) > i + 1: log_file = argv[i+1]
else: missing_arg_val(argv[i])
elif argv[i] == "--help":
if argv[i] == "--update-geoip":
if len(argv) > i + 1: geoip_city_csv = argv[i+1]
else: missing_arg_val(argv[i])
elif argv[i] in ["--help", "-h"]:
help()
exit(0)
elif argv[i] == "--collect":
@ -87,11 +91,11 @@ def main():
else:
pass
i += 1
if not collect and not visualize_:
missing_arg("--visualize or --collect")
if not (collect or visualize_ or geoip_city_csv):
missing_arg("--visualize or --collect or --update-geoip")
if not config_file:
missing_arg("--config_file")
missing_arg("--config")
if not isfile(config_file):
error(f"Not a file: '{config_file}'")
read_settings_file(config_file, settings)
@ -107,19 +111,33 @@ def main():
if isinstance(settings["locs_and_dirs"], str):
settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ]
if not isfile(config_file):
error(f"Not a file: '{config_file}'")
if not isfile(settings["db"]):
create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
if geoip_city_csv:
if not isfile(geoip_city_csv):
error(f"Not a file: '{geoip_city_csv}'")
conn = sql.connect(settings['db'], isolation_level=None) # required vor vacuum
cur = conn.cursor()
update_geoip_tables(cur, geoip_city_csv)
# update users
for user_id in range(sql_tablesize(cur, t_user)):
update_ip_range_id(cur, user_id)
cur.close()
conn.commit()
conn.close()
if collect:
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'")
if not isfile(settings["db"]):
create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"])
requests = parse_log(settings["access_log"])
add_requests_to_db(requests, settings["db"])
elif visualize_:
if visualize_:
pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'")
if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'")
visualize(settings)
else:
error("Either --collect --visualize has to be provided")
if __name__ == '__main__':
main()

View File

@ -5,7 +5,7 @@ version = "1.0"
# default settings, these are overwriteable through a config file
settings = {
# GENERAL
"server_name": "",
"server_name": "default_sever",
# DATA COLLECTION
"access_log": "",
"db": "",
@ -15,12 +15,16 @@ settings = {
"request_location_regex_blacklist": "",
"request_is_same_on_same_day": True, # mutiple requests from same user to same file at same day are counted as 1
"unique_user_is_ip_address": False,
"user_get_country": True,
"get_cities_for_countries": [""], # list if country codes for which the ip address ranges need to be collected at city level, not country level
# VISUALIZATION
"get_human_percentage": False,
"human_needs_success": True, # a human must have at least 1 successful request (status < 300)
"status_300_is_success": False, # 300 codes are success
"do_geoip_rankings": False,
"geoip_only_humans": True,
"city_ranking_regex_blacklist": "",
"country_ranking_regex_blacklist": "",
# "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))",
"file_ranking_regex_whitelist": r".*\.(html)",
"file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300)
@ -34,12 +38,16 @@ settings = {
# "plot_figsize": (60, 40),
"plot_dpi": 300,
"plot_add_count_label": True,
"plot_size_broad": (10, 5),
"plot_size_narrow": (6.5, 5),
"img_dir": "",
"img_location": "",
"img_filetype": "svg",
"template_html": "",
"html_out_path": "",
"last_x_days": 30,
# regina
"debug": False
}
# these oses and browser can be detected:

View File

@ -4,14 +4,24 @@ def get_bool(bool_str: str, fallback=False):
elif bool_str in ["false", "False"]: return False
return fallback
def get_iterable(s, original_iterable, require_same_length=False):
val_type = str
if len(original_iterable) > 0: val_type = type(original_iterable[0])
new_iter = type(original_iterable)(val_type(v.strip(" ")) for v in s.split(","))
if require_same_length and len(original_iterable) != len(new_iter):
raise Exception(f"{new_iter} does not have the same length as {original_iterable}")
return new_iter
def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, allow_new_keys=False, convert_to_type=True):
ignore_invalid_lines = False
lines = []
with open(filepath, "r") as file:
lines = file.readlines()
for i in range(len(lines)):
line = lines[i].strip("\n ")
if line.startswith("#"): continue
if line.startswith("#") or len(line) == 0: continue
vals = line.split("=")
if not len(vals) == 2:
if ignore_invalid_lines: continue
@ -23,11 +33,23 @@ def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True,
if convert_to_type and not isinstance(settings[vals[0]], str|list|None):
if isinstance(settings[vals[0]], bool):
settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]])
continue
try:
settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" "))
except Exception as e:
if not ignore_invalid_lines: raise e
else: continue
elif isinstance(settings[vals[0]], tuple):
try:
settings[vals[0]] = get_iterable(vals[1], settings[vals[0]], require_same_length=True)
except Exception as e:
if not ignore_invalid_lines: raise e
else: continue
elif isinstance(settings[vals[0]], list):
try:
settings[vals[0]] = get_iterable(vals[1], settings[vals[0]], require_same_length=False)
except Exception as e:
if not ignore_invalid_lines: raise e
else: continue
else:
try:
settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" "))
except Exception as e:
if not ignore_invalid_lines: raise e
else: continue
else:
settings[vals[0]] = vals[1].strip(" ")

View File

@ -2,13 +2,14 @@
# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}")
from sys import exit
from regina.utility.globals import settings
"""
Various utitity
"""
DEBUG = False
def pdebug(*args, **keys):
if DEBUG: print(*args, **keys)
if settings["debug"]: print(*args, **keys)
def warning(*w, **k):
print("Warning:", *w, **k)

View File

@ -5,11 +5,11 @@
<meta name="description" content="Regina - Nginx Analytics">
<meta name="keywords" content="">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Analytics for %server-name</title>
<title>Analytics for %server_name</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<h1>Analytics for %server-name</h1>
<h1>Analytics for %server_name</h1>
<div class=box>
<center>
<h2>Last %last_x_days days</h2>
@ -23,17 +23,22 @@
<hr>
<h3>File access</h3>
<img src="%img_file_ranking_last_x_days" alt="File ranking", title="File ranking">
<img src="%img_file_ranking_last_x_days" alt="File ranking for the last %last_x_days days", title="File ranking for the last %last_x_days days">
<hr>
<h3>Platforms and browsers</h3>
<img src="%img_operating_system_ranking_last_x_days" alt="Operating system ranking", title="Operating system ranking">
<img src="%img_browser_ranking_last_x_days" alt="Browser ranking", title="Browser ranking">
<img src="%img_operating_system_ranking_last_x_days" alt="Operating system ranking for the last %last_x_days days", title="Operating system ranking for the last %last_x_days days">
<img src="%img_browser_ranking_last_x_days" alt="Browser ranking for the last %last_x_days days", title="Browser ranking for the last %last_x_days days">
<h4>Mobile users: %mobile_user_percentage_last_x_days%</h4>
<hr>
<h3>Referrers</h3>
<img src="%img_referer_ranking_last_x_days" alt="Referer ranking", title="Referer ranking">
<img src="%img_referer_ranking_last_x_days" alt="Referer ranking for the last %last_x_days days", title="Referer ranking for the last %last_x_days days">
<hr>
<h3>GeoIP</h3>
<img src="%img_countries_last_x_days" alt="Country ranking for the last %last_x_days days", title="Country ranking for the last %last_x_days days">
<img src="%img_cities_last_x_days" alt="City ranking for the last %last_x_days days", title="City ranking for the last %last_x_days days">
<hr>
</center>
</div>
@ -62,8 +67,15 @@
<h3>Referrers</h3>
<img src="%img_referer_ranking_total" alt="Referer ranking", title="Referer ranking">
<hr>
<h3>GeoIP</h3>
<img src="%img_countries_total" alt="Country ranking", title="Country ranking">
<img src="%img_cities_total" alt="City ranking", title="City ranking">
<hr>
</center>
</div>
<p>These analytics were generated by <a href="https://git.quintern.xyz/MatthiasQuintern/regina">regina %regina_version</a> at %generation_date</p>
<!-- Uncomment if you use IP2Location database -->
<p>This site includes IP2Location LITE data available from <a href="https://lite.ip2location.com">https://lite.ip2location.com</a></p>
</body>
</html>