From 5b7fae371e1fd426f19c28e219e1a6e77bdb1b10 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Wed, 14 Dec 2022 14:55:04 +0100 Subject: [PATCH] added geoip --- _regina.compdef.zsh | 37 +++---- database.uxf | 155 ++++++++++++++++++++++----- default.conf | 68 +++++++++--- regina.1.md | 92 +++++----------- regina/db_operation/collect.py | 43 +++++--- regina/db_operation/database.py | 140 ++++++++++++++++++++++-- regina/db_operation/visualize.py | 166 +++++++++++++++++------------ regina/main.py | 68 +++++++----- regina/utility/globals.py | 12 ++- regina/utility/settings_manager.py | 36 +++++-- regina/utility/utility.py | 5 +- template.html | 24 +++-- 12 files changed, 587 insertions(+), 259 deletions(-) diff --git a/_regina.compdef.zsh b/_regina.compdef.zsh index 807bb0d..29dc49a 100644 --- a/_regina.compdef.zsh +++ b/_regina.compdef.zsh @@ -1,30 +1,25 @@ -#compdef nicole +#compdef regina # https://zsh.sourceforge.io/Doc/Release/Completion-System.html#Completion-Functions -_lyrics-site() -{ - _values "lyrics site" \ - 'genius[use only genius.com]' \ - 'azlyrics[use only azlyrics.com]' \ - 'all[use all supported sites (default)]' +_config-file() { + # list all files that end in .conf + # -s separator, descritions options + _values -s , 'config files' $(find . -type f -name '*.conf') +} +_csv-file() { + _values -s , 'geoip city database as csv' $(find . -type f -name '*.csv') } -_nicole() -{ +_regina() { # each argument is # n:message:action # option[description]:message:action # # -s allow stacking, eg -inr _arguments -s \ - '-d[process directory]':directory:_directories \ - '-f[process file]':file:_files \ - '-r[go through directories recursively]' \ - '-s[silent]' \ - '-i[ignore history]' \ - '-n[do not write to history]' \ - '-o[overwrite if the file already has lyrics]' \ - '-t[test, only print lyrics, dont write to tags]' \ - '-h[show this]' \ - '--rm_explicit[remove the "Explicit" lyrics warning from the title tag]' \ - '--site[specify lyrics site]':lyrics-site:_lyrics-site + {--help,-h}'[show help]' \ + {--config,-c}'[use this config file]':config:_config-file \ + '--visualize[visualize the data in the database]' \ + '--collect[collect requests from the nginx log]' \ + '--access-log[source this logfile]':logfile:_file \ + '--update-geoip[recreate the geoip database from csv]':csv:_csv-file } -_nicole "$@" +_regina "$@" diff --git a/database.uxf b/database.uxf index 281314c..44c6551 100644 --- a/database.uxf +++ b/database.uxf @@ -4,34 +4,35 @@ UMLClass - 247 - 312 + 299 + 221 299 - 234 + 247 - User + user -- <<PK>> - user_id: INTEGER -- -- ip address: TEXT +- ip_address: INTEGER - user agent string: TEXT - platform: TEXT - browser: TEXT - mobile: INTEGER - is_human: INTEGER +- range_id: INTEGER style=autoresize UMLClass - 975 - 312 + 1040 + 221 234 130 - FileGroup + filegroup -- <<PK>> - group_id: INTEGER @@ -43,8 +44,8 @@ style=autoresize Relation - 871 - 312 + 936 + 221 130 65 @@ -57,12 +58,12 @@ m2=1 UMLClass - 637 - 299 + 702 + 208 247 221 - Request + request -- <<PK>> - request_id: INTEGER @@ -79,26 +80,26 @@ style=autoresize Relation - 533 - 312 - 130 + 585 + 221 + 143 65 lt=- m1=1 m2=n - 10.0;20.0;80.0;20.0 + 10.0;20.0;90.0;20.0 UMLClass - 975 - 585 + 1040 + 455 234 130 - File + file -- <<PK>> - name: TEXT @@ -111,22 +112,22 @@ style=autoresize Relation - 1014 - 429 + 1079 + 338 52 - 182 + 143 lt=- m1=n m2=1 - 10.0;120.0;10.0;10.0 + 10.0;90.0;10.0;10.0 UMLNote - 780 - 156 + 845 + 65 390 91 @@ -139,12 +140,110 @@ style=autoresize Relation - 988 - 234 + 1053 + 143 39 104 lt=<- 10.0;60.0;10.0;10.0 + + UMLClass + + 676 + 611 + 247 + 169 + + city +-- +<<PK>> +- city_id: INTEGER +-- +- country_id: INTEGER +- name: TEXT +- region: TEXT +style=autoresize + + + + UMLClass + + 1014 + 611 + 156 + 143 + + country +-- +<<PK>> +- country_id +-- +- name: TEXT +- code: TEXT +style=autoresize + + + + Relation + + 910 + 637 + 130 + 65 + + lt=- +m1=1 +m2=n + + 80.0;20.0;10.0;20.0 + + + Relation + + 572 + 637 + 130 + 65 + + lt=- +m1=1 +m2=n + + 80.0;20.0;10.0;20.0 + + + UMLClass + + 364 + 611 + 221 + 169 + + ip_range +-- +<<PK>> +- range_id +-- +- from: INTEGER +- to: INTEGER +- city_id: INTEGER +style=autoresize + + + + Relation + + 429 + 455 + 52 + 182 + + lt=- +m1=1 +m2=n + + 10.0;120.0;10.0;10.0 + diff --git a/default.conf b/default.conf index 275a7e6..7d3448c 100644 --- a/default.conf +++ b/default.conf @@ -1,13 +1,27 @@ -# default configuration for regina -# GENERAL -server_name = default_sever -# path to the database -db = /home/my_user/analytics/my_website.db +# ************************************* REGINA CONFIGURATION ************************************** +# .__ +# _______ ____ ____ |__| ____ _____ +# \_ __ \_/ __ \ / ___\| |/ \\__ \ +# | | \/\ ___// /_/ > | | \/ __ \_ +# |__| \___ >___ /|__|___| (____ / +# \/_____/ \/ \/ +# ************************************************************************************************* +# File format: +# Assign value +# key = value +# Lists +# key = el1, el2, el3 +# - do not use quotation marks (unless your literally want one) +# - leading and trailing whitespaces will be ignored +# ******************************************* GENERAL ********************************************* +# path to the database eg. /home/my_user/analytics/my_website.db +db = -# DATA COLLECTION +# **************************************** DATA COLLECTION **************************************** # these changes will only apply to newly collected data/creation of new database -# path to the nginx access log to parse. -access_log = /home/my_user/analytics/access.log +# ************************************************************************************************* +# path to the nginx access log to parse. /var/log/nginx/access.log. Make sure you have write permissions! +access_log = # nginx locations and their root directory: location:directory,location:directory,... locs_and_dirs = /:/www/my_website,/error:/www/error @@ -22,19 +36,31 @@ status_300_is_success = False # if False, unique user is (ip-address - user agent) pair, if True only ip addess unique_user_is_ip_address = False # wether a user needs to make at least 1 successful request to be a human -humans_need_success = True +human_needs_success = True -# dont collect requests to locations matched by this +# dont collect requests to locations fully match this request_location_regex_blacklist = /analytics.* -# get nation -user_get_country = True +# list if capitalized ISO 3166-1 alpha-2 country codes for which the ip address ranges need to be collected at city level, not country level +# eg for EU: AT, BE, BG, HR, CY, CZ, DK, EE, FI, FR, DE, GZ, HU, IE, IT, LV, LT, LU, MT, NL, PL, PT, RO, SK, SI, ES, SE +get_cities_for_countries = -# VISUALIZATION +# ***************************************** VISUALIZATION ***************************************** +# these changes can be changed at any point in time as the only affect the visualization of the data +# ************************************************************************************************* +# will be available as variable for the the generated website as %server_name +server_name = default_sever + # separate users into all and humans get_human_percentage = True -# regex expression as whitelist for file ranking + +# generate a country and city ranking +do_geoip_rankings = False +# only use humans for geoip rankings +geoip_only_humans = True +city_ranking_regex_blacklist = City in .* +country_ranking_regex_blacklist = # ignore the protocol in referers, so https://url.com = http://url.com -> url.com referer_ranking_ignore_protocol = True @@ -49,15 +75,19 @@ referer_ranking_regex_whitelist = ^[^\-].* # regex expression as whitelist for user agent ranking user_agent_ranking_regex_whitelist = -# file_ranking_regex_whitelist = .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) +# regex expression as whitelist for file ranking +# eg .*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif)) to only show these files file_ranking_regex_whitelist = # maximum number of file(group)s on the file ranking file_ranking_plot_max_files = 20 # wether to ignore non existing files in the ranking file_ranking_ignore_error_files = True -# "plot_figsize" = (60 40), plot_dpi = 300 +# affects user/request count plot, file ranking and referer ranking +plot_size_broad = 10, 6 +# affects platform and browser ranking +plot_size_narrow = 7, 5 # output directory for the generated plots img_dir = /www/analytics/images @@ -67,3 +97,9 @@ img_location = images template_html = /home/my_user/analytics/template.html # output for the generated html html_out_path = /www/analytics/statistics.html + +# ******************************************** REGINA ********************************************* +# these settings affect the behavior of regina +# ************************************************************************************************* +# print lots! of debug messages to help you find problems +debug = False diff --git a/regina.1.md b/regina.1.md index 726d8fc..0b9517b 100644 --- a/regina.1.md +++ b/regina.1.md @@ -2,96 +2,62 @@ % Matthias Quintern % April 2022 -# Name +# NAME **R**uling **E**mpress **G**enerating **I**n-depth **N**ginx **A**nalytics (obviously) Regina is an analytics tool for nginx. -## About +# SYNOPSIS +| **regina** --config CONFIG_FILE [OPTION...] + +# DESCRIPTION It collects information from the nginx access.log and stores it in a sqlite3 database. Regina supports several data visualization configurations and can generate an admin-analytics page from an html template file. -# SYNOPSIS -| With config file: -| **regina** [OPTION...] - -## Visualization options: -- Line plot: Einmal seit Beginn der Aufzeichnung(pro Monat), einmal letzte 30 Tage (pro Tag) - x: date - y: #unique users, #unique requests -- Bar charts: - - unique user information: - - used browsers (in percent) - - used operating systems (in percent) - - countries (in percent) - - unique request information: - - requested files (in counts) - - HTTP referrers (in counts) -A unique user is a IP-address - user agent pair. -A unique request is a unique-user - requested file - date (day) - combination. - ## Command line options -**-d** directory -: process directory [directory] +**-h**, **--help** +: Show the the possible command line arguments -**-f** file -: process file [file] +**-c**, **--config** config-file +: Retrieve settings from the config-file -**-r** -: go through directories recursively +**--access-log** log-file +: Overrides the access_log from the configuration -**-s** -: silent, no command-line output +**--collect** +: Collect information from the access_log and store them in the databse -**-i** -: ignore history +**--visualize** +: Visualize the data from the database -**-n** -: do not write to history - -**-o** -: overwrite if the file already has lyrics - -**-t** -: test, do not write lyrics to file, but print to stdout - -**-h** -: show this - -**--rm_explicit** -: remove the "[Explicit]" lyrics warning from the song's title tag - -**--site** site -: onlysearch [site] for lyrics (genius or azlyrics) - -If you do not specify a directory or file, the program will ask you if you want to use the current working directory. -Example: `nicole -ior -d ~/music/artist --rm_explicit` +**--update-geoip** geoip-db +: Recreate the geoip part of the database from the geoip-db csv. The csv must have this form: lower, upper, country-code, country-name, region, city # INSTALLATION AND UPDATING -To update nicole, simply follow the installation instructions. +To update regina, simply follow the installation instructions. ## pacman (Arch Linux) -Installing nicole using the Arch Build System also installs the man-page and a zsh completion script, if you have zsh installed. +Installing regina using the Arch Build System also installs the man-page and a zsh completion script, if you have zsh installed. ```shell -git clone https://github.com/MatthiasQuintern/nicole.git -cd nicole +git clone https://github.com/MatthiasQuintern/regina.git +cd regina makepkg -si ``` ## pip -You can also install nicole with python-pip: +You can also install regina with python-pip: ```shell -git clone https://github.com/MatthiasQuintern/nicole.git -cd nicole +git clone https://github.com/MatthiasQuintern/regina.git +cd regina python3 -m pip install . ``` -You can also install it system-wide using `sudo python3 -m pip install.` +You can also install it system-wide using `sudo python3 -m pip install .` If you also want to install the man-page and the zsh completion script: ```shell -sudo cp nicole.1.man /usr/share/man/man1/nicole.1 -sudo gzip /usr/share/man/man1/nicole.1 -sudo cp _nicole.compdef.zsh /usr/share/zsh/site-functions/_nicole -sudo chmod +x /usr/share/zsh/site-functions/_nicole +sudo cp regina.1.man /usr/share/man/man1/regina.1 +sudo gzip /usr/share/man/man1/regina.1 +sudo cp _regina.compdef.zsh /usr/share/zsh/site-functions/_regina +sudo chmod +x /usr/share/zsh/site-functions/_regina ``` # CHANGELOG diff --git a/regina/db_operation/collect.py b/regina/db_operation/collect.py index 6e97450..bbbd9c1 100644 --- a/regina/db_operation/collect.py +++ b/regina/db_operation/collect.py @@ -1,8 +1,9 @@ import sqlite3 as sql from re import fullmatch, match +from ipaddress import IPv4Address, ip_address from time import mktime from datetime import datetime as dt -from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, database_tables, get_filegroup +from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, t_ip_range, database_tables, get_filegroup, ip_range_id from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from regina.utility.utility import pdebug, warning, pmessage from regina.utility.globals import user_agent_operating_systems, user_agent_browsers, settings @@ -16,7 +17,7 @@ months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aut", "Sep", "Oct", class Request: def __init__(self, ip_address="", time_local="", request_type="", request_file="", request_protocol="", status="", bytes_sent="", referer="", user_agent=""): - self.ip_address = sanitize(ip_address) + self.ip_address = int(IPv4Address(sanitize(ip_address))) self.time_local = 0 #[20/Nov/2022:00:47:36 +0100] m = match(r"\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\]", time_local) @@ -98,8 +99,11 @@ def get_user_id(request: Request, cursor: sql.Cursor) -> int: user_id: int = sql_tablesize(cursor, t_user) # pdebug("new user:", user_id, request.ip_address) platform, browser, mobile = get_os_browser_pairs_from_agent(request.user_agent) + ip_range_id_val = 0 + if settings["user_get_location"]: + ip_range_id_val = get_ip_range_id(cursor, request.ip_address) is_human = 0 # is_user_human cannot be called until user is in db int(is_user_human(cursor, user_id)) - cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}');") + cursor.execute(f"INSERT INTO {t_user} (user_id, ip_address, user_agent, platform, browser, mobile, is_human, {ip_range_id.name}) VALUES ({user_id}, '{request.ip_address}', '{request.user_agent}', '{platform}', '{browser}', '{int(mobile)}', '{is_human}', '{ip_range_id_val}');") return user_id def is_user_human(cur: sql.Cursor, user_id: int): @@ -170,15 +174,30 @@ def get_os_browser_pairs_from_agent(user_agent): return operating_system, browser, mobile -# def set_countries(cur: sql.Cursor, user_ids: list[int]): -# if settings["user_get_country"]: -# ipconn = sql.connect(ip2nation_db_path) -# ipcur = ipconn.cursor() -# for user_id in user_ids: -# ip_address = sql_select(cur, t_user, [("user_id", user_id)]) -# cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}") -# ip_address = cur.fetchall()[0][0] -# ipcur.execute("SELECT iso_code_3 FROM ip2nationCountries WHERE ip") +def get_ip_range_id(cur: sql.Cursor, ip_address: int): + print(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE lower <= '{ip_address}' AND to >= '{ip_address}'") + cur.execute(f"SELECT {ip_range_id.name} FROM {t_ip_range} WHERE '{ip_address}' BETWEEN lower AND upper") + results = cur.fetchall() + ip_range_id_val = 0 + if len(results) == 0: + pass + elif len(results) > 1: + warning(f"get_countries: Found multiple ip_ranges for ip_address={ip_address}: results={results}") + else: + ip_range_id_val = results[0][0] + return ip_range_id_val + +def update_ip_range_id(cur: sql.Cursor, user_id: int): + cur.execute(f"SELECT ip_address FROM {t_user} WHERE user_id = {user_id}") + results = cur.fetchall() + if len(results) == 0: + warning(f"update_ip_range_id: Invalid user_id={user_id}") + return + elif len(results) > 1: + warning(f"update_ip_range_id: Found multiple ip_addresses for user_id={user_id}: results={results}") + return + ip_address = results[0][0] + cur.execute(f"UPDATE {t_user} SET {ip_range_id.name} = '{get_ip_range_id(cur, ip_address)}' WHERE user_id = '{user_id}'") def add_requests_to_db(requests: list[Request], db_name: str): diff --git a/regina/db_operation/database.py b/regina/db_operation/database.py index c3d6c16..db1f835 100644 --- a/regina/db_operation/database.py +++ b/regina/db_operation/database.py @@ -1,10 +1,11 @@ # from sys import path -# print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") import sqlite3 as sql +from csv import reader from os import path, listdir # local from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize from regina.utility.utility import pdebug +from regina.utility.globals import settings """ create reginas database as shown in the uml diagram database.uxf @@ -40,28 +41,61 @@ t_request = "request" t_file = "file" t_filegroup = "filegroup" t_user = "user" +t_city = "city" +t_country = "country" +t_ip_range = "ip_range" user_id = Entry("user_id", "INTEGER") request_id = Entry("request_id", "INTEGER") filegroup_id = Entry("group_id", "INTEGER") ip_address_entry = Entry("ip_address", "TEXT") filename_entry = Entry("filename", "TEXT") +city_id = Entry("city_id", "INTEGER") +country_id = Entry("country_id", "INTEGER") +ip_range_id = Entry("ip_range_id", "INTEGER") + database_tables = { t_user: Table(t_user, user_id, [ - Entry("ip_address", "TEXT"), + Entry("ip_address", "INTEGER"), Entry("user_agent", "TEXT"), Entry("platform", "TEXT"), Entry("browser", "TEXT"), Entry("mobile", "INTEGER"), Entry("is_human", "INTEGER"), - # Entry("country_iso_code_3", "TEXT") + ip_range_id, ], [f"UNIQUE({user_id.name})"]), - t_file: Table(t_file, filename_entry, [filegroup_id], [f"UNIQUE({filename_entry.name})"]), - t_filegroup: Table(t_filegroup, filegroup_id, [Entry("groupname", "TEXT")], [f"UNIQUE({filegroup_id.name})"]), + t_file: Table(t_file, filename_entry, + [filegroup_id], + [f"UNIQUE({filename_entry.name})"]), + t_filegroup: Table(t_filegroup, filegroup_id, + [Entry("groupname", "TEXT")], + [f"UNIQUE({filegroup_id.name})"]), t_request: Table(t_request, request_id, [ - user_id, filegroup_id, Entry("date", "INTEGER"), Entry("referer", "TEXT"), Entry("status", "INTEGER") - ], ["UNIQUE(request_id)"]), + user_id, + filegroup_id, + Entry("date", "INTEGER"), + Entry("referer", "TEXT"), + Entry("status", "INTEGER") + ], + ["UNIQUE(request_id)"]), + t_ip_range: Table(t_ip_range, ip_range_id, [ + Entry("lower", "INTEGER"), + Entry("upper", "INTEGER"), + city_id, + ], + [f"UNIQUE({ip_range_id.name})"]), + t_city: Table(t_city, city_id, [ + country_id, + Entry("name", "TEXT"), + Entry("region", "TEXT"), + ], + [f"UNIQUE({city_id.name})"]), + t_country: Table(t_country, country_id, [ + Entry("name", "TEXT"), + Entry("code", "TEXT"), + ], + [f"UNIQUE({country_id.name})"]), } @@ -146,17 +180,103 @@ def get_auto_filegroup_str(location_and_dirs:list[tuple[str, str]], auto_group_f pdebug("get_auto_filegroup_str: found files:", files, "filegroups_str:", filegroups) return filegroups -def create_db(name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]): +def get_country_id(cur:sql.Cursor, name, code, country_tablesize): + # countries = sql_select(cur, t_country, [("name", name)]) + cur.execute(f"SELECT {country_id.name} FROM {t_country} WHERE name = '{name}'") + countries = cur.fetchall() + if len(countries) > 0: + country_id_val = countries[0][0] + else: # insert new country + country_id_val = country_tablesize + # pdebug(f"update_geoip_tables: Adding country #{country_id_val}, name={name}") + cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES ({country_id_val}, '{name}', '{code}')") + country_tablesize += 1 + return country_id_val, country_tablesize + +def get_city_id(cur: sql.Cursor, name, region, country_id, city_tablesize): + # cities = sql_select(cur, t_city, [("name", name)]) + cur.execute(f"SELECT {city_id.name} FROM {t_city} WHERE name = '{name}'") + cities = cur.fetchall() + if len(cities) > 0: + city_id_val = cities[0][0] + else: # insert new city + city_id_val = city_tablesize + # pdebug(f"update_geoip_tables: Adding city #{city_id_val}, name={row[CITY]}, country={country_id_val}") + cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region, country_id) VALUES ({city_id_val}, '{name}', '{region}', '{country_id}')") + city_tablesize += 1 + return city_id_val, city_tablesize + +def update_geoip_tables(cur: sql.Cursor, geoip_city_csv: str): + FROM = 0; TO = 1; CODE = 2; COUNTRY = 3; REGION = 4; CITY = 5 + ip_range_id_val = 0 + with open(geoip_city_csv, 'r') as file: + # delete all previous data + cur.execute(f"DELETE FROM {t_ip_range}") + cur.execute(f"VACUUM") + csv = reader(file, delimiter=',', quotechar='"') + + + # guarantees that unkown city/country will have id 0 + if not sql_exists(cur, t_country, [("name", "Unknown")]): + cur.execute(f"INSERT INTO {t_country} ({country_id.name}, name, code) VALUES (0, 'Unknown', 'XX') ") + if not sql_exists(cur, t_city, [("name", "Unknown")]): + cur.execute(f"INSERT INTO {t_city} ({city_id.name}, name, region) VALUES (0, 'Unknown', 'Unkown') ") + country_tablesize = sql_tablesize(cur, t_country) + city_tablesize = sql_tablesize(cur, t_city) + print(f"Recreating the geoip database from {geoip_city_csv}. This might take a long time...") + combine_range_country_id = 0 + combine_range_lower = -1 + combine_range_upper = -1 + combine_range_country_name = "" + for row in csv: + # these might contain problematic characters (') + row[CITY] = sanitize(row[CITY]) + row[COUNTRY] = sanitize(row[COUNTRY]) + row[REGION] = sanitize(row[REGION]) + + # make sure country exists + country_id_val, country_tablesize = get_country_id(cur, row[COUNTRY], row[CODE], country_tablesize) + if row[CODE] in settings["get_cities_for_countries"]: + # make sure city exists + city_id_val, city_tablesize = get_city_id(cur, row[CITY], row[REGION], country_id_val, city_tablesize) + pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding range for city={row[CITY]}, country={row[COUNTRY]}, lower={row[FROM]}, upper={row[TO]}") + cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {row[FROM]}, {row[TO]}, {city_id_val})") + ip_range_id_val += 1 + else: + if combine_range_country_id >= 0: + if combine_range_country_id == country_id_val: combine_range_upper = row[TO] + else: # new range for country, append + # get id for dummy city + pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}") + city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize) + cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})") + ip_range_id_val += 1 + combine_range_country_id = -1 + if combine_range_country_id < 0 : # combine with later ranges + combine_range_country_id = country_id_val + combine_range_lower = row[FROM] + combine_range_upper = row[TO] + combine_range_country_name = row[COUNTRY] + if combine_range_country_id >= 0: # last range , append + # get id for dummy city + pdebug(f"update_ip_range_id: ip_range_id={ip_range_id_val}, Adding combined range for country={combine_range_country_name}, lower={combine_range_lower}, upper={combine_range_upper}") + city_id_val, city_tablesize = get_city_id(cur, f"City in {combine_range_country_name}", f"Region in {combine_range_country_name}", combine_range_country_id, city_tablesize) + cur.execute(f"INSERT INTO {t_ip_range} ({ip_range_id.name}, lower, upper, {city_id.name}) VALUES ({ip_range_id_val}, {combine_range_lower}, {combine_range_upper}, {city_id_val})") + ip_range_id_val += 1 + + +def create_db(db_name, filegroup_str="", location_and_dirs:list[tuple[str, str]]=[], auto_group_filetypes=[]): """ create the name with database_tables """ - print(f"creating database: '{name}'") - conn = sql.connect(f"{name}") + print(f"creating database: '{db_name}'") + conn = sql.connect(f"{db_name}") cursor = conn.cursor() for table in database_tables.values(): cursor.execute(table.create_sql_str()) filegroup_str = filegroup_str.strip("; ") + ";" + get_auto_filegroup_str(location_and_dirs, auto_group_filetypes) create_filegroups(cursor, filegroup_str) + cursor.close() conn.commit() conn.close() diff --git a/regina/db_operation/visualize.py b/regina/db_operation/visualize.py index c3d5644..a81f4fd 100644 --- a/regina/db_operation/visualize.py +++ b/regina/db_operation/visualize.py @@ -9,7 +9,7 @@ from datetime import datetime as dt from numpy import empty # local -from regina.db_operation.database import t_request, t_user, t_file, t_filegroup +from regina.db_operation.database import t_request, t_user, t_file, t_filegroup, t_ip_range, t_city, t_country from regina.utility.sql_util import sanitize, sql_select, sql_exists, sql_insert, sql_tablesize, sql_get_count_where from regina.utility.utility import pdebug, warning, missing_arg from regina.utility.globals import settings @@ -277,7 +277,7 @@ def get_user_agent_ranking(cur: sql.Cursor, date:str) -> list[tuple[int, str]]: # print(ranking) return ranking -def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date:str) -> list[tuple[int, str]]: +def get_request_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Cursor, date_condition:str) -> list[tuple[int, str]]: """ 1) get all the distinct entries for field_name after min_date_unix_time 2) call get_name_function with the distinct entry @@ -286,25 +286,27 @@ def get_ranking(field_name: str, table: str, whitelist_regex: str, cur: sql.Curs :returns [(request_count, name)] """ ranking = [] - cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date}") + cur.execute(f"SELECT DISTINCT {field_name} FROM {table} WHERE {date_condition}") for name in cur.fetchall(): name = name[0] if whitelist_regex: if not fullmatch(whitelist_regex, name): continue # ranking.append((sql_get_count_where(cur, t_request, [("group_id", group)]), filename)) - cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date}") + cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {field_name} = '{name}' AND {date_condition}") ranking.append((cur.fetchone()[0], name)) ranking.sort() # print(ranking) return ranking -re_uri_protocol = f"(https?)://" +# re_uri_protocol = f"(https?)://" +re_uri_protocol = f"(https?://)?" re_uri_ipv4 = r"(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?)" # re_uri_ipv6 = "" re_uri_domain = r"(?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})" re_uri_location = r"(?:/(.*))?" re_uri_full = f"{re_uri_protocol}({re_uri_domain}|{re_uri_ipv4})({re_uri_location})" +# (https?://)?((?:([^/]+\.)*[^/]+\.[a-zA-Z]{2,})|(?:(?:(?:\d{1,3}\.?){4})(?::\d+)?))((?:/(.*))?) def cleanup_referer(referer: str) -> str: """ @@ -326,7 +328,7 @@ def cleanup_referer(referer: str) -> str: if len(domain.split(".")) == 2: # if domain.tld referer = domain.split(".")[0] if not settings["referer_ranking_ignore_subdomain"]: referer = subdomains + referer - if not settings["referer_ranking_ignore_protocol"]: referer = protocol + "://" + referer + if not settings["referer_ranking_ignore_protocol"]: referer = protocol + referer if not settings["referer_ranking_ignore_location"]: referer += location # pdebug(f"cleanup_referer: cleaned up: {referer}") return referer @@ -344,6 +346,37 @@ def cleanup_referer_ranking(referer_ranking: list[tuple[int, str]]): referer_ranking.append((count, referer)) referer_ranking.sort() +def get_city_and_country_ranking(cur:sql.Cursor, require_humans=True, regex_city_blacklist="", regex_country_blacklist=""): + sql_cmd = f"SELECT ci.name, c.code, c.name FROM {t_country} AS c, {t_city} as ci, {t_user} as u, {t_ip_range} as i WHERE u.ip_range_id = i.ip_range_id AND i.city_id = ci.city_id AND ci.country_id = c.country_id" + if require_humans: sql_cmd += " AND u.is_human = 1" + cur.execute(sql_cmd) + pdebug(f"get_city_and_country_ranking: require_humans={require_humans}, regex_city_blacklist='{regex_city_blacklist}', regex_country_blacklist='{regex_country_blacklist}'") + cities = cur.fetchall() + cities_dict = {} + country_dict = {} + # TODO: find out why regex_blacklist does not work + pdebug(f"get_city_and_country_ranking: found {len(cities)} ip_ranges") + + validate_city_cmd = lambda _ : True + validate_country_cmd = lambda _ : True + if len(regex_city_blacklist) > 0: validate_city_cmd = lambda city : fullmatch(regex_city_blacklist, city) is None + if len(regex_country_blacklist) > 0 : validate_country_cmd = lambda country : fullmatch(regex_country_blacklist, country) is None + for i in range(len(cities)): + if cities[i][0] in cities_dict: + cities_dict[cities[i][0]][0] += 1 + else: + if validate_city_cmd(cities[i][0]): + cities_dict[cities[i][0]] = [1, cities[i][1], cities[i][2]] # count, country code + if cities[i][2] in country_dict: + country_dict[cities[i][2]] += 1 + else: + if validate_country_cmd(cities[i][2]): + country_dict[cities[i][2]] = 1 # count, country code + city_ranking = [(v[0], f"{k} ({v[1]})") for k,v in cities_dict.items()] + city_ranking.sort() + country_ranking = [(v, k) for k,v in country_dict.items()] + country_ranking.sort() + return city_ranking, country_ranking # # PLOTTING @@ -365,13 +398,13 @@ def add_labels_at_top_of_bar(xdata, ydata, max_y_val, ax, bar_plot): for idx,rect in enumerate(bar_plot): ax.text(rect.get_x() + rect.get_width()/2, ydata[idx] - y_offset, round(ydata[idx], 1), ha='center', bbox=dict(facecolor='white', alpha=0.8)) -def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[]): +def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", color_settings:dict|list=[], figsize=None): """ make a bar plot of the most requested files """ # pdebug(f"plot_ranking: ranking={ranking}") if not fig: - fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) # create new axis if none is given ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) # fill x y data @@ -404,36 +437,32 @@ def plot_ranking(ranking: list[tuple[int, str]], fig=None, xlabel="", ylabel="", return fig -def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue"): - if not fig: - fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) - if not ax: - ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) - else: - ax = ax.twinx() - ax.set_ylabel(ylabel) - # ax.tick_params(axis="y", labelcolor="r") - ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color) - if label: ax.legend() - # if xlim: - # if xlim[0] != xlim[1]: - # ax.set_xlim(*xlim) +# def plot(xdata, ydata, fig=None, ax=None, xlabel="", ylabel="", label="", linestyle='-', marker="", color="blue", rotate_xlabel=0): +# if not fig: +# fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) +# if not ax: +# ax = fig.add_subplot(xlabel=xlabel, ylabel=ylabel) +# else: +# ax = ax.twinx() +# ax.set_ylabel(ylabel) +# # ax.tick_params(axis="y", labelcolor="r") +# ax.plot(xdata, ydata, marker=marker, label=label, linestyle=linestyle, color=color) +# plt.xticks(rotation=rotate_xlabel) +# if label: ax.legend() +# return fig, ax - # if ylim: - # if ylim[0] != ylim[1]: - # ax.set_ylim(*ylim) - return fig, ax - -def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major"): +def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlabel="", ylabel1="", ylabel2="", label1="", label2="", linestyle='-', marker="", color1="blue", color2="orange", grid="major", rotate_xlabel=0, figsize=None): if not fig: - fig = plt.figure(figsize=None, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) + fig = plt.figure(figsize=figsize, dpi=settings["plot_dpi"], linewidth=1.0, frameon=True, subplotpars=None, layout=None) if not (ax1 and ax2): ax1 = fig.add_subplot(xlabel=xlabel, ylabel=ylabel1) ax2 = ax1.twinx() ax2.set_ylabel(ylabel2) - # ax.tick_params(axis="y", labelcolor="r") + ax1.tick_params(axis="x", rotation=90) plot1 = ax1.plot(xdata, ydata1, marker=marker, label=label1, linestyle=linestyle, color=color1) plot2 = ax2.plot(xdata, ydata2, marker=marker, label=label2, linestyle=linestyle, color=color2) + # ax1.set_xticks(ax1.get_xticks()) + # ax1.set_xticklabels(xdata, rotation=rotate_xlabel, rotation_mode="anchor") # if label1 or label2: ax1.legend() if plots: plots += plot1 + plot2 else: plots = plot1 + plot2 @@ -444,13 +473,6 @@ def plot2y(xdata, ydata1, ydata2, fig=None, ax1=None, ax2=None, plots=None, xlab ax1.minorticks_on() ax1.grid(visible=True, which=grid, linestyle="-", color="#888") - # if xlim: - # if xlim[0] != xlim[1]: - # ax.set_xlim(*xlim) - - # if ylim: - # if ylim[0] != ylim[1]: - # ax.set_ylim(*ylim) return fig, ax1, ax2, plots @@ -470,16 +492,20 @@ def visualize(loaded_settings: dict): img_location = settings["img_location"] names = { # paths - "img_file_ranking_last_x_days": f"ranking_all_time_files_last_x_days.{img_filetype}", - "img_referer_ranking_last_x_days": f"ranking_all_time_referers_last_x_days.{img_filetype}", - "img_browser_ranking_last_x_days": f"ranking_all_time_browsers_last_x_days.{img_filetype}", - "img_operating_system_ranking_last_x_days": f"ranking_all_time_operating_systems_last_x_days.{img_filetype}", + "img_file_ranking_last_x_days": f"ranking_files_last_x_days.{img_filetype}", + "img_referer_ranking_last_x_days": f"ranking_referers_last_x_days.{img_filetype}", + "img_countries_last_x_days": f"ranking_countries_last_x_days.{img_filetype}", + "img_cities_last_x_days": f"ranking_cities_last_x_days.{img_filetype}", + "img_browser_ranking_last_x_days": f"ranking_browsers_last_x_days.{img_filetype}", + "img_operating_system_ranking_last_x_days": f"ranking_operating_systems_last_x_days.{img_filetype}", "img_users_and_requests_last_x_days": f"user_request_count_daily_last_x_days.{img_filetype}", - "img_file_ranking_total": f"ranking_all_time_files_total.{img_filetype}", - "img_referer_ranking_total": f"ranking_all_time_referers_total.{img_filetype}", - "img_browser_ranking_total": f"ranking_all_time_browsers_total.{img_filetype}", - "img_operating_system_ranking_total": f"ranking_all_time_operating_systems_total.{img_filetype}", + "img_file_ranking_total": f"ranking_files_total.{img_filetype}", + "img_referer_ranking_total": f"ranking_referers_total.{img_filetype}", + "img_countries_total": f"ranking_countries_total.{img_filetype}", + "img_cities_total": f"ranking_cities_total.{img_filetype}", + "img_browser_ranking_total": f"ranking_browsers_total.{img_filetype}", + "img_operating_system_ranking_total": f"ranking_operating_systems_total.{img_filetype}", "img_users_and_requests_total": f"user_request_count_daily_total.{img_filetype}", # values "mobile_user_percentage_total": 0.0, @@ -522,7 +548,6 @@ def visualize(loaded_settings: dict): days = get_days(cur, last_x_days_str) days_strs = [get_where_date_str(at_date=day) for day in days] - # ALL DATES all_time_str = get_where_date_str(min_date=0) # all months in yyyy-mm format @@ -550,15 +575,29 @@ def visualize(loaded_settings: dict): # FILES file_ranking = get_file_ranking(cur, date_str) if gen_img: - fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes) - fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}") + fig_file_ranking = plot_ranking(file_ranking, xlabel="Filename/Filegroup", ylabel="Number of requests", color_settings=color_settings_filetypes, figsize=settings["plot_size_broad"]) + fig_file_ranking.savefig(f"{img_dir}/{names[f'img_file_ranking{suffix}']}", bbox_inches="tight") # REFERER - referer_ranking = get_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) + referer_ranking = get_request_ranking("referer", t_request, settings["referer_ranking_regex_whitelist"], cur, date_str) + pdebug("Referer ranking", referer_ranking) cleanup_referer_ranking(referer_ranking) if gen_img: - fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate) - fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}") + fig_referer_ranking = plot_ranking(referer_ranking, xlabel="HTTP Referer", ylabel="Number of requests", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"]) + fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_referer_ranking{suffix}']}", bbox_inches="tight") + + # GEOIP + if settings["do_geoip_rankings"]: + city_ranking, country_ranking = get_city_and_country_ranking(cur, require_humans=settings["geoip_only_humans"], regex_city_blacklist=settings["city_ranking_regex_blacklist"], regex_country_blacklist=settings["country_ranking_regex_blacklist"]) + pdebug("Country ranking:", country_ranking) + pdebug("City ranking:", city_ranking) + if gen_img: + fig_referer_ranking = plot_ranking(country_ranking, xlabel="Country", ylabel="Number of users", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"]) + fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_countries{suffix}']}", bbox_inches="tight") + + fig_referer_ranking = plot_ranking(city_ranking, xlabel="City", ylabel="Number of users", color_settings=color_settings_alternate, figsize=settings["plot_size_broad"]) + fig_referer_ranking.savefig(f"{img_dir}/{names[f'img_cities{suffix}']}", bbox_inches="tight") + # USER # user_agent_ranking = get_user_agent_ranking(cur, date_str) @@ -570,8 +609,8 @@ def visualize(loaded_settings: dict): date_count = len(date_strs) unique_user_ids_dates: list[list[int]] = [] unique_request_ids_dates: list[list[int]] = [] - unique_user_ids_human_dates: list[list[int]] = [[] for i in range(date_count)] - unique_request_ids_human_dates: list[list[int]] = [[] for i in range(date_count)] + unique_user_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)] + unique_request_ids_human_dates: list[list[int]] = [[] for _ in range(date_count)] for i in range(date_count): date_str_ = date_strs[i] unique_user_ids_dates.append(get_unique_user_ids_for_date(cur, date_str_)) @@ -603,26 +642,19 @@ def visualize(loaded_settings: dict): names[f"user_count{suffix}"] = len_list_list(unique_user_ids_dates) names[f"request_count{suffix}"] = len_list_list(unique_request_ids_dates) if gen_img: - fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"]) + fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_dates], [len(request_ids) for request_ids in unique_request_ids_dates], xlabel="Date", ylabel1="User count", label1="Unique users", ylabel2="Request count", label2="Unique requests", color1=palette["red"], color2=palette["blue"], rotate_xlabel=-45, figsize=settings["plot_size_broad"]) if get_humans: - fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots) - fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}") + fig_daily, ax1, ax2, plots = plot2y(date_names, [len(user_ids) for user_ids in unique_user_ids_human_dates], [len(request_ids) for request_ids in unique_request_ids_human_dates], label1="Unique users (human)", label2="Unique requests (human)", color1=palette["orange"], color2=palette["green"], fig=fig_daily, ax1=ax1, ax2=ax2, plots=plots, rotate_xlabel=-45, figsize=settings["plot_size_broad"]) + fig_daily.savefig(f"{img_dir}/{names[f'img_users_and_requests{suffix}']}", bbox_inches="tight") # os & browser os_ranking, browser_ranking, names[f"mobile_user_percentage{suffix}"] = get_os_browser_mobile_rankings(cur, unique_user_ids_human) if gen_img: - fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems) - fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}") - fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers) - fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}") + fig_os_rating = plot_ranking(os_ranking, xlabel="Platform", ylabel="Share [%]", color_settings=color_settings_operating_systems, figsize=settings["plot_size_narrow"]) + fig_os_rating.savefig(f"{img_dir}/{names[f'img_operating_system_ranking{suffix}']}", bbox_inches="tight") + fig_browser_rating = plot_ranking(browser_ranking, xlabel="Browsers", ylabel="Share [%]", color_settings=color_settings_browsers, figsize=settings["plot_size_narrow"]) + fig_browser_rating.savefig(f"{img_dir}/{names[f'img_browser_ranking{suffix}']}", bbox_inches="tight") - # print("File Ranking", file_ranking) - # print("referer Ranking", referer_ranking) - # print("user agent ranking", user_agent_ranking) - # print("Unique Users:", get_unique_user_count(cur)) - # fig_daily, ax_daily_users = plot(dates, [len(user_ids) for user_ids in unique_user_ids_for_dates], xlabel="Datum", ylabel="Einzigartige Nutzer", label="Einzigartige Nutzer", color="blue") - # fig_daily, ax_daily_requests = plot(dates, [len(request_ids) for request_ids in unique_request_ids_for_dates], fig=fig_daily, ax=ax_daily_users, xlabel="Datum", ylabel="Einzigartige Anfragen", label="Einzigartige Anfragen", color="orange") - # fig_daily.savefig(f"{img_dir}/daily.{img_filetype}") # print("OS ranking", os_ranking) # print("Browser ranking", browser_ranking) # print("Mobile percentage", names["mobile_user_percentage"]) diff --git a/regina/main.py b/regina/main.py index b50992d..1819b81 100644 --- a/regina/main.py +++ b/regina/main.py @@ -3,12 +3,14 @@ # __package__="." from sys import argv, exit from os.path import isfile -from regina.db_operation.collect import parse_log, add_requests_to_db -from regina.db_operation.database import create_db +import sqlite3 as sql +from regina.db_operation.collect import parse_log, add_requests_to_db, update_ip_range_id +from regina.db_operation.database import create_db, update_geoip_tables, t_user from regina.db_operation.visualize import visualize from regina.utility.settings_manager import read_settings_file from regina.utility.globals import settings, version from regina.utility.utility import pmessage +from regina.utility.sql_util import sql_tablesize """ start regina, launch either collect or visualize @@ -17,23 +19,24 @@ TODO: - unique user = ip address - max requests/time - unique request datums unabhängig -- fix datum im user and request count plot -- fix datum monat is 1 zu wenig -- fix ms edge nicht dabei +X fix datum im user and request count plot +X fix datum monat is 1 zu wenig +X fix ms edge nicht dabei - für letzten Tag: uhrzeit - requests/users plot - checken warum last x days und total counts abweichen - länder aus ip addresse - "manuelle" datenbank beabeitung in cli: - user + alle seine requests löschen - user agents: - - android vor linux suchen, oder linux durch X11 ersetzen + X android vor linux suchen, oder linux durch X11 ersetzen - alles was bot drin hat als bot betrachten - wenn datenbankgröße zum problem wird: - referrer table die die schon zusammengelegten referrer enthält, request verlinkt nur mit id - selbes für platforms und browsers - test: - human detection - - referer cleanup + X referer cleanup +X geoip - schöne log nachrichten für die cron mail - testing! """ @@ -41,13 +44,10 @@ TODO: def help(): helpstring = """Command line options: - --server-name string - --log path to the access.log - --db name of the database - --settings["filegroups"] string describing settings["filegroups"], eg 'name1: file1, file2; name2: file3, file4, file5;' - --auto-group-filetypes comma separated list of filetypes, eg 'css,png,gif' - --locs-and_dirs comma separated list of nginx_location:directory pairs, eg '/:/www/website' - --config-file path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line + --config path to a config file that specifies all the other parameters: param = value, where value has the same formatting as on the command line + --update-geoip path to IP-COUNTRY-REGION-CITY database in csv format + --visualize generate the visualization website + --collect fill the database from the nginx access log """ print(helpstring) @@ -68,16 +68,20 @@ def main(): collect = False visualize_ = False log_file = "" + geoip_city_csv = "" # parse args i = 1 while i in range(1, len(argv)): - if argv[i] == "--config": + if argv[i] in ["--config", "-c"]: if len(argv) > i + 1: config_file = argv[i+1] else: missing_arg_val(argv[i]) - if argv[i] == "--log-file": + elif argv[i] == "--log-file": if len(argv) > i + 1: log_file = argv[i+1] else: missing_arg_val(argv[i]) - elif argv[i] == "--help": + if argv[i] == "--update-geoip": + if len(argv) > i + 1: geoip_city_csv = argv[i+1] + else: missing_arg_val(argv[i]) + elif argv[i] in ["--help", "-h"]: help() exit(0) elif argv[i] == "--collect": @@ -87,11 +91,11 @@ def main(): else: pass i += 1 - if not collect and not visualize_: - missing_arg("--visualize or --collect") + if not (collect or visualize_ or geoip_city_csv): + missing_arg("--visualize or --collect or --update-geoip") if not config_file: - missing_arg("--config_file") + missing_arg("--config") if not isfile(config_file): error(f"Not a file: '{config_file}'") read_settings_file(config_file, settings) @@ -107,19 +111,33 @@ def main(): if isinstance(settings["locs_and_dirs"], str): settings["locs_and_dirs"] = [ loc_and_dir.split(":") for loc_and_dir in settings["locs_and_dirs"].split(",") ] + if not isfile(config_file): + error(f"Not a file: '{config_file}'") + + if not isfile(settings["db"]): + create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"]) + + if geoip_city_csv: + if not isfile(geoip_city_csv): + error(f"Not a file: '{geoip_city_csv}'") + conn = sql.connect(settings['db'], isolation_level=None) # required vor vacuum + cur = conn.cursor() + update_geoip_tables(cur, geoip_city_csv) + # update users + for user_id in range(sql_tablesize(cur, t_user)): + update_ip_range_id(cur, user_id) + cur.close() + conn.commit() + conn.close() if collect: pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}' and logfile '{settings['access_log']}'") - if not isfile(settings["db"]): - create_db(settings["db"], settings["filegroups"], settings["locs_and_dirs"], settings["auto_group_filetypes"]) requests = parse_log(settings["access_log"]) add_requests_to_db(requests, settings["db"]) - elif visualize_: + if visualize_: pmessage(f"regina version {version} with server-name '{settings['server_name']}', database '{settings['db']}'") if not isfile(settings["db"]): error(f"Invalid database path: '{settings['db']}'") visualize(settings) - else: - error("Either --collect --visualize has to be provided") if __name__ == '__main__': main() diff --git a/regina/utility/globals.py b/regina/utility/globals.py index 6e6ecb0..8b9970a 100644 --- a/regina/utility/globals.py +++ b/regina/utility/globals.py @@ -5,7 +5,7 @@ version = "1.0" # default settings, these are overwriteable through a config file settings = { # GENERAL - "server_name": "", + "server_name": "default_sever", # DATA COLLECTION "access_log": "", "db": "", @@ -15,12 +15,16 @@ settings = { "request_location_regex_blacklist": "", "request_is_same_on_same_day": True, # mutiple requests from same user to same file at same day are counted as 1 "unique_user_is_ip_address": False, - "user_get_country": True, + "get_cities_for_countries": [""], # list if country codes for which the ip address ranges need to be collected at city level, not country level # VISUALIZATION "get_human_percentage": False, "human_needs_success": True, # a human must have at least 1 successful request (status < 300) "status_300_is_success": False, # 300 codes are success + "do_geoip_rankings": False, + "geoip_only_humans": True, + "city_ranking_regex_blacklist": "", + "country_ranking_regex_blacklist": "", # "file_ranking_regex_whitelist": r".*\.((txt)|(html)|(css)|(php)|(png)|(jpeg)|(jpg)|(svg)|(gif))", "file_ranking_regex_whitelist": r".*\.(html)", "file_ranking_ignore_error_files": False, # skip files that only had unsuccessful requests (status < 300) @@ -34,12 +38,16 @@ settings = { # "plot_figsize": (60, 40), "plot_dpi": 300, "plot_add_count_label": True, + "plot_size_broad": (10, 5), + "plot_size_narrow": (6.5, 5), "img_dir": "", "img_location": "", "img_filetype": "svg", "template_html": "", "html_out_path": "", "last_x_days": 30, + # regina + "debug": False } # these oses and browser can be detected: diff --git a/regina/utility/settings_manager.py b/regina/utility/settings_manager.py index cb821d6..92c0300 100644 --- a/regina/utility/settings_manager.py +++ b/regina/utility/settings_manager.py @@ -4,14 +4,24 @@ def get_bool(bool_str: str, fallback=False): elif bool_str in ["false", "False"]: return False return fallback +def get_iterable(s, original_iterable, require_same_length=False): + val_type = str + if len(original_iterable) > 0: val_type = type(original_iterable[0]) + new_iter = type(original_iterable)(val_type(v.strip(" ")) for v in s.split(",")) + if require_same_length and len(original_iterable) != len(new_iter): + raise Exception(f"{new_iter} does not have the same length as {original_iterable}") + return new_iter + + def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, allow_new_keys=False, convert_to_type=True): + ignore_invalid_lines = False lines = [] with open(filepath, "r") as file: lines = file.readlines() for i in range(len(lines)): line = lines[i].strip("\n ") - if line.startswith("#"): continue + if line.startswith("#") or len(line) == 0: continue vals = line.split("=") if not len(vals) == 2: if ignore_invalid_lines: continue @@ -23,11 +33,23 @@ def read_settings_file(filepath: str, settings:dict, ignore_invalid_lines=True, if convert_to_type and not isinstance(settings[vals[0]], str|list|None): if isinstance(settings[vals[0]], bool): settings[vals[0]] = get_bool(vals[1].strip(" "), fallback=settings[vals[0]]) - continue - try: - settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" ")) - except Exception as e: - if not ignore_invalid_lines: raise e - else: continue + elif isinstance(settings[vals[0]], tuple): + try: + settings[vals[0]] = get_iterable(vals[1], settings[vals[0]], require_same_length=True) + except Exception as e: + if not ignore_invalid_lines: raise e + else: continue + elif isinstance(settings[vals[0]], list): + try: + settings[vals[0]] = get_iterable(vals[1], settings[vals[0]], require_same_length=False) + except Exception as e: + if not ignore_invalid_lines: raise e + else: continue + else: + try: + settings[vals[0]] = type(settings[vals[0]])(vals[1].strip(" ")) + except Exception as e: + if not ignore_invalid_lines: raise e + else: continue else: settings[vals[0]] = vals[1].strip(" ") diff --git a/regina/utility/utility.py b/regina/utility/utility.py index bd81d04..6788174 100644 --- a/regina/utility/utility.py +++ b/regina/utility/utility.py @@ -2,13 +2,14 @@ # print(f"{__file__}: __name__={__name__}, __package__={__package__}, sys.path[0]={path[0]}") from sys import exit +from regina.utility.globals import settings + """ Various utitity """ -DEBUG = False def pdebug(*args, **keys): - if DEBUG: print(*args, **keys) + if settings["debug"]: print(*args, **keys) def warning(*w, **k): print("Warning:", *w, **k) diff --git a/template.html b/template.html index 0b0e232..623760b 100644 --- a/template.html +++ b/template.html @@ -5,11 +5,11 @@ - Analytics for %server-name + Analytics for %server_name -

Analytics for %server-name

+

Analytics for %server_name

Last %last_x_days days

@@ -23,17 +23,22 @@

File access

- File ranking + File ranking for the last %last_x_days days

Platforms and browsers

- Operating system ranking - Browser ranking + Operating system ranking for the last %last_x_days days + Browser ranking for the last %last_x_days days

Mobile users: %mobile_user_percentage_last_x_days%


Referrers

- Referer ranking + Referer ranking for the last %last_x_days days +
+ +

GeoIP

+ Country ranking for the last %last_x_days days + City ranking for the last %last_x_days days
@@ -62,8 +67,15 @@

Referrers

Referer ranking
+ +

GeoIP

+ Country ranking + City ranking +

These analytics were generated by regina %regina_version at %generation_date

+ +

This site includes IP2Location LITE data available from https://lite.ip2location.com