From f61d88e0d806fbeb579dce246b912b72bd4459d6 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Wed, 10 May 2023 22:44:14 +0200 Subject: [PATCH] Fixed model, restructured files --- .gitignore | 1 + readme.md | 11 ++ teng-ml/main.py | 255 ++++++++++--------------------- teng-ml/peaks.py | 1 + teng-ml/rnn.py | 39 ----- teng-ml/rnn/rnn.py | 80 ++++++++++ teng-ml/rnn/training.py | 150 ++++++++++++++++++ teng-ml/tracker/epoch_tracker.py | 187 +++++++++++++++++++++++ teng-ml/util/data_loader.py | 90 ++++++++--- teng-ml/util/epoch_tracker.py | 83 ---------- teng-ml/util/file_io.py | 34 +++++ teng-ml/util/model_io.py | 45 ++++++ teng-ml/util/settings.py | 12 +- teng-ml/util/split.py | 23 +++ teng-ml/util/string.py | 51 +++++++ teng-ml/util/transform.py | 1 - 16 files changed, 743 insertions(+), 320 deletions(-) delete mode 100644 teng-ml/rnn.py create mode 100644 teng-ml/rnn/rnn.py create mode 100644 teng-ml/rnn/training.py create mode 100644 teng-ml/tracker/epoch_tracker.py delete mode 100644 teng-ml/util/epoch_tracker.py create mode 100644 teng-ml/util/file_io.py create mode 100644 teng-ml/util/model_io.py create mode 100644 teng-ml/util/split.py create mode 100644 teng-ml/util/string.py diff --git a/.gitignore b/.gitignore index cd4c22c..af66213 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *__pycache__* +.old diff --git a/readme.md b/readme.md index cdd7616..067ffcd 100644 --- a/readme.md +++ b/readme.md @@ -1,2 +1,13 @@ # Machine Learning stuff for TENG project +(Bi)LSTM for name classification. +More information on the project are [on my website](https://quintern.xyz/en/teng.html). + +## Model training +Adjust the parameters in `main.py` and run it. +All models and the settings they were trained with are automatically serialized with pickle and stored in a subfolder +of the `` that was set in `main.py`. + + +## Model evaluation +Run `find_best_model.py ` with the `` specified in `main.py` during training. diff --git a/teng-ml/main.py b/teng-ml/main.py index 6a0f882..700efda 100644 --- a/teng-ml/main.py +++ b/teng-ml/main.py @@ -7,20 +7,22 @@ if __name__ == "__main__": filepath = path.realpath(path.abspath(__file__)) sys.path.insert(0, path.dirname(path.dirname(filepath))) +from sys import exit import matplotlib.pyplot as plt import pandas as pd import torch import torch.nn as nn -import torch.nn.utils.rnn as rnn_utils from torch.utils.data import DataLoader -import json +import itertools import time -import pickle +from os import makedirs, path from .util.transform import ConstantInterval, Normalize from .util.data_loader import load_datasets, LabelConverter -from .util.epoch_tracker import EpochTracker +from .util.split import DataSplitter from .util.settings import MLSettings +from .rnn.rnn import RNN +from .rnn.training import train_validate_save, select_device def test_interpol(): file = "/home/matth/data/2023-04-27_glass_8.2V_179mm000.csv" @@ -35,187 +37,92 @@ def test_interpol(): ax1.plot(interp_array[:,0], interp_array[:,1], color="r", label="Interpolated") ax1.scatter(array[:,0], array[:,2], color="g", label="Original") ax1.legend() - plt.show() + # plt.show() + if __name__ == "__main__": - device = ( - "cuda" - if torch.cuda.is_available() - else "mps" - if torch.backends.mps.is_available() - else "cpu" - ) - - labels = LabelConverter(["foam", "glass", "kapton", "foil", "cloth", "rigid_foam"]) + models_dir = "/home/matth/Uni/TENG/models" # where to save models, settings and results + if not path.isdir(models_dir): + makedirs(models_dir) + data_dir = "/home/matth/Uni/TENG/data" + + + # Test with + num_layers = [ 3 ] + hidden_size = [ 8 ] + bidirectional = [ True ] t_const_int = ConstantInterval(0.01) t_norm = Normalize(0, 1) - transforms = [ t_const_int, t_norm ] - st = MLSettings(num_features=1, - num_layers=1, - hidden_size=1, - bidirectional=True, - transforms=transforms, - num_epochs=40, - batch_size=3, - labels=labels, - ) + transforms = [[ t_const_int ]] #, [ t_const_int, t_norm ]] + batch_sizes = [ 64 ] # , 16] + splitters = [ DataSplitter(100) ] + num_epochs = [ 80 ] - print(f"Using device: {device}") + # num_layers=1, + # hidden_size=1, + # bidirectional=True, + # optimizer=None, + # scheduler=None, + # loss_func=None, + # transforms=[], + # splitter=None, + # num_epochs=10, + # batch_size=5, + args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes] + # create settings for every possible combination + settings = [ + MLSettings(1, *params, labels) for params in itertools.product(*args) + ] - train_set, test_set = load_datasets("/home/matth/Uni/TENG/data", labels, voltage=8.2, transforms=st.transforms, train_to_test_ratio=0.7, random_state=42) + loss_func = nn.CrossEntropyLoss() + optimizers = [ + lambda model: torch.optim.Adam(model.parameters(), lr=0.03), + # lambda model: torch.optim.Adam(model.parameters(), lr=0.25), + # lambda model: torch.optim.Adam(model.parameters(), lr=0.50), + ] + schedulers = [ + lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9), + lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.40, verbose=False), + # lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False), + ] - # train_loader = iter(DataLoader(train_set)) - # test_loader = iter(DataLoader(test_set)) - train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True) - test_loader = DataLoader(test_set, batch_size=st.batch_size, shuffle=True) + n_total = len(settings) * len(optimizers) * len(schedulers) + print(f"Testing {n_total} possible configurations") + # scheduler2 = + def create_model(st, optimizer_f, scheduler_f): + model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional) + optimizer = optimizer_f(model) + scheduler = scheduler_f(optimizer, st) + return model, optimizer, scheduler - class RNN(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, num_classes, bidirectional): - super(RNN, self).__init__() - self.num_layers = num_layers - self.hidden_size = hidden_size - self.is_bidirectional = bidirectional - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional) - # x = (batch_size, sequence, feature) + t_begin = time.time() + n = 1 + for o in range(len(optimizers)): + for s in range(len(schedulers)): + for i in range(len(settings)): + st = settings[i] + # print(st.get_name()) + train_set, test_set = load_datasets(data_dir, labels, voltage=8.2, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=42, num_workers=4) - if bidirectional == True: - self.fc = nn.Linear(hidden_size * 2, num_classes) - else: - self.fc = nn.Linear(hidden_size, num_classes) + generator = torch.manual_seed(42) + # train_loader = iter(DataLoader(train_set)) + # test_loader = iter(DataLoader(test_set)) + train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator) + test_loader = DataLoader(test_set, batch_size=st.batch_size, shuffle=True, generator=generator) + print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})") + model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s]) + device = select_device(force_device="cpu") + try: + train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=4) + except KeyboardInterrupt: + if input("Cancelled current training. Quit? (q/*): ") == "q": + t_end = time.time() + print(f"Testing took {t_end - t_begin:.2f}s = {(t_end-t_begin)/60:.1f}m") + exit() + n += 1 - self.softmax = nn.Softmax(dim=1) - - def forward(self, x): - # x: batches, length, features - # print(f"forward pass") - D = 2 if self.is_bidirectional == True else 1 - - # print(f"x({x.shape})=...") - batch_size = x.shape[0] - # print(f"batch_size={batch_size}") - - h0 = torch.zeros(D * self.num_layers, batch_size, self.hidden_size).to(device) - # print(f"h1({h0.shape})=...") - c0 = torch.zeros(D * self.num_layers, batch_size, self.hidden_size).to(device) - x.to(device) - _, (h_n, _) = self.lstm(x, (h0, c0)) - # print(f"h_n({h_n.shape})=...") - final_state = h_n.view(self.num_layers, D, batch_size, self.hidden_size)[-1] # num_layers, num_directions, batch, hidden_size - # print(f"final_state({final_state.shape})=...") - - if D == 1: - X = final_state.squeeze() # TODO what if batch_size == 1 - elif D == 2: - h_1, h_2 = final_state[0], final_state[1] # forward & backward pass - #X = h_1 + h_2 # Add both states - X = torch.cat((h_1, h_2), 1) # Concatenate both states, X-size: (Batch, hidden_size * 2) - else: - raise ValueError("D must be 1 or 2") - # print(f"X({X.shape})={X}") - output = self.fc(X) # fully-connected layer - # print(f"out({output.shape})={output}") - output = self.softmax(output) - # print(f"out({output.shape})={output}") - return output - - model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional).to(device) - loss_func = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.Adam(model.parameters(), lr=0.02) - scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95) - - print(f"model:", model) - print(f"loss_func={loss_func}") - print(f"optimizer={optimizer}") - print(f"scheduler={scheduler}") - - - - epoch_tracker = EpochTracker(labels) - - print(f"train_loader") - for i, (data, y) in enumerate(train_loader): - print(y) - print(f"{i:3} - {torch.argmax(y, dim=1, keepdim=False)}") - - -# training - epoch_tracker.train_begin() - for ep in range(st.num_epochs): - for i, (data, y) in enumerate(train_loader): - # print(data, y) - # data = batch, seq, features - # print(f"data({data.shape})={data}") - x = data[:,:,[2]].float() # select voltage data - # print(f"x({x.shape}, {x.dtype})=...") - # print(f"y({y.shape}, {y.dtype})=...") - # length = torch.tensor([x.shape[1] for _ in range(x.shape[0])], dtype=torch.int64) - # print(f"length({length.shape})={length}") - # batch_size = x.shape[0] - # print(f"batch_size={batch_size}") - # v = x.view(batch_size, -1, feature_count) - # data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)[0] - # print(f"data({data.shape})={data}") - # print(data.batch_sizes[0]) - # print(data) - out = model(x) - # print(f"out({out.shape}={out})") - loss = loss_func(out, y) - # print(loss) - - optimizer.zero_grad() # clear gradients for next train - loss.backward() # backpropagation, compute gradients - optimizer.step() # apply gradients - - # predicted = torch.max(torch.nn.functional.softmax(out), 1)[1] - predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] - correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] - # print(f"predicted={predicted}, correct={correct}") - # train_total += y.size(0) - # train_correct += (predicted == correct).sum().item() - epoch_tracker.train(correct, predicted) - epoch_tracker.next_epoch(loss) - print(epoch_tracker.get_last_epoch_summary_str()) - scheduler.step() t_end = time.time() - - with torch.no_grad(): - for i, (data, y) in enumerate(test_loader): - # print(ep, "Test") - x = data[:,:,[2]].float() - out = model(x) - loss = loss_func(out, y) - - predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] - correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] - # print(f"predicted={predicted}, correct={correct}") - # val_total += y.size(0) - # val_correct += (predicted == correct).sum().item() - - epoch_tracker.test(correct, predicted) - - # print(f"train_total={train_total}, val_total={val_total}") - # if train_total == 0: train_total = -1 - # if val_total == 0: val_total = -1 - - # print(f"epoch={ep+1:3}: Testing accuracy={100 * val_correct / val_total:.2f}") - # print(f"End result: Training accuracy={100 * train_correct / train_total:.2f}%, Testing accuracy={100 * val_correct / val_total:.2f}, training took {t_end - t_begin:.2f} seconds") - - epoch_tracker.get_test_statistics() - # epoch_tracker.() - - # print(epoch_tracker.get_training_summary_str()) - print(epoch_tracker.get_training_count_per_label()) - - model_name = st.get_name() - # save the settings, results and model - with open(model_name + "_settings.pkl", "wb") as file: - pickle.dump(st, file) - - with open(model_name + "_results.pkl", "wb") as file: - pickle.dump(epoch_tracker, file) - - with open(model_name + "_model.pkl", "wb") as file: - pickle.dump(model, file) + print(f"Testing took {t_end - t_begin:.2f}s = {(t_end-t_begin)/60:.1f}m") diff --git a/teng-ml/peaks.py b/teng-ml/peaks.py index c999558..35451e1 100644 --- a/teng-ml/peaks.py +++ b/teng-ml/peaks.py @@ -17,6 +17,7 @@ if __name__ == "__main__": sys.path.insert(0, path.dirname(path.dirname(filepath))) from .util.transform import Normalize +from .util.data_loader import get_datafiles file = "/home/matth/data/2023-04-25_kapton_8.2V_179mm002.csv" diff --git a/teng-ml/rnn.py b/teng-ml/rnn.py deleted file mode 100644 index f8425c4..0000000 --- a/teng-ml/rnn.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch -import torch.nn as nn - -# BiLSTM Model - -class RNN(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, num_classes, if_bidirectional): - super(RNN, self).__init__() - self.num_layers = num_layers - self.hidden_size = hidden_size - self.if_bidirectional = if_bidirectional - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=if_bidirectional) - - if if_bidirectional == True: - self.fc = nn.Linear(hidden_size * 2, num_classes) - else: - self.fc = nn.Linear(hidden_size, num_classes) - - - def forward(self, x): - D = 2 if self.if_bidirectional == True else 1 - Batch = x.batch_sizes[0] - - h0 = torch.zeros(D * self.num_layers, Batch, self.hidden_size).to(device) - c0 = torch.zeros(D * self.num_layers, Batch, self.hidden_size).to(device) - x.to(device) - _, (h_n, _) = self.lstm(x, (h0, c0)) - final_state = h_n.view(self.num_layers, D, Batch, self.hidden_size)[-1] # num_layers, num_directions, batch, hidden_size - - if D == 1: - X = final_state.squeeze() - elif D == 2: - h_1, h_2 = final_state[0], final_state[1] # forward & backward pass - # X = h_1 + h_2 # Add both states - X = torch.cat((h_1, h_2), 1) # Concatenate both states, X-size: (Batch, hidden_size * 2) - - output = self.fc(X) # fully-connected layer - - return output diff --git a/teng-ml/rnn/rnn.py b/teng-ml/rnn/rnn.py new file mode 100644 index 0000000..490eaed --- /dev/null +++ b/teng-ml/rnn/rnn.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn + +class RNN(nn.Module): + """ + (Bi)LSTM for name classification + """ + def __init__(self, input_size, hidden_size, num_layers, num_classes, bidirectional): + super(RNN, self).__init__() + self.num_layers = num_layers + self.hidden_size = hidden_size + self.is_bidirectional = bidirectional + self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional) + # x = (batch_size, sequence, feature) + + if bidirectional == True: + self.fc = nn.Linear(hidden_size * 2, num_classes) + else: + self.fc = nn.Linear(hidden_size, num_classes) + + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + # x: batches, length, features + # print(f"forward pass") + D = 2 if self.is_bidirectional == True else 1 + # print(f"x({x.shape})=...") + batch_size = x.shape[0] + + device = x.device + + h0 = torch.zeros(D * self.num_layers, batch_size, self.hidden_size).to(device) + # print(f"h1({h0.shape})=...") + c0 = torch.zeros(D * self.num_layers, batch_size, self.hidden_size).to(device) + + out, (h_n, c_n) = self.lstm(x, (h0, c0)) + # out: (N, L, D * hidden_size) + # h_n: (D * num_layers, hidden_size) + # c_n: (D * num_layers, hidden_size) + # print(f"out({out.shape})={out}") + # print(f"h_n({h_n.shape})={h_n}") + # print(f"c_n({c_n.shape})={c_n}") + # print(f"out({out.shape})=...") + # print(f"h_n({h_n.shape})=...") + # print(f"c_n({c_n.shape})=...") + + """ + # select only last layer [-1] -> last layer, + last_layer_state = h_n.view(self.num_layers, D, batch_size, self.hidden_size)[-1] + if D == 1: + # [1, batch_size, hidden_size] -> [batch_size, hidden_size] + X = last_layer_state.squeeze() # TODO what if batch_size == 1 + elif D == 2: + h_1, h_2 = last_layer_state[0], last_layer_state[1] # states of both directions + # concatenate both states, X-size: (Batch, hidden_size * 2) + X = torch.cat((h_1, h_2), dim=1) + else: + raise ValueError("D must be 1 or 2") + """ # all this is quivalent to line below + out = out[:,-1,:] # select last time step + + # fc: (*, hidden_size) -> (*, num_classes) + # print(f"X({X.shape})={X}") + # print(f"X({X.shape})=...") + out = self.fc(out) # fully-connected layer + # print(f"out({output.shape})={output}") + # print(f"output({output.shape})=...") + # softmax: (*) -> (*) + # out = self.softmax(out) + # print(f"output({output.shape})=...") + # print(f"output({output.shape})={output}") + + """ + out(torch.Size([15, 200, 10]))=... + h_n(torch.Size([3, 15, 10]))=... + c_n(torch.Size([3, 15, 10]))=... + X(torch.Size([3, 1, 15, 10]))=... + output(torch.Size([3, 1, 15, 6]))=... + output(torch.Size([3, 1, 15, 6]))=...""" + return out diff --git a/teng-ml/rnn/training.py b/teng-ml/rnn/training.py new file mode 100644 index 0000000..c919d7f --- /dev/null +++ b/teng-ml/rnn/training.py @@ -0,0 +1,150 @@ +from os import makedirs, path +import torch +import pickle +import matplotlib.pyplot as plt +from torch.utils.data import DataLoader + +from ..util.settings import MLSettings +from ..tracker.epoch_tracker import EpochTracker +from ..util.file_io import get_next_digits +from ..util.string import class_str + +from ..util import model_io as mio + + +def select_device(force_device=None): + """ + Select best device and move model + """ + if force_device is not None: + device = force_device + else: + device = torch.device( + "cuda" + if torch.cuda.is_available() + # else "mps" + # if torch.backends.mps.is_available() + else "cpu" + ) + # print(device, torch.cuda.get_device_name(device), torch.cuda.get_device_properties(device)) + return device + + +def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1) -> EpochTracker: + epoch_tracker = EpochTracker(st.labels) + epoch_tracker.begin() + for ep in range(st.num_epochs): + loss = -1 + for i, (data, y) in enumerate(train_loader): + # print(data, y) + # data = batch, seq, features + # print(f"data({data.shape})={data}") + x = data[:,:,[2]].float() # select voltage data + # print(f"x({x.shape}, {x.dtype})=...") + # print(f"y({y.shape}, {y.dtype})=...") + # length = torch.tensor([x.shape[1] for _ in range(x.shape[0])], dtype=torch.int64) + # print(f"length({length.shape})={length}") + # batch_size = x.shape[0] + # print(f"batch_size={batch_size}") + # v = x.view(batch_size, -1, feature_count) + # data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)[0] + # print(f"data({data.shape})={data}") + out = model(x) + + # print(f"out({out.shape}={out})") + # print(f" y({y.shape}={y})") + with torch.no_grad(): + predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] + correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] + # print(f"predicted={predicted}, correct={correct}") + # train_total += y.size(0) + # train_correct += (predicted == correct).sum().item() + epoch_tracker.add_prediction(correct, predicted) + # predicted2 = torch.argmax(out, dim=1, keepdim=True) # -> [ label_indices ] + # print(f"correct={correct}, y={y}") + loss = loss_func(out, correct) + # loss = loss_func(out, y) + + + optimizer.zero_grad() # clear gradients for next train + loss.backward() # backpropagation, compute gradients + optimizer.step() # apply gradients + + # predicted = torch.max(torch.nn.functional.softmax(out), 1)[1] + epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"]) + if ep+1 % print_interval == 0: + print(f"Training:", epoch_tracker.get_epoch_summary_str()) + scheduler.step() + print("Training:", epoch_tracker.end()) + return epoch_tracker + + +def validate(model, test_loader: DataLoader, st: MLSettings) -> EpochTracker: + epoch_tracker = EpochTracker(st.labels) + epoch_tracker.begin() + with torch.no_grad(): + for i, (data, y) in enumerate(test_loader): + # print(ep, "Test") + x = data[:,:,[2]].float() + out = model(x) + + predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] + correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] + + epoch_tracker.add_prediction(correct, predicted) + print("Validation:", epoch_tracker.end()) + return epoch_tracker + + +def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, show_plots=False): + # assumes model and data is already on correct device + # train_loader.to(device) + # test_loader.to(device) + + # store optimizer, scheduler and loss_func in settings + st.optimizer = class_str(optimizer) + st.scheduler = class_str(scheduler) + st.loss_func = class_str(loss_func) + + model_name = st.get_name() + + def add_tab(s): + return "\t" + str(s).replace("\n", "\n\t") + print(100 * '=') + print("Model Name:", model_name) + print(f"model:\n", add_tab(model)) + # print(f"loss_func:\n", add_tab(class_str(loss_func))) + # print(f"optimizer:\n", add_tab(class_str(optimizer))) + # print(f"scheduler:\n", add_tab(class_str(scheduler))) + + + print(100 * '-') + training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval) + # print("Training: Count per label:", training_tracker.get_count_per_label()) + # print("Training: Predictions per label:", training_tracker.get_predictions_per_label()) + + print(100 * '-') + validation_tracker = validate(model, test_loader, st) + # print("Validation: Count per label:", validation_tracker.get_count_per_label()) + # print("Validation: Predictions per label:", validation_tracker.get_predictions_per_label()) + + + digits = get_next_digits(f"{model_name}_", models_dir) + model_dir = f"{models_dir}/{model_name}_{digits}" + # do not put earlier, since the dir should not be created if training is interrupted + if not path.isdir(model_dir): # should always run, if not the digits function did not work + makedirs(model_dir) + + fig, _ = validation_tracker.plot_predictions("Validation: Predictions", model_dir=model_dir, name="img_validation_predictions") + fig, _ = training_tracker.plot_predictions("Training: Predictions", model_dir=model_dir, name="img_training_predictions") + fig, _ = training_tracker.plot_training(model_dir=model_dir) + + if show_plots: + plt.show() + plt.close('all') + + # save the settings, results and model + mio.save_settings(model_dir, st) + mio.save_tracker_validation(model_dir, validation_tracker) + mio.save_tracker_training(model_dir, training_tracker) + mio.save_model(model_dir, model) diff --git a/teng-ml/tracker/epoch_tracker.py b/teng-ml/tracker/epoch_tracker.py new file mode 100644 index 0000000..0f9cd3c --- /dev/null +++ b/teng-ml/tracker/epoch_tracker.py @@ -0,0 +1,187 @@ +from ..util.data_loader import LabelConverter +import matplotlib.pyplot as plt +import time +import torch +import numpy as np + +class EpochTracker: + """ + Track accuracy, loss, learning_rate etc. during model training + Can also be used for validation (which will probably be only one epoch) + """ + def __init__(self, labels: LabelConverter): + self.labels = labels + + self.times: list[float] = [] # (epoch) + self.predictions = [[]] # (epoch, batch_nr, (correct_indices | predicted_indices), ind:ex_nr) + self.loss: list[float] = [] # (epoch) + self.learning_rate: list[float] = [] # (epoch) + self.epochs: list[int] = [] # 1 based for FINISHED epochs + self._current_epoch = 0 # 0 based + + # after training + self.accuracies: list[float] = [] # (epoch) + + def begin(self): + self.times.append(time.time()) + + def end(self): + self.times.append(time.time()) + # if end_epoch was called before end: + if len(self.predictions[-1]) == 0: + self.predictions.pop() + self._current_epoch -= 1 + else: # if end_epoch was not called + self.epochs.append(len(self.epochs) + 1) + self._calculate_accuracies(self._current_epoch) + + + s = f"Summary: After {self.epochs[-1]} epochs: " + s += f"Accuracy={self.accuracies[-1]:.2f}%" + s += f", Total time={self.get_total_time():.2f}s" + return s + + + + def get_total_time(self): + if len(self.times) > 1: return self.times[-1] - self.times[0] + else: return -1 + + # + # EPOCH + # + def end_epoch(self, loss, learning_rate): + """ + loss and learning_rate of last epoch + call before scheduler.step() + """ + self.times.append(time.time()) + self.epochs.append(len(self.epochs) + 1) + if type(loss) == torch.Tensor: self.loss.append(loss.item()) + else: self.loss.append(loss) + self.learning_rate.append(learning_rate) + self._calculate_accuracies(self._current_epoch) + + self._current_epoch += 1 + self.predictions.append([]) + + def get_epoch_summary_str(self, ep=-1): + """call after next_epoch()""" + m = max(ep, 0) # if ep == -1, check if len is > 0 + assert(len(self.epochs) > m) + s = f"Epoch {self.epochs[ep]:3}" + if len(self.accuracies) > m:s += f", Accuracy={self.accuracies[ep]:.2f}%" + if len(self.loss) > m: s += f", Loss={self.loss[ep]:.3f}" + if len(self.loss) > m: s += f", lr={self.learning_rate[ep]:.4f}" + if len(self.times) > m+1: s += f", dt={self.times[ep] - self.times[ep-1]:.2f}s" + return s + + def add_prediction(self, correct_indices: torch.Tensor, predicted_indices: torch.Tensor): + """for accuracy calculation""" + self.predictions[self._current_epoch].append((correct_indices.detach().numpy(), predicted_indices.detach().numpy())) + + # + # STATISTICS + # + def get_count_per_label(self, epoch=-1): + """ + the number of times where