From 1d05da3abfea712929cef66f931b279ef5bc49e1 Mon Sep 17 00:00:00 2001 From: "matthias@arch" Date: Thu, 10 Aug 2023 17:29:09 +0200 Subject: [PATCH] fixed fc missing --- teng_ml/main.py | 76 +++++++++++++++-------------- teng_ml/rnn/rnn.py | 83 +++++++++++++++++--------------- teng_ml/rnn/training.py | 59 ++++++++++++----------- teng_ml/tracker/epoch_tracker.py | 14 +++++- teng_ml/util/data_loader.py | 51 +++++++++++++++++--- teng_ml/util/split.py | 11 ++++- teng_ml/util/string.py | 39 ++++++++++++--- 7 files changed, 217 insertions(+), 116 deletions(-) diff --git a/teng_ml/main.py b/teng_ml/main.py index f2915a6..5123a16 100644 --- a/teng_ml/main.py +++ b/teng_ml/main.py @@ -18,8 +18,9 @@ import time from os import makedirs, path from .util.transform import ConstantInterval, Normalize -from .util.data_loader import load_datasets, LabelConverter +from .util.data_loader import load_datasets, LabelConverter, count_data from .util.split import DataSplitter +from .util.pad import PadSequences from .util.settings import MLSettings from .rnn.rnn import RNN from .rnn.training import train_validate_save, select_device @@ -41,34 +42,30 @@ def test_interpol(): if __name__ == "__main__": - labels = LabelConverter(["white_foam", "black_foam", "rigid_foam", "cardboard", "glass", "Kapton", "bubble_wrap", "cloth_ffp2", ]) - models_dir = "/home/matth/Uni/TENG/teng_2/models_gen_1" # where to save models, settings and results + # labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "antistatic_foil", "cardboard", "glass", "kapton", "bubble_wrap_PE", "fabric_PP", ]) + labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "kapton", "bubble_wrap_PE", "fabric_PP", ]) + models_dir = "/home/matth/Uni/TENG/teng_2/models_gen_8" # where to save models, settings and results if not path.isdir(models_dir): makedirs(models_dir) data_dir = "/home/matth/Uni/TENG/teng_2/sorted_data" + # gen_5 best options: datasplitter, not bidirectional, lr=0.001, no scheduler + # gen_6 best options: no glass, cardboard and antistatic_foil, not bidirectional, lr=0.0007, no datasplitter, 2 layers n_hidden = 10 # Test with - num_layers = [ 3 ] - hidden_size = [ 8 ] - bidirectional = [ True ] - # t_const_int = ConstantInterval(0.01) TODO check if needed: data was taken at equal rate, but it isnt perfect -> maybe just ignore? + num_layers = [ 2 ] + hidden_size = [ 7, 11, 14 ] + bidirectional = [ False, True ] + t_const_int = ConstantInterval(0.01) # TODO check if needed: data was taken at equal rate, but it isnt perfect -> maybe just ignore? t_norm = Normalize(-1, 1) - transforms = [[ t_const_int, t_norm ]] - batch_sizes = [ 64 ] # , 16] - splitters = [ DataSplitter(100) ] # TODO: try with 0.5-1second snippets - num_epochs = [ 60 ] + transforms = [[ ], [ t_norm ]] #, [ t_norm, t_const_int ]] + batch_sizes = [ 4 ] + splitters = [ DataSplitter(50, drop_if_smaller_than=30), DataSplitter(100, drop_if_smaller_than=30) ] # smallest file has length 68 TODO: try with 0.5-1second snippets + num_epochs = [ 5 ] + # (epoch, min_accuracy) + training_cancel_points = [(10, 10), (20, 20), (40, 30)] + # training_cancel_points = [] - # num_layers=1, - # hidden_size=1, - # bidirectional=True, - # optimizer=None, - # scheduler=None, - # loss_func=None, - # transforms=[], - # splitter=None, - # num_epochs=10, - # batch_size=5, args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes] # create settings for every possible combination @@ -78,23 +75,28 @@ if __name__ == "__main__": loss_func = nn.CrossEntropyLoss() optimizers = [ - lambda model: torch.optim.Adam(model.parameters(), lr=0.03), - # lambda model: torch.optim.Adam(model.parameters(), lr=0.25), - # lambda model: torch.optim.Adam(model.parameters(), lr=0.50), + lambda model: torch.optim.Adam(model.parameters(), lr=0.0005), + lambda model: torch.optim.Adam(model.parameters(), lr=0.0007), + # lambda model: torch.optim.Adam(model.parameters(), lr=0.008), ] schedulers = [ + None, # lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9), - lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.40, verbose=False), + # lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5), + lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 8, gamma=0.50, verbose=False), # lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False), ] + device = select_device(force_device="cpu") # TODO cuda is not supported because something throws NotImplementedError with my gpu n_total = len(settings) * len(optimizers) * len(schedulers) - print(f"Testing {n_total} possible configurations") + print(f"Testing {n_total} possible configurations, device='{device}'") # scheduler2 = def create_model(st, optimizer_f, scheduler_f): model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional) optimizer = optimizer_f(model) - scheduler = scheduler_f(optimizer, st) + if scheduler_f is not None: + scheduler = scheduler_f(optimizer, st) + else: scheduler = None return model, optimizer, scheduler t_begin = time.time() @@ -103,19 +105,21 @@ if __name__ == "__main__": for s in range(len(schedulers)): for i in range(len(settings)): st = settings[i] - # print(st.get_name()) - train_set, test_set = load_datasets(data_dir, labels, voltage=8.2, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=42, num_workers=4) + train_set, test_set = load_datasets(data_dir, labels, exclude_n_object=None, voltage=None, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=80, num_workers=4) generator = torch.manual_seed(42) - # train_loader = iter(DataLoader(train_set)) - # test_loader = iter(DataLoader(test_set)) - train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator) - test_loader = DataLoader(test_set, batch_size=st.batch_size, shuffle=True, generator=generator) - print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})") + train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator, collate_fn=PadSequences()) + test_loader = DataLoader(test_set, batch_size=None, shuffle=True, generator=generator) + + # set batch_size to None and remove collate_fn for this to work + # count_data(train_loader, st.labels, print_summary="training data") + # count_data(test_loader, st.labels, print_summary="validation data") + + model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s]) - device = select_device(force_device="cpu") + print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})") try: - train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1) + train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1, print_continuous=True, training_cancel_points=training_cancel_points) except KeyboardInterrupt: if input("Cancelled current training. Quit? (q/*): ") == "q": t_end = time.time() diff --git a/teng_ml/rnn/rnn.py b/teng_ml/rnn/rnn.py index da267ec..eaad4cb 100644 --- a/teng_ml/rnn/rnn.py +++ b/teng_ml/rnn/rnn.py @@ -19,52 +19,59 @@ class RNN(nn.Module): self.softmax = nn.Softmax(dim=1) self.D = 2 if self.is_bidirectional == True else 1 - def forward(self, x): - device = x.device + def forward(self, x, unpadded_lengths=None): + """ + @param x: + Tensor (seq_length, features) for unbatched inputs + Tensor (batch_size, seq_length, features) for batch inputs + PackedSequence for padded batched inputs + @param unpadded_lengths: Tensor(batch_size) with lengths of the unpadded sequences, when using padding but without PackedSequence + @returns (batch_size, num_classes) with batch_size == 1 for unbatched inputs + """ + # if type(x) == torch.Tensor: + # device = x.device + # # h0: initial hidden states + # # c0: initial cell states + # if len(x.shape) == 2: # x: (seq_length, features) + # h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device) + # c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device) + # elif len(x.shape) == 3: # x: (batch, seq_length, features) + # batch_size = x.shape[0] + # h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device) + # c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device) + # else: + # raise ValueError(f"RNN.forward: invalid input shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)") + # elif type(x) == nn.utils.rnn.PackedSequence: + # device = x.data.device + # h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device) + # c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device) + # else: + # raise ValueError(f"RNN.forward: invalid input type: {type(x)}. Must be Tensor or PackedSequence") - # h0: initial hidden states - # c0: initial cell states - if len(x.shape) == 2: # x: (seq_length, features) - h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device) - c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device) - elif len(x.shape) == 3: # x: (batch, seq_length, features) - batch_size = x.shape[0] - h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device) - c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device) - else: - raise ValueError(f"RNN.forward: invalid iput shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)") # lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size) - out, (h_n, c_n) = self.lstm(x, (h0, c0)) - print(f"forward: out.shape={out.shape} TODO verify comment") - # out: (N, L, D * hidden_size) - # h_n: (D * num_layers, hidden_size) - # c_n: (D * num_layers, hidden_size) - # print(f"out({out.shape})={out}") - # print(f"h_n({h_n.shape})={h_n}") - # print(f"c_n({c_n.shape})={c_n}") - # print(f"out({out.shape})=...") - # print(f"h_n({h_n.shape})=...") - # print(f"c_n({c_n.shape})=...") + # or: packed_sequence -> packed_sequence + # out, (h_n, c_n) = self.lstm(x, (h0, c0)) + out, (h_n, c_n) = self.lstm(x) # (h0, c0) defaults to zeros - """ - # select only last layer [-1] -> last layer, - last_layer_state = h_n.view(self.num_layers, D, batch_size, self.hidden_size)[-1] - if D == 1: - # [1, batch_size, hidden_size] -> [batch_size, hidden_size] - X = last_layer_state.squeeze() # TODO what if batch_size == 1 - elif D == 2: - h_1, h_2 = last_layer_state[0], last_layer_state[1] # states of both directions - # concatenate both states, X-size: (Batch, hidden_size * 2) - X = torch.cat((h_1, h_2), dim=1) + # select the last state of lstm's neurons + if type(out) == nn.utils.rnn.PackedSequence: + # padding has to be considered + out, lengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) + # the unpadded length of batch i is lengths[i], so that is the last non-zero state + out = torch.stack([out[i,lengths[i].item()-1,:] for i in range(len(lengths))]) + elif unpadded_lengths is not None: + out = torch.stack([out[i,unpadded_lengths[i].item()-1,:] for i in range(len(unpadded_lengths))]) else: - raise ValueError("D must be 1 or 2") - """ # all this is quivalent to line below - out = out[:,-1,:] # select last time step + if out.shape[0] == 3: # batched + out = out[:,-1,:] + else: # unbatched + # softmax requires (batch_size, *) + out = torch.stack([out[-1,:]]) # fc fully connected layer: (*, hidden_size) -> (*, num_classes) out = self.fc(out) - # softmax: (*) -> (*) + # softmax: (batch_size, *) -> (batch_size, *) out = self.softmax(out) return out diff --git a/teng_ml/rnn/training.py b/teng_ml/rnn/training.py index c919d7f..298ab4f 100644 --- a/teng_ml/rnn/training.py +++ b/teng_ml/rnn/training.py @@ -7,7 +7,7 @@ from torch.utils.data import DataLoader from ..util.settings import MLSettings from ..tracker.epoch_tracker import EpochTracker from ..util.file_io import get_next_digits -from ..util.string import class_str +from ..util.string import class_str, optimizer_str from ..util import model_io as mio @@ -30,29 +30,20 @@ def select_device(force_device=None): return device -def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1) -> EpochTracker: +def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1, print_continuous=False, training_cancel_points=[]) -> EpochTracker: epoch_tracker = EpochTracker(st.labels) epoch_tracker.begin() for ep in range(st.num_epochs): loss = -1 - for i, (data, y) in enumerate(train_loader): - # print(data, y) + for i, (data, lengths, y) in enumerate(train_loader): # data = batch, seq, features - # print(f"data({data.shape})={data}") x = data[:,:,[2]].float() # select voltage data # print(f"x({x.shape}, {x.dtype})=...") # print(f"y({y.shape}, {y.dtype})=...") - # length = torch.tensor([x.shape[1] for _ in range(x.shape[0])], dtype=torch.int64) - # print(f"length({length.shape})={length}") - # batch_size = x.shape[0] - # print(f"batch_size={batch_size}") - # v = x.view(batch_size, -1, feature_count) - # data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)[0] - # print(f"data({data.shape})={data}") - out = model(x) + # pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True) + # out = model(pack) # really slow + out = model(x, lengths) - # print(f"out({out.shape}={out})") - # print(f" y({y.shape}={y})") with torch.no_grad(): predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] @@ -72,9 +63,20 @@ def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: # predicted = torch.max(torch.nn.functional.softmax(out), 1)[1] epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"]) - if ep+1 % print_interval == 0: - print(f"Training:", epoch_tracker.get_epoch_summary_str()) - scheduler.step() + if (ep+1) % print_interval == 0: + if print_continuous: end='\r' + else: end='\n' + print(f"Training:", epoch_tracker.get_epoch_summary_str(), end=end) + # cancel training if model is not good enough + if len(training_cancel_points) > 0 and ep == training_cancel_points[0][0]: + print(f"Checking training cancel point: epoch={ep}, point={training_cancel_points[0]}, accuracy={epoch_tracker.accuracies[-1]}") + if epoch_tracker.accuracies[-1] < training_cancel_points[0][1]: + print(f"Training cancelled because the models accuracy={epoch_tracker.accuracies[-1]:.2f} < {training_cancel_points[0][1]} after {ep} epochs.") + break; + training_cancel_points.pop(0) + + if scheduler is not None: + scheduler.step() print("Training:", epoch_tracker.end()) return epoch_tracker @@ -85,24 +87,27 @@ def validate(model, test_loader: DataLoader, st: MLSettings) -> EpochTracker: with torch.no_grad(): for i, (data, y) in enumerate(test_loader): # print(ep, "Test") - x = data[:,:,[2]].float() + x = data[:,[2]].float() out = model(x) predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] - correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] + if y.shape[0] == 2: # batched + correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] + else: # unbatched + correct = torch.argmax(y, dim=0, keepdim=True) # -> [ label_indices ] epoch_tracker.add_prediction(correct, predicted) print("Validation:", epoch_tracker.end()) return epoch_tracker -def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, show_plots=False): +def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, print_continuous=False, show_plots=False, training_cancel_points=[]): # assumes model and data is already on correct device # train_loader.to(device) # test_loader.to(device) # store optimizer, scheduler and loss_func in settings - st.optimizer = class_str(optimizer) + st.optimizer = optimizer_str(optimizer) st.scheduler = class_str(scheduler) st.loss_func = class_str(loss_func) @@ -111,15 +116,15 @@ def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: Da def add_tab(s): return "\t" + str(s).replace("\n", "\n\t") print(100 * '=') - print("Model Name:", model_name) + print("model name:", model_name) print(f"model:\n", add_tab(model)) - # print(f"loss_func:\n", add_tab(class_str(loss_func))) - # print(f"optimizer:\n", add_tab(class_str(optimizer))) - # print(f"scheduler:\n", add_tab(class_str(scheduler))) + print(f"loss_func: {st.loss_func}") + print(f"optimizer: {st.optimizer}") + print(f"scheduler: {st.scheduler}") print(100 * '-') - training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval) + training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval, print_continuous=print_continuous, training_cancel_points=training_cancel_points) # print("Training: Count per label:", training_tracker.get_count_per_label()) # print("Training: Predictions per label:", training_tracker.get_predictions_per_label()) diff --git a/teng_ml/tracker/epoch_tracker.py b/teng_ml/tracker/epoch_tracker.py index 7c87cec..6a9a9ac 100644 --- a/teng_ml/tracker/epoch_tracker.py +++ b/teng_ml/tracker/epoch_tracker.py @@ -1,3 +1,4 @@ +from os import stat from ..util.data_loader import LabelConverter import matplotlib.pyplot as plt import matplotlib.colors as colors @@ -103,7 +104,18 @@ class EpochTracker: statistics = [ [ 0 for _ in range(len(self.labels)) ] for _ in range(len(self.labels)) ] for corr, pred in self.predictions[epoch]: for batch in range(len(corr)): - statistics[corr[batch]][pred[batch]] += 1 + try: + statistics[corr[batch]][pred[batch]] += 1 + except IndexError as e: + print(f"IndexError in get_predictions_per_label: epoch={epoch}, len(corr)={len(corr)}, len(pred)={len(pred)}, batch={batch}, len(labels)={len(self.labels)}, len(statistics)={len(statistics)}") + print(f"statistics: {statistics}") + print(f"corr: {corr}") + print(f"pred: {pred}") + if batch in range(len(corr)): + if corr[batch] in range(len(statistics)): + print(f"len(statistics[corr[batch]])={len(statistics[corr[batch]])}") + print(f"corr[batch]={corr[batch]}, pred[batch]={pred[batch]}") + raise e return statistics def plot_training(self, title="Training Summary", model_dir=None, name="img_training"): diff --git a/teng_ml/util/data_loader.py b/teng_ml/util/data_loader.py index 7e8a770..2c9be7a 100644 --- a/teng_ml/util/data_loader.py +++ b/teng_ml/util/data_loader.py @@ -4,6 +4,7 @@ import re import numpy as np import pandas as pd from scipy.sparse import data +import torch import threading @@ -24,7 +25,13 @@ class LabelConverter: vec[self.class_labels.index(label)] = 1.0 return vec + def get_label_index(self, one_hot: torch.Tensor): + """return one hot vector for given label""" + return int(torch.argmax(one_hot).item()) + def __getitem__(self, index): + if type(index) == torch.Tensor: + return self.class_labels[self.get_label_index(index)] return self.class_labels[index] def __contains__(self, value): @@ -84,8 +91,11 @@ class Dataset: if split_function is None: self.data.append((data, sample.label_vec)) else: - for data_split in split_function(data): - self.data.append((data_split, sample.label_vec)) + try: + for data_split in split_function(data): + self.data.append((data_split, sample.label_vec)) + except ValueError as e: + raise ValueError(f"Exception occured during splitting of sample '{sample.datapath}': {e}") def apply_transforms(self, data): if type(self.transforms) == list: @@ -111,7 +121,9 @@ def get_datafiles(datadir, labels: LabelConverter, exclude_n_object=None, filter files.sort() for file in files: match = re.fullmatch(re_filename, file) - if not match: continue + if not match: + print(f"get_datafiles: dropping non matching file '{file}'") + continue label = match.groups()[1] if label not in labels: continue @@ -125,16 +137,16 @@ def get_datafiles(datadir, labels: LabelConverter, exclude_n_object=None, filter return datafiles -def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None): +def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, exclude_n_object=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None): """ load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv """ datasamples = [] if num_workers == None: - for file, match, label in get_datafiles(datadir, labels, voltage): + for file, match, label in get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage): datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file)) else: - files = get_datafiles(datadir, labels, voltage) + files = get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage) def worker(): while True: try: @@ -155,3 +167,30 @@ def load_datasets(datadir, labels: LabelConverter, transforms=None, split_functi train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function) test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function) return train_dataset, test_dataset + + +def count_data(data_loader, label_converter: LabelConverter, print_summary=False): + """ + @param data_loader: unbatched data loader + """ + n_sequences = 0 # count number of sequences + labels = [ 0 for _ in range(len(label_converter)) ] # count number of sequences per label + len_data = [ 0 for _ in range(len(label_converter)) ] # count number of datapoints per label + for i, (data, y) in enumerate(data_loader): + n_sequences = i + label_i = label_converter.get_label_index(y) + len_data[label_i] += data.shape[0] + labels[label_i] += 1 + if print_summary: + print("=" * 50) + print("Dataset summary" + f" for {print_summary}:" if type(print_summary) == str else ":") + print(f"Number of sequences: {n_sequences}") + for i in range(len(label_converter)): + print(f"- {label_converter[i]:15}: {labels[i]:3} sequences, {len_data[i]:5} datapoints") + + return n_sequences, labels, len_data + + + + + diff --git a/teng_ml/util/split.py b/teng_ml/util/split.py index 4711cb2..3752593 100644 --- a/teng_ml/util/split.py +++ b/teng_ml/util/split.py @@ -5,8 +5,12 @@ class DataSplitter: Split a numpy array into smaller arrays of size datapoints_per_split If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped """ - def __init__(self, datapoints_per_split): + def __init__(self, datapoints_per_split, drop_if_smaller_than=-1): + """ + @param drop_if_smaller_than: drop the remaining datapoints if the sequence would be smaller than this value. -1 means drop_if_smaller_than=datapoints_per_split + """ self.split_size = datapoints_per_split + self.drop_threshhold = datapoints_per_split if drop_if_smaller_than == -1 else drop_if_smaller_than def __call__(self, data: np.ndarray): """ @@ -15,6 +19,11 @@ class DataSplitter: ret_data = [] for i in range(self.split_size, data.shape[0], self.split_size): ret_data.append(data[i-self.split_size:i, :]) + + rest_start = len(ret_data) * self.split_size + if len(data) - rest_start >= self.drop_threshhold: + ret_data.append(data[rest_start:,:]) + if len(ret_data) == 0: raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}") return ret_data diff --git a/teng_ml/util/string.py b/teng_ml/util/string.py index 0d31701..c5c40f2 100644 --- a/teng_ml/util/string.py +++ b/teng_ml/util/string.py @@ -13,25 +13,50 @@ def fill_and_center(s: str, fill_char="=", length=100): else: return s + def class_str(x): """ Return the constructor of the class of x with arguemnts """ name = type(x).__name__ - signature = inspect.signature(type(x)) params = [] - for param_name, param_value in x.__dict__.items(): - if param_name not in signature.parameters: - continue - default_value = signature.parameters[param_name].default - if param_value != default_value: - params.append(f"{param_name}={param_value!r}") + try: + signature = inspect.signature(type(x)) + for param_name, param_value in x.__dict__.items(): + if param_name not in signature.parameters: + continue + default_value = signature.parameters[param_name].default + if param_value != default_value: + params.append(f"{param_name}={param_value!r}") + except ValueError: + pass if params: return f"{name}({', '.join(params)})" else: return name +def optimizer_str(x): + # optimizer stores everything in 'defaults' dict and is thus not compatible with class_str + name = type(x).__name__ + params = [] + try: + signature = inspect.signature(type(x)) + for param_name, param_value in x.__dict__["defaults"].items(): + if param_name not in signature.parameters: + continue + default_value = signature.parameters[param_name].default + if param_value != default_value: + params.append(f"{param_name}={param_value!r}") + except ValueError: + pass + if params: + return f"{name}({', '.join(params)})" + else: + return name + + + def cleanup_str(s): """ convert to string if necessary and