Compare commits

..

4 Commits

Author SHA1 Message Date
matthias@arch
33d1945de2 added padding function 2023-08-14 18:43:53 +02:00
matthias@arch
37bb1f444e removed debug 2023-08-14 18:43:27 +02:00
matthias@arch
61321e3919 fixed epoch for cancel pts 2023-08-14 18:43:07 +02:00
matthias@arch
1d05da3abf fixed fc missing 2023-08-10 17:29:09 +02:00
8 changed files with 218 additions and 115 deletions

View File

@ -18,8 +18,9 @@ import time
from os import makedirs, path from os import makedirs, path
from .util.transform import ConstantInterval, Normalize from .util.transform import ConstantInterval, Normalize
from .util.data_loader import load_datasets, LabelConverter from .util.data_loader import load_datasets, LabelConverter, count_data
from .util.split import DataSplitter from .util.split import DataSplitter
from .util.pad import PadSequences
from .util.settings import MLSettings from .util.settings import MLSettings
from .rnn.rnn import RNN from .rnn.rnn import RNN
from .rnn.training import train_validate_save, select_device from .rnn.training import train_validate_save, select_device
@ -41,34 +42,30 @@ def test_interpol():
if __name__ == "__main__": if __name__ == "__main__":
labels = LabelConverter(["white_foam", "black_foam", "rigid_foam", "cardboard", "glass", "Kapton", "bubble_wrap", "cloth_ffp2", ]) # labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "antistatic_foil", "cardboard", "glass", "kapton", "bubble_wrap_PE", "fabric_PP", ])
models_dir = "/home/matth/Uni/TENG/teng_2/models_gen_1" # where to save models, settings and results labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "kapton", "bubble_wrap_PE", "fabric_PP", ])
models_dir = "/home/matth/Uni/TENG/teng_2/models_gen_8" # where to save models, settings and results
if not path.isdir(models_dir): if not path.isdir(models_dir):
makedirs(models_dir) makedirs(models_dir)
data_dir = "/home/matth/Uni/TENG/teng_2/sorted_data" data_dir = "/home/matth/Uni/TENG/teng_2/sorted_data"
# gen_5 best options: datasplitter, not bidirectional, lr=0.001, no scheduler
# gen_6 best options: no glass, cardboard and antistatic_foil, not bidirectional, lr=0.0007, no datasplitter, 2 layers n_hidden = 10
# Test with # Test with
num_layers = [ 3 ] num_layers = [ 2 ]
hidden_size = [ 8 ] hidden_size = [ 7, 11, 14 ]
bidirectional = [ True ] bidirectional = [ False, True ]
# t_const_int = ConstantInterval(0.01) TODO check if needed: data was taken at equal rate, but it isnt perfect -> maybe just ignore? t_const_int = ConstantInterval(0.01) # TODO check if needed: data was taken at equal rate, but it isnt perfect -> maybe just ignore?
t_norm = Normalize(-1, 1) t_norm = Normalize(-1, 1)
transforms = [[ t_const_int, t_norm ]] transforms = [[ ], [ t_norm ]] #, [ t_norm, t_const_int ]]
batch_sizes = [ 64 ] # , 16] batch_sizes = [ 4 ]
splitters = [ DataSplitter(100) ] # TODO: try with 0.5-1second snippets splitters = [ DataSplitter(50, drop_if_smaller_than=30), DataSplitter(100, drop_if_smaller_than=30) ] # smallest file has length 68 TODO: try with 0.5-1second snippets
num_epochs = [ 60 ] num_epochs = [ 5 ]
# (epoch, min_accuracy)
training_cancel_points = [(10, 10), (20, 20), (40, 30)]
# training_cancel_points = []
# num_layers=1,
# hidden_size=1,
# bidirectional=True,
# optimizer=None,
# scheduler=None,
# loss_func=None,
# transforms=[],
# splitter=None,
# num_epochs=10,
# batch_size=5,
args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes] args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes]
# create settings for every possible combination # create settings for every possible combination
@ -78,23 +75,28 @@ if __name__ == "__main__":
loss_func = nn.CrossEntropyLoss() loss_func = nn.CrossEntropyLoss()
optimizers = [ optimizers = [
lambda model: torch.optim.Adam(model.parameters(), lr=0.03), lambda model: torch.optim.Adam(model.parameters(), lr=0.0005),
# lambda model: torch.optim.Adam(model.parameters(), lr=0.25), lambda model: torch.optim.Adam(model.parameters(), lr=0.0007),
# lambda model: torch.optim.Adam(model.parameters(), lr=0.50), # lambda model: torch.optim.Adam(model.parameters(), lr=0.008),
] ]
schedulers = [ schedulers = [
None,
# lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9), # lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9),
lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.40, verbose=False), # lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5),
lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 8, gamma=0.50, verbose=False),
# lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False), # lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False),
] ]
device = select_device(force_device="cpu") # TODO cuda is not supported because something throws NotImplementedError with my gpu
n_total = len(settings) * len(optimizers) * len(schedulers) n_total = len(settings) * len(optimizers) * len(schedulers)
print(f"Testing {n_total} possible configurations") print(f"Testing {n_total} possible configurations, device='{device}'")
# scheduler2 = # scheduler2 =
def create_model(st, optimizer_f, scheduler_f): def create_model(st, optimizer_f, scheduler_f):
model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional) model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional)
optimizer = optimizer_f(model) optimizer = optimizer_f(model)
scheduler = scheduler_f(optimizer, st) if scheduler_f is not None:
scheduler = scheduler_f(optimizer, st)
else: scheduler = None
return model, optimizer, scheduler return model, optimizer, scheduler
t_begin = time.time() t_begin = time.time()
@ -103,19 +105,21 @@ if __name__ == "__main__":
for s in range(len(schedulers)): for s in range(len(schedulers)):
for i in range(len(settings)): for i in range(len(settings)):
st = settings[i] st = settings[i]
# print(st.get_name()) train_set, test_set = load_datasets(data_dir, labels, exclude_n_object=None, voltage=None, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=80, num_workers=4)
train_set, test_set = load_datasets(data_dir, labels, voltage=8.2, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=42, num_workers=4)
generator = torch.manual_seed(42) generator = torch.manual_seed(42)
# train_loader = iter(DataLoader(train_set)) train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator, collate_fn=PadSequences())
# test_loader = iter(DataLoader(test_set)) test_loader = DataLoader(test_set, batch_size=None, shuffle=True, generator=generator)
train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator)
test_loader = DataLoader(test_set, batch_size=st.batch_size, shuffle=True, generator=generator) # set batch_size to None and remove collate_fn for this to work
print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})") # count_data(train_loader, st.labels, print_summary="training data")
# count_data(test_loader, st.labels, print_summary="validation data")
model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s]) model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s])
device = select_device(force_device="cpu") print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})")
try: try:
train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1) train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1, print_continuous=True, training_cancel_points=training_cancel_points)
except KeyboardInterrupt: except KeyboardInterrupt:
if input("Cancelled current training. Quit? (q/*): ") == "q": if input("Cancelled current training. Quit? (q/*): ") == "q":
t_end = time.time() t_end = time.time()

View File

@ -19,52 +19,59 @@ class RNN(nn.Module):
self.softmax = nn.Softmax(dim=1) self.softmax = nn.Softmax(dim=1)
self.D = 2 if self.is_bidirectional == True else 1 self.D = 2 if self.is_bidirectional == True else 1
def forward(self, x): def forward(self, x, unpadded_lengths=None):
device = x.device """
@param x:
Tensor (seq_length, features) for unbatched inputs
Tensor (batch_size, seq_length, features) for batch inputs
PackedSequence for padded batched inputs
@param unpadded_lengths: Tensor(batch_size) with lengths of the unpadded sequences, when using padding but without PackedSequence
@returns (batch_size, num_classes) with batch_size == 1 for unbatched inputs
"""
# if type(x) == torch.Tensor:
# device = x.device
# # h0: initial hidden states
# # c0: initial cell states
# if len(x.shape) == 2: # x: (seq_length, features)
# h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# elif len(x.shape) == 3: # x: (batch, seq_length, features)
# batch_size = x.shape[0]
# h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
# c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
# else:
# raise ValueError(f"RNN.forward: invalid input shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)")
# elif type(x) == nn.utils.rnn.PackedSequence:
# device = x.data.device
# h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# else:
# raise ValueError(f"RNN.forward: invalid input type: {type(x)}. Must be Tensor or PackedSequence")
# h0: initial hidden states
# c0: initial cell states
if len(x.shape) == 2: # x: (seq_length, features)
h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
elif len(x.shape) == 3: # x: (batch, seq_length, features)
batch_size = x.shape[0]
h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
else:
raise ValueError(f"RNN.forward: invalid iput shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)")
# lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size) # lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size)
out, (h_n, c_n) = self.lstm(x, (h0, c0)) # or: packed_sequence -> packed_sequence
print(f"forward: out.shape={out.shape} TODO verify comment") # out, (h_n, c_n) = self.lstm(x, (h0, c0))
# out: (N, L, D * hidden_size) out, (h_n, c_n) = self.lstm(x) # (h0, c0) defaults to zeros
# h_n: (D * num_layers, hidden_size)
# c_n: (D * num_layers, hidden_size)
# print(f"out({out.shape})={out}")
# print(f"h_n({h_n.shape})={h_n}")
# print(f"c_n({c_n.shape})={c_n}")
# print(f"out({out.shape})=...")
# print(f"h_n({h_n.shape})=...")
# print(f"c_n({c_n.shape})=...")
""" # select the last state of lstm's neurons
# select only last layer [-1] -> last layer, if type(out) == nn.utils.rnn.PackedSequence:
last_layer_state = h_n.view(self.num_layers, D, batch_size, self.hidden_size)[-1] # padding has to be considered
if D == 1: out, lengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
# [1, batch_size, hidden_size] -> [batch_size, hidden_size] # the unpadded length of batch i is lengths[i], so that is the last non-zero state
X = last_layer_state.squeeze() # TODO what if batch_size == 1 out = torch.stack([out[i,lengths[i].item()-1,:] for i in range(len(lengths))])
elif D == 2: elif unpadded_lengths is not None:
h_1, h_2 = last_layer_state[0], last_layer_state[1] # states of both directions out = torch.stack([out[i,unpadded_lengths[i].item()-1,:] for i in range(len(unpadded_lengths))])
# concatenate both states, X-size: (Batch, hidden_size * 2
X = torch.cat((h_1, h_2), dim=1)
else: else:
raise ValueError("D must be 1 or 2") if out.shape[0] == 3: # batched
""" # all this is quivalent to line below out = out[:,-1,:]
out = out[:,-1,:] # select last time step else: # unbatched
# softmax requires (batch_size, *)
out = torch.stack([out[-1,:]])
# fc fully connected layer: (*, hidden_size) -> (*, num_classes) # fc fully connected layer: (*, hidden_size) -> (*, num_classes)
out = self.fc(out) out = self.fc(out)
# softmax: (*) -> (*) # softmax: (batch_size, *) -> (batch_size, *)
out = self.softmax(out) out = self.softmax(out)
return out return out

View File

@ -7,7 +7,7 @@ from torch.utils.data import DataLoader
from ..util.settings import MLSettings from ..util.settings import MLSettings
from ..tracker.epoch_tracker import EpochTracker from ..tracker.epoch_tracker import EpochTracker
from ..util.file_io import get_next_digits from ..util.file_io import get_next_digits
from ..util.string import class_str from ..util.string import class_str, optimizer_str
from ..util import model_io as mio from ..util import model_io as mio
@ -30,29 +30,20 @@ def select_device(force_device=None):
return device return device
def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1) -> EpochTracker: def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1, print_continuous=False, training_cancel_points=[]) -> EpochTracker:
epoch_tracker = EpochTracker(st.labels) epoch_tracker = EpochTracker(st.labels)
epoch_tracker.begin() epoch_tracker.begin()
for ep in range(st.num_epochs): for ep in range(st.num_epochs):
loss = -1 loss = -1
for i, (data, y) in enumerate(train_loader): for i, (data, lengths, y) in enumerate(train_loader):
# print(data, y)
# data = batch, seq, features # data = batch, seq, features
# print(f"data({data.shape})={data}")
x = data[:,:,[2]].float() # select voltage data x = data[:,:,[2]].float() # select voltage data
# print(f"x({x.shape}, {x.dtype})=...") # print(f"x({x.shape}, {x.dtype})=...")
# print(f"y({y.shape}, {y.dtype})=...") # print(f"y({y.shape}, {y.dtype})=...")
# length = torch.tensor([x.shape[1] for _ in range(x.shape[0])], dtype=torch.int64) # pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
# print(f"length({length.shape})={length}") # out = model(pack) # really slow
# batch_size = x.shape[0] out = model(x, lengths)
# print(f"batch_size={batch_size}")
# v = x.view(batch_size, -1, feature_count)
# data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)[0]
# print(f"data({data.shape})={data}")
out = model(x)
# print(f"out({out.shape}={out})")
# print(f" y({y.shape}={y})")
with torch.no_grad(): with torch.no_grad():
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
@ -72,9 +63,19 @@ def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st:
# predicted = torch.max(torch.nn.functional.softmax(out), 1)[1] # predicted = torch.max(torch.nn.functional.softmax(out), 1)[1]
epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"]) epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"])
if ep+1 % print_interval == 0: if (ep+1) % print_interval == 0:
print(f"Training:", epoch_tracker.get_epoch_summary_str()) if print_continuous: end='\r'
scheduler.step() else: end='\n'
print(f"Training:", epoch_tracker.get_epoch_summary_str(), end=end)
# cancel training if model is not good enough
if len(training_cancel_points) > 0 and ep+1 == training_cancel_points[0][0]:
if epoch_tracker.accuracies[-1] < training_cancel_points[0][1]:
print(f"Training cancelled because the models accuracy={epoch_tracker.accuracies[-1]:.2f} < {training_cancel_points[0][1]} after {ep+1} epochs.")
break;
training_cancel_points.pop(0)
if scheduler is not None:
scheduler.step()
print("Training:", epoch_tracker.end()) print("Training:", epoch_tracker.end())
return epoch_tracker return epoch_tracker
@ -85,24 +86,27 @@ def validate(model, test_loader: DataLoader, st: MLSettings) -> EpochTracker:
with torch.no_grad(): with torch.no_grad():
for i, (data, y) in enumerate(test_loader): for i, (data, y) in enumerate(test_loader):
# print(ep, "Test") # print(ep, "Test")
x = data[:,:,[2]].float() x = data[:,[2]].float()
out = model(x) out = model(x)
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] if y.shape[0] == 2: # batched
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
else: # unbatched
correct = torch.argmax(y, dim=0, keepdim=True) # -> [ label_indices ]
epoch_tracker.add_prediction(correct, predicted) epoch_tracker.add_prediction(correct, predicted)
print("Validation:", epoch_tracker.end()) print("Validation:", epoch_tracker.end())
return epoch_tracker return epoch_tracker
def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, show_plots=False): def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, print_continuous=False, show_plots=False, training_cancel_points=[]):
# assumes model and data is already on correct device # assumes model and data is already on correct device
# train_loader.to(device) # train_loader.to(device)
# test_loader.to(device) # test_loader.to(device)
# store optimizer, scheduler and loss_func in settings # store optimizer, scheduler and loss_func in settings
st.optimizer = class_str(optimizer) st.optimizer = optimizer_str(optimizer)
st.scheduler = class_str(scheduler) st.scheduler = class_str(scheduler)
st.loss_func = class_str(loss_func) st.loss_func = class_str(loss_func)
@ -111,15 +115,15 @@ def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: Da
def add_tab(s): def add_tab(s):
return "\t" + str(s).replace("\n", "\n\t") return "\t" + str(s).replace("\n", "\n\t")
print(100 * '=') print(100 * '=')
print("Model Name:", model_name) print("model name:", model_name)
print(f"model:\n", add_tab(model)) print(f"model:\n", add_tab(model))
# print(f"loss_func:\n", add_tab(class_str(loss_func))) print(f"loss_func: {st.loss_func}")
# print(f"optimizer:\n", add_tab(class_str(optimizer))) print(f"optimizer: {st.optimizer}")
# print(f"scheduler:\n", add_tab(class_str(scheduler))) print(f"scheduler: {st.scheduler}")
print(100 * '-') print(100 * '-')
training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval) training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval, print_continuous=print_continuous, training_cancel_points=training_cancel_points)
# print("Training: Count per label:", training_tracker.get_count_per_label()) # print("Training: Count per label:", training_tracker.get_count_per_label())
# print("Training: Predictions per label:", training_tracker.get_predictions_per_label()) # print("Training: Predictions per label:", training_tracker.get_predictions_per_label())

View File

@ -1,3 +1,4 @@
from os import stat
from ..util.data_loader import LabelConverter from ..util.data_loader import LabelConverter
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.colors as colors import matplotlib.colors as colors

View File

@ -4,6 +4,7 @@ import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from scipy.sparse import data from scipy.sparse import data
import torch
import threading import threading
@ -24,7 +25,13 @@ class LabelConverter:
vec[self.class_labels.index(label)] = 1.0 vec[self.class_labels.index(label)] = 1.0
return vec return vec
def get_label_index(self, one_hot: torch.Tensor):
"""return one hot vector for given label"""
return int(torch.argmax(one_hot).item())
def __getitem__(self, index): def __getitem__(self, index):
if type(index) == torch.Tensor:
return self.class_labels[self.get_label_index(index)]
return self.class_labels[index] return self.class_labels[index]
def __contains__(self, value): def __contains__(self, value):
@ -84,8 +91,11 @@ class Dataset:
if split_function is None: if split_function is None:
self.data.append((data, sample.label_vec)) self.data.append((data, sample.label_vec))
else: else:
for data_split in split_function(data): try:
self.data.append((data_split, sample.label_vec)) for data_split in split_function(data):
self.data.append((data_split, sample.label_vec))
except ValueError as e:
raise ValueError(f"Exception occured during splitting of sample '{sample.datapath}': {e}")
def apply_transforms(self, data): def apply_transforms(self, data):
if type(self.transforms) == list: if type(self.transforms) == list:
@ -111,7 +121,9 @@ def get_datafiles(datadir, labels: LabelConverter, exclude_n_object=None, filter
files.sort() files.sort()
for file in files: for file in files:
match = re.fullmatch(re_filename, file) match = re.fullmatch(re_filename, file)
if not match: continue if not match:
print(f"get_datafiles: dropping non matching file '{file}'")
continue
label = match.groups()[1] label = match.groups()[1]
if label not in labels: continue if label not in labels: continue
@ -125,16 +137,16 @@ def get_datafiles(datadir, labels: LabelConverter, exclude_n_object=None, filter
return datafiles return datafiles
def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None): def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, exclude_n_object=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None):
""" """
load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv
""" """
datasamples = [] datasamples = []
if num_workers == None: if num_workers == None:
for file, match, label in get_datafiles(datadir, labels, voltage): for file, match, label in get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage):
datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file)) datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file))
else: else:
files = get_datafiles(datadir, labels, voltage) files = get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage)
def worker(): def worker():
while True: while True:
try: try:
@ -155,3 +167,30 @@ def load_datasets(datadir, labels: LabelConverter, transforms=None, split_functi
train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function) train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function)
test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function) test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function)
return train_dataset, test_dataset return train_dataset, test_dataset
def count_data(data_loader, label_converter: LabelConverter, print_summary=False):
"""
@param data_loader: unbatched data loader
"""
n_sequences = 0 # count number of sequences
labels = [ 0 for _ in range(len(label_converter)) ] # count number of sequences per label
len_data = [ 0 for _ in range(len(label_converter)) ] # count number of datapoints per label
for i, (data, y) in enumerate(data_loader):
n_sequences = i
label_i = label_converter.get_label_index(y)
len_data[label_i] += data.shape[0]
labels[label_i] += 1
if print_summary:
print("=" * 50)
print("Dataset summary" + f" for {print_summary}:" if type(print_summary) == str else ":")
print(f"Number of sequences: {n_sequences}")
for i in range(len(label_converter)):
print(f"- {label_converter[i]:15}: {labels[i]:3} sequences, {len_data[i]:5} datapoints")
return n_sequences, labels, len_data

14
teng_ml/util/pad.py Normal file
View File

@ -0,0 +1,14 @@
import torch
import torch.nn.utils.rnn as rnn
import numpy as np
class PadSequences:
def __call__(self, batch):
# batch = [(data, label)]
# sort by length
sorted_batch = sorted(batch, key=lambda sample: sample[0].shape[0], reverse=True)
sequences = [torch.Tensor(sample[0]) for sample in sorted_batch]
labels = torch.Tensor(np.array([sample[1] for sample in sorted_batch]))
lengths = torch.IntTensor(np.array([seq.shape[0] for seq in sequences]))
sequences_padded = rnn.pad_sequence(sequences, batch_first=True)
return sequences_padded, lengths, labels

View File

@ -5,8 +5,12 @@ class DataSplitter:
Split a numpy array into smaller arrays of size datapoints_per_split Split a numpy array into smaller arrays of size datapoints_per_split
If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped
""" """
def __init__(self, datapoints_per_split): def __init__(self, datapoints_per_split, drop_if_smaller_than=-1):
"""
@param drop_if_smaller_than: drop the remaining datapoints if the sequence would be smaller than this value. -1 means drop_if_smaller_than=datapoints_per_split
"""
self.split_size = datapoints_per_split self.split_size = datapoints_per_split
self.drop_threshhold = datapoints_per_split if drop_if_smaller_than == -1 else drop_if_smaller_than
def __call__(self, data: np.ndarray): def __call__(self, data: np.ndarray):
""" """
@ -15,6 +19,11 @@ class DataSplitter:
ret_data = [] ret_data = []
for i in range(self.split_size, data.shape[0], self.split_size): for i in range(self.split_size, data.shape[0], self.split_size):
ret_data.append(data[i-self.split_size:i, :]) ret_data.append(data[i-self.split_size:i, :])
rest_start = len(ret_data) * self.split_size
if len(data) - rest_start >= self.drop_threshhold:
ret_data.append(data[rest_start:,:])
if len(ret_data) == 0: if len(ret_data) == 0:
raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}") raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}")
return ret_data return ret_data

View File

@ -13,25 +13,50 @@ def fill_and_center(s: str, fill_char="=", length=100):
else: else:
return s return s
def class_str(x): def class_str(x):
""" """
Return the constructor of the class of x with arguemnts Return the constructor of the class of x with arguemnts
""" """
name = type(x).__name__ name = type(x).__name__
signature = inspect.signature(type(x))
params = [] params = []
for param_name, param_value in x.__dict__.items(): try:
if param_name not in signature.parameters: signature = inspect.signature(type(x))
continue for param_name, param_value in x.__dict__.items():
default_value = signature.parameters[param_name].default if param_name not in signature.parameters:
if param_value != default_value: continue
params.append(f"{param_name}={param_value!r}") default_value = signature.parameters[param_name].default
if param_value != default_value:
params.append(f"{param_name}={param_value!r}")
except ValueError:
pass
if params: if params:
return f"{name}({', '.join(params)})" return f"{name}({', '.join(params)})"
else: else:
return name return name
def optimizer_str(x):
# optimizer stores everything in 'defaults' dict and is thus not compatible with class_str
name = type(x).__name__
params = []
try:
signature = inspect.signature(type(x))
for param_name, param_value in x.__dict__["defaults"].items():
if param_name not in signature.parameters:
continue
default_value = signature.parameters[param_name].default
if param_value != default_value:
params.append(f"{param_name}={param_value!r}")
except ValueError:
pass
if params:
return f"{name}({', '.join(params)})"
else:
return name
def cleanup_str(s): def cleanup_str(s):
""" """
convert to string if necessary and convert to string if necessary and