Compare commits

...

12 Commits

Author SHA1 Message Date
matthias@arch
5895f39874 gen_12 2023-08-14 18:44:39 +02:00
matthias@arch
33d1945de2 added padding function 2023-08-14 18:43:53 +02:00
matthias@arch
37bb1f444e removed debug 2023-08-14 18:43:27 +02:00
matthias@arch
61321e3919 fixed epoch for cancel pts 2023-08-14 18:43:07 +02:00
matthias@arch
1d05da3abf fixed fc missing 2023-08-10 17:29:09 +02:00
Matthias@Dell
ad2e3468f7 add n_object to regex 2023-08-04 13:37:45 +02:00
Matthias@Dell
577e47d03f changes for teng2 2023-08-03 18:43:40 +02:00
matthias@arch
ddeec83e31 added file history 2023-08-02 13:12:39 +02:00
matthias@arch
9aa1ffd7e0 implemented file saving 2023-08-02 10:58:15 +02:00
matthias@arch
7ef99b5811 tested other colorscheme 2023-08-02 10:57:01 +02:00
Matthias@Dell
c4f90ff281 Merge branch 'main' of https://git.quintern.xyz/Matthiasquintern/teng-ml into dev 2023-07-01 16:57:07 +02:00
Matthias@Dell
a479cdeda4 added interactive data selector 2023-07-01 16:55:59 +02:00
10 changed files with 495 additions and 127 deletions

263
teng_ml/data_preprocess.py Normal file
View File

@ -0,0 +1,263 @@
import pandas as pd
import numpy as np
import scipy.signal as signal
import matplotlib as mpl
mpl.use("TkAgg") # fixes focus issues for me
import matplotlib.pyplot as plt
from time import sleep
from random import choice as r_choice
from sys import exit
import os
import re
if __name__ == "__main__":
if __package__ is None:
# make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
__package__ = "teng_ml"
import sys
from os import path
filepath = path.realpath(path.abspath(__file__))
sys.path.insert(0, path.dirname(path.dirname(filepath)))
from .util.transform import Normalize
from .util.data_loader import get_datafiles
from .util.file_io import get_next_digits
file = "/home/matth/Uni/TENG/teng_2/data/2023-06-28_foam_black_1_188mm_06V001.csv"
class InteractiveDataSelector:
re_file = r'\d{4}-\d{2}-\d{2}_([a-zA-Z0-9_]+)_([a-zA-Z0-9]+)_(\d+(?:\.\d+)?mm)_(\d+V)(\d+)\.csv'
re_index_group_nr = 5 # group number of the index part of the filename
"""
Go through all .csv files in a directory, split the data and exclude sections with the mouse, then write the sections as single files into a new directory
"""
def __init__(self, in_dir, out_dir, keep_index=True, split_at_exclude=True):
"""
@param keep_index:
If True: append the split number as triple digits to the existing filename (file001.csv -> file001001.csv, file001002.csv ...)
Else: remove the indices from the filename before adding the split number (file001.csv -> file001.csv, file002.csv ...)
@param split_at_exclude:
If True: When excluding an area, split the data before and after the excluded zone
Else: remove the excluded zone and join the previous and later part
"""
if os.path.isdir(out_dir):
if os.listdir(out_dir):
raise ValueError(f"'out_dir' = '{out_dir}' is not empty")
else:
os.makedirs(out_dir)
self._out_dir = out_dir
self._in_dir = in_dir
self._in_files = os.listdir(in_dir)
self._in_files.sort()
for i in reversed(range(len(self._in_files))):
if not re.fullmatch(InteractiveDataSelector.re_file, self._in_files[i]):
print(f"Dropping non-matching file '{self._in_files[i]}'")
self._in_files.pop(i)
if not self._in_files:
raise ValueError(f"No matching files in 'in_dir' = '{in_dir}'")
self._history: list[tuple[str, list]] = [] # (in_file, [out_files...])
self._keep_index = keep_index
self.split_at_exclude = split_at_exclude
plt.ion()
self._fig, self._ax = plt.subplots()
mpl.rcParams['keymap.save'].remove('s') # s is used for split
mpl.rcParams['keymap.quit'].remove('q')
self._fig.canvas.mpl_connect("button_press_event", lambda ev: self._fig_on_button_press(ev))
self._fig.canvas.mpl_connect("key_press_event", lambda ev: self._fig_on_key_press(ev))
def run(self):
self._next_file()
while plt.fignum_exists(self._fig.number):
plt.pause(0.01)
def _set_titles(self):
help_str = "[(e)xclude, (s)plit, (w)rite+next, (U)ndo last file, (Q)uit]"
self._fig.suptitle(f"{help_str}\nuse left click to select, right click to undo\ncurret mode: {self._mode}")
def _undo_file(self):
if len(self._history) == 0:
print("Nothing to undo")
return
# delete written files
for outfile in self._history[-1][1]:
print(f"Deleting '{outfile}'")
os.remove(outfile)
self._in_files.insert(0, self._history[-1][0])
self._history.pop()
self._next_file()
def _next_file(self):
# runtime stuff
if len(self._in_files) == 0:
raise IndexError("No more files to process")
self._current_file = self._in_files.pop(0)
self._current_dataframe = pd.read_csv(os.path.join(self._in_dir, self._current_file))
self._current_array = self._current_dataframe.to_numpy()
self._current_array = np.loadtxt(os.path.join(self._in_dir, self._current_file), skiprows=1, delimiter=",")
# plot stuff
self._splits_lines = None # vlines
self._excludes_lines = None
self._excludes_areas = [] # list of areas
self._fig.clear()
self._ax = self._fig.subplots()
self._ax.plot(self._current_array[:,0], self._current_array[:,2])
self._ax.set_xlabel(self._current_file)
self._splits: list[int] = []
self._excludes: list[int] = []
self._mode = "exclude" # split or exclude
self._set_titles()
def _fig_on_button_press(self, event):
"""
left click: set split / exclude section (depends on mode)
right click: undo last action of selected mode
"""
if event.xdata is None: return
if event.xdata in self._excludes or event.xdata in self._splits: return
if event.button == 1: # left click, add position
if self._mode == "split":
self._splits.append(event.xdata)
else:
self._excludes.append(event.xdata)
elif event.button == 3: # right click, undo
if self._mode == "split":
if len(self._splits) > 0:
self._splits.pop()
else:
if len(self._excludes) > 0:
self._excludes.pop()
self._update_lines()
def _fig_on_key_press(self, event):
"""
s: set split mode
e: set exclude mode
w: write and got to next file
Q: quit all
"""
if event.key == 's':
self._mode = "split"
elif event.key == 'e':
self._mode = "exclude"
elif event.key == 'w':
self._save_as_new_files()
try:
self._next_file()
except IndexError:
print(f"All files processed.")
exit(0)
elif event.key == 'U':
self._undo_file()
elif event.key == 'Q':
print(f"Quitting before all files have been processed!")
exit(1)
self._set_titles()
def _update_lines(self):
# print(self._splits, self._excludes)
ymin, ymax = self._ax.get_ylim()
if self._splits_lines is not None: self._splits_lines.remove()
self._splits_lines = self._ax.vlines(self._splits, ymin, ymax, color="b")
if self._excludes_lines is not None: self._excludes_lines.remove()
self._excludes_lines = self._ax.vlines(self._excludes, ymin, ymax, color="r")
for area in self._excludes_areas:
area.remove()
self._excludes_areas.clear()
excludes = self._excludes.copy()
if len(excludes) % 2 == 1: excludes.pop() # only draw pairs
excludes.sort()
for i in range(1, len(excludes), 2):
self._excludes_areas.append(self._ax.axvspan(excludes[i-1], excludes[i], facecolor='r', alpha=0.3))
self._ax.set_ylim(ymin, ymax) # reset, since margins are added to lines
self._fig.canvas.draw()
def _get_next_filename(self):
if self._keep_index:
# 5th group is index
match = re.fullmatch(InteractiveDataSelector.re_file, self._current_file)
assert(type(match) is not None)
basename = self._current_file[:match.start(InteractiveDataSelector.re_index_group_nr)]
else:
basename = self._current_file[:-4] # extension
index = get_next_digits(basename, self._out_dir, digits=3)
return f"{basename}{index}.csv"
def _save_as_new_files(self):
# convert timestamps to their closest index
excludes_idx = [np.abs(self._current_array[:,0] - t).argmin() for t in self._excludes]
splits_idx = [np.abs(self._current_array[:,0] - t).argmin() for t in self._splits]
if self.split_at_exclude:
# split before the start of the exclucded range
splits_idx += [ excludes_idx[i]-1 for i in range(0, len(excludes_idx), 2) ]
# split after the end of the exclucded range
splits_idx += [ excludes_idx[i]+1 for i in range(1, len(excludes_idx), 2) ]
splits_idx = list(set(splits_idx)) # remove duplicates
splits_idx.sort()
df = self._current_dataframe.copy()
# 1) remove excluded parts
for i in range(1, len(excludes_idx), 2):
df = df.drop(index=range(excludes_idx[i-1], excludes_idx[i]+1))
# 2) splits
new_frames = []
start_i = df.index[0]
for i in range(0, len(splits_idx)):
end_i = splits_idx[i]
# print(start_i, end_i)
# check if valid start and end index
if start_i in df.index and end_i in df.index:
new_frames.append(df.loc[start_i:end_i])
start_i = end_i + 1
# append rest
if start_i in df.index:
new_frames.append(df.loc[start_i:])
# 3) remove empty
for i in reversed(range(len(new_frames))):
if len(new_frames[i]) == 0:
new_frames.pop(i)
self._history.append((self._current_file, []))
for frame in new_frames:
filename = self._get_next_filename()
pathname = os.path.join(self._out_dir, filename)
# until now, frame is a copy of a slice
frame = frame.copy()
# transform timestamps so that first value is 0
t_column_name = frame.columns[0]
frame[t_column_name] -= frame.iloc[0][t_column_name]
frame.to_csv(pathname, index=False)
self._history[-1][1].append(pathname)
print(f"Saved range of length {len(frame.index):04} to {pathname}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser("data_preprocess")
parser.add_argument("in_dir")
parser.add_argument("out_dir")
parser.add_argument("-i", "--keep_index", action="store_true")
parser.add_argument("-e", "--split_at_exclude", action="store_true")
ns = parser.parse_args()
selector = InteractiveDataSelector(ns.in_dir, ns.out_dir, keep_index=ns.keep_index, split_at_exclude=ns.split_at_exclude)
selector.run()
exit(2) # selector should exit in _fig_on_key_press

View File

@ -18,8 +18,9 @@ import time
from os import makedirs, path from os import makedirs, path
from .util.transform import ConstantInterval, Normalize from .util.transform import ConstantInterval, Normalize
from .util.data_loader import load_datasets, LabelConverter from .util.data_loader import load_datasets, LabelConverter, count_data
from .util.split import DataSplitter from .util.split import DataSplitter
from .util.pad import PadSequences
from .util.settings import MLSettings from .util.settings import MLSettings
from .rnn.rnn import RNN from .rnn.rnn import RNN
from .rnn.training import train_validate_save, select_device from .rnn.training import train_validate_save, select_device
@ -41,34 +42,30 @@ def test_interpol():
if __name__ == "__main__": if __name__ == "__main__":
labels = LabelConverter(["white_foam", "glass", "Kapton", "bubble_wrap", "cloth", "black_foam"]) labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "antistatic_foil", "cardboard", "glass", "kapton", "bubble_wrap_PE", "fabric_PP", ])
models_dir = "/home/matth/Uni/TENG/models" # where to save models, settings and results # labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "kapton", "bubble_wrap_PE", "fabric_PP", ])
models_dir = "/home/matth/Uni/TENG/teng_2/models_gen_12" # where to save models, settings and results
if not path.isdir(models_dir): if not path.isdir(models_dir):
makedirs(models_dir) makedirs(models_dir)
data_dir = "/home/matth/Uni/TENG/data" data_dir = "/home/matth/Uni/TENG/teng_2/sorted_data"
# gen_5 best options: datasplitter, not bidirectional, lr=0.001, no scheduler
# gen_6 best options: no glass, cardboard and antistatic_foil, not bidirectional, lr=0.0007, no datasplitter, 2 layers n_hidden = 10
# Test with # Test with
num_layers = [ 3 ] num_layers = [ 2, 3 ]
hidden_size = [ 8 ] hidden_size = [ 21, 28 ]
bidirectional = [ True ] bidirectional = [ False, True ]
t_const_int = ConstantInterval(0.01) t_const_int = ConstantInterval(0.01) # TODO check if needed: data was taken at equal rate, but it isnt perfect -> maybe just ignore?
t_norm = Normalize(0, 1) t_norm = Normalize(-1, 1)
transforms = [[ t_const_int ]] #, [ t_const_int, t_norm ]] transforms = [[ t_norm ]] #, [ t_norm, t_const_int ]]
batch_sizes = [ 64 ] # , 16] batch_sizes = [ 4 ]
splitters = [ DataSplitter(100) ] splitters = [ DataSplitter(50, drop_if_smaller_than=30) ] # smallest file has length 68 TODO: try with 0.5-1second snippets
num_epochs = [ 80 ] num_epochs = [ 80 ]
# (epoch, min_accuracy)
training_cancel_points = [(15, 20), (40, 25)]
# training_cancel_points = []
# num_layers=1,
# hidden_size=1,
# bidirectional=True,
# optimizer=None,
# scheduler=None,
# loss_func=None,
# transforms=[],
# splitter=None,
# num_epochs=10,
# batch_size=5,
args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes] args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes]
# create settings for every possible combination # create settings for every possible combination
@ -78,23 +75,26 @@ if __name__ == "__main__":
loss_func = nn.CrossEntropyLoss() loss_func = nn.CrossEntropyLoss()
optimizers = [ optimizers = [
lambda model: torch.optim.Adam(model.parameters(), lr=0.03), lambda model: torch.optim.Adam(model.parameters(), lr=0.0007),
# lambda model: torch.optim.Adam(model.parameters(), lr=0.25),
# lambda model: torch.optim.Adam(model.parameters(), lr=0.50),
] ]
schedulers = [ schedulers = [
None,
# lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9), # lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9),
lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.40, verbose=False), # lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5),
lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 8, gamma=0.60, verbose=False),
# lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False), # lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False),
] ]
device = select_device(force_device="cpu") # TODO cuda is not supported because something throws NotImplementedError with my gpu
n_total = len(settings) * len(optimizers) * len(schedulers) n_total = len(settings) * len(optimizers) * len(schedulers)
print(f"Testing {n_total} possible configurations") print(f"Testing {n_total} possible configurations, device='{device}'")
# scheduler2 = # scheduler2 =
def create_model(st, optimizer_f, scheduler_f): def create_model(st, optimizer_f, scheduler_f):
model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional) model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional)
optimizer = optimizer_f(model) optimizer = optimizer_f(model)
scheduler = scheduler_f(optimizer, st) if scheduler_f is not None:
scheduler = scheduler_f(optimizer, st)
else: scheduler = None
return model, optimizer, scheduler return model, optimizer, scheduler
t_begin = time.time() t_begin = time.time()
@ -103,19 +103,21 @@ if __name__ == "__main__":
for s in range(len(schedulers)): for s in range(len(schedulers)):
for i in range(len(settings)): for i in range(len(settings)):
st = settings[i] st = settings[i]
# print(st.get_name()) train_set, test_set = load_datasets(data_dir, labels, exclude_n_object=None, voltage=None, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=123, num_workers=4)
train_set, test_set = load_datasets(data_dir, labels, voltage=8.2, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=42, num_workers=4)
generator = torch.manual_seed(42) generator = torch.manual_seed(42)
# train_loader = iter(DataLoader(train_set)) train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator, collate_fn=PadSequences())
# test_loader = iter(DataLoader(test_set)) test_loader = DataLoader(test_set, batch_size=None, shuffle=True, generator=generator)
train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator)
test_loader = DataLoader(test_set, batch_size=st.batch_size, shuffle=True, generator=generator) # set batch_size to None and remove collate_fn for this to work
print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})") # count_data(train_loader, st.labels, print_summary="training data")
# count_data(test_loader, st.labels, print_summary="validation data")
model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s]) model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s])
device = select_device(force_device="cpu") print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})")
try: try:
train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1) train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1, print_continuous=True, training_cancel_points=training_cancel_points)
except KeyboardInterrupt: except KeyboardInterrupt:
if input("Cancelled current training. Quit? (q/*): ") == "q": if input("Cancelled current training. Quit? (q/*): ") == "q":
t_end = time.time() t_end = time.time()

View File

@ -10,7 +10,7 @@ from sys import exit
if __name__ == "__main__": if __name__ == "__main__":
if __package__ is None: if __package__ is None:
# make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
__package__ = "teng-ml" __package__ = "teng_ml"
import sys import sys
from os import path from os import path
filepath = path.realpath(path.abspath(__file__)) filepath = path.realpath(path.abspath(__file__))

View File

@ -19,52 +19,59 @@ class RNN(nn.Module):
self.softmax = nn.Softmax(dim=1) self.softmax = nn.Softmax(dim=1)
self.D = 2 if self.is_bidirectional == True else 1 self.D = 2 if self.is_bidirectional == True else 1
def forward(self, x): def forward(self, x, unpadded_lengths=None):
device = x.device """
@param x:
Tensor (seq_length, features) for unbatched inputs
Tensor (batch_size, seq_length, features) for batch inputs
PackedSequence for padded batched inputs
@param unpadded_lengths: Tensor(batch_size) with lengths of the unpadded sequences, when using padding but without PackedSequence
@returns (batch_size, num_classes) with batch_size == 1 for unbatched inputs
"""
# if type(x) == torch.Tensor:
# device = x.device
# # h0: initial hidden states
# # c0: initial cell states
# if len(x.shape) == 2: # x: (seq_length, features)
# h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# elif len(x.shape) == 3: # x: (batch, seq_length, features)
# batch_size = x.shape[0]
# h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
# c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
# else:
# raise ValueError(f"RNN.forward: invalid input shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)")
# elif type(x) == nn.utils.rnn.PackedSequence:
# device = x.data.device
# h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
# else:
# raise ValueError(f"RNN.forward: invalid input type: {type(x)}. Must be Tensor or PackedSequence")
# h0: initial hidden states
# c0: initial cell states
if len(x.shape) == 2: # x: (seq_length, features)
h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
elif len(x.shape) == 3: # x: (batch, seq_length, features)
batch_size = x.shape[0]
h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
else:
raise ValueError(f"RNN.forward: invalid iput shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)")
# lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size) # lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size)
out, (h_n, c_n) = self.lstm(x, (h0, c0)) # or: packed_sequence -> packed_sequence
print(f"forward: out.shape={out.shape} TODO verify comment") # out, (h_n, c_n) = self.lstm(x, (h0, c0))
# out: (N, L, D * hidden_size) out, (h_n, c_n) = self.lstm(x) # (h0, c0) defaults to zeros
# h_n: (D * num_layers, hidden_size)
# c_n: (D * num_layers, hidden_size)
# print(f"out({out.shape})={out}")
# print(f"h_n({h_n.shape})={h_n}")
# print(f"c_n({c_n.shape})={c_n}")
# print(f"out({out.shape})=...")
# print(f"h_n({h_n.shape})=...")
# print(f"c_n({c_n.shape})=...")
""" # select the last state of lstm's neurons
# select only last layer [-1] -> last layer, if type(out) == nn.utils.rnn.PackedSequence:
last_layer_state = h_n.view(self.num_layers, D, batch_size, self.hidden_size)[-1] # padding has to be considered
if D == 1: out, lengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
# [1, batch_size, hidden_size] -> [batch_size, hidden_size] # the unpadded length of batch i is lengths[i], so that is the last non-zero state
X = last_layer_state.squeeze() # TODO what if batch_size == 1 out = torch.stack([out[i,lengths[i].item()-1,:] for i in range(len(lengths))])
elif D == 2: elif unpadded_lengths is not None:
h_1, h_2 = last_layer_state[0], last_layer_state[1] # states of both directions out = torch.stack([out[i,unpadded_lengths[i].item()-1,:] for i in range(len(unpadded_lengths))])
# concatenate both states, X-size: (Batch, hidden_size * 2
X = torch.cat((h_1, h_2), dim=1)
else: else:
raise ValueError("D must be 1 or 2") if out.shape[0] == 3: # batched
""" # all this is quivalent to line below out = out[:,-1,:]
out = out[:,-1,:] # select last time step else: # unbatched
# softmax requires (batch_size, *)
out = torch.stack([out[-1,:]])
# fc fully connected layer: (*, hidden_size) -> (*, num_classes) # fc fully connected layer: (*, hidden_size) -> (*, num_classes)
out = self.fc(out) out = self.fc(out)
# softmax: (*) -> (*) # softmax: (batch_size, *) -> (batch_size, *)
out = self.softmax(out) out = self.softmax(out)
return out return out

View File

@ -7,7 +7,7 @@ from torch.utils.data import DataLoader
from ..util.settings import MLSettings from ..util.settings import MLSettings
from ..tracker.epoch_tracker import EpochTracker from ..tracker.epoch_tracker import EpochTracker
from ..util.file_io import get_next_digits from ..util.file_io import get_next_digits
from ..util.string import class_str from ..util.string import class_str, optimizer_str
from ..util import model_io as mio from ..util import model_io as mio
@ -30,29 +30,20 @@ def select_device(force_device=None):
return device return device
def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1) -> EpochTracker: def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1, print_continuous=False, training_cancel_points=[]) -> EpochTracker:
epoch_tracker = EpochTracker(st.labels) epoch_tracker = EpochTracker(st.labels)
epoch_tracker.begin() epoch_tracker.begin()
for ep in range(st.num_epochs): for ep in range(st.num_epochs):
loss = -1 loss = -1
for i, (data, y) in enumerate(train_loader): for i, (data, lengths, y) in enumerate(train_loader):
# print(data, y)
# data = batch, seq, features # data = batch, seq, features
# print(f"data({data.shape})={data}")
x = data[:,:,[2]].float() # select voltage data x = data[:,:,[2]].float() # select voltage data
# print(f"x({x.shape}, {x.dtype})=...") # print(f"x({x.shape}, {x.dtype})=...")
# print(f"y({y.shape}, {y.dtype})=...") # print(f"y({y.shape}, {y.dtype})=...")
# length = torch.tensor([x.shape[1] for _ in range(x.shape[0])], dtype=torch.int64) # pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
# print(f"length({length.shape})={length}") # out = model(pack) # really slow
# batch_size = x.shape[0] out = model(x, lengths)
# print(f"batch_size={batch_size}")
# v = x.view(batch_size, -1, feature_count)
# data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)[0]
# print(f"data({data.shape})={data}")
out = model(x)
# print(f"out({out.shape}={out})")
# print(f" y({y.shape}={y})")
with torch.no_grad(): with torch.no_grad():
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
@ -72,9 +63,19 @@ def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st:
# predicted = torch.max(torch.nn.functional.softmax(out), 1)[1] # predicted = torch.max(torch.nn.functional.softmax(out), 1)[1]
epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"]) epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"])
if ep+1 % print_interval == 0: if (ep+1) % print_interval == 0:
print(f"Training:", epoch_tracker.get_epoch_summary_str()) if print_continuous: end='\r'
scheduler.step() else: end='\n'
print(f"Training:", epoch_tracker.get_epoch_summary_str(), end=end)
# cancel training if model is not good enough
if len(training_cancel_points) > 0 and ep+1 == training_cancel_points[0][0]:
if epoch_tracker.accuracies[-1] < training_cancel_points[0][1]:
print(f"Training cancelled because the models accuracy={epoch_tracker.accuracies[-1]:.2f} < {training_cancel_points[0][1]} after {ep+1} epochs.")
break;
training_cancel_points.pop(0)
if scheduler is not None:
scheduler.step()
print("Training:", epoch_tracker.end()) print("Training:", epoch_tracker.end())
return epoch_tracker return epoch_tracker
@ -85,24 +86,27 @@ def validate(model, test_loader: DataLoader, st: MLSettings) -> EpochTracker:
with torch.no_grad(): with torch.no_grad():
for i, (data, y) in enumerate(test_loader): for i, (data, y) in enumerate(test_loader):
# print(ep, "Test") # print(ep, "Test")
x = data[:,:,[2]].float() x = data[:,[2]].float()
out = model(x) out = model(x)
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ] predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ] if y.shape[0] == 2: # batched
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
else: # unbatched
correct = torch.argmax(y, dim=0, keepdim=True) # -> [ label_indices ]
epoch_tracker.add_prediction(correct, predicted) epoch_tracker.add_prediction(correct, predicted)
print("Validation:", epoch_tracker.end()) print("Validation:", epoch_tracker.end())
return epoch_tracker return epoch_tracker
def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, show_plots=False): def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, print_continuous=False, show_plots=False, training_cancel_points=[]):
# assumes model and data is already on correct device # assumes model and data is already on correct device
# train_loader.to(device) # train_loader.to(device)
# test_loader.to(device) # test_loader.to(device)
# store optimizer, scheduler and loss_func in settings # store optimizer, scheduler and loss_func in settings
st.optimizer = class_str(optimizer) st.optimizer = optimizer_str(optimizer)
st.scheduler = class_str(scheduler) st.scheduler = class_str(scheduler)
st.loss_func = class_str(loss_func) st.loss_func = class_str(loss_func)
@ -111,15 +115,15 @@ def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: Da
def add_tab(s): def add_tab(s):
return "\t" + str(s).replace("\n", "\n\t") return "\t" + str(s).replace("\n", "\n\t")
print(100 * '=') print(100 * '=')
print("Model Name:", model_name) print("model name:", model_name)
print(f"model:\n", add_tab(model)) print(f"model:\n", add_tab(model))
# print(f"loss_func:\n", add_tab(class_str(loss_func))) print(f"loss_func: {st.loss_func}")
# print(f"optimizer:\n", add_tab(class_str(optimizer))) print(f"optimizer: {st.optimizer}")
# print(f"scheduler:\n", add_tab(class_str(scheduler))) print(f"scheduler: {st.scheduler}")
print(100 * '-') print(100 * '-')
training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval) training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval, print_continuous=print_continuous, training_cancel_points=training_cancel_points)
# print("Training: Count per label:", training_tracker.get_count_per_label()) # print("Training: Count per label:", training_tracker.get_count_per_label())
# print("Training: Predictions per label:", training_tracker.get_predictions_per_label()) # print("Training: Predictions per label:", training_tracker.get_predictions_per_label())

View File

@ -1,5 +1,7 @@
from os import stat
from ..util.data_loader import LabelConverter from ..util.data_loader import LabelConverter
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.colors as colors
import time import time
import torch import torch
import numpy as np import numpy as np
@ -141,8 +143,7 @@ class EpochTracker:
label_names = self.labels.get_labels() label_names = self.labels.get_labels()
fig, ax = plt.subplots(layout="tight") fig, ax = plt.subplots(layout="tight")
im = ax.imshow(normalized_predictions, cmap='Blues') # cmap='BuPu', , norm=colors.PowerNorm(1./2.)
im = ax.imshow(normalized_predictions, cmap='Blues') # cmap='BuPu'
ax.set_xticks(np.arange(N)) ax.set_xticks(np.arange(N))
ax.set_yticks(np.arange(N)) ax.set_yticks(np.arange(N))
ax.set_xticklabels(label_names) ax.set_xticklabels(label_names)

View File

@ -4,13 +4,15 @@ import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from scipy.sparse import data from scipy.sparse import data
import torch
import threading import threading
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
# groups: date, name, voltage, distance, index # groups: date, name, n_object, voltage, distance, index
re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z_]+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv" # re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z_]+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv"
re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z0-9_]+)_(\d+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv"
class LabelConverter: class LabelConverter:
def __init__(self, class_labels: list[str]): def __init__(self, class_labels: list[str]):
@ -23,7 +25,13 @@ class LabelConverter:
vec[self.class_labels.index(label)] = 1.0 vec[self.class_labels.index(label)] = 1.0
return vec return vec
def get_label_index(self, one_hot: torch.Tensor):
"""return one hot vector for given label"""
return int(torch.argmax(one_hot).item())
def __getitem__(self, index): def __getitem__(self, index):
if type(index) == torch.Tensor:
return self.class_labels[self.get_label_index(index)]
return self.class_labels[index] return self.class_labels[index]
def __contains__(self, value): def __contains__(self, value):
@ -40,9 +48,10 @@ class LabelConverter:
class Datasample: class Datasample:
def __init__(self, date: str, label: str, voltage: str, distance: str, index: str, label_vec, datapath: str, init_data=False): def __init__(self, date: str, label: str, n_object: str, voltage: str, distance: str, index: str, label_vec, datapath: str, init_data=False):
self.date = date self.date = date
self.label = label self.label = label
self.n_object = int(n_object)
self.voltage = float(voltage) self.voltage = float(voltage)
self.distance = float(distance) self.distance = float(distance)
self.index = int(index) self.index = int(index)
@ -82,8 +91,11 @@ class Dataset:
if split_function is None: if split_function is None:
self.data.append((data, sample.label_vec)) self.data.append((data, sample.label_vec))
else: else:
for data_split in split_function(data): try:
self.data.append((data_split, sample.label_vec)) for data_split in split_function(data):
self.data.append((data_split, sample.label_vec))
except ValueError as e:
raise ValueError(f"Exception occured during splitting of sample '{sample.datapath}': {e}")
def apply_transforms(self, data): def apply_transforms(self, data):
if type(self.transforms) == list: if type(self.transforms) == list:
@ -100,36 +112,41 @@ class Dataset:
return len(self.data) return len(self.data)
def get_datafiles(datadir, labels: LabelConverter, voltage=None): def get_datafiles(datadir, labels: LabelConverter, exclude_n_object=None, filter_voltage=None):
""" """
get a list of all matching datafiles from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv get a list of all matching datafiles from datadir that are in the format: yyyy-mm-dd_label__n_object_x.xV_xxxmm.csv
""" """
datafiles = [] datafiles = []
files = listdir(datadir) files = listdir(datadir)
files.sort() files.sort()
for file in files: for file in files:
match = re.fullmatch(re_filename, file) match = re.fullmatch(re_filename, file)
if not match: continue if not match:
print(f"get_datafiles: dropping non matching file '{file}'")
continue
label = match.groups()[1] label = match.groups()[1]
if label not in labels: continue if label not in labels: continue
sample_voltage = float(match.groups()[2]) sample_n_object = float(match.groups()[2])
if voltage and voltage != sample_voltage: continue if exclude_n_object and exclude_n_object == sample_n_object: continue
sample_voltage = float(match.groups()[3])
if filter_voltage and filter_voltage != sample_voltage: continue
datafiles.append((datadir + "/" + file, match, label)) datafiles.append((datadir + "/" + file, match, label))
return datafiles return datafiles
def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None): def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, exclude_n_object=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None):
""" """
load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv
""" """
datasamples = [] datasamples = []
if num_workers == None: if num_workers == None:
for file, match, label in get_datafiles(datadir, labels, voltage): for file, match, label in get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage):
datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file)) datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file))
else: else:
files = get_datafiles(datadir, labels, voltage) files = get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage)
def worker(): def worker():
while True: while True:
try: try:
@ -144,10 +161,36 @@ def load_datasets(datadir, labels: LabelConverter, transforms=None, split_functi
for t in threads: for t in threads:
t.join() t.join()
# TODO do the train_test_split after the Dataset split # TODO do the train_test_split after the Dataset split
# problem: needs to be after transforms # problem: needs to be after transforms
train_samples, test_samples = train_test_split(datasamples, train_size=train_to_test_ratio, shuffle=True, random_state=random_state) train_samples, test_samples = train_test_split(datasamples, train_size=train_to_test_ratio, shuffle=True, random_state=random_state)
train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function) train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function)
test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function) test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function)
return train_dataset, test_dataset return train_dataset, test_dataset
def count_data(data_loader, label_converter: LabelConverter, print_summary=False):
"""
@param data_loader: unbatched data loader
"""
n_sequences = 0 # count number of sequences
labels = [ 0 for _ in range(len(label_converter)) ] # count number of sequences per label
len_data = [ 0 for _ in range(len(label_converter)) ] # count number of datapoints per label
for i, (data, y) in enumerate(data_loader):
n_sequences = i
label_i = label_converter.get_label_index(y)
len_data[label_i] += data.shape[0]
labels[label_i] += 1
if print_summary:
print("=" * 50)
print("Dataset summary" + f" for {print_summary}:" if type(print_summary) == str else ":")
print(f"Number of sequences: {n_sequences}")
for i in range(len(label_converter)):
print(f"- {label_converter[i]:15}: {labels[i]:3} sequences, {len_data[i]:5} datapoints")
return n_sequences, labels, len_data

14
teng_ml/util/pad.py Normal file
View File

@ -0,0 +1,14 @@
import torch
import torch.nn.utils.rnn as rnn
import numpy as np
class PadSequences:
def __call__(self, batch):
# batch = [(data, label)]
# sort by length
sorted_batch = sorted(batch, key=lambda sample: sample[0].shape[0], reverse=True)
sequences = [torch.Tensor(sample[0]) for sample in sorted_batch]
labels = torch.Tensor(np.array([sample[1] for sample in sorted_batch]))
lengths = torch.IntTensor(np.array([seq.shape[0] for seq in sequences]))
sequences_padded = rnn.pad_sequence(sequences, batch_first=True)
return sequences_padded, lengths, labels

View File

@ -5,8 +5,12 @@ class DataSplitter:
Split a numpy array into smaller arrays of size datapoints_per_split Split a numpy array into smaller arrays of size datapoints_per_split
If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped
""" """
def __init__(self, datapoints_per_split): def __init__(self, datapoints_per_split, drop_if_smaller_than=-1):
"""
@param drop_if_smaller_than: drop the remaining datapoints if the sequence would be smaller than this value. -1 means drop_if_smaller_than=datapoints_per_split
"""
self.split_size = datapoints_per_split self.split_size = datapoints_per_split
self.drop_threshhold = datapoints_per_split if drop_if_smaller_than == -1 else drop_if_smaller_than
def __call__(self, data: np.ndarray): def __call__(self, data: np.ndarray):
""" """
@ -15,6 +19,11 @@ class DataSplitter:
ret_data = [] ret_data = []
for i in range(self.split_size, data.shape[0], self.split_size): for i in range(self.split_size, data.shape[0], self.split_size):
ret_data.append(data[i-self.split_size:i, :]) ret_data.append(data[i-self.split_size:i, :])
rest_start = len(ret_data) * self.split_size
if len(data) - rest_start >= self.drop_threshhold:
ret_data.append(data[rest_start:,:])
if len(ret_data) == 0: if len(ret_data) == 0:
raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}") raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}")
return ret_data return ret_data

View File

@ -13,25 +13,50 @@ def fill_and_center(s: str, fill_char="=", length=100):
else: else:
return s return s
def class_str(x): def class_str(x):
""" """
Return the constructor of the class of x with arguemnts Return the constructor of the class of x with arguemnts
""" """
name = type(x).__name__ name = type(x).__name__
signature = inspect.signature(type(x))
params = [] params = []
for param_name, param_value in x.__dict__.items(): try:
if param_name not in signature.parameters: signature = inspect.signature(type(x))
continue for param_name, param_value in x.__dict__.items():
default_value = signature.parameters[param_name].default if param_name not in signature.parameters:
if param_value != default_value: continue
params.append(f"{param_name}={param_value!r}") default_value = signature.parameters[param_name].default
if param_value != default_value:
params.append(f"{param_name}={param_value!r}")
except ValueError:
pass
if params: if params:
return f"{name}({', '.join(params)})" return f"{name}({', '.join(params)})"
else: else:
return name return name
def optimizer_str(x):
# optimizer stores everything in 'defaults' dict and is thus not compatible with class_str
name = type(x).__name__
params = []
try:
signature = inspect.signature(type(x))
for param_name, param_value in x.__dict__["defaults"].items():
if param_name not in signature.parameters:
continue
default_value = signature.parameters[param_name].default
if param_value != default_value:
params.append(f"{param_name}={param_value!r}")
except ValueError:
pass
if params:
return f"{name}({', '.join(params)})"
else:
return name
def cleanup_str(s): def cleanup_str(s):
""" """
convert to string if necessary and convert to string if necessary and