Compare commits
12 Commits
99ba4e390e
...
5895f39874
Author | SHA1 | Date | |
---|---|---|---|
|
5895f39874 | ||
|
33d1945de2 | ||
|
37bb1f444e | ||
|
61321e3919 | ||
|
1d05da3abf | ||
|
ad2e3468f7 | ||
|
577e47d03f | ||
|
ddeec83e31 | ||
|
9aa1ffd7e0 | ||
|
7ef99b5811 | ||
|
c4f90ff281 | ||
|
a479cdeda4 |
263
teng_ml/data_preprocess.py
Normal file
263
teng_ml/data_preprocess.py
Normal file
@ -0,0 +1,263 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import scipy.signal as signal
|
||||||
|
import matplotlib as mpl
|
||||||
|
mpl.use("TkAgg") # fixes focus issues for me
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from time import sleep
|
||||||
|
from random import choice as r_choice
|
||||||
|
from sys import exit
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if __package__ is None:
|
||||||
|
# make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
|
||||||
|
__package__ = "teng_ml"
|
||||||
|
import sys
|
||||||
|
from os import path
|
||||||
|
filepath = path.realpath(path.abspath(__file__))
|
||||||
|
sys.path.insert(0, path.dirname(path.dirname(filepath)))
|
||||||
|
|
||||||
|
from .util.transform import Normalize
|
||||||
|
from .util.data_loader import get_datafiles
|
||||||
|
from .util.file_io import get_next_digits
|
||||||
|
|
||||||
|
file = "/home/matth/Uni/TENG/teng_2/data/2023-06-28_foam_black_1_188mm_06V001.csv"
|
||||||
|
|
||||||
|
|
||||||
|
class InteractiveDataSelector:
|
||||||
|
re_file = r'\d{4}-\d{2}-\d{2}_([a-zA-Z0-9_]+)_([a-zA-Z0-9]+)_(\d+(?:\.\d+)?mm)_(\d+V)(\d+)\.csv'
|
||||||
|
re_index_group_nr = 5 # group number of the index part of the filename
|
||||||
|
"""
|
||||||
|
Go through all .csv files in a directory, split the data and exclude sections with the mouse, then write the sections as single files into a new directory
|
||||||
|
"""
|
||||||
|
def __init__(self, in_dir, out_dir, keep_index=True, split_at_exclude=True):
|
||||||
|
"""
|
||||||
|
@param keep_index:
|
||||||
|
If True: append the split number as triple digits to the existing filename (file001.csv -> file001001.csv, file001002.csv ...)
|
||||||
|
Else: remove the indices from the filename before adding the split number (file001.csv -> file001.csv, file002.csv ...)
|
||||||
|
@param split_at_exclude:
|
||||||
|
If True: When excluding an area, split the data before and after the excluded zone
|
||||||
|
Else: remove the excluded zone and join the previous and later part
|
||||||
|
"""
|
||||||
|
if os.path.isdir(out_dir):
|
||||||
|
if os.listdir(out_dir):
|
||||||
|
raise ValueError(f"'out_dir' = '{out_dir}' is not empty")
|
||||||
|
else:
|
||||||
|
os.makedirs(out_dir)
|
||||||
|
self._out_dir = out_dir
|
||||||
|
|
||||||
|
self._in_dir = in_dir
|
||||||
|
self._in_files = os.listdir(in_dir)
|
||||||
|
self._in_files.sort()
|
||||||
|
for i in reversed(range(len(self._in_files))):
|
||||||
|
if not re.fullmatch(InteractiveDataSelector.re_file, self._in_files[i]):
|
||||||
|
print(f"Dropping non-matching file '{self._in_files[i]}'")
|
||||||
|
self._in_files.pop(i)
|
||||||
|
if not self._in_files:
|
||||||
|
raise ValueError(f"No matching files in 'in_dir' = '{in_dir}'")
|
||||||
|
|
||||||
|
self._history: list[tuple[str, list]] = [] # (in_file, [out_files...])
|
||||||
|
|
||||||
|
self._keep_index = keep_index
|
||||||
|
self.split_at_exclude = split_at_exclude
|
||||||
|
|
||||||
|
plt.ion()
|
||||||
|
self._fig, self._ax = plt.subplots()
|
||||||
|
|
||||||
|
mpl.rcParams['keymap.save'].remove('s') # s is used for split
|
||||||
|
mpl.rcParams['keymap.quit'].remove('q')
|
||||||
|
self._fig.canvas.mpl_connect("button_press_event", lambda ev: self._fig_on_button_press(ev))
|
||||||
|
self._fig.canvas.mpl_connect("key_press_event", lambda ev: self._fig_on_key_press(ev))
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self._next_file()
|
||||||
|
while plt.fignum_exists(self._fig.number):
|
||||||
|
plt.pause(0.01)
|
||||||
|
|
||||||
|
def _set_titles(self):
|
||||||
|
help_str = "[(e)xclude, (s)plit, (w)rite+next, (U)ndo last file, (Q)uit]"
|
||||||
|
self._fig.suptitle(f"{help_str}\nuse left click to select, right click to undo\ncurret mode: {self._mode}")
|
||||||
|
|
||||||
|
def _undo_file(self):
|
||||||
|
if len(self._history) == 0:
|
||||||
|
print("Nothing to undo")
|
||||||
|
return
|
||||||
|
# delete written files
|
||||||
|
for outfile in self._history[-1][1]:
|
||||||
|
print(f"Deleting '{outfile}'")
|
||||||
|
os.remove(outfile)
|
||||||
|
self._in_files.insert(0, self._history[-1][0])
|
||||||
|
self._history.pop()
|
||||||
|
self._next_file()
|
||||||
|
|
||||||
|
def _next_file(self):
|
||||||
|
# runtime stuff
|
||||||
|
if len(self._in_files) == 0:
|
||||||
|
raise IndexError("No more files to process")
|
||||||
|
self._current_file = self._in_files.pop(0)
|
||||||
|
self._current_dataframe = pd.read_csv(os.path.join(self._in_dir, self._current_file))
|
||||||
|
self._current_array = self._current_dataframe.to_numpy()
|
||||||
|
self._current_array = np.loadtxt(os.path.join(self._in_dir, self._current_file), skiprows=1, delimiter=",")
|
||||||
|
|
||||||
|
|
||||||
|
# plot stuff
|
||||||
|
self._splits_lines = None # vlines
|
||||||
|
self._excludes_lines = None
|
||||||
|
self._excludes_areas = [] # list of areas
|
||||||
|
self._fig.clear()
|
||||||
|
self._ax = self._fig.subplots()
|
||||||
|
self._ax.plot(self._current_array[:,0], self._current_array[:,2])
|
||||||
|
self._ax.set_xlabel(self._current_file)
|
||||||
|
|
||||||
|
self._splits: list[int] = []
|
||||||
|
self._excludes: list[int] = []
|
||||||
|
self._mode = "exclude" # split or exclude
|
||||||
|
self._set_titles()
|
||||||
|
|
||||||
|
def _fig_on_button_press(self, event):
|
||||||
|
"""
|
||||||
|
left click: set split / exclude section (depends on mode)
|
||||||
|
right click: undo last action of selected mode
|
||||||
|
"""
|
||||||
|
if event.xdata is None: return
|
||||||
|
if event.xdata in self._excludes or event.xdata in self._splits: return
|
||||||
|
if event.button == 1: # left click, add position
|
||||||
|
if self._mode == "split":
|
||||||
|
self._splits.append(event.xdata)
|
||||||
|
else:
|
||||||
|
self._excludes.append(event.xdata)
|
||||||
|
elif event.button == 3: # right click, undo
|
||||||
|
if self._mode == "split":
|
||||||
|
if len(self._splits) > 0:
|
||||||
|
self._splits.pop()
|
||||||
|
else:
|
||||||
|
if len(self._excludes) > 0:
|
||||||
|
self._excludes.pop()
|
||||||
|
self._update_lines()
|
||||||
|
|
||||||
|
def _fig_on_key_press(self, event):
|
||||||
|
"""
|
||||||
|
s: set split mode
|
||||||
|
e: set exclude mode
|
||||||
|
w: write and got to next file
|
||||||
|
Q: quit all
|
||||||
|
"""
|
||||||
|
if event.key == 's':
|
||||||
|
self._mode = "split"
|
||||||
|
elif event.key == 'e':
|
||||||
|
self._mode = "exclude"
|
||||||
|
elif event.key == 'w':
|
||||||
|
self._save_as_new_files()
|
||||||
|
try:
|
||||||
|
self._next_file()
|
||||||
|
except IndexError:
|
||||||
|
print(f"All files processed.")
|
||||||
|
exit(0)
|
||||||
|
elif event.key == 'U':
|
||||||
|
self._undo_file()
|
||||||
|
elif event.key == 'Q':
|
||||||
|
print(f"Quitting before all files have been processed!")
|
||||||
|
exit(1)
|
||||||
|
self._set_titles()
|
||||||
|
|
||||||
|
|
||||||
|
def _update_lines(self):
|
||||||
|
# print(self._splits, self._excludes)
|
||||||
|
ymin, ymax = self._ax.get_ylim()
|
||||||
|
|
||||||
|
if self._splits_lines is not None: self._splits_lines.remove()
|
||||||
|
self._splits_lines = self._ax.vlines(self._splits, ymin, ymax, color="b")
|
||||||
|
|
||||||
|
if self._excludes_lines is not None: self._excludes_lines.remove()
|
||||||
|
self._excludes_lines = self._ax.vlines(self._excludes, ymin, ymax, color="r")
|
||||||
|
|
||||||
|
for area in self._excludes_areas:
|
||||||
|
area.remove()
|
||||||
|
self._excludes_areas.clear()
|
||||||
|
excludes = self._excludes.copy()
|
||||||
|
if len(excludes) % 2 == 1: excludes.pop() # only draw pairs
|
||||||
|
excludes.sort()
|
||||||
|
for i in range(1, len(excludes), 2):
|
||||||
|
self._excludes_areas.append(self._ax.axvspan(excludes[i-1], excludes[i], facecolor='r', alpha=0.3))
|
||||||
|
|
||||||
|
self._ax.set_ylim(ymin, ymax) # reset, since margins are added to lines
|
||||||
|
self._fig.canvas.draw()
|
||||||
|
|
||||||
|
def _get_next_filename(self):
|
||||||
|
if self._keep_index:
|
||||||
|
# 5th group is index
|
||||||
|
match = re.fullmatch(InteractiveDataSelector.re_file, self._current_file)
|
||||||
|
assert(type(match) is not None)
|
||||||
|
basename = self._current_file[:match.start(InteractiveDataSelector.re_index_group_nr)]
|
||||||
|
else:
|
||||||
|
basename = self._current_file[:-4] # extension
|
||||||
|
index = get_next_digits(basename, self._out_dir, digits=3)
|
||||||
|
return f"{basename}{index}.csv"
|
||||||
|
|
||||||
|
def _save_as_new_files(self):
|
||||||
|
# convert timestamps to their closest index
|
||||||
|
excludes_idx = [np.abs(self._current_array[:,0] - t).argmin() for t in self._excludes]
|
||||||
|
splits_idx = [np.abs(self._current_array[:,0] - t).argmin() for t in self._splits]
|
||||||
|
if self.split_at_exclude:
|
||||||
|
# split before the start of the exclucded range
|
||||||
|
splits_idx += [ excludes_idx[i]-1 for i in range(0, len(excludes_idx), 2) ]
|
||||||
|
# split after the end of the exclucded range
|
||||||
|
splits_idx += [ excludes_idx[i]+1 for i in range(1, len(excludes_idx), 2) ]
|
||||||
|
splits_idx = list(set(splits_idx)) # remove duplicates
|
||||||
|
splits_idx.sort()
|
||||||
|
|
||||||
|
df = self._current_dataframe.copy()
|
||||||
|
|
||||||
|
# 1) remove excluded parts
|
||||||
|
for i in range(1, len(excludes_idx), 2):
|
||||||
|
df = df.drop(index=range(excludes_idx[i-1], excludes_idx[i]+1))
|
||||||
|
|
||||||
|
# 2) splits
|
||||||
|
new_frames = []
|
||||||
|
start_i = df.index[0]
|
||||||
|
for i in range(0, len(splits_idx)):
|
||||||
|
end_i = splits_idx[i]
|
||||||
|
# print(start_i, end_i)
|
||||||
|
# check if valid start and end index
|
||||||
|
if start_i in df.index and end_i in df.index:
|
||||||
|
new_frames.append(df.loc[start_i:end_i])
|
||||||
|
start_i = end_i + 1
|
||||||
|
# append rest
|
||||||
|
if start_i in df.index:
|
||||||
|
new_frames.append(df.loc[start_i:])
|
||||||
|
|
||||||
|
# 3) remove empty
|
||||||
|
for i in reversed(range(len(new_frames))):
|
||||||
|
if len(new_frames[i]) == 0:
|
||||||
|
new_frames.pop(i)
|
||||||
|
|
||||||
|
self._history.append((self._current_file, []))
|
||||||
|
for frame in new_frames:
|
||||||
|
filename = self._get_next_filename()
|
||||||
|
pathname = os.path.join(self._out_dir, filename)
|
||||||
|
# until now, frame is a copy of a slice
|
||||||
|
frame = frame.copy()
|
||||||
|
# transform timestamps so that first value is 0
|
||||||
|
t_column_name = frame.columns[0]
|
||||||
|
frame[t_column_name] -= frame.iloc[0][t_column_name]
|
||||||
|
frame.to_csv(pathname, index=False)
|
||||||
|
self._history[-1][1].append(pathname)
|
||||||
|
print(f"Saved range of length {len(frame.index):04} to {pathname}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser("data_preprocess")
|
||||||
|
parser.add_argument("in_dir")
|
||||||
|
parser.add_argument("out_dir")
|
||||||
|
parser.add_argument("-i", "--keep_index", action="store_true")
|
||||||
|
parser.add_argument("-e", "--split_at_exclude", action="store_true")
|
||||||
|
ns = parser.parse_args()
|
||||||
|
|
||||||
|
selector = InteractiveDataSelector(ns.in_dir, ns.out_dir, keep_index=ns.keep_index, split_at_exclude=ns.split_at_exclude)
|
||||||
|
selector.run()
|
||||||
|
exit(2) # selector should exit in _fig_on_key_press
|
||||||
|
|
@ -18,8 +18,9 @@ import time
|
|||||||
from os import makedirs, path
|
from os import makedirs, path
|
||||||
|
|
||||||
from .util.transform import ConstantInterval, Normalize
|
from .util.transform import ConstantInterval, Normalize
|
||||||
from .util.data_loader import load_datasets, LabelConverter
|
from .util.data_loader import load_datasets, LabelConverter, count_data
|
||||||
from .util.split import DataSplitter
|
from .util.split import DataSplitter
|
||||||
|
from .util.pad import PadSequences
|
||||||
from .util.settings import MLSettings
|
from .util.settings import MLSettings
|
||||||
from .rnn.rnn import RNN
|
from .rnn.rnn import RNN
|
||||||
from .rnn.training import train_validate_save, select_device
|
from .rnn.training import train_validate_save, select_device
|
||||||
@ -41,34 +42,30 @@ def test_interpol():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
labels = LabelConverter(["white_foam", "glass", "Kapton", "bubble_wrap", "cloth", "black_foam"])
|
labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "antistatic_foil", "cardboard", "glass", "kapton", "bubble_wrap_PE", "fabric_PP", ])
|
||||||
models_dir = "/home/matth/Uni/TENG/models" # where to save models, settings and results
|
# labels = LabelConverter(["foam_PDMS_white", "foam_PDMS_black", "foam_PDMS_TX100", "foam_PE", "kapton", "bubble_wrap_PE", "fabric_PP", ])
|
||||||
|
models_dir = "/home/matth/Uni/TENG/teng_2/models_gen_12" # where to save models, settings and results
|
||||||
if not path.isdir(models_dir):
|
if not path.isdir(models_dir):
|
||||||
makedirs(models_dir)
|
makedirs(models_dir)
|
||||||
data_dir = "/home/matth/Uni/TENG/data"
|
data_dir = "/home/matth/Uni/TENG/teng_2/sorted_data"
|
||||||
|
|
||||||
|
# gen_5 best options: datasplitter, not bidirectional, lr=0.001, no scheduler
|
||||||
|
# gen_6 best options: no glass, cardboard and antistatic_foil, not bidirectional, lr=0.0007, no datasplitter, 2 layers n_hidden = 10
|
||||||
|
|
||||||
# Test with
|
# Test with
|
||||||
num_layers = [ 3 ]
|
num_layers = [ 2, 3 ]
|
||||||
hidden_size = [ 8 ]
|
hidden_size = [ 21, 28 ]
|
||||||
bidirectional = [ True ]
|
bidirectional = [ False, True ]
|
||||||
t_const_int = ConstantInterval(0.01)
|
t_const_int = ConstantInterval(0.01) # TODO check if needed: data was taken at equal rate, but it isnt perfect -> maybe just ignore?
|
||||||
t_norm = Normalize(0, 1)
|
t_norm = Normalize(-1, 1)
|
||||||
transforms = [[ t_const_int ]] #, [ t_const_int, t_norm ]]
|
transforms = [[ t_norm ]] #, [ t_norm, t_const_int ]]
|
||||||
batch_sizes = [ 64 ] # , 16]
|
batch_sizes = [ 4 ]
|
||||||
splitters = [ DataSplitter(100) ]
|
splitters = [ DataSplitter(50, drop_if_smaller_than=30) ] # smallest file has length 68 TODO: try with 0.5-1second snippets
|
||||||
num_epochs = [ 80 ]
|
num_epochs = [ 80 ]
|
||||||
|
# (epoch, min_accuracy)
|
||||||
|
training_cancel_points = [(15, 20), (40, 25)]
|
||||||
|
# training_cancel_points = []
|
||||||
|
|
||||||
# num_layers=1,
|
|
||||||
# hidden_size=1,
|
|
||||||
# bidirectional=True,
|
|
||||||
# optimizer=None,
|
|
||||||
# scheduler=None,
|
|
||||||
# loss_func=None,
|
|
||||||
# transforms=[],
|
|
||||||
# splitter=None,
|
|
||||||
# num_epochs=10,
|
|
||||||
# batch_size=5,
|
|
||||||
args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes]
|
args = [num_layers, hidden_size, bidirectional, [None], [None], [None], transforms, splitters, num_epochs, batch_sizes]
|
||||||
|
|
||||||
# create settings for every possible combination
|
# create settings for every possible combination
|
||||||
@ -78,23 +75,26 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
loss_func = nn.CrossEntropyLoss()
|
loss_func = nn.CrossEntropyLoss()
|
||||||
optimizers = [
|
optimizers = [
|
||||||
lambda model: torch.optim.Adam(model.parameters(), lr=0.03),
|
lambda model: torch.optim.Adam(model.parameters(), lr=0.0007),
|
||||||
# lambda model: torch.optim.Adam(model.parameters(), lr=0.25),
|
|
||||||
# lambda model: torch.optim.Adam(model.parameters(), lr=0.50),
|
|
||||||
]
|
]
|
||||||
schedulers = [
|
schedulers = [
|
||||||
|
None,
|
||||||
# lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9),
|
# lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9),
|
||||||
lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.40, verbose=False),
|
# lambda optimizer, st: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5),
|
||||||
|
lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 8, gamma=0.60, verbose=False),
|
||||||
# lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False),
|
# lambda optimizer, st: torch.optim.lr_scheduler.StepLR(optimizer, step_size=st.num_epochs // 10, gamma=0.75, verbose=False),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
device = select_device(force_device="cpu") # TODO cuda is not supported because something throws NotImplementedError with my gpu
|
||||||
n_total = len(settings) * len(optimizers) * len(schedulers)
|
n_total = len(settings) * len(optimizers) * len(schedulers)
|
||||||
print(f"Testing {n_total} possible configurations")
|
print(f"Testing {n_total} possible configurations, device='{device}'")
|
||||||
# scheduler2 =
|
# scheduler2 =
|
||||||
def create_model(st, optimizer_f, scheduler_f):
|
def create_model(st, optimizer_f, scheduler_f):
|
||||||
model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional)
|
model=RNN(input_size=st.num_features, hidden_size=st.hidden_size, num_layers=st.num_layers, num_classes=len(labels), bidirectional=st.bidirectional)
|
||||||
optimizer = optimizer_f(model)
|
optimizer = optimizer_f(model)
|
||||||
|
if scheduler_f is not None:
|
||||||
scheduler = scheduler_f(optimizer, st)
|
scheduler = scheduler_f(optimizer, st)
|
||||||
|
else: scheduler = None
|
||||||
return model, optimizer, scheduler
|
return model, optimizer, scheduler
|
||||||
|
|
||||||
t_begin = time.time()
|
t_begin = time.time()
|
||||||
@ -103,19 +103,21 @@ if __name__ == "__main__":
|
|||||||
for s in range(len(schedulers)):
|
for s in range(len(schedulers)):
|
||||||
for i in range(len(settings)):
|
for i in range(len(settings)):
|
||||||
st = settings[i]
|
st = settings[i]
|
||||||
# print(st.get_name())
|
train_set, test_set = load_datasets(data_dir, labels, exclude_n_object=None, voltage=None, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=123, num_workers=4)
|
||||||
train_set, test_set = load_datasets(data_dir, labels, voltage=8.2, transforms=st.transforms, split_function=st.splitter, train_to_test_ratio=0.7, random_state=42, num_workers=4)
|
|
||||||
|
|
||||||
generator = torch.manual_seed(42)
|
generator = torch.manual_seed(42)
|
||||||
# train_loader = iter(DataLoader(train_set))
|
train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator, collate_fn=PadSequences())
|
||||||
# test_loader = iter(DataLoader(test_set))
|
test_loader = DataLoader(test_set, batch_size=None, shuffle=True, generator=generator)
|
||||||
train_loader = DataLoader(train_set, batch_size=st.batch_size, shuffle=True, generator=generator)
|
|
||||||
test_loader = DataLoader(test_set, batch_size=st.batch_size, shuffle=True, generator=generator)
|
# set batch_size to None and remove collate_fn for this to work
|
||||||
print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})")
|
# count_data(train_loader, st.labels, print_summary="training data")
|
||||||
|
# count_data(test_loader, st.labels, print_summary="validation data")
|
||||||
|
|
||||||
|
|
||||||
model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s])
|
model, optimizer, scheduler = create_model(st, optimizers[o], schedulers[s])
|
||||||
device = select_device(force_device="cpu")
|
print(f"Testing {n}/{n_total}: (o={o}, s={s}, i={i})")
|
||||||
try:
|
try:
|
||||||
train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1)
|
train_validate_save(model, optimizer, scheduler, loss_func, train_loader, test_loader, st, models_dir, print_interval=1, print_continuous=True, training_cancel_points=training_cancel_points)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
if input("Cancelled current training. Quit? (q/*): ") == "q":
|
if input("Cancelled current training. Quit? (q/*): ") == "q":
|
||||||
t_end = time.time()
|
t_end = time.time()
|
||||||
|
@ -10,7 +10,7 @@ from sys import exit
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if __package__ is None:
|
if __package__ is None:
|
||||||
# make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
|
# make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
|
||||||
__package__ = "teng-ml"
|
__package__ = "teng_ml"
|
||||||
import sys
|
import sys
|
||||||
from os import path
|
from os import path
|
||||||
filepath = path.realpath(path.abspath(__file__))
|
filepath = path.realpath(path.abspath(__file__))
|
||||||
|
@ -19,52 +19,59 @@ class RNN(nn.Module):
|
|||||||
self.softmax = nn.Softmax(dim=1)
|
self.softmax = nn.Softmax(dim=1)
|
||||||
self.D = 2 if self.is_bidirectional == True else 1
|
self.D = 2 if self.is_bidirectional == True else 1
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x, unpadded_lengths=None):
|
||||||
device = x.device
|
"""
|
||||||
|
@param x:
|
||||||
|
Tensor (seq_length, features) for unbatched inputs
|
||||||
|
Tensor (batch_size, seq_length, features) for batch inputs
|
||||||
|
PackedSequence for padded batched inputs
|
||||||
|
@param unpadded_lengths: Tensor(batch_size) with lengths of the unpadded sequences, when using padding but without PackedSequence
|
||||||
|
@returns (batch_size, num_classes) with batch_size == 1 for unbatched inputs
|
||||||
|
"""
|
||||||
|
# if type(x) == torch.Tensor:
|
||||||
|
# device = x.device
|
||||||
|
# # h0: initial hidden states
|
||||||
|
# # c0: initial cell states
|
||||||
|
# if len(x.shape) == 2: # x: (seq_length, features)
|
||||||
|
# h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
|
||||||
|
# c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
|
||||||
|
# elif len(x.shape) == 3: # x: (batch, seq_length, features)
|
||||||
|
# batch_size = x.shape[0]
|
||||||
|
# h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
|
||||||
|
# c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
|
||||||
|
# else:
|
||||||
|
# raise ValueError(f"RNN.forward: invalid input shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)")
|
||||||
|
# elif type(x) == nn.utils.rnn.PackedSequence:
|
||||||
|
# device = x.data.device
|
||||||
|
# h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
|
||||||
|
# c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
|
||||||
|
# else:
|
||||||
|
# raise ValueError(f"RNN.forward: invalid input type: {type(x)}. Must be Tensor or PackedSequence")
|
||||||
|
|
||||||
# h0: initial hidden states
|
|
||||||
# c0: initial cell states
|
|
||||||
if len(x.shape) == 2: # x: (seq_length, features)
|
|
||||||
h0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
|
|
||||||
c0 = torch.zeros(self.D * self.num_layers, self.hidden_size).to(device)
|
|
||||||
elif len(x.shape) == 3: # x: (batch, seq_length, features)
|
|
||||||
batch_size = x.shape[0]
|
|
||||||
h0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
|
|
||||||
c0 = torch.zeros(self.D * self.num_layers, batch_size, self.hidden_size).to(device)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"RNN.forward: invalid iput shape: {x.shape}. Must be (batch, seq_length, features) or (seq_length, features)")
|
|
||||||
|
|
||||||
# lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size)
|
# lstm: (batch_size, seq_length, features) -> (batch_size, hidden_size)
|
||||||
out, (h_n, c_n) = self.lstm(x, (h0, c0))
|
# or: packed_sequence -> packed_sequence
|
||||||
print(f"forward: out.shape={out.shape} TODO verify comment")
|
# out, (h_n, c_n) = self.lstm(x, (h0, c0))
|
||||||
# out: (N, L, D * hidden_size)
|
out, (h_n, c_n) = self.lstm(x) # (h0, c0) defaults to zeros
|
||||||
# h_n: (D * num_layers, hidden_size)
|
|
||||||
# c_n: (D * num_layers, hidden_size)
|
|
||||||
# print(f"out({out.shape})={out}")
|
|
||||||
# print(f"h_n({h_n.shape})={h_n}")
|
|
||||||
# print(f"c_n({c_n.shape})={c_n}")
|
|
||||||
# print(f"out({out.shape})=...")
|
|
||||||
# print(f"h_n({h_n.shape})=...")
|
|
||||||
# print(f"c_n({c_n.shape})=...")
|
|
||||||
|
|
||||||
"""
|
# select the last state of lstm's neurons
|
||||||
# select only last layer [-1] -> last layer,
|
if type(out) == nn.utils.rnn.PackedSequence:
|
||||||
last_layer_state = h_n.view(self.num_layers, D, batch_size, self.hidden_size)[-1]
|
# padding has to be considered
|
||||||
if D == 1:
|
out, lengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
|
||||||
# [1, batch_size, hidden_size] -> [batch_size, hidden_size]
|
# the unpadded length of batch i is lengths[i], so that is the last non-zero state
|
||||||
X = last_layer_state.squeeze() # TODO what if batch_size == 1
|
out = torch.stack([out[i,lengths[i].item()-1,:] for i in range(len(lengths))])
|
||||||
elif D == 2:
|
elif unpadded_lengths is not None:
|
||||||
h_1, h_2 = last_layer_state[0], last_layer_state[1] # states of both directions
|
out = torch.stack([out[i,unpadded_lengths[i].item()-1,:] for i in range(len(unpadded_lengths))])
|
||||||
# concatenate both states, X-size: (Batch, hidden_size * 2)
|
|
||||||
X = torch.cat((h_1, h_2), dim=1)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("D must be 1 or 2")
|
if out.shape[0] == 3: # batched
|
||||||
""" # all this is quivalent to line below
|
out = out[:,-1,:]
|
||||||
out = out[:,-1,:] # select last time step
|
else: # unbatched
|
||||||
|
# softmax requires (batch_size, *)
|
||||||
|
out = torch.stack([out[-1,:]])
|
||||||
|
|
||||||
# fc fully connected layer: (*, hidden_size) -> (*, num_classes)
|
# fc fully connected layer: (*, hidden_size) -> (*, num_classes)
|
||||||
out = self.fc(out)
|
out = self.fc(out)
|
||||||
|
|
||||||
# softmax: (*) -> (*)
|
# softmax: (batch_size, *) -> (batch_size, *)
|
||||||
out = self.softmax(out)
|
out = self.softmax(out)
|
||||||
return out
|
return out
|
||||||
|
@ -7,7 +7,7 @@ from torch.utils.data import DataLoader
|
|||||||
from ..util.settings import MLSettings
|
from ..util.settings import MLSettings
|
||||||
from ..tracker.epoch_tracker import EpochTracker
|
from ..tracker.epoch_tracker import EpochTracker
|
||||||
from ..util.file_io import get_next_digits
|
from ..util.file_io import get_next_digits
|
||||||
from ..util.string import class_str
|
from ..util.string import class_str, optimizer_str
|
||||||
|
|
||||||
from ..util import model_io as mio
|
from ..util import model_io as mio
|
||||||
|
|
||||||
@ -30,29 +30,20 @@ def select_device(force_device=None):
|
|||||||
return device
|
return device
|
||||||
|
|
||||||
|
|
||||||
def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1) -> EpochTracker:
|
def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st: MLSettings, print_interval=1, print_continuous=False, training_cancel_points=[]) -> EpochTracker:
|
||||||
epoch_tracker = EpochTracker(st.labels)
|
epoch_tracker = EpochTracker(st.labels)
|
||||||
epoch_tracker.begin()
|
epoch_tracker.begin()
|
||||||
for ep in range(st.num_epochs):
|
for ep in range(st.num_epochs):
|
||||||
loss = -1
|
loss = -1
|
||||||
for i, (data, y) in enumerate(train_loader):
|
for i, (data, lengths, y) in enumerate(train_loader):
|
||||||
# print(data, y)
|
|
||||||
# data = batch, seq, features
|
# data = batch, seq, features
|
||||||
# print(f"data({data.shape})={data}")
|
|
||||||
x = data[:,:,[2]].float() # select voltage data
|
x = data[:,:,[2]].float() # select voltage data
|
||||||
# print(f"x({x.shape}, {x.dtype})=...")
|
# print(f"x({x.shape}, {x.dtype})=...")
|
||||||
# print(f"y({y.shape}, {y.dtype})=...")
|
# print(f"y({y.shape}, {y.dtype})=...")
|
||||||
# length = torch.tensor([x.shape[1] for _ in range(x.shape[0])], dtype=torch.int64)
|
# pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
|
||||||
# print(f"length({length.shape})={length}")
|
# out = model(pack) # really slow
|
||||||
# batch_size = x.shape[0]
|
out = model(x, lengths)
|
||||||
# print(f"batch_size={batch_size}")
|
|
||||||
# v = x.view(batch_size, -1, feature_count)
|
|
||||||
# data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)[0]
|
|
||||||
# print(f"data({data.shape})={data}")
|
|
||||||
out = model(x)
|
|
||||||
|
|
||||||
# print(f"out({out.shape}={out})")
|
|
||||||
# print(f" y({y.shape}={y})")
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
|
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
|
||||||
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
|
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
|
||||||
@ -72,8 +63,18 @@ def train(model, optimizer, scheduler, loss_func, train_loader: DataLoader, st:
|
|||||||
|
|
||||||
# predicted = torch.max(torch.nn.functional.softmax(out), 1)[1]
|
# predicted = torch.max(torch.nn.functional.softmax(out), 1)[1]
|
||||||
epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"])
|
epoch_tracker.end_epoch(loss, optimizer.param_groups[0]["lr"])
|
||||||
if ep+1 % print_interval == 0:
|
if (ep+1) % print_interval == 0:
|
||||||
print(f"Training:", epoch_tracker.get_epoch_summary_str())
|
if print_continuous: end='\r'
|
||||||
|
else: end='\n'
|
||||||
|
print(f"Training:", epoch_tracker.get_epoch_summary_str(), end=end)
|
||||||
|
# cancel training if model is not good enough
|
||||||
|
if len(training_cancel_points) > 0 and ep+1 == training_cancel_points[0][0]:
|
||||||
|
if epoch_tracker.accuracies[-1] < training_cancel_points[0][1]:
|
||||||
|
print(f"Training cancelled because the models accuracy={epoch_tracker.accuracies[-1]:.2f} < {training_cancel_points[0][1]} after {ep+1} epochs.")
|
||||||
|
break;
|
||||||
|
training_cancel_points.pop(0)
|
||||||
|
|
||||||
|
if scheduler is not None:
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
print("Training:", epoch_tracker.end())
|
print("Training:", epoch_tracker.end())
|
||||||
return epoch_tracker
|
return epoch_tracker
|
||||||
@ -85,24 +86,27 @@ def validate(model, test_loader: DataLoader, st: MLSettings) -> EpochTracker:
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for i, (data, y) in enumerate(test_loader):
|
for i, (data, y) in enumerate(test_loader):
|
||||||
# print(ep, "Test")
|
# print(ep, "Test")
|
||||||
x = data[:,:,[2]].float()
|
x = data[:,[2]].float()
|
||||||
out = model(x)
|
out = model(x)
|
||||||
|
|
||||||
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
|
predicted = torch.argmax(out, dim=1, keepdim=False) # -> [ label_indices ]
|
||||||
|
if y.shape[0] == 2: # batched
|
||||||
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
|
correct = torch.argmax(y, dim=1, keepdim=False) # -> [ label_indices ]
|
||||||
|
else: # unbatched
|
||||||
|
correct = torch.argmax(y, dim=0, keepdim=True) # -> [ label_indices ]
|
||||||
|
|
||||||
epoch_tracker.add_prediction(correct, predicted)
|
epoch_tracker.add_prediction(correct, predicted)
|
||||||
print("Validation:", epoch_tracker.end())
|
print("Validation:", epoch_tracker.end())
|
||||||
return epoch_tracker
|
return epoch_tracker
|
||||||
|
|
||||||
|
|
||||||
def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, show_plots=False):
|
def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: DataLoader, test_loader: DataLoader, st: MLSettings, models_dir, print_interval=1, print_continuous=False, show_plots=False, training_cancel_points=[]):
|
||||||
# assumes model and data is already on correct device
|
# assumes model and data is already on correct device
|
||||||
# train_loader.to(device)
|
# train_loader.to(device)
|
||||||
# test_loader.to(device)
|
# test_loader.to(device)
|
||||||
|
|
||||||
# store optimizer, scheduler and loss_func in settings
|
# store optimizer, scheduler and loss_func in settings
|
||||||
st.optimizer = class_str(optimizer)
|
st.optimizer = optimizer_str(optimizer)
|
||||||
st.scheduler = class_str(scheduler)
|
st.scheduler = class_str(scheduler)
|
||||||
st.loss_func = class_str(loss_func)
|
st.loss_func = class_str(loss_func)
|
||||||
|
|
||||||
@ -111,15 +115,15 @@ def train_validate_save(model, optimizer, scheduler, loss_func, train_loader: Da
|
|||||||
def add_tab(s):
|
def add_tab(s):
|
||||||
return "\t" + str(s).replace("\n", "\n\t")
|
return "\t" + str(s).replace("\n", "\n\t")
|
||||||
print(100 * '=')
|
print(100 * '=')
|
||||||
print("Model Name:", model_name)
|
print("model name:", model_name)
|
||||||
print(f"model:\n", add_tab(model))
|
print(f"model:\n", add_tab(model))
|
||||||
# print(f"loss_func:\n", add_tab(class_str(loss_func)))
|
print(f"loss_func: {st.loss_func}")
|
||||||
# print(f"optimizer:\n", add_tab(class_str(optimizer)))
|
print(f"optimizer: {st.optimizer}")
|
||||||
# print(f"scheduler:\n", add_tab(class_str(scheduler)))
|
print(f"scheduler: {st.scheduler}")
|
||||||
|
|
||||||
|
|
||||||
print(100 * '-')
|
print(100 * '-')
|
||||||
training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval)
|
training_tracker = train(model, optimizer, scheduler, loss_func, train_loader, st, print_interval=print_interval, print_continuous=print_continuous, training_cancel_points=training_cancel_points)
|
||||||
# print("Training: Count per label:", training_tracker.get_count_per_label())
|
# print("Training: Count per label:", training_tracker.get_count_per_label())
|
||||||
# print("Training: Predictions per label:", training_tracker.get_predictions_per_label())
|
# print("Training: Predictions per label:", training_tracker.get_predictions_per_label())
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
|
from os import stat
|
||||||
from ..util.data_loader import LabelConverter
|
from ..util.data_loader import LabelConverter
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.colors as colors
|
||||||
import time
|
import time
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -141,8 +143,7 @@ class EpochTracker:
|
|||||||
label_names = self.labels.get_labels()
|
label_names = self.labels.get_labels()
|
||||||
|
|
||||||
fig, ax = plt.subplots(layout="tight")
|
fig, ax = plt.subplots(layout="tight")
|
||||||
|
im = ax.imshow(normalized_predictions, cmap='Blues') # cmap='BuPu', , norm=colors.PowerNorm(1./2.)
|
||||||
im = ax.imshow(normalized_predictions, cmap='Blues') # cmap='BuPu'
|
|
||||||
ax.set_xticks(np.arange(N))
|
ax.set_xticks(np.arange(N))
|
||||||
ax.set_yticks(np.arange(N))
|
ax.set_yticks(np.arange(N))
|
||||||
ax.set_xticklabels(label_names)
|
ax.set_xticklabels(label_names)
|
||||||
|
@ -4,13 +4,15 @@ import re
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy.sparse import data
|
from scipy.sparse import data
|
||||||
|
import torch
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
# groups: date, name, voltage, distance, index
|
# groups: date, name, n_object, voltage, distance, index
|
||||||
re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z_]+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv"
|
# re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z_]+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv"
|
||||||
|
re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z0-9_]+)_(\d+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv"
|
||||||
|
|
||||||
class LabelConverter:
|
class LabelConverter:
|
||||||
def __init__(self, class_labels: list[str]):
|
def __init__(self, class_labels: list[str]):
|
||||||
@ -23,7 +25,13 @@ class LabelConverter:
|
|||||||
vec[self.class_labels.index(label)] = 1.0
|
vec[self.class_labels.index(label)] = 1.0
|
||||||
return vec
|
return vec
|
||||||
|
|
||||||
|
def get_label_index(self, one_hot: torch.Tensor):
|
||||||
|
"""return one hot vector for given label"""
|
||||||
|
return int(torch.argmax(one_hot).item())
|
||||||
|
|
||||||
def __getitem__(self, index):
|
def __getitem__(self, index):
|
||||||
|
if type(index) == torch.Tensor:
|
||||||
|
return self.class_labels[self.get_label_index(index)]
|
||||||
return self.class_labels[index]
|
return self.class_labels[index]
|
||||||
|
|
||||||
def __contains__(self, value):
|
def __contains__(self, value):
|
||||||
@ -40,9 +48,10 @@ class LabelConverter:
|
|||||||
|
|
||||||
|
|
||||||
class Datasample:
|
class Datasample:
|
||||||
def __init__(self, date: str, label: str, voltage: str, distance: str, index: str, label_vec, datapath: str, init_data=False):
|
def __init__(self, date: str, label: str, n_object: str, voltage: str, distance: str, index: str, label_vec, datapath: str, init_data=False):
|
||||||
self.date = date
|
self.date = date
|
||||||
self.label = label
|
self.label = label
|
||||||
|
self.n_object = int(n_object)
|
||||||
self.voltage = float(voltage)
|
self.voltage = float(voltage)
|
||||||
self.distance = float(distance)
|
self.distance = float(distance)
|
||||||
self.index = int(index)
|
self.index = int(index)
|
||||||
@ -82,8 +91,11 @@ class Dataset:
|
|||||||
if split_function is None:
|
if split_function is None:
|
||||||
self.data.append((data, sample.label_vec))
|
self.data.append((data, sample.label_vec))
|
||||||
else:
|
else:
|
||||||
|
try:
|
||||||
for data_split in split_function(data):
|
for data_split in split_function(data):
|
||||||
self.data.append((data_split, sample.label_vec))
|
self.data.append((data_split, sample.label_vec))
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError(f"Exception occured during splitting of sample '{sample.datapath}': {e}")
|
||||||
|
|
||||||
def apply_transforms(self, data):
|
def apply_transforms(self, data):
|
||||||
if type(self.transforms) == list:
|
if type(self.transforms) == list:
|
||||||
@ -100,36 +112,41 @@ class Dataset:
|
|||||||
return len(self.data)
|
return len(self.data)
|
||||||
|
|
||||||
|
|
||||||
def get_datafiles(datadir, labels: LabelConverter, voltage=None):
|
def get_datafiles(datadir, labels: LabelConverter, exclude_n_object=None, filter_voltage=None):
|
||||||
"""
|
"""
|
||||||
get a list of all matching datafiles from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv
|
get a list of all matching datafiles from datadir that are in the format: yyyy-mm-dd_label__n_object_x.xV_xxxmm.csv
|
||||||
"""
|
"""
|
||||||
datafiles = []
|
datafiles = []
|
||||||
files = listdir(datadir)
|
files = listdir(datadir)
|
||||||
files.sort()
|
files.sort()
|
||||||
for file in files:
|
for file in files:
|
||||||
match = re.fullmatch(re_filename, file)
|
match = re.fullmatch(re_filename, file)
|
||||||
if not match: continue
|
if not match:
|
||||||
|
print(f"get_datafiles: dropping non matching file '{file}'")
|
||||||
|
continue
|
||||||
|
|
||||||
label = match.groups()[1]
|
label = match.groups()[1]
|
||||||
if label not in labels: continue
|
if label not in labels: continue
|
||||||
|
|
||||||
sample_voltage = float(match.groups()[2])
|
sample_n_object = float(match.groups()[2])
|
||||||
if voltage and voltage != sample_voltage: continue
|
if exclude_n_object and exclude_n_object == sample_n_object: continue
|
||||||
|
sample_voltage = float(match.groups()[3])
|
||||||
|
if filter_voltage and filter_voltage != sample_voltage: continue
|
||||||
|
|
||||||
datafiles.append((datadir + "/" + file, match, label))
|
datafiles.append((datadir + "/" + file, match, label))
|
||||||
return datafiles
|
return datafiles
|
||||||
|
|
||||||
|
|
||||||
def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None):
|
def load_datasets(datadir, labels: LabelConverter, transforms=None, split_function=None, exclude_n_object=None, voltage=None, train_to_test_ratio=0.7, random_state=None, num_workers=None):
|
||||||
"""
|
"""
|
||||||
load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv
|
load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv
|
||||||
"""
|
"""
|
||||||
datasamples = []
|
datasamples = []
|
||||||
if num_workers == None:
|
if num_workers == None:
|
||||||
for file, match, label in get_datafiles(datadir, labels, voltage):
|
for file, match, label in get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage):
|
||||||
datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file))
|
datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), file))
|
||||||
else:
|
else:
|
||||||
files = get_datafiles(datadir, labels, voltage)
|
files = get_datafiles(datadir, labels, exclude_n_object=exclude_n_object, filter_voltage=voltage)
|
||||||
def worker():
|
def worker():
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -144,10 +161,36 @@ def load_datasets(datadir, labels: LabelConverter, transforms=None, split_functi
|
|||||||
for t in threads:
|
for t in threads:
|
||||||
t.join()
|
t.join()
|
||||||
|
|
||||||
|
|
||||||
# TODO do the train_test_split after the Dataset split
|
# TODO do the train_test_split after the Dataset split
|
||||||
# problem: needs to be after transforms
|
# problem: needs to be after transforms
|
||||||
train_samples, test_samples = train_test_split(datasamples, train_size=train_to_test_ratio, shuffle=True, random_state=random_state)
|
train_samples, test_samples = train_test_split(datasamples, train_size=train_to_test_ratio, shuffle=True, random_state=random_state)
|
||||||
train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function)
|
train_dataset = Dataset(train_samples, transforms=transforms, split_function=split_function)
|
||||||
test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function)
|
test_dataset = Dataset(test_samples, transforms=transforms, split_function=split_function)
|
||||||
return train_dataset, test_dataset
|
return train_dataset, test_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def count_data(data_loader, label_converter: LabelConverter, print_summary=False):
|
||||||
|
"""
|
||||||
|
@param data_loader: unbatched data loader
|
||||||
|
"""
|
||||||
|
n_sequences = 0 # count number of sequences
|
||||||
|
labels = [ 0 for _ in range(len(label_converter)) ] # count number of sequences per label
|
||||||
|
len_data = [ 0 for _ in range(len(label_converter)) ] # count number of datapoints per label
|
||||||
|
for i, (data, y) in enumerate(data_loader):
|
||||||
|
n_sequences = i
|
||||||
|
label_i = label_converter.get_label_index(y)
|
||||||
|
len_data[label_i] += data.shape[0]
|
||||||
|
labels[label_i] += 1
|
||||||
|
if print_summary:
|
||||||
|
print("=" * 50)
|
||||||
|
print("Dataset summary" + f" for {print_summary}:" if type(print_summary) == str else ":")
|
||||||
|
print(f"Number of sequences: {n_sequences}")
|
||||||
|
for i in range(len(label_converter)):
|
||||||
|
print(f"- {label_converter[i]:15}: {labels[i]:3} sequences, {len_data[i]:5} datapoints")
|
||||||
|
|
||||||
|
return n_sequences, labels, len_data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
14
teng_ml/util/pad.py
Normal file
14
teng_ml/util/pad.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn.utils.rnn as rnn
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class PadSequences:
|
||||||
|
def __call__(self, batch):
|
||||||
|
# batch = [(data, label)]
|
||||||
|
# sort by length
|
||||||
|
sorted_batch = sorted(batch, key=lambda sample: sample[0].shape[0], reverse=True)
|
||||||
|
sequences = [torch.Tensor(sample[0]) for sample in sorted_batch]
|
||||||
|
labels = torch.Tensor(np.array([sample[1] for sample in sorted_batch]))
|
||||||
|
lengths = torch.IntTensor(np.array([seq.shape[0] for seq in sequences]))
|
||||||
|
sequences_padded = rnn.pad_sequence(sequences, batch_first=True)
|
||||||
|
return sequences_padded, lengths, labels
|
@ -5,8 +5,12 @@ class DataSplitter:
|
|||||||
Split a numpy array into smaller arrays of size datapoints_per_split
|
Split a numpy array into smaller arrays of size datapoints_per_split
|
||||||
If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped
|
If data.shape(0) % datapoints_per_split != 0, the remaining datapoints are dropped
|
||||||
"""
|
"""
|
||||||
def __init__(self, datapoints_per_split):
|
def __init__(self, datapoints_per_split, drop_if_smaller_than=-1):
|
||||||
|
"""
|
||||||
|
@param drop_if_smaller_than: drop the remaining datapoints if the sequence would be smaller than this value. -1 means drop_if_smaller_than=datapoints_per_split
|
||||||
|
"""
|
||||||
self.split_size = datapoints_per_split
|
self.split_size = datapoints_per_split
|
||||||
|
self.drop_threshhold = datapoints_per_split if drop_if_smaller_than == -1 else drop_if_smaller_than
|
||||||
|
|
||||||
def __call__(self, data: np.ndarray):
|
def __call__(self, data: np.ndarray):
|
||||||
"""
|
"""
|
||||||
@ -15,6 +19,11 @@ class DataSplitter:
|
|||||||
ret_data = []
|
ret_data = []
|
||||||
for i in range(self.split_size, data.shape[0], self.split_size):
|
for i in range(self.split_size, data.shape[0], self.split_size):
|
||||||
ret_data.append(data[i-self.split_size:i, :])
|
ret_data.append(data[i-self.split_size:i, :])
|
||||||
|
|
||||||
|
rest_start = len(ret_data) * self.split_size
|
||||||
|
if len(data) - rest_start >= self.drop_threshhold:
|
||||||
|
ret_data.append(data[rest_start:,:])
|
||||||
|
|
||||||
if len(ret_data) == 0:
|
if len(ret_data) == 0:
|
||||||
raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}")
|
raise ValueError(f"data has only {data.shape[0]}, but datapoints_per_split is set to {self.split_size}")
|
||||||
return ret_data
|
return ret_data
|
||||||
|
@ -13,25 +13,50 @@ def fill_and_center(s: str, fill_char="=", length=100):
|
|||||||
else:
|
else:
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def class_str(x):
|
def class_str(x):
|
||||||
"""
|
"""
|
||||||
Return the constructor of the class of x with arguemnts
|
Return the constructor of the class of x with arguemnts
|
||||||
"""
|
"""
|
||||||
name = type(x).__name__
|
name = type(x).__name__
|
||||||
signature = inspect.signature(type(x))
|
|
||||||
params = []
|
params = []
|
||||||
|
try:
|
||||||
|
signature = inspect.signature(type(x))
|
||||||
for param_name, param_value in x.__dict__.items():
|
for param_name, param_value in x.__dict__.items():
|
||||||
if param_name not in signature.parameters:
|
if param_name not in signature.parameters:
|
||||||
continue
|
continue
|
||||||
default_value = signature.parameters[param_name].default
|
default_value = signature.parameters[param_name].default
|
||||||
if param_value != default_value:
|
if param_value != default_value:
|
||||||
params.append(f"{param_name}={param_value!r}")
|
params.append(f"{param_name}={param_value!r}")
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
if params:
|
if params:
|
||||||
return f"{name}({', '.join(params)})"
|
return f"{name}({', '.join(params)})"
|
||||||
else:
|
else:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def optimizer_str(x):
|
||||||
|
# optimizer stores everything in 'defaults' dict and is thus not compatible with class_str
|
||||||
|
name = type(x).__name__
|
||||||
|
params = []
|
||||||
|
try:
|
||||||
|
signature = inspect.signature(type(x))
|
||||||
|
for param_name, param_value in x.__dict__["defaults"].items():
|
||||||
|
if param_name not in signature.parameters:
|
||||||
|
continue
|
||||||
|
default_value = signature.parameters[param_name].default
|
||||||
|
if param_value != default_value:
|
||||||
|
params.append(f"{param_name}={param_value!r}")
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
if params:
|
||||||
|
return f"{name}({', '.join(params)})"
|
||||||
|
else:
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_str(s):
|
def cleanup_str(s):
|
||||||
"""
|
"""
|
||||||
convert to string if necessary and
|
convert to string if necessary and
|
||||||
|
Loading…
Reference in New Issue
Block a user