4 changed files with 24 additions and 239 deletions
--- a/teng-ml/main.py
+++ b/teng-ml/main.py
@ -1,3 +1,6 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 if __name__ == "__main__":
    if __package__ is None:
        # make relative imports work as described here: https://peps.python.org/pep-0366/#proposed-change
@ -7,18 +10,10 @@ if __name__ == "__main__":
        filepath = path.realpath(path.abspath(__file__))
        sys.path.insert(0, path.dirname(path.dirname(filepath)))
 import matplotlib.pyplot as plt
 import pandas as pd
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from .util.transform import ConstantInterval
 from .util.data_loader import load_datasets, LabelConverter
-def test_interpol():
+if __name__ == "__main__":
-    file = "/home/matth/data/2023-04-27_glass_8.2V_179mm000.csv"
+    file = "/home/matth/data/2023-04-25_kapton_8.2V_179mm002.csv"
    # file = "/home/matth/data/test001.csv"
    df = pd.read_csv(file)
    array = df.to_numpy()
@ -32,104 +27,3 @@ def test_interpol():
    ax1.legend()
    plt.show()
 if __name__ == "__main__":
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    print(f"Using device: {device}")
    labels = LabelConverter(["foam", "glass", "kapton", "foil"])
    train_set, test_set = load_datasets("/home/matth/data", labels, voltage=8.2)
    # train_loader = iter(DataLoader(train_set))
    # test_loader = iter(DataLoader(test_set))
    # sample = next(train_loader)
    # print(sample)
    train_loader = iter(DataLoader(train_set))
    test_loader = iter(DataLoader(test_set))
    class RNN(nn.Module):
        def __init__(self, input_size, hidden_size, num_layers, num_classes, if_bidirectional):
            super(RNN, self).__init__()
            self.num_layers = num_layers
            self.hidden_size = hidden_size
            self.if_bidirectional = if_bidirectional
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=if_bidirectional)
            if if_bidirectional == True:
              self.fc = nn.Linear(hidden_size * 2, num_classes)
            else:
              self.fc = nn.Linear(hidden_size, num_classes)
        def forward(self, x):
            D = 2 if self.if_bidirectional == True else 1
            Batch = x.batch_sizes[0]
            h0 = torch.zeros(D * self.num_layers, Batch, self.hidden_size).to(device)
            c0 = torch.zeros(D * self.num_layers, Batch, self.hidden_size).to(device)
            x.to(device)
            _, (h_n, _) = self.lstm(x, (h0, c0))
            final_state  = h_n.view(self.num_layers, D, Batch, self.hidden_size)[-1]     # num_layers, num_directions, batch, hidden_size
            if D == 1:
              X = final_state.squeeze()
            elif D == 2:
              h_1, h_2 = final_state[0], final_state[1]  # forward & backward pass
              #X = h_1 + h_2                # Add both states
              X = torch.cat((h_1, h_2), 1)         # Concatenate both states, X-size: (Batch, hidden_size * 2）
            output = self.fc(X) # fully-connected layer
            return output
    model = RNN(input_size = 1, hidden_size = 8, num_layers = 3, num_classes = 18, if_bidirectional = True).to(device)
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    print(model)
 # training 
    for ep in range(40):
        train_correct = 0
        train_total = 0
        val_correct = 0
        val_total = 0
        for (x, y), length in train_loader: 
            batch_size = x.shape[0]
            v = x.view(batch_size, -1, nFeatrue)
            data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)
            # print(data.batch_sizes[0])
            # print(data)
            out = model(data)
            loss = loss_func(out, y) 
            # print(loss)
            optimizer.zero_grad()   # clear gradients for next train
            loss.backward()         # backpropagation, compute gradients
            optimizer.step()        # apply gradients
            predicted = torch.max(torch.nn.functional.softmax(out), 1)[1]
            train_total += y.size(0)
            train_correct += (predicted == y).sum().item()
        scheduler.step()
        for (x, y), length in test_loader: 
            batch_size = x.shape[0]
            v = x.view(batch_size, -1, nFeatrue)
            data = rnn_utils.pack_padded_sequence(v.type(torch.FloatTensor), length, batch_first=True).to(device)
            out = model(data)
            loss = loss_func(out, y)     
            predicted = torch.max(torch.nn.functional.softmax(out), 1)[1]
            val_total += y.size(0)
            val_correct += (predicted == y).sum().item()
        print("epoch: ", ep + 1, 'Accuracy of the Train: %.2f %%' % (100 * train_correct / train_total), 'Accuracy of the Test: %.2f %%' % (100 * val_correct / val_total))
--- a/teng-ml/prepare.py
+++ b/teng-ml/prepare.py
@ -6,6 +6,7 @@ from time import sleep
 from random import choice as r_choice
 from sys import exit
 from .util.transform import Normalize
 if __name__ == "__main__":
    if __package__ is None:
@ -15,8 +16,7 @@ if __name__ == "__main__":
        from os import path
        filepath = path.realpath(path.abspath(__file__))
        sys.path.insert(0, path.dirname(path.dirname(filepath)))
-
+from .utility.data import load_dataframe
 from .util.transform import Normalize
 file = "/home/matth/data/2023-04-25_kapton_8.2V_179mm002.csv"
@ -101,7 +101,7 @@ if __name__ == "__main__":
    Peak identification:
    plot, let user choose first, second, last and lowest peak for identification
    """
-    df = pd.read_csv(file)
+    df = load_dataframe(file)
    a = df.to_numpy()
    # a2 = interpolate_to_linear_time()
--- a/teng-ml/rnn.py
+++ b/teng-ml/rnn.py
@ -1,39 +0,0 @@
 import torch
 import torch.nn as nn
 # BiLSTM Model
 class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_classes, if_bidirectional):
    super(RNN, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.if_bidirectional = if_bidirectional
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=if_bidirectional)
    if if_bidirectional == True:
      self.fc = nn.Linear(hidden_size * 2, num_classes)
    else:
      self.fc = nn.Linear(hidden_size, num_classes)
  def forward(self, x):
    D = 2 if self.if_bidirectional == True else 1
    Batch = x.batch_sizes[0]
    h0 = torch.zeros(D * self.num_layers, Batch, self.hidden_size).to(device)
    c0 = torch.zeros(D * self.num_layers, Batch, self.hidden_size).to(device)
    x.to(device)
    _, (h_n, _) = self.lstm(x, (h0, c0))
    final_state  = h_n.view(self.num_layers, D, Batch, self.hidden_size)[-1]     # num_layers, num_directions, batch, hidden_size
    if D == 1:
      X = final_state.squeeze()
    elif D == 2:
      h_1, h_2 = final_state[0], final_state[1]  # forward & backward pass
      #X = h_1 + h_2                # Add both states
      X = torch.cat((h_1, h_2), 1)         # Concatenate both states, X-size: (Batch, hidden_size * 2）
    output = self.fc(X) # fully-connected layer
    return output
--- a/teng-ml/util/data_loader.py
+++ b/teng-ml/util/data_loader.py
@ -1,95 +1,25 @@
-from os import path, listdir
+def load_data():
-import re
+    # Build the category_lines dictionary, a list of names per language
-import numpy as np
+    category_lines = {}
-import pandas as pd
+    all_categories = []
-from sklearn.model_selection import train_test_split
+    def find_files(path):
        return glob.glob(path)
-# groups: date, name, voltage, distance, index
+    # Read a file and split into lines
-re_filename = r"(\d{4}-\d{2}-\d{2})_([a-zA-Z]+)_(\d{1,2}(?:\.\d*)?)V_(\d+(?:\.\d*)?)mm(\d+).csv"
+    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]
-class LabelConverter:
+    for filename in find_files('data/names/*.txt'):
-    def __init__(self, class_labels):
+        category = os.path.splitext(os.path.basename(filename))[0]
-        self.class_labels = class_labels.copy()
+        all_categories.append(category)
        self.class_labels.sort()
-    def get_one_hot(self, label):
+        lines = read_lines(filename)
-        """return one hot vector for given label"""
+        category_lines[category] = lines
        vec = np.zeros(len(self.class_labels), dtype=np.float32)
        vec[self.class_labels.index(label)] = 1.0
        return vec
-    def __getitem__(self, index):
+    return category_lines, all_categories
        return self.class_labels[index]
    def __contains__(self, value):
        return value in self.class_labels
    def get_labels(self):
        return self.class_labels.copy()
 class Datasample:
    def __init__(self, date: str, label: str, voltage: str, distance: str, index: str, label_vec, datapath: str):
        self.date = date
        self.label = label
        self.voltage = float(voltage)
        self.distance = float(distance)
        self.index = int(index)
        self.label_vec = label_vec
        self.datapath = datapath
        self.data = None
    def __repr__(self):
        size = self.data.size if self.data else "Unknown"
        return f"{self.label}-{self.index}: dimension={size}, recorded at {self.date} with U={self.voltage}V, d={self.distance}mm"
    def _load_data(self):
        df = pd.read_csv(self.datapath)
        self.data = df.to_numpy()
    def get_data(self):
        """[[timestamps, idata, vdata]]"""
        if not self.data:
            self._load_data()
        return self.data
 class Dataset:
    """
    Store the whole dataset, compatible with torch.data.Dataloader
    """
    def __init__(self, datasamples):
        self.datasamples = datasamples
        # self.labels = [ d.label_vec for d in datasamples ]
        # self.data = [ d.get_data() for d in datasamples ]
    def __getitem__(self, index):
        return self.datasamples[index].get_data(), self.datasamples[index].label_vec
    def __len__(self):
        return len(self.datasamples)
 def load_datasets(datadir, labels: LabelConverter, voltage=None, train_to_test_ratio=0.7, random_state=None):
    """
    load all data from datadir that are in the format: yyyy-mm-dd_label_x.xV_xxxmm.csv
    """
    datasamples = []
    files = listdir(datadir)
    files.sort()
    for file in files:
        match = re.fullmatch(re_filename, file)
        if not match: continue
        label = match.groups()[1]
        if label not in labels: continue
        sample_voltage = float(match.groups()[2])
        if voltage and voltage != sample_voltage: continue
        datasamples.append(Datasample(*match.groups(), labels.get_one_hot(label), datadir + "/" + file))
    train_samples, test_samples = train_test_split(datasamples, train_size=train_to_test_ratio, shuffle=True, random_state=random_state)
    train_dataset = Dataset(train_samples)
    test_dataset = Dataset(test_samples)
    return train_dataset, test_dataset