From 9aa1ffd7e03fca55dd10bc56cef9c2f7698a5d82 Mon Sep 17 00:00:00 2001
From: "matthias@arch" <matthiasqui@protonmail.com>
Date: Wed, 2 Aug 2023 10:58:15 +0200
Subject: [PATCH] implemented file saving

---
 teng_ml/data_preprocess.py | 186 +++++++++++++++++++++++++++++--------
 1 file changed, 145 insertions(+), 41 deletions(-)

diff --git a/teng_ml/data_preprocess.py b/teng_ml/data_preprocess.py
index fdd7922..d15f11f 100644
--- a/teng_ml/data_preprocess.py
+++ b/teng_ml/data_preprocess.py
@@ -5,6 +5,8 @@ import matplotlib.pyplot as plt
 from time import sleep
 from random import choice as r_choice
 from sys import exit
+import os
+import re
 
 
 if __name__ == "__main__":
@@ -18,36 +20,93 @@ if __name__ == "__main__":
 
 from .util.transform import Normalize
 from .util.data_loader import get_datafiles
+from .util.file_io import get_next_digits
 
 file = "/home/matth/Uni/TENG/teng_2/data/2023-06-28_foam_black_1_188mm_06V001.csv"
 
+
 class InteractiveDataSelector:
+    re_file = r'\d{4}-\d{2}-\d{2}_([a-zA-Z_]+)_([a-zA-Z0-9]+)_(\d+(?:\.\d+)?mm)_(\d+V)(\d+)\.csv'
+    re_index_group_nr = 5  # group number of the index part of the filename
     """
-    Helper class for "iterating" through selected peaks.
+    Go through all .csv files in a directory, split the data and exclude sections with the mouse, then write the sections as single files into a new directory
     """
-    def __init__(self, out_name, out_dir, fig, ax):
+    def __init__(self, in_dir, out_dir, keep_index=True, split_at_exclude=True):
+        """
+        @param keep_index:
+            If True: append the split number as triple digits to the existing filename (file001.csv -> file001001.csv, file001002.csv ...)
+            Else: remove the indices from the filename before adding the split number (file001.csv -> file001.csv, file002.csv ...)
+        @param split_at_exclude:
+            If True: When excluding an area, split the data before and after the excluded zone
+            Else: remove the excluded zone and join the previous and later part
+        """
+        if os.path.isdir(out_dir):
+            if os.listdir(out_dir):
+                raise ValueError(f"'out_dir' = '{out_dir}' is not empty")
+        else:
+            os.makedirs(out_dir)
         self._out_dir = out_dir
-        self._out_name = out_name
-        self._fig = fig
-        self._ax = ax
+
+        self._in_dir = in_dir
+        self._in_files = os.listdir(in_dir)
+        self._in_files.sort()
+        for i in reversed(range(len(self._in_files))):
+            if not re.fullmatch(InteractiveDataSelector.re_file, self._in_files[i]):
+                print(f"Dropping non-matching file '{self._in_files[i]}'")
+                self._in_files.pop(i)
+        if not self._in_files:
+            raise ValueError(f"No matching files in 'in_dir' = '{in_dir}'")
+
+        self._keep_index = keep_index
+        self.split_at_exclude = split_at_exclude
+
+        plt.ion()
+        self._fig, self._ax = plt.subplots()
 
         self._fig.canvas.mpl_connect("button_press_event", lambda ev: self._fig_on_button_press(ev))
         self._fig.canvas.mpl_connect("key_press_event", lambda ev: self._fig_on_key_press(ev))
 
-        self._splits_lines = None  # vlines
-        self._excludes_lines = None
-        self._excludes_areas = [] # list of areas
-
-        self._splits: list[int] = []
-        self._excludes: list[int] = []
-        self._mode = None  # split or exclude
-        self._set_mode("split")
-
     def run(self):
+        self._next_file()
         while plt.fignum_exists(self._fig.number):
             plt.pause(0.01)
 
+    def _set_titles(self):
+        help_str = "[(e)xclude, (S)plit, (w)rite]"
+        self._fig.suptitle(f"{help_str}\ncurret mode: {self._mode}")
+
+    def _next_file(self):
+        # runtime stuff
+        if len(self._in_files) == 0:
+            raise IndexError("No more files to process")
+        self._current_file = self._in_files.pop(0)
+        self._current_dataframe = pd.read_csv(os.path.join(self._in_dir, self._current_file))
+        self._current_array = self._current_dataframe.to_numpy()
+        # self._current_array = np.loadtxt(os.path.join(self._in_dir, self._current_file), skiprows=1, delimiter=",")
+
+
+        # plot stuff
+        self._splits_lines = None  # vlines
+        self._excludes_lines = None
+        self._excludes_areas = [] # list of areas
+        self._fig.clear()
+        self._ax = self._fig.subplots()
+        self._ax.plot(self._current_array[:,0], self._current_array[:,2])
+        self._ax.set_xlabel(self._current_file)
+
+        self._splits: list[int] = []
+        self._excludes: list[int] = []
+        self._mode = "exclude" # split or exclude
+        self._set_titles()
+
+        self._set_titles()
+
     def _fig_on_button_press(self, event):
+        """
+        left click: set split / exclude section (depends on mode)
+        right click: undo last action of selected mode
+        """
+        if event.xdata is None: return
         if event.xdata in self._excludes or event.xdata in self._splits: return
         if event.button == 1:  # left click, add position
             if self._mode == "split":
@@ -64,22 +123,21 @@ class InteractiveDataSelector:
         self._update_lines()
 
     def _fig_on_key_press(self, event):
+        """
+        S: set split mode
+        e: set exclude mode
+        w: write and got to next file
+        """
         if event.key == 'S':
-            self._set_mode("split")
-        elif event.key == 'e':
-            self._set_mode("exclude")
-
-    def _set_mode(self, mode):
-        help_str = "[(e)xclude - (S)plit]"
-        if mode == "split":
             self._mode = "split"
-            fig.suptitle(f"-> split mode {help_str}")
-        else:
+        elif event.key == 'e':
             self._mode = "exclude"
-            fig.suptitle(f"-> exclude mode {help_str}")
+        elif event.key == 'w':
+            self._save_as_new_files()
+        self._set_titles()
 
     def _update_lines(self):
-        print(self._splits, self._excludes)
+        # print(self._splits, self._excludes)
         ymin, ymax = self._ax.get_ylim()
 
         if self._splits_lines is not None: self._splits_lines.remove()
@@ -100,28 +158,74 @@ class InteractiveDataSelector:
         self._ax.set_ylim(ymin, ymax)  # reset, since margins are added to lines
         self._fig.canvas.draw()
 
+    def _get_next_filename(self):
+        if self._keep_index:
+            # 5th group is index
+            match = re.fullmatch(InteractiveDataSelector.re_file, self._current_file)
+            assert(type(match) is not None)
+            basename = self._current_file[:match.start(InteractiveDataSelector.re_index_group_nr)]
+        else:
+            basename = self._current_file[:-4]  # extension
+        index = get_next_digits(basename, self._out_dir, digits=3)
+        return f"{basename}{index}.csv"
+
     def _save_as_new_files(self):
+        # convert timestamps to their closest index
+        excludes_idx = [np.abs(self._current_array[:,0] - t).argmin() for t in self._excludes]
+        splits_idx = [np.abs(self._current_array[:,0] - t).argmin() for t in self._splits]
+        if self.split_at_exclude:
+            # split before the start of the exclucded range
+            splits_idx += [ excludes_idx[i]-1 for i in range(0, len(excludes_idx), 2) ]
+            # split after the end of the exclucded range
+            splits_idx += [ excludes_idx[i]+1 for i in range(1, len(excludes_idx), 2) ]
+        splits_idx = list(set(splits_idx))  # remove duplicates
+        splits_idx.sort()
 
+        df = self._current_dataframe.copy()
 
+        # 1) remove excluded parts
+        for i in range(1, len(excludes_idx), 2):
+            df = df.drop(index=range(excludes_idx[i-1], excludes_idx[i]+1))
+
+        # 2) splits
+        new_frames = []
+        start_i = df.index[0]
+        for i in range(0, len(splits_idx)):
+            end_i = splits_idx[i]
+            # print(start_i, end_i)
+            # check if valid start and end index
+            if start_i in df.index and end_i in df.index:
+                new_frames.append(df.loc[start_i:end_i])
+            start_i = end_i + 1
+        # append rest
+        if start_i in df.index:
+            new_frames.append(df.loc[start_i:])
+
+        # 3) remove empty
+        for i in reversed(range(len(new_frames))):
+            if len(new_frames[i]) == 0:
+                new_frames.pop(i)
+
+        for frame in new_frames:
+            filename = self._get_next_filename()
+            pathname = os.path.join(self._out_dir, filename)
+            # until now, frame is a copy of a slice
+            frame = frame.copy()
+            # transform timestamps so that first value is 0
+            t_column_name = frame.columns[0]
+            frame[t_column_name] -= frame.iloc[0][t_column_name]
+            frame.to_csv(pathname, index=False)
+            print(f"Saved range of length {len(frame.index):04} to {pathname}")
 
 
 if __name__ == "__main__":
-    """
-    Peak identification:
-    plot, let user choose first, second, last and lowest peak for identification
-    """
-    df = pd.read_csv(file)
-    a = df.to_numpy()
+    import argparse
+    parser = argparse.ArgumentParser("data_preprocess")
+    parser.add_argument("in_dir")
+    parser.add_argument("out_dir")
+    parser.add_argument("--keep_index", action="store_true")
+    ns = parser.parse_args()
 
-    # a2 = interpolate_to_linear_time()
-    # print(a2)
-    # exit()
-
-    vdata = Normalize(0, 1)(a[:,2])
-    plt.ion()
-    fig, ax = plt.subplots()
-    ax.plot(vdata)
-    ax.grid(True)
-    selector = InteractiveDataSelector("bla", "test", fig, ax)
+    selector = InteractiveDataSelector(ns.in_dir, ns.out_dir, ns.keep_index)
     selector.run()