photoreflectance/utility/data.py

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import logging
log = logging.getLogger(__name__)

from cpdctrl.utility.file_io import get_next_filename, sanitize_filename

FLUSH_TYPE = "pickle-ndarray"
FLUSH_PREFIX = "PART_"
METADATA_FILENAME = FLUSH_PREFIX + "measurement_metadata.pkl"

class CpdData:
    """
    Class managing data and metadata.
    Can be initialized from data directly, or a file or directory path.
    """
    columns = ["idx", "t [s]", "V [V]", "LED [%]"]
    def __init__(self, path:str|None=None, data:np.ndarray|None=None, metadata:dict|None=None, verbose=False):
        self.data = data
        if type(metadata) == dict:
            self.metadata = metadata
        else:
            self.metadata = {}
        if data is None and path is None:
            raise ValueError("Either path or data must be defined.")
        if data is not None and path is not None:
            raise ValueError("Either path or data must be defined, but not both.")
        if path is not None:  # load from file
            if os.path.isdir(path):
                self.data, md = CpdData.load_data_from_dir(path, verbose=verbose)
                self.metadata |= md
            elif os.path.isfile(path):
                if path.endswith(".csv"):
                    self.data, md = CpdData.load_data_from_csv(path)
                    self.metadata |= md
                elif path.endswith(".pkl"):
                    self.data, md = CpdData.load_data_from_pkl(path)
                    self.metadata |= md
                else:
                    raise NotImplementedError(f"Only .csv and .pkl files are supported")
            else:
                raise FileNotFoundError(f"Path '{path}' is neither a file nor a directory.")
        else:
            self.data = data

    # Convert data
    def to_dataframe(self):
        df = pd.DataFrame(self.data, columns=CpdData.columns)
        df.meta = str(self.metadata)
        return df

    def to_csv(self, sep=","):
        # self.to_dataframe().to_csv(os.path.join(self.path, self.name + ".csv"), index=False, metadata=True)
        return CpdData.get_csv(self.data, self.metadata, sep=sep)


    def save_csv_at(self, filepath, sep=",", verbose=False):
        if verbose: print(f"Writing csv to {filepath}")
        log.info(f"Writing csv to {filepath}")
        with open(filepath, "w") as file:
            file.write(self.to_csv(sep=sep))

    def save_csv(self, sep=",", verbose=False):
        """Save the csv inside the data directory"""
        filepath = os.path.join(self.path, self.dirname + ".csv")
        self.save_csv_at(filepath, sep, verbose)

    # STATIC CONVERTER
    @staticmethod
    def get_csv(data, metadata, sep=","):
        csv = ""
        for k, v in metadata.items():
            csv += f"# {k}: {v}\n"
        csv += "".join(f"{colname}{sep}" for colname in CpdData.columns).strip(sep) + "\n"
        for i in range(data.shape[0]):
            csv += f"{i}{sep}{data[i,1]}{sep}{data[i,2]}{sep}{data[i,3]}\n"
        return csv.strip("\n")

    # STATIC LOADERS
    @staticmethod
    def load_data_from_csv(filepath:str, sep: str=",") -> tuple[np.ndarray, dict]:
        """
        Loads data from a single csv file.
        Lines with this format are interpreted as metadata:
            # key: value
        Lines with this format are interpreted as data:
            index, timestamp [s], CPD [V], LED [%]
        Parameters
        ----------
        filepath
           Path to the csv file.
        sep
            csv separator
        Returns
        -------
        data
            2D numpy array with shape (n, 4) where n is the number of data points.
        metadata
            Dictionary with metadata.
        """
        metadata = {}
        with open(filepath, "r") as f:
            # this loop will read the metadata at the beginning and skip also the header row
            for line in f:
                if line.startswith("#"):
                    colon =  line.find(":")
                    if colon == -1:  # normal comment
                        continue
                    key = line[1:colon].strip()
                    value = line[colon+1:].strip()
                    metadata[key] = value
                else:
                    break
            # here, the generator has only data lines
            data = np.loadtxt(f, delimiter=sep)
        return data, metadata

    @staticmethod
    def load_data_from_pkl(filepath:str) -> tuple[np.ndarray, dict]:
        """
        Loads data from a single csv file.
        Lines with this format are interpreted as metadata:
            # key: value
        Lines with this format are interpreted as data:
            index, timestamp [s], CPD [V], LED [%]
        Parameters
        ----------
        filepath
           Path to the csv file.
        sep
            csv separator
        Returns
        -------
        data
            2D numpy array with shape (n, 4) where n is the number of data points.
        metadata
            Dictionary with metadata.
        """
        data = None
        metadata = {}
        with open(filepath, "rb") as f:
            obj = pickle.load(f)
        if isinstance(obj, tuple):
            if not len(obj) == 2:
                raise ValueError(f"Pickle file is a tuple with length {len(obj)}, however it must be 2: (data, metadata)")
            data = obj[0]
            metadata = obj[1]
            if not isinstance(data, np.ndarray):
                raise ValueError(f"First object in tuple is not a numpy.ndarray")
        elif isinstance(obj, np.ndarray):
            data = obj
        else:
            raise ValueError(f"Pickled object must be either numpy.ndarray or (numpy.ndarray, dict), but is of type {type(obj)}")
        # must be loaded by now
        if not len(data.shape) == 2 and data.shape[1] == 4:
            raise ValueError(f"numpy.ndarray has invalid shape: {data.shape}, however the shape must be (N, 4)")
        if not isinstance(metadata, dict):
            raise ValueError(f"Metadata is not a of type dict")
        return data, metadata

    @staticmethod
    def load_data_from_dir(dirpath:str, verbose:bool=False) -> tuple[np.ndarray, dict]:
        """
        Combines all data files with the FLUSH_PREFIX from a directory into a numpy array

        Parameters
        ----------
        dirpath : str
            Path to the data directory
        verbose : bool, optional
            If True, print a message for every file that is opened. The default is False.

        Raises
        ------
        NotImplementedError
            DESCRIPTION.

        Returns
        -------
        data : ndarray
            First index: Measurement
            Second index: (index, timestamp [s], CPD [V], LED [%])
        """
        files = os.listdir(dirpath)
        files.sort()
        data = np.empty((0, 4))
        metadata = {}
        for filename in files:
            filepath = os.path.join(dirpath, filename)
            if filename.startswith(FLUSH_PREFIX):
                if filename.endswith(".csv"):
                    if verbose: print(f"Opening {filepath} as csv")
                    df = pd.read_csv(filepath)
                    arr = df.to_numpy()
                    data = np.concatenate((data, arr))
                elif filename.endswith(".ndarray.pkl"):
                    with open(filepath, "rb") as file:
                        arr = pickle.load(file)
                        if len(arr.shape) != 2 or arr.shape[1] != 4:
                            print(f"Skipping file '{filepath}' with invalid array shape: {arr.shape}")
                            continue
                        data = np.concatenate((data, arr))
                elif filename == METADATA_FILENAME:  # Metadata filename must also start with FLUSH_PREFIX
                    with open(filepath, "rb") as file:
                        metadata = pickle.load(file)
                else:
                    raise NotImplementedError(f"Unknown file extension for file '{filepath}'")
            else:
                log.info(f"Skipping unknown file: '{filepath}'")
        return data, metadata


def plot_cpd_data(data: str or pd.DataFrame or np.ndarray, t: str="seconds", title: str="", CPD:bool=True, LED:bool=True):
    """
    Plot recorded data

    Parameters
    ----------
    data : str or np.ndarray
        Path to the data directory or
        numpy array with columns (idx, t [s], V [V], LED [%])
    t : str, optional
        Which timescale to use for the x axis:
        Must be one of "seconds", "mintutes", "hours".
        The default is "seconds".
    title : str, optional
        Title for the plot. The default is "".
    CPD : bool, optional
        Wether to plot the voltage (CPD) line. The default is True.
    LED : bool, optional
        Wether to plot the LED state line. The default is False.

    Returns
    -------
    fig : TYPE
        Matplotlib figure object.
    """
    if type(data) == str:
        _data, _ = CpdData.load_data_from_dir(data)
    else:
        _data = data
    fig, ax = plt.subplots()
    xdata = _data[:,1].copy()
    xlabel = "t [s]"
    if t == "minutes":
        xdata /= 60
        xlabel = "t [minutes]"
    elif t == "hours":
        xdata /= 3600
        xlabel = "t [hours]"
    ax.set_xlabel(xlabel)
    ax_cpd = ax
    ax_led = ax
    if CPD and LED:
        ax_led = ax.twinx()
    if CPD:
        ax_cpd = ax
        ax_cpd.set_ylabel("CPD [V]")
        ax_cpd.plot(xdata, _data[:,2], color="blue", label="CPD")
    if LED:
        ax_led.set_ylabel("LED [%]")
        ax_led.plot(xdata, _data[:,3], color="orange", label="LED")
        ax_led.set_ylim(-2, 102)
        ax_led.set_yticks([0, 20, 40, 60, 80, 100])
    if CPD and LED:
        # ax_led.legend()
        # ax_cpd.legend()
        pass
    if title:
        ax.set_title(title)
    fig.tight_layout()
    return fig