photoreflectance/prsctrl/utility/data_collector.py

import pandas as pd
import numpy as np
import os
import datetime
import pickle
import logging
from abc import abstractmethod

log = logging.getLogger(__name__)

from ..utility.file_io import get_next_filename, sanitize_filename
from ..utility.prsdata import PrsData, FLUSH_TYPE, FLUSH_PREFIX, METADATA_FILENAME

"""
Wollen:
- Daten während der Messung hinzufügen und in Snippets auf die Disk schreiben
- Daten nach der Messung laden, aus Rohdaten (directory), aus Berechneten Daten (csv)

"""

class DataCollector:
    """
    Class managing data collection and partial storage
    """
    def __init__(self,
                 data_path: str,
                 data_name: str="PRS",
                 metadata: dict[str, str]={},
                 dirname: str|None=None,
                 add_number_if_dir_exists=True,
                 data_container=list,
                 ):
        self.data_type = data_container
        self.data = data_container()
        self.full_data = None  # if loaded, this contains the final numpy array
        self.name = data_name
        self.metadata = metadata
        self.path = os.path.abspath(os.path.expanduser(data_path))
        if dirname is None:
            self.dirname = sanitize_filename(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") + "_" + self.name)
        else:
            self.dirname = sanitize_filename(dirname)
        self.dirpath = os.path.join(self.path, self.dirname)

        if os.path.exists(self.dirpath):
            if not add_number_if_dir_exists:
                raise Exception(f"Directory '{self.dirname}' already exists. Provide a different directory or pass `add_number_if_dir_exists=True` to ignore this")
            else:
                i = 1
                dirpath = f"{self.dirpath}-{i}"
                while os.path.exists(dirpath):
                    i += 1
                    dirpath = f"{self.dirpath}-{i}"
                    print(f"Directory '{self.dirname}' already exists. Trying '{dirpath}' instead")
                self.dirpath = dirpath
        self.assert_directory_exists()
        self.flushed = False


    # OPERATION
    def clear(self):
        self.data = []
        self.full_data = None


    def assert_directory_exists(self):
        if not os.path.isdir(self.dirpath):
            os.makedirs(self.dirpath)

    def get_data(self) -> PrsData:
        """
        Load the full data and return it together with the metadata
        Returns
        -------
        tuple[np.ndarray, dict]
           The full data and the metadata
        """
        if self.full_data is None:
            self.full_data = PrsData(path=self.dirpath, metadata=self.metadata)
        return self.full_data

    def save_csv_in_dir(self, sep=",", verbose=False):
        """Save full data as csv inside the directory with temporary data"""
        self.get_data()
        filepath = os.path.join(self.dirpath, self.dirname + ".csv")
        self.full_data.save_csv_at(filepath, sep, verbose)

    def write_metadata(self):
        f"""
        Write the metadata to the disk as '{METADATA_FILENAME}'

        Returns
        -------
        None.
        """
        filepath = os.path.join(self.dirpath, METADATA_FILENAME)
        log.debug(f"Writing metadata to {filepath}")
        with open(filepath, "wb") as file:
            pickle.dump(self.metadata, file)

class PrsDataCollector(DataCollector):
    def __init__(self,
                 data_path: str,
                 data_name: str="PRS",
                 metadata: dict[str, str]={},
                 dirname: str|None=None,
                 add_number_if_dir_exists=True,
        ):
        super().__init__(data_path, data_name, metadata, dirname, add_number_if_dir_exists, dict)

    @abstractmethod
    def add_data(self, wavelength, raw):
        self.data[wavelength] = raw
        self.full_data = None  # no longer up to date

    @abstractmethod
    def flush(self, verbose: bool = False):
        """
        Write the current data to a file and clear the internal data

        Parameters
        ----------
        verbose : bool, optional
            If True, print a message when flushing data. The default is False.

        Raises
        ------
        ValueError
            If the FLUSH_TYPE is invalid.

        Returns
        -------
        None.

        """
        # dont flush empty data
        if len(self.data) == 0:
            return
        self.assert_directory_exists()
        for key, key_data in self.data.items():
            if FLUSH_TYPE == "csv":
                filename = self._get_flush_filename(key) + ".csv"
                filepath = os.path.join(self.dirpath, filename)
                log.info(f"Flushing data to {filepath}")
                if verbose: print(f"Flushing data to {filepath}")
                df = pd.DataFrame(key_data, columns=PrsData.columns)
                df.meta = str(self.metadata)
                df.to_csv(filepath, sep=",", index=False, metadata=True)
            elif FLUSH_TYPE == "pickle-ndarray":
                filename = self._get_flush_filename(key) + ".ndarray.pkl"
                filepath = os.path.join(self.dirpath, filename)
                log.info(f"Flushing data to {filepath}")
                if verbose: print(f"Flushing data to {filepath}")
                with open(filepath, "wb") as file:
                    pickle.dump(np.array(key_data), file)
            else:
                raise ValueError(f"Invalid FLUSH_TYPE: '{FLUSH_TYPE}'")
        self.clear()

    # File IO
    def _get_flush_filename(self, key):
        return sanitize_filename(self.name + "_" + str(key))