photoreflectance/prsctl/utility/data_collector.py
2025-04-29 12:25:57 +02:00

142 lines
4.8 KiB
Python

import pandas as pd
import numpy as np
import os
import datetime
import pickle
import logging
log = logging.getLogger(__name__)
from cpdctrl.utility.file_io import get_next_filename, sanitize_filename
from cpdctrl.utility.data import CpdData, FLUSH_TYPE, FLUSH_PREFIX, METADATA_FILENAME
class DataCollector:
"""
Class managing data collection and partial storage
"""
def __init__(self,
data_path: str,
data_name: str="CPData",
metadata: dict[str, str]={},
dirname: str|None=None,
add_number_if_dir_exists=True,
):
self.data = []
self.cpd_data = None # if loaded, this contains the final numpy array
self.name = data_name
self.metadata = metadata
self.path = os.path.abspath(os.path.expanduser(data_path))
if dirname is None:
self.dirname = sanitize_filename(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") + "_" + self.name)
else:
self.dirname = sanitize_filename(dirname)
self.dirpath = os.path.join(self.path, self.dirname)
if os.path.exists(self.dirpath):
if not add_number_if_dir_exists:
raise Exception(f"Directory '{self.dirname}' already exists. Provide a different directory or pass `add_number_if_dir_exists=True` to ignore this")
else:
i = 1
dirpath = f"{self.dirpath}-{i}"
while os.path.exists(dirpath):
i += 1
dirpath = f"{self.dirpath}-{i}"
print(f"Directory '{self.dirname}' already exists. Trying '{dirpath}' instead")
self.dirpath = dirpath
self.assert_directory_exists()
self.flushed = False
# OPERATION
def clear(self):
self.data = []
self.cpd_data = None
def add_data(self, i, t, v, l):
self.data.append((i, t, v, l))
self.cpd_data = None # no longer up to date
def flush(self, verbose: bool = False):
"""
Write the current data to a file and clear the internal data
Parameters
----------
verbose : bool, optional
If True, print a message when flushing data. The default is False.
Raises
------
ValueError
If the FLUSH_TYPE is invalid.
Returns
-------
None.
"""
# dont flush empty data
if len(self.data) == 0:
return
self.assert_directory_exists()
if FLUSH_TYPE == "csv":
filename = self._get_flush_filename() + ".csv"
filepath = os.path.join(self.dirpath, filename)
log.info(f"Flushing data to {filepath}")
if verbose: print(f"Flushing data to {filepath}")
df = pd.DataFrame(self.data, columns=CpdData.columns)
df.meta = str(self.metadata)
df.to_csv(filepath, sep=",", index=False, metadata=True)
elif FLUSH_TYPE == "pickle-ndarray":
filename = self._get_flush_filename() + ".ndarray.pkl"
filepath = os.path.join(self.dirpath, filename)
log.info(f"Flushing data to {filepath}")
if verbose: print(f"Flushing data to {filepath}")
with open(filepath, "wb") as file:
pickle.dump(np.array(self.data), file)
else:
raise ValueError(f"Invalid FLUSH_TYPE: '{FLUSH_TYPE}'")
self.clear()
# File IO
def _get_flush_filename(self):
"""Get the filename of the next partial file, incrementing the number every time"""
return sanitize_filename(get_next_filename(FLUSH_PREFIX + self.name, self.dirpath, digits=5))
def assert_directory_exists(self):
if not os.path.isdir(self.dirpath):
os.makedirs(self.dirpath)
def get_data(self) -> CpdData:
"""
Load the full data and return it together with the metadata
Returns
-------
tuple[np.ndarray, dict]
The full data and the metadata
"""
if self.cpd_data is None:
self.cpd_data = CpdData(path=self.dirpath, metadata=self.metadata)
return self.cpd_data
def save_csv_in_dir(self, sep=",", verbose=False):
"""Save full data as csv inside the directory with temporary data"""
self.get_data()
filepath = os.path.join(self.dirpath, self.dirname + ".csv")
self.cpd_data.save_csv_at(filepath, sep, verbose)
def write_metadata(self):
f"""
Write the metadata to the disk as '{METADATA_FILENAME}'
Returns
-------
None.
"""
filepath = os.path.join(self.dirpath, METADATA_FILENAME)
log.debug(f"Writing metadata to {filepath}")
with open(filepath, "wb") as file:
pickle.dump(self.metadata, file)