BREAKING: Change flush filenames
Improve load_csv performance using numpy builtin
This commit is contained in:
parent
51dd329bfc
commit
c955d7a8f9
@ -9,7 +9,8 @@ log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
from cpdctrl.utility.file_io import get_next_filename, sanitize_filename
|
from cpdctrl.utility.file_io import get_next_filename, sanitize_filename
|
||||||
FLUSH_TYPE = "pickle-ndarray"
|
FLUSH_TYPE = "pickle-ndarray"
|
||||||
METADATA_FILENAME = "_measurement_metadata.pkl"
|
FLUSH_PREFIX = "PART_"
|
||||||
|
METADATA_FILENAME = FLUSH_PREFIX + "measurement_metadata.pkl"
|
||||||
|
|
||||||
|
|
||||||
class DataCollector:
|
class DataCollector:
|
||||||
@ -47,28 +48,16 @@ class DataCollector:
|
|||||||
self.flushed = False
|
self.flushed = False
|
||||||
|
|
||||||
|
|
||||||
def _get_filename(self):
|
# OPERATION
|
||||||
return sanitize_filename(get_next_filename(self.name, self.dirpath, digits=5))
|
def clear(self):
|
||||||
|
self.data = []
|
||||||
|
self.fulldata = None
|
||||||
|
|
||||||
def write_metadata(self):
|
def add_data(self, i, t, v, l):
|
||||||
f"""
|
self.data.append((i, t, v, l))
|
||||||
Write the metadata to the disk as '{METADATA_FILENAME}'
|
self.fulldata = None # no longer up to date
|
||||||
|
|
||||||
Returns
|
def flush(self, verbose: bool = False):
|
||||||
-------
|
|
||||||
None.
|
|
||||||
"""
|
|
||||||
filepath = os.path.join(self.dirpath, METADATA_FILENAME)
|
|
||||||
log.debug(f"Writing metadata to {filepath}")
|
|
||||||
with open(filepath, "wb") as file:
|
|
||||||
pickle.dump(self.metadata, file)
|
|
||||||
|
|
||||||
def assert_directory_exists(self):
|
|
||||||
if not os.path.isdir(self.dirpath):
|
|
||||||
os.makedirs(self.dirpath)
|
|
||||||
|
|
||||||
|
|
||||||
def flush(self, verbose:bool=False):
|
|
||||||
"""
|
"""
|
||||||
Write the current data to a file and clear the internal data
|
Write the current data to a file and clear the internal data
|
||||||
|
|
||||||
@ -92,13 +81,13 @@ class DataCollector:
|
|||||||
return
|
return
|
||||||
self.assert_directory_exists()
|
self.assert_directory_exists()
|
||||||
if FLUSH_TYPE == "csv":
|
if FLUSH_TYPE == "csv":
|
||||||
filename = self._get_filename() + ".csv"
|
filename = self._get_flush_filename() + ".csv"
|
||||||
filepath = os.path.join(self.dirpath, filename)
|
filepath = os.path.join(self.dirpath, filename)
|
||||||
log.info(f"Flushing data to {filepath}")
|
log.info(f"Flushing data to {filepath}")
|
||||||
if verbose: print(f"Flushing data to {filepath}")
|
if verbose: print(f"Flushing data to {filepath}")
|
||||||
self.to_dataframe().to_csv(filepath, sep=",", index=False, metadata=True)
|
self.to_dataframe().to_csv(filepath, sep=",", index=False, metadata=True)
|
||||||
elif FLUSH_TYPE == "pickle-ndarray":
|
elif FLUSH_TYPE == "pickle-ndarray":
|
||||||
filename = self._get_filename() + ".ndarray.pkl"
|
filename = self._get_flush_filename() + ".ndarray.pkl"
|
||||||
filepath = os.path.join(self.dirpath, filename)
|
filepath = os.path.join(self.dirpath, filename)
|
||||||
log.info(f"Flushing data to {filepath}")
|
log.info(f"Flushing data to {filepath}")
|
||||||
if verbose: print(f"Flushing data to {filepath}")
|
if verbose: print(f"Flushing data to {filepath}")
|
||||||
@ -108,14 +97,7 @@ class DataCollector:
|
|||||||
raise ValueError(f"Invalid FLUSH_TYPE: '{FLUSH_TYPE}'")
|
raise ValueError(f"Invalid FLUSH_TYPE: '{FLUSH_TYPE}'")
|
||||||
self.clear()
|
self.clear()
|
||||||
|
|
||||||
def clear(self):
|
# Convert data
|
||||||
self.data = []
|
|
||||||
self.fulldata = None
|
|
||||||
|
|
||||||
def add_data(self, i, t, v, l):
|
|
||||||
self.data.append((i, t, v, l))
|
|
||||||
self.fulldata = None # no longer up to date
|
|
||||||
|
|
||||||
def to_dataframe(self):
|
def to_dataframe(self):
|
||||||
df = pd.DataFrame(self.data, columns=DataCollector.columns)
|
df = pd.DataFrame(self.data, columns=DataCollector.columns)
|
||||||
df.meta = str(self.metadata)
|
df.meta = str(self.metadata)
|
||||||
@ -126,13 +108,6 @@ class DataCollector:
|
|||||||
data, metadata = self.get_data()
|
data, metadata = self.get_data()
|
||||||
return DataCollector.get_csv(data, self.metadata, sep=sep)
|
return DataCollector.get_csv(data, self.metadata, sep=sep)
|
||||||
|
|
||||||
def save_csv(self, sep=",", verbose=False):
|
|
||||||
filepath = os.path.join(self.path, self.dirname + ".csv")
|
|
||||||
if verbose: print(f"Writing csv to {filepath}")
|
|
||||||
log.info(f"Writing csv to {filepath}")
|
|
||||||
with open(filepath, "w") as file:
|
|
||||||
file.write(self.to_csv(sep=sep))
|
|
||||||
|
|
||||||
def get_data(self) -> tuple[np.ndarray, dict]:
|
def get_data(self) -> tuple[np.ndarray, dict]:
|
||||||
"""
|
"""
|
||||||
Load the full data and return it together with the metadata
|
Load the full data and return it together with the metadata
|
||||||
@ -146,6 +121,46 @@ class DataCollector:
|
|||||||
self.metadata |= new_mdata
|
self.metadata |= new_mdata
|
||||||
return self.fulldata, self.metadata
|
return self.fulldata, self.metadata
|
||||||
|
|
||||||
|
# File IO
|
||||||
|
def _get_flush_filename(self):
|
||||||
|
"""Get the filename of the next partial file, incrementing the number every time"""
|
||||||
|
return sanitize_filename(get_next_filename(FLUSH_PREFIX + self.name, self.dirpath, digits=5))
|
||||||
|
|
||||||
|
def assert_directory_exists(self):
|
||||||
|
if not os.path.isdir(self.dirpath):
|
||||||
|
os.makedirs(self.dirpath)
|
||||||
|
|
||||||
|
|
||||||
|
def save_csv_at(self, filepath, sep=",", verbose=False):
|
||||||
|
if verbose: print(f"Writing csv to {filepath}")
|
||||||
|
log.info(f"Writing csv to {filepath}")
|
||||||
|
with open(filepath, "w") as file:
|
||||||
|
file.write(self.to_csv(sep=sep))
|
||||||
|
|
||||||
|
def save_csv(self, sep=",", verbose=False):
|
||||||
|
"""Save the csv inside the data directory"""
|
||||||
|
filepath = os.path.join(self.path, self.dirname + ".csv")
|
||||||
|
self.save_csv_at(filepath, sep, verbose)
|
||||||
|
|
||||||
|
def save_csv_in_dir(self, sep=",", verbose=False):
|
||||||
|
"""Save the csv inside the directory with temporary data"""
|
||||||
|
filepath = os.path.join(self.dirpath, self.dirname + ".csv")
|
||||||
|
self.save_csv_at(filepath, sep, verbose)
|
||||||
|
|
||||||
|
def write_metadata(self):
|
||||||
|
f"""
|
||||||
|
Write the metadata to the disk as '{METADATA_FILENAME}'
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None.
|
||||||
|
"""
|
||||||
|
filepath = os.path.join(self.dirpath, METADATA_FILENAME)
|
||||||
|
log.debug(f"Writing metadata to {filepath}")
|
||||||
|
with open(filepath, "wb") as file:
|
||||||
|
pickle.dump(self.metadata, file)
|
||||||
|
|
||||||
|
# STATIC LOADERS
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_csv(data, metadata, sep=","):
|
def get_csv(data, metadata, sep=","):
|
||||||
csv = ""
|
csv = ""
|
||||||
@ -157,7 +172,7 @@ class DataCollector:
|
|||||||
return csv.strip("\n")
|
return csv.strip("\n")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_data_from_csv(filepath:str) -> tuple[np.ndarray, dict]:
|
def load_data_from_csv(filepath:str, sep: str=",") -> tuple[np.ndarray, dict]:
|
||||||
"""
|
"""
|
||||||
Loads data from a single csv file.
|
Loads data from a single csv file.
|
||||||
Lines with this format are interpreted as metadata:
|
Lines with this format are interpreted as metadata:
|
||||||
@ -168,6 +183,8 @@ class DataCollector:
|
|||||||
----------
|
----------
|
||||||
filepath
|
filepath
|
||||||
Path to the csv file.
|
Path to the csv file.
|
||||||
|
sep
|
||||||
|
csv separator
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -177,9 +194,9 @@ class DataCollector:
|
|||||||
Dictionary with metadata.
|
Dictionary with metadata.
|
||||||
"""
|
"""
|
||||||
metadata = {}
|
metadata = {}
|
||||||
data = np.empty((0, 4))
|
|
||||||
with open(filepath, "r") as f:
|
with open(filepath, "r") as f:
|
||||||
for j, line in enumerate(f):
|
# this loop will read the metadata at the beginning and skip also the header row
|
||||||
|
for line in f:
|
||||||
if line.startswith("#"):
|
if line.startswith("#"):
|
||||||
colon = line.find(":")
|
colon = line.find(":")
|
||||||
if colon == -1: # normal comment
|
if colon == -1: # normal comment
|
||||||
@ -187,20 +204,10 @@ class DataCollector:
|
|||||||
key = line[1:colon].strip()
|
key = line[1:colon].strip()
|
||||||
value = line[colon+1:].strip()
|
value = line[colon+1:].strip()
|
||||||
metadata[key] = value
|
metadata[key] = value
|
||||||
continue
|
else:
|
||||||
if line.startswith("idx"): # header line
|
break
|
||||||
continue
|
# here, the generator has only data lines
|
||||||
vals = line.split(",")
|
data = np.loadtxt(f, delimiter=sep)
|
||||||
if len(vals) != 4:
|
|
||||||
raise ValueError(f"Line {j+1}: Line must have 4 values, but has {len(vals)}: '{line}'")
|
|
||||||
try:
|
|
||||||
i = int(vals[0])
|
|
||||||
t = float(vals[1])
|
|
||||||
cpd = float(vals[2])
|
|
||||||
led = float(vals[3])
|
|
||||||
except ValueError:
|
|
||||||
raise ValueError(f"Line {j+1}: Failed to convert values to numbers: '{line}'")
|
|
||||||
data = np.append(data, [[i, t, cpd, led]], axis=0)
|
|
||||||
return data, metadata
|
return data, metadata
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -232,23 +239,26 @@ class DataCollector:
|
|||||||
metadata = {}
|
metadata = {}
|
||||||
for filename in files:
|
for filename in files:
|
||||||
filepath = os.path.join(dirpath, filename)
|
filepath = os.path.join(dirpath, filename)
|
||||||
if filename.endswith(".csv"):
|
if filename.startswith(FLUSH_PREFIX):
|
||||||
if verbose: print(f"Opening {filepath} as csv")
|
if filename.endswith(".csv"):
|
||||||
df = pd.read_csv(filepath)
|
if verbose: print(f"Opening {filepath} as csv")
|
||||||
arr = df.to_numpy()
|
df = pd.read_csv(filepath)
|
||||||
data = np.concatenate((data, arr))
|
arr = df.to_numpy()
|
||||||
elif filename.endswith(".ndarray.pkl"):
|
|
||||||
with open(filepath, "rb") as file:
|
|
||||||
arr = pickle.load(file)
|
|
||||||
if len(arr.shape) != 2 or arr.shape[1] != 4:
|
|
||||||
print(f"Skipping file '{filepath}' with invalid array shape: {arr.shape}")
|
|
||||||
continue
|
|
||||||
data = np.concatenate((data, arr))
|
data = np.concatenate((data, arr))
|
||||||
elif filename == METADATA_FILENAME:
|
elif filename.endswith(".ndarray.pkl"):
|
||||||
with open(filepath, "rb") as file:
|
with open(filepath, "rb") as file:
|
||||||
metadata = pickle.load(file)
|
arr = pickle.load(file)
|
||||||
|
if len(arr.shape) != 2 or arr.shape[1] != 4:
|
||||||
|
print(f"Skipping file '{filepath}' with invalid array shape: {arr.shape}")
|
||||||
|
continue
|
||||||
|
data = np.concatenate((data, arr))
|
||||||
|
elif filename == METADATA_FILENAME: # Metadata filename must also start with FLUSH_PREFIX
|
||||||
|
with open(filepath, "rb") as file:
|
||||||
|
metadata = pickle.load(file)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Unknown file extension for file '{filepath}'")
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
log.info(f"Skipping unknown file: '{filepath}'")
|
||||||
return data, metadata
|
return data, metadata
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user