Newer
Older
Rafael Gort
committed
import os
import logging
import h5py
Rafael Gort
committed
import xarray as xr
from ..util.exceptions import ToolBoxFileError
log = logging.getLogger(__name__)
def _to_netcdf(fname, data, group, mode):
Rafael Gort
committed
f_exists = os.path.isfile(fname)
if (f_exists and mode is 'w'):
data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
Rafael Gort
committed
log.warning(f"File {fname} existed: overwritten")
log.info(f"Stored data in file {fname}")
elif f_exists and mode is 'a':
try:
data.to_netcdf(fname, group=group, mode='a', engine='h5netcdf')
log.info(f"Created group {group} in file {fname}")
except (ValueError, TypeError):
msg = f"Group {group} exists and has incompatible dimensions."
log.warning(f"Could not store data: {msg}")
raise ToolBoxFileError(msg, fname)
Rafael Gort
committed
else:
data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
Rafael Gort
committed
log.info(f"Stored data in file {fname}")
def save_xarray(fname, data, group='data', mode='a'):
Rafael Gort
committed
"""
Store xarray Dataset in the specified location
Parameters
----------
data: xarray.DataSet
The data to be stored
fname: str, int
filename
overwrite: bool
overwrite existing data
Raises
------
ToolBoxFileError: Exception
File existed, but overwrite was set to False.
"""
Rafael Gort
committed
try:
_to_netcdf(fname, data, group, mode)
Rafael Gort
committed
except ToolBoxFileError as err:
raise err
def save_attributes_h5(fname, data={}):
"""
Adding attributes to a hdf5 file. This function is intended to be used to
attach metadata to a processed run.
Parameters
----------
fname: str
filename as string
data: dictionary
the data that should be added to the file in form of a dictionary.
"""
f = h5py.File(fname, mode='a')
for d in data.keys():
f.attrs[d] = data[d]
f.close()
log.info(f"added attributes to file {fname}")
Rafael Gort
committed
def load_xarray(fname, group='data', form='dataset'):
Rafael Gort
committed
"""
Load stored xarray Dataset.
Comment: This function exists because of a problem with the standard
netcdf engine that is malfunctioning due to related software installed
in the exfel_anaconda3 environment. May be dropped at some point.
Parameters
----------
fname: str
filename as string
group: str
the name of the xarray dataset (group in h5 file).
form: str
specify whether the data to be loaded is a 'dataset' or a 'array'.
Rafael Gort
committed
"""
f_exists = os.path.isfile(fname)
if f_exists:
log.debug(f'open xarray dataset {fname}')
return xr.load_dataset(fname, group=group, engine='h5netcdf')
log.debug(f'open xarray dataarray {fname}')
return xr.load_dataarray(fname, group=group, engine='h5netcdf')
else:
msg = "File does not exists."
raise ToolBoxFileError(msg, fname)
def _data_from_list(filenames):
"""
Helper function for data formatting routines. Loads the specified files
given by their names. This subroutine expects the name of the group to be
'data'.
data: list
a list containing the loaded data
Raises
------
ToolBoxFileError
raises ToolBoxFileError in case file does not exist.
"""
data = []
for name in filenames:
f_exists = os.path.isfile(name)
if f_exists:
data.append(load_xarray(name, group='data'))
else:
msg = "File does not exists."
raise ToolBoxFileError(msg, name)
return data
def get_data_formatted(filenames=[], data_list=[]):
"""
Combines the given data into one dataset. For any of extra_data's data
types, an xarray.Dataset is returned. The data is sorted along the 'module'
dimension. The array dimension have the order 'trainId', 'pulse', 'module',
'x', 'y'. This order is required by the extra_geometry package.
filenames: list of str
files to be combined as a list of names. Calls '_data_from_list' to
actually load the data.
data_list: list
list containing the already loaded data
data: xarray.Dataset
A xarray.Dataset containing the combined data.
"""
if any(filenames) is True:
data = _data_from_list(filenames)
elif any(data_list) is True:
data = data_list
if type(data[0]).__module__ == 'xarray.core.dataset':
data = xr.concat(data, dim='module')
elif type(data[0]).__module__ == 'pandas.core.frame':
pass
elif type(data[0]).__module__ == 'dask.dataframe.core':
pass
data = data.sortby("module")
return data.transpose('trainId', 'pulse', 'module', 'x', 'y')
def search_files(run_folder):
"""
run_folder: str
the path to a folder containing h5 files.
a list of the filenames of all .h5 files in the given folder.
Raises
------
ToolBoxFileError: Exception
raises ToolBoxFileError in case there are no .h5 files in the folder,
or the folder does not exist.
"""
try:
filenames = os.listdir(run_folder)
return [run_folder+name for name in filenames if ".h5" in name]
except:
msg = "No files in folder"
raise ToolBoxFileError(msg, run_folder)