Skip to content
Snippets Groups Projects
dssc_data.py 5.5 KiB
Newer Older
import xarray as xr

from ..util.exceptions import ToolBoxFileError

log = logging.getLogger(__name__)


def _to_netcdf(fname, data, group, mode):
    if (f_exists and mode is 'w'):
        data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
        log.warning(f"File {fname} existed: overwritten")
        log.info(f"Stored data in file {fname}")
    elif f_exists and mode is 'a':
        try:
            data.to_netcdf(fname, group=group, mode='a', engine='h5netcdf')
            log.info(f"Created group {group} in file {fname}")
        except (ValueError, TypeError):
            msg = f"Group {group} exists and has incompatible dimensions."
            log.warning(f"Could not store data: {msg}")
            raise ToolBoxFileError(msg, fname)
        data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
Rafael Gort's avatar
Rafael Gort committed

def save_xarray(fname, data, group='data', mode='a'):
    """
    Store xarray Dataset in the specified location

    Parameters
    ----------
    data: xarray.DataSet
        The data to be stored
    fname: str, int
        filename
    overwrite: bool
        overwrite existing data

    Raises
    ------
    ToolBoxFileError: Exception
        File existed, but overwrite was set to False.
    """
        _to_netcdf(fname, data, group, mode)
def save_attributes_h5(fname, data={}):
    """
Rafael Gort's avatar
Rafael Gort committed
    Adding attributes to a hdf5 file. This function is intended to be used to
    attach metadata to a processed run.

Rafael Gort's avatar
Rafael Gort committed
    fname: str
        filename as string
    data: dictionary
        the data that should be added to the file in form of a dictionary.
    """
    f = h5py.File(fname, mode='a')
    for d in data.keys():
        f.attrs[d] = data[d]
    f.close()
    log.info(f"added attributes to file {fname}")
def load_xarray(fname, group='data', form='dataset'):
    """
    Load stored xarray Dataset.
    Comment: This function exists because of a problem with the standard
Rafael Gort's avatar
Rafael Gort committed
    netcdf engine that is malfunctioning due to related software installed
    in the exfel_anaconda3 environment. May be dropped at some point.
Rafael Gort's avatar
Rafael Gort committed

    Parameters
    ----------
    fname: str
        filename as string
    group: str
        the name of the xarray dataset (group in h5 file).
    form: str
        specify whether the data to be loaded is a 'dataset' or a 'array'.
    f_exists = os.path.isfile(fname)
    if f_exists:
Rafael Gort's avatar
Rafael Gort committed
        if form == 'dataset':
            log.debug(f'open xarray dataset {fname}')
            return xr.load_dataset(fname, group=group, engine='h5netcdf')
Rafael Gort's avatar
Rafael Gort committed
        elif form == 'array':
            log.debug(f'open xarray dataarray {fname}')
            return xr.load_dataarray(fname, group=group, engine='h5netcdf')
    else:
        msg = "File does not exists."
        raise ToolBoxFileError(msg, fname)


def _data_from_list(filenames):
    """
Rafael Gort's avatar
Rafael Gort committed
    Helper function for data formatting routines. Loads the specified files
    given by their names. This subroutine expects the name of the group to be
    'data'.

    Parameters
    ----------
Rafael Gort's avatar
Rafael Gort committed
    filenames: list
        list of valid xarray filenames
Rafael Gort's avatar
Rafael Gort committed
    data: list
        a list containing the loaded data

    Raises
    ------
    ToolBoxFileError
        raises ToolBoxFileError in case file does not exist.
    """
    data = []
    for name in filenames:
        f_exists = os.path.isfile(name)
        if f_exists:
            data.append(load_xarray(name, group='data'))
        else:
            msg = "File does not exists."
            raise ToolBoxFileError(msg, name)
    return data


def get_data_formatted(filenames=[], data_list=[]):
    """
Rafael Gort's avatar
Rafael Gort committed
    Combines the given data into one dataset. For any of extra_data's data
    types, an xarray.Dataset is returned. The data is sorted along the 'module'
    dimension. The array dimension have the order 'trainId', 'pulse', 'module',
    'x', 'y'. This order is required by the extra_geometry package.

    Parameters
    ----------
Rafael Gort's avatar
Rafael Gort committed
    filenames: list of str
        files to be combined as a list of names. Calls '_data_from_list' to
        actually load the data.
    data_list: list
        list containing the already loaded data
Rafael Gort's avatar
Rafael Gort committed
    data: xarray.Dataset
        A xarray.Dataset containing the combined data.
    """
    if any(filenames) is True:
        data = _data_from_list(filenames)
    elif any(data_list) is True:
        data = data_list
    if type(data[0]).__module__ == 'xarray.core.dataset':
        data = xr.concat(data, dim='module')
    elif type(data[0]).__module__ == 'pandas.core.frame':
        pass
    elif type(data[0]).__module__ == 'dask.dataframe.core':
        pass

    data = data.sortby("module")
Rafael Gort's avatar
Rafael Gort committed
    return data.transpose('trainId', 'pulse', 'module', 'x', 'y')


def search_files(run_folder):
    """
Rafael Gort's avatar
Rafael Gort committed
    Search folder for h5 files.

    Parameters
    ----------
Rafael Gort's avatar
Rafael Gort committed
    run_folder: str
        the path to a folder containing h5 files.
Rafael Gort's avatar
Rafael Gort committed
    a list of the filenames of all .h5 files in the given folder.

    Raises
    ------
    ToolBoxFileError: Exception
        raises ToolBoxFileError in case there are no .h5 files in the folder,
        or the folder does not exist.
    """
    try:
        filenames = os.listdir(run_folder)
        return [run_folder+name for name in filenames if ".h5" in name]
    except:
        msg = "No files in folder"
        raise ToolBoxFileError(msg, run_folder)