Merge branch 'fix-get-array' into 'master'

fix `get_array`, add wrappers to some of `extra_data` basic functions See merge request !116

Merge branch 'fix-get-array' into 'master'
fix `get_array`, add wrappers to some of `extra_data` basic functions See merge request !116
afeee965 · Laurent Mercadier · 03569509 · c471aaa4 · afeee965 · afeee965
Commit afeee965 authored 3 years ago by Laurent Mercadier
--- a/src/toolbox_scs/detectors/dssc.py
+++ b/src/toolbox_scs/detectors/dssc.py
@@ -10,7 +10,7 @@
        - contributions should comply with pep8 code structure guidelines.
        - Plot routines don't fit into objects since they are rather fluent.
          They have been outsourced to dssc_plot.py. They can now be accessed
-          as tbdet member functions.
+          as toolbox_scs member functions.
 """
 import os
 import logging
@@ -56,7 +56,7 @@ class DSSCBinner:
            run number
        binners: dictionary
            dictionary containing binners constructed using the
-            'create_dssc_bins' tbdet-method.
+            'create_dssc_bins' toolbox_scs.detectors-method.
        xgm_name: str
            a valid mnemonic key of the XGM data to be used to mask the dssc
            frames. Since the xgm is used in several methods its name can be
@@ -76,8 +76,8 @@ class DSSCBinner:
        Example
        -------
        1.) quick -> generic bins, no xgm,
-        >>> import toolbox_scs.detectors as tbdet
+        >>> import toolbox_scs as tb
-        >>> run235 = tbdet.DSSCBinner(proposal_nb=2212, run_nb=235)
+        >>> run235 = tb.DSSCBinner(proposal_nb=2212, run_nb=235)
        2.) detailed -> docs
        """

--- a/src/toolbox_scs/detectors/dssc_misc.py
+++ b/src/toolbox_scs/detectors/dssc_misc.py
@@ -75,19 +75,18 @@ def create_dssc_bins(name, coordinates, bins):
    Examples
    --------
    >>> import toolbox_scs as tb
-    >>> import toolbox_scs.detectors as tbdet
+    >>> run = tb.open_run(2212, 235, include='*DA*')
-    >>> run = tb.load_run(2212, 235, include='*DA*')
    1.) binner along 'pulse' dimension. Group data into two bins.
    >>> bins_pulse = ['pumped', 'unpumped'] * 10
-    >>> binner_pulse = tbdet.create_dssc_bins("pulse",
+    >>> binner_pulse = tb.create_dssc_bins("pulse",
                                np.linspace(0,19,20, dtype=int),
                                bins_pulse)
    2.) binner along 'train' dimension. Group data into bins corresponding
        to the positions of a delay stage for instance.
    >>> bins_trainId = tb.get_array(run, 'PP800_PhaseShifter', 0.04)
-    >>> binner_train = tbdet.create_dssc_bins("trainId",
+    >>> binner_train = tb.create_dssc_bins("trainId",
                                run.trainIds,
                                bins_trainId.values)
    """

--- a/src/toolbox_scs/detectors/dssc_processing.py
+++ b/src/toolbox_scs/detectors/dssc_processing.py
@@ -165,7 +165,7 @@ def process_dssc_data(proposal, run_nr, module, chunksize, info, dssc_binners,
        dictionary containing keys 'dims', 'frames_per_train', 'total_frames',
        'trainIds', 'number_of_trains'.
    dssc_binners: dictionary
-        a dictionary containing binner objects created by the tbdet member
+        a dictionary containing binner objects created by the ToolBox member
        function "create_binner()"
    path : str
        location in which the .h5 files, containing the binned data, should

--- a/src/toolbox_scs/load.py
+++ b/src/toolbox_scs/load.py
@@ -23,8 +23,10 @@ import toolbox_scs.detectors as tbdet
 __all__ = [
    'concatenateRuns',
+    'find_run_path',
    'get_array',
    'load',
+    'open_run',
    'run_by_path',
 ]
@@ -52,9 +54,9 @@ def load(proposalNB=None, runNB=None,
    Parameters
    ----------
-    proposalNB: (str, int)
+    proposalNB: str, int
        proposal number e.g. 'p002252' or 2252
-    runNB: (str, int)
+    runNB: str, int
        run number as integer
    fields: str, list of str, list of dict
        list of mnemonics to load specific data such as "fastccd",
@@ -62,17 +64,18 @@ def load(proposalNB=None, runNB=None,
        {"extra": {'source: 'SCS_CDIFFT_MAG/SUPPLY/CURRENT',
                   'key': 'actual_current.value',
                   'dim': None}}
-    subFolder: (str)
+    subFolder: str
-        sub-folder from which to load the data. Use 'raw' for raw data
+        'raw', 'proc' (processed) or 'all' (both 'raw' and 'proc') to access
-        or 'proc' for processed data.
+        data from either or both of those folders. If 'all' is used, sources
-    display: (bool)
+        present in 'proc' overwrite those in 'raw'. The default is 'raw'.
+    display: bool
        whether to show the run.info or not
-    validate: (bool)
+    validate: bool
        whether to run extra-data-validate or not
    subset:
        a subset of train that can be load with by_index[:5] for the first 5
        trains
-    rois: dictionary
+    rois: dict
        a dictionnary of mnemonics with a list of rois definition and
        the desired names, for example:
        {'fastccd': {'ref': {'roi': by_index[730:890, 535:720],
@@ -112,11 +115,7 @@ def load(proposalNB=None, runNB=None,
    >>> run, data = tb.load(2212, 208, ['SCS_SA3', 'MCP2apd', 'nrj'])
    """
-    if isinstance(runNB, int):
+    runFolder = find_run_path(proposalNB, runNB, subFolder)
-        runNB = 'r{:04d}'.format(runNB)
-    if isinstance(proposalNB, int):
-        proposalNB = 'p{:06d}'.format(proposalNB)
-    runFolder = os.path.join(find_proposal(proposalNB), subFolder, runNB)
    run = ed.RunDirectory(runFolder).select_trains(subset)
    if fields is None:
        return run, xr.Dataset()
@@ -241,47 +240,95 @@ def run_by_path(path):
    return ed.RunDirectory(path)
-def concatenateRuns(runs):
+def find_run_path(proposalNB, runNB, data='raw'):
-    """ Sorts and concatenate a list of runs with identical data variables
+    """
-        along the trainId dimension.
+    Return the run path given the specified proposal and run numbers.
-        Input:
+    Parameters
-            runs: (list) the xarray Datasets to concatenate
+    ----------
-        Output:
+    proposalNB: (str, int)
-            a concatenated xarray Dataset
+        proposal number e.g. 'p002252' or 2252
+    runNB: (str, int)
+        run number as integer
+    data: str
+        'raw', 'proc' (processed) or 'all' (both 'raw' and 'proc') to access
+        data from either or both of those folders. If 'all' is used, sources
+        present in 'proc' overwrite those in 'raw'. The default is 'raw'.
+    Returns
+    -------
+    path: str
+        The run path.
    """
-    firstTid = {i: int(run.trainId[0].values) for i, run in enumerate(runs)}
+    if isinstance(runNB, int):
-    orderedDict = dict(sorted(firstTid.items(), key=lambda t: t[1]))
+        runNB = 'r{:04d}'.format(runNB)
-    orderedRuns = [runs[i] for i in orderedDict]
+    if isinstance(proposalNB, int):
-    keys = orderedRuns[0].keys()
+        proposalNB = 'p{:06d}'.format(proposalNB)
-    for run in orderedRuns[1:]:
+    return os.path.join(find_proposal(proposalNB), data, runNB)
-        if run.keys() != keys:
-            print('data fields between different runs are not identical. '
-                  'Cannot combine runs.')
-            return
-    result = xr.concat(orderedRuns, dim='trainId')
-    for k in orderedRuns[0].attrs.keys():
-        result.attrs[k] = [run.attrs[k] for run in orderedRuns]
-    return result
+def open_run(proposalNB, runNB, subset=ed.by_index[:], **kwargs):
+    """
+    Get extra_data.DataCollection in a given proposal.
+    Wraps the extra_data open_run routine and adds subset selection, out of
+    convenience for the toolbox user. More information can be found in the
+    extra_data documentation.
-def get_array(run, mnemonic_key=None, stepsize=None):
+    Parameters
+    ----------
+    proposalNB: (str, int)
+        proposal number e.g. 'p002252' or 2252
+    runNB: (str, int)
+        run number e.g. 17 or 'r0017'
+    subset:
+        a subset of train that can be load with by_index[:5] for the first 5
+        trains
+    **kwargs
+    --------
+    data: str
+        default -> 'raw'
+    include: str
+        default -> '*'
+    Returns
+    -------
+    run : extra_data.DataCollection
+        DataCollection object containing information about the specified
+        run. Data can be loaded using built-in class methods.
+    """
+    return ed.open_run(proposalNB, runNB, **kwargs).select_trains(subset)
+def get_array(run=None, mnemonic=None, stepsize=None,
+              subset=ed.by_index[:], subFolder='raw',
+              proposalNB=None, runNB=None):
    """
-    Loads the required 1D-data and rounds its values to integer multiples of
+    Loads one data array for the specified mnemonic and rounds its values to
-    stepsize for consistent grouping (except for stepsize=None).
+    integer multiples of stepsize for consistent grouping (except for
-    Returns a dummy array if mnemonic is set to None.
+    stepsize=None).
+    Returns a 1D array of ones if mnemonic is set to None.
    Parameters
    ----------
-    run: karabo_data.DataCollection
+    run: extra_data.DataCollection
-        path to the run directory
+        DataCollection containing the data.
-    mnemonic_key: str
+        Used if proposalNB and runNB are None.
+    mnemonic: str
        Identifier of a single item in the mnemonic collection. None creates a
-        dummy file to average over all trains in the run
+        dummy 1D array of ones with length equal to the number of trains.
    stepsize : float
        nominal stepsize of the array data - values will be rounded to integer
-        multiples of this value
+        multiples of this value.
+    subset:
+        a subset of train that can be load with by_index[:5] for the first 5
+        trains
+    subFolder: (str)
+        sub-folder from which to load the data. Use 'raw' for raw data
+        or 'proc' for processed data.
+    proposalNB: (str, int)
+        proposal number e.g. 'p002252' or 2252.
+    runNB: (str, int)
+        run number e.g. 17 or 'r0017'.
    Returns
    -------
@@ -297,26 +344,32 @@ def get_array(run, mnemonic_key=None, stepsize=None):
    Example
    -------
    >>> import toolbox_scs as tb
-    >>> run = tb.load_run(2212, 235)
+    >>> run = tb.open_run(2212, 235)
    >>> mnemonic = 'PP800_PhaseShifter'
    >>> data_PhaseShifter = tb.get_array(run, mnemonic, 0.5)
    """
+    if run is None:
+        run = open_run(proposalNB, runNB, subset, data=subFolder)
+    if not isinstance(run, ed.DataCollection):
+        raise TypeError(f'run argument has type {type(run)} but '
+                         'expected type is extra_data.DataCollection')
+    run = run.select_trains(subset)
+    run_mnemonics = mnemonics_for_run(run)
    try:
-        if mnemonic_key is None:
+        if mnemonic is None:
            data = xr.DataArray(
                        np.ones(len(run.train_ids), dtype=np.int16),
                        dims=['trainId'], coords={'trainId': run.train_ids})
-        elif mnemonic_key in _mnemonics:
+        elif mnemonic in run_mnemonics:
-            mnem = _mnemonics[mnemonic_key]
+            mnem = run_mnemonics[mnemonic]
-            data = run.get_array(*mnem.values())
+            data = run.get_array(*mnem.values(), name=mnemonic)
        else:
-            raise ToolBoxValueError("Invalid mnemonic", mnemonic_key)
+            raise ToolBoxValueError("Invalid mnemonic", mnemonic)
        if stepsize is not None:
            data = stepsize * np.round(data / stepsize)
-        data.name = 'data'
+        log.debug(f"Got data for {mnemonic}")
-        log.debug(f"Got data for {mnemonic_key}")
    except ToolBoxValueError as err:
        log.error(f"{err.message}")
        raise
@@ -324,6 +377,31 @@ def get_array(run, mnemonic_key=None, stepsize=None):
    return data
+def concatenateRuns(runs):
+    """ Sorts and concatenate a list of runs with identical data variables
+        along the trainId dimension.
+        Input:
+            runs: (list) the xarray Datasets to concatenate
+        Output:
+            a concatenated xarray Dataset
+    """
+    firstTid = {i: int(run.trainId[0].values) for i, run in enumerate(runs)}
+    orderedDict = dict(sorted(firstTid.items(), key=lambda t: t[1]))
+    orderedRuns = [runs[i] for i in orderedDict]
+    keys = orderedRuns[0].keys()
+    for run in orderedRuns[1:]:
+        if run.keys() != keys:
+            print('data fields between different runs are not identical. '
+                  'Cannot combine runs.')
+            return
+    result = xr.concat(orderedRuns, dim='trainId')
+    for k in orderedRuns[0].attrs.keys():
+        result.attrs[k] = [run.attrs[k] for run in orderedRuns]
+    return result
 def load_bpt(run, merge_with=None, run_mnemonics=None):
    if run_mnemonics is None:
        run_mnemonics = mnemonics_for_run(run)