Compare revisions

c61ef985 · c61ef985 · c61ef985 · c61ef985 · c61ef985 · c61ef985
--- a/src/toolbox_scs/detectors/LICENSE_BSD
+++ b/src/toolbox_scs/detectors/LICENSE_BSD
+BSD 3-Clause License
+
+Copyright (c) 2019, Michael Schneider
+Copyright (c) 2020, SCS-team
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--- a/src/toolbox_scs/detectors/__init__.py
+++ b/src/toolbox_scs/detectors/__init__.py
+from .azimuthal_integrator import *
+from .bam_detectors import *
+from .digitizers import *
+from .dssc import *
+from .dssc_data import *
+from .dssc_misc import *
+from .dssc_processing import *
+from .gotthard2 import *
+from .hrixs import *
+from .pes import *
+from .viking import *
+from .xgm import *
+
+__all__ = (
+    azimuthal_integrator.__all__
+    + bam_detectors.__all__
+    + digitizers.__all__
+    + dssc.__all__
+    + dssc_data.__all__
+    + dssc_misc.__all__
+    + dssc_processing.__all__
+    + gotthard2.__all__
+    + hrixs.__all__
+    + pes.__all__
+    + viking.__all__
+    + xgm.__all__
+)
--- a/src/toolbox_scs/detectors/azimuthal_integrator.py
+++ b/src/toolbox_scs/detectors/azimuthal_integrator.py
+import logging
+import numpy as np
+
+__all__ = [
+    'AzimuthalIntegrator',
+    'AzimuthalIntegratorDSSC'
+]
+
+log = logging.getLogger(__name__)
+
+
+class AzimuthalIntegrator(object):
+    def __init__(self, imageshape, center, polar_range,
+                 aspect=204/236, **kwargs):
+        '''
+        Create a reusable integrator for repeated azimuthal integration of
+        similar images. Calculates array indices for a given parameter set that
+        allows fast recalculation.
+
+        Parameters
+        ==========
+        imageshape : tuple of ints
+            The shape of the images to be integrated over.
+
+        center : tuple of ints
+            center coordinates in pixels
+
+        polar_range : tuple of ints
+            start and stop polar angle (in degrees) to restrict integration to
+            wedges
+
+        dr : int, optional
+            radial width of the integration slices. Takes non-square DSSC
+            pixels into account.
+
+        nrings : int, optional
+            Number of integration rings. Can be given as an alternative to dr
+
+        aspect: float, default 204/236 for DSSC
+            aspect ratio of the pixel pitch
+
+        Returns
+        =======
+        ai : azimuthal_integrator instance
+            Instance can directly be called with image data:
+            > az_intensity = ai(image)
+            radial distances and the polar mask are accessible as attributes:
+            > ai.distance
+            > ai.polar_mask
+        '''
+        self.xcoord = None
+        self.ycoord = None
+
+        self._calc_dist_array(imageshape, center, aspect)
+        self._calc_polar_mask(polar_range)
+        self._calc_indices(**kwargs)
+
+    def _calc_dist_array(self, shape, center, aspect):
+        '''Calculate pixel coordinates for the given shape.'''
+        self.center = center
+        self.shape = shape
+        self.aspect = aspect
+        cx, cy = center
+        log.info(f'azimuthal center: {center}')
+        sx, sy = shape
+        xcoord, ycoord = np.ogrid[:sx, :sy]
+        self.xcoord = xcoord - cx
+        self.ycoord = ycoord - cy
+        # distance from center, hexagonal pixel shape taken into account
+        self.dist_array = np.hypot(self.xcoord * aspect, self.ycoord)
+
+    def _calc_indices(self, **kwargs):
+        '''Calculates the list of indices for the flattened image array.'''
+        maxdist = self.dist_array.max()
+        mindist = self.dist_array.min()
+        dr = kwargs.get('dr', None)
+        nrings = kwargs.get('nrings', None)
+        if (dr is None) and (nrings is None):
+            raise AssertionError('Either <dr> or <nrings> needs to be given.')
+        if (dr is not None) and (nrings is not None):
+            log.warning('Both <dr> and <nrings> given. <dr> takes precedence.')
+        if (dr is None):
+            dr = maxdist / nrings
+
+        idx = np.indices(dimensions=self.shape)
+        self.index_array = np.ravel_multi_index(idx, self.shape)
+
+        self.distance = np.array([])
+        self.flat_indices = []
+        for dist in np.arange(mindist, maxdist + dr, dr):
+            ring_mask = (self.polar_mask
+                         * (self.dist_array >= (dist - dr))
+                         * (self.dist_array < dist))
+            self.flat_indices.append(self.index_array[ring_mask])
+            self.distance = np.append(self.distance, dist)
+
+    def _calc_polar_mask(self, polar_range):
+        self.polar_range = polar_range
+        prange = np.abs(polar_range[1] - polar_range[0])
+        if prange > 180:
+            raise ValueError('Integration angle too wide, should be within 180'
+                             ' degrees')
+        if prange < 1e-6:
+            raise ValueError('Integration angle too narrow')
+
+        if prange == 180:
+            self.polar_mask = np.ones(self.shape, dtype=bool)
+        else:
+            tmin, tmax = np.deg2rad(np.sort(polar_range)) % np.pi
+            polar_array = np.arctan2(self.xcoord, self.ycoord)
+            polar_array = np.mod(polar_array, np.pi)
+            self.polar_mask = (polar_array > tmin) * (polar_array < tmax)
+
+    def calc_q(self, distance, wavelength):
+        '''Calculate momentum transfer coordinate.
+
+        Parameters
+        ==========
+        distance : float
+            Sample - detector distance in meter
+        wavelength : float
+            wavelength of scattered light in meter
+
+        Returns
+        =======
+        deltaq : np.ndarray
+            Momentum transfer coordinate in 1/m
+        '''
+        res = 4 * np.pi \
+                * np.sin(np.arctan(self.distance / distance) / 2) / wavelength
+        return res
+
+    def __call__(self, image):
+        assert self.shape == image.shape, 'image shape does not match'
+        image_flat = np.ravel(image)
+        return np.array([np.nansum(image_flat[indices])
+                         for indices in self.flat_indices])
+
+
+class AzimuthalIntegratorDSSC(AzimuthalIntegrator):
+    def __init__(self, geom, polar_range, dxdy=(0, 0), **kwargs):
+        '''
+        Create a reusable integrator for repeated azimuthal integration of
+        similar images. Calculates array indices for a given parameter set that
+        allows fast recalculation. Directly uses a
+        extra_geom.detectors.DSSC_1MGeometry instance for correct pixel
+        positions
+
+        Parameters
+        ==========
+        geom : extra_geom.detectors.DSSC_1MGeometry
+            loaded geometry instance
+
+        polar_range : tuple of ints
+            start and stop polar angle (in degrees) to restrict integration to
+            wedges
+
+        dr : int, optional
+            radial width of the integration slices. Takes non-square DSSC
+            pixels into account.
+
+        nrings : int, optional
+            Number of integration rings. Can be given as an alternative to dr
+
+        dxdy : tuple of floats, default (0, 0)
+            global coordinate shift to adjust center outside of geom object
+            (meter)
+
+        Returns
+        =======
+        ai : azimuthal_integrator instance
+            Instance can directly be called with image data:
+            > az_intensity = ai(image)
+            radial distances and the polar mask are accessible as attributes:
+            > ai.distance
+            > ai.polar_mask
+        '''
+
+        self.xcoord = None
+        self.ycoord = None
+
+        self._calc_dist_array(geom, dxdy)
+        self._calc_polar_mask(polar_range)
+        self._calc_indices(**kwargs)
+
+    def _calc_dist_array(self, geom, dxdy):
+        self.dxdy = dxdy
+        pos = geom.get_pixel_positions()
+        self.shape = pos.shape[:-1]
+        self.xcoord = pos[..., 0] + dxdy[0]
+        self.ycoord = pos[..., 1] + dxdy[1]
+        self.dist_array = np.hypot(self.xcoord, self.ycoord)
--- a/src/toolbox_scs/detectors/bam_detectors.py
+++ b/src/toolbox_scs/detectors/bam_detectors.py
+""" Beam Arrival Monitor related sub-routines
+
+    Copyright (2021) SCS Team.
+
+    (contributions preferrably comply with pep8 code structure
+    guidelines.)
+"""
+
+import logging
+import xarray as xr
+import numpy as np
+
+from ..misc.bunch_pattern_external import is_pulse_at
+from ..mnemonics_machinery import mnemonics_for_run
+from ..constants import mnemonics as _mnemonics
+from ..misc.bunch_pattern import (npulses_has_changed,
+                                  get_unique_sase_pId, load_bpt)
+from toolbox_scs.load import get_array
+
+__all__ = [
+    'get_bam',
+    'get_bam_params',
+]
+
+log = logging.getLogger(__name__)
+
+
+def get_bam(run, mnemonics=None, merge_with=None, bunchPattern='sase3',
+            pulseIds=None):
+    """
+    Load beam arrival monitor (BAM) data and align their pulse ID
+    according to the bunch pattern. Sources can be loaded on the fly
+    via the mnemonics argument, or processed from an existing data set
+    (merge_with).
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the bam data.
+    mnemonics: str or list of str
+        mnemonics for BAM, e.g. "BAM1932M" or ["BAM414", "BAM1932M"].
+        the arrays are either taken from merge_with or loaded from
+        the DataCollection run.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one. If merge_with contains variables in mnemonics, they will
+        be selected, aligned and merged.
+    bunchPattern: str
+        'sase1' or 'sase3' or 'scs_ppl', bunch pattern
+        used to extract peaks. The pulse ID dimension will be named
+        'sa1_pId', 'sa3_pId' or 'ol_pId', respectively.
+    pulseIds:  list, 1D array
+        Pulse Ids. If None, they are automatically loaded.
+    Returns
+    -------
+    xarray Dataset with pulse-resolved BAM variables aligned,
+        merged with Dataset *merge_with* if provided.
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> run = tb.open_run(2711, 303)
+    >>> bam = tb.get_bam(run, 'BAM1932S')
+    """
+    bam_mnemos = ['BAM4', 'BAM1', 'BAM2']
+    mnemonics = [mnemonics] if isinstance(mnemonics, str) else mnemonics
+    m2 = []
+    for m in mnemonics:
+        if any([(k in m) for k in bam_mnemos]):
+            m2.append(m)
+    mnemonics = list(set(m2))
+    if len(mnemonics) == 0:
+        log.info('no BAM mnemonics to process. Skipping.')
+        return merge_with
+    # Prepare the dataset of non-BAM data to merge with
+    if bool(merge_with):
+        ds_mw = merge_with.drop(mnemonics, errors='ignore')
+    else:
+        ds_mw = xr.Dataset()
+
+    dim_names = {'sase3': 'sa3_pId', 'sase1': 'sa1_pId',
+                 'scs_ppl': 'ol_pId'}
+    bpt = load_bpt(run, ds_mw)
+    if bpt is not None:
+        mask = is_pulse_at(bpt, bunchPattern)
+        mask = mask.rename({'pulse_slot': dim_names[bunchPattern]})
+    ds = xr.Dataset()
+    run_mnemonics = mnemonics_for_run(run)
+    for m in mnemonics:
+        if merge_with is not None and m in merge_with:
+            da_bam = merge_with[m]
+        else:
+            da_bam = get_array(run, m)
+        if len(da_bam.dims) == 2:
+            if 'BAMbunchId' not in da_bam.dims:
+                continue
+            da_bam = da_bam.sel(BAMbunchId=slice(0, None, 2))
+            # align the pulse Id
+            if bpt is not None:
+                n = mask.sizes[dim_names[bunchPattern]]
+                da_bam = da_bam.isel(BAMbunchId=slice(0, n))
+                da_bam = da_bam.assign_coords(BAMbunchId=np.arange(0, n))
+                da_bam = da_bam.rename(BAMbunchId=dim_names[bunchPattern])
+                da_bam = da_bam.where(mask, drop=True)
+            # make sure unit is picosecond
+            if run_mnemonics[m]['key'] != 'data.lowChargeArrivalTime':
+                da_bam *= 1e-3
+        else:
+            # The 1D values (mean, std dev...) are in fs, need to convert to ps
+            mnemo = run_mnemonics[m]
+            first_val = run[mnemo['source']][mnemo['key']].train_from_index(0)[1]
+            if first_val == da_bam[0]:
+                da_bam *= 1e-3
+        ds = ds.merge(da_bam, join='inner')
+    # merge with non-BAM dataset
+    ds = ds_mw.merge(ds, join='inner')
+    return ds
+
+
+'''
+def get_bam_old(run, mnemonics=None, merge_with=None, bunchPattern='sase3'):
+    """
+    Load beam arrival monitor (BAM) data and align their pulse ID
+    according to the bunch pattern. Sources can be loaded on the fly
+    via the mnemonics argument, or processed from an existing data set
+    (merge_with).
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the bam data.
+    mnemonics: str or list of str
+        mnemonics for BAM, e.g. "BAM1932M" or ["BAM414", "BAM1932M"].
+        If None, defaults to "BAM1932M" in case no merge_with dataset
+        is provided.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one. The BAM variables of merge_with (if any) will also be
+        selected, aligned and merged.
+    bunchPattern: str
+        'sase1' or 'sase3' or 'scs_ppl', bunch pattern
+        used to extract peaks. The pulse ID dimension will be named
+        'sa1_pId', 'sa3_pId' or 'ol_pId', respectively.
+    Returns
+    -------
+    xarray Dataset with pulse-resolved BAM variables aligned,
+        merged with Dataset *merge_with* if provided.
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> import toolbox_scs.detectors as tbdet
+    >>> run, _ = tb.load(2711, 303)
+    >>> bam = tbdet.get_bam(run)
+
+    """
+    # get the list of mnemonics to process
+    mnemonics = mnemonics_to_process(mnemonics, merge_with, 'BAM')
+
+    if len(mnemonics) == 0:
+        log.info('No array with unaligned BAM data to extract. Skipping.')
+        return merge_with
+    else:
+        log.info(f'Extracting BAM data from {mnemonics}.')
+
+    # Prepare the dataset of non-BAM data to merge with
+    if bool(merge_with):
+        mw_ds = merge_with.drop(mnemonics, errors='ignore')
+    else:
+        mw_ds = xr.Dataset()
+
+    run_mnemonics = mnemonics_for_run(run)
+    # extract the XFEL pulses: slice(0,5400,2)
+    roi = np.s_[:5400:2]
+    ds = xr.Dataset()
+    for m in mnemonics:
+        if bool(merge_with) and m in merge_with:
+            val = merge_with[m].isel({run_mnemonics[m]['dim'][0]: roi})
+            log.debug(f'Using {m} from merge_with dataset.')
+        else:
+            val = run.get_array(*run_mnemonics[m].values(), roi=roi, name=m)
+            log.debug(f'Loading {m} from DataCollection.')
+        val[run_mnemonics[m]['dim'][0]] = np.arange(2700)
+        # Since winter shutdown 2021-2022, units changed from ps to fs
+        # Here we convert to ps
+        if run_mnemonics[m]['key'] != 'data.lowChargeArrivalTime':
+            val *= 1e-3
+        ds = ds.merge(val, join='inner')
+
+    # check if bunch pattern table exists
+    if bool(merge_with) and 'bunchPatternTable' in merge_with:
+        bpt = merge_with['bunchPatternTable']
+        log.debug('Using bpt from merge_with dataset.')
+    elif 'bunchPatternTable' in run_mnemonics:
+        bpt = run.get_array(*run_mnemonics['bunchPatternTable'].values())
+        log.debug('Loaded bpt from extra_data run.')
+    else:
+        bpt = None
+
+    # align the pulse Id
+    if bpt is not None and len(ds.variables) > 0:
+        dim_names = {'sase3': 'sa3_pId', 'sase1': 'sa1_pId',
+                     'scs_ppl': 'ol_pId'}
+        mask = is_pulse_at(bpt, bunchPattern)
+        mask = mask.rename({'pulse_slot': dim_names[bunchPattern]})
+        ds = ds.rename({run_mnemonics['BAM1932M']['dim'][0]:
+                        dim_names[bunchPattern]})
+        ds = ds.where(mask, drop=True)
+
+    # merge with non-BAM dataset
+    ds = mw_ds.merge(ds, join='inner')
+
+    return ds
+'''
+
+
+def get_bam_params(run, mnemo_or_source='BAM1932S'):
+    """
+    Extract the run values of bamStatus[1-3] and bamError.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the bam data.
+    mnemo_or_source: str
+        mnemonic of the BAM, e.g. 'BAM414', or source name,
+        e.g. 'SCS_ILH_LAS/DOOCS/BAM_414_B2.
+
+    Returns
+    -------
+    params: dict
+        dictionnary containing the extracted parameters.
+
+    Note
+    ----
+    The extracted parameters are run values, they do not reflect any
+    possible change during the run.
+    """
+
+    if mnemo_or_source in _mnemonics:
+        run_mnemonics = mnemonics_for_run(run)
+        source = run_mnemonics[mnemo_or_source]['source'].split(':')[0]
+    else:
+        source = mnemo_or_source
+    res = {}
+    res['status1'] = run.get_run_value(source, 'bamStatus1.value')
+    res['status2'] = run.get_run_value(source, 'bamStatus2.value')
+    res['status3'] = run.get_run_value(source, 'bamStatus3.value')
+    res['error'] = run.get_run_value(source, 'bamError.value')
+
+    return res
--- a/src/toolbox_scs/detectors/deprecated/digitizers_old.py
+++ b/src/toolbox_scs/detectors/deprecated/digitizers_old.py
+""" Digitizers related sub-routines
+
+    Copyright (2021) SCS Team.
+
+    (contributions preferrably comply with pep8 code structure
+    guidelines.)
+"""
+
+import logging
+import os
+
+import numpy as np
+import xarray as xr
+import matplotlib.pyplot as plt
+from scipy.signal import find_peaks
+from scipy.signal import fftconvolve
+
+from ..misc.bunch_pattern_external import is_pulse_at
+from ..util.exceptions import ToolBoxValueError
+from ..mnemonics_machinery import (mnemonics_to_process,
+                                   mnemonics_for_run)
+from extra_data import open_run
+from extra_data.read_machinery import find_proposal
+from extra.components import XrayPulses, OpticalLaserPulses
+
+__all__ = [
+    'check_peak_params',
+    'get_digitizer_peaks',
+    'get_laser_peaks',
+    'get_peaks',
+    'get_tim_peaks',
+    'digitizer_signal_description',
+    'get_dig_avg_trace',
+    'extract_digitizer_peaks',
+    'load_processed_peaks',
+    'check_processed_peak_params'
+]
+
+log = logging.getLogger(__name__)
+
+
+def peaks_from_raw_trace(traces, pulseStart, pulseStop, baseStart, baseStop,
+                         period=None, npulses=None, extra_dim=None):
+    """
+    Computes peaks from raw digitizer traces by trapezoidal integration.
+
+    Parameters
+    ----------
+    traces: xarray DataArray or numpy array containing raw traces. If
+        numpy array is provided, the second dimension is that of the samples.
+    pulseStart: int or list or 1D-numpy array
+        trace index of integration start. If 1d array, each value is the start
+        of one peak. The period and npulses parameters are ignored.
+    pulseStop: int
+        trace index of integration stop.
+    baseStart: int
+        trace index of background start.
+    baseStop: int
+        trace index of background stop.
+    period: int
+        number of samples between two peaks. Ignored if intstart is a 1D-array.
+    npulses: int
+        number of pulses. Ignored if intstart is a 1D-array.
+    extra_dim: str
+        Name given to the dimension along the peaks. Defaults to 'pulseId'.
+
+    Returns
+    -------
+    xarray DataArray
+    """
+
+    assert len(traces.shape) == 2
+
+    if isinstance(traces, xr.DataArray):
+        ntid = traces.sizes['trainId']
+        coords = traces.coords
+        traces = traces.values
+        if traces.shape[0] != ntid:
+            traces = traces.T
+    else:
+        coords = None
+
+    if hasattr(pulseStart, '__len__'):
+        pulseStart = np.array(pulseStart)
+        pulses = pulseStart - pulseStart[0]
+        pulseStart = pulseStart[0]
+    else:
+        pulses = range(0, npulses*period, period)
+
+    if extra_dim is None:
+        extra_dim = 'pulseId'
+    results = xr.DataArray(np.empty((traces.shape[0], len(pulses))),
+                           coords=coords,
+                           dims=['trainId', extra_dim])
+
+    for i, p in enumerate(pulses):
+        a = pulseStart + p
+        b = pulseStop + p
+        bkga = baseStart + p
+        bkgb = baseStop + p
+        if b > traces.shape[1]:
+            break
+        bg = np.outer(np.median(traces[:, bkga:bkgb], axis=1),
+                      np.ones(b-a))
+        results[:, i] = np.trapz(traces[:, a:b] - bg, axis=1)
+    return results
+
+
+def peaks_from_apd(array, params, digitizer, bpt, bunchPattern):
+    """
+    Extract peak-integrated data according to the bunch pattern.
+
+    Parameters
+    ----------
+    array: xarray DataArray
+        2D array containing peak-integrated data
+    params: dict
+        peak-integration parameters of the digitizer
+    digitizer: str
+        digitizer type, one of {'FastADC', 'ADQ412'}
+    bpt: xarray DataArray
+        bunch pattern table
+    bunchPattern: str
+        one of {'sase1', 'sase3', 'scs_ppl'}, used to select pulses and
+        assign name of the pulse id dimension.
+
+    Returns
+    -------
+    xarray DataArray with pulse id coordinates.
+    """
+    if params['enable'] == 0 or (array == 1.).all():
+        raise ValueError('The digitizer did not record integrated peaks. '
+                         'Consider using raw traces from the same channel '
+                         'for peak integration.')
+    if digitizer == 'FastADC':
+        min_distance = 24
+    if digitizer == 'ADQ412':
+        min_distance = 440
+    period = params['period']
+    if period % min_distance != 0:
+        log.warning(f'Warning: the pulse period ({period} samples) of '
+                    'digitizer is not a multiple of the pulse separation at '
+                    f'4.5 MHz ({min_distance} samples). Pulse id assignment '
+                    'is likely to fail.')
+    stride = int(period/min_distance)
+    npulses_apd = params['npulses']
+    dim_names = {'sase3': 'sa3_pId', 'sase1': 'sa1_pId', 'scs_ppl': 'ol_pId'}
+    pulse_dim = dim_names[bunchPattern]
+    arr_dim = [dim for dim in array.dims if dim != 'trainId'][0]
+    if npulses_apd > array.sizes[arr_dim]:
+        log.warning(f'The digitizer was set to record {npulses_apd} pulses '
+                    f'but the array length is only {array.sizes[arr_dim]}.')
+        npulses_apd = array.sizes[arr_dim]
+    mask = is_pulse_at(bpt, bunchPattern).rename({'pulse_slot': pulse_dim})
+    mask = mask.where(mask.trainId.isin(array.trainId), drop=True)
+    mask = mask.assign_coords({pulse_dim: np.arange(bpt.sizes['pulse_slot'])})
+    pid = np.sort(np.unique(np.where(mask)[1]))
+    npulses_bpt = len(pid)
+    apd_coords = np.arange(pid[0], pid[0] + stride*npulses_apd, stride)
+    noalign = False
+    if len(np.intersect1d(apd_coords, pid, assume_unique=True)) < npulses_bpt:
+        log.warning('Not all pulses were recorded. The digitizer '
+                    'was set to record pulse ids '
+                    f'{apd_coords[apd_coords<bpt.sizes["pulse_slot"]]} but the'
+                    'bunch pattern for'
+                    f' {bunchPattern} is {pid}. Skipping pulse ID alignment.')
+        noalign = True
+    array = array.isel({arr_dim: slice(0, npulses_apd)})
+    array = array.where(array != 1.)
+    if noalign:
+        return array
+    array = array.rename(
+        {arr_dim: pulse_dim}).assign_coords({pulse_dim: apd_coords})
+    array, mask = xr.align(array, mask, join='inner')
+    array = array.where(mask, drop=True)
+    return array
+
+
+def get_peaks(run,
+              data,
+              mnemonic,
+              useRaw=True,
+              autoFind=True,
+              integParams=None,
+              bunchPattern='sase3',
+              bpt=None,
+              extra_dim=None,
+              indices=None,
+              ):
+    """
+    Extract peaks from one source (channel) of a digitizer.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data
+    data: xarray DataArray or str
+        array containing the raw traces or peak-integrated values from the
+        digitizer. If str, must be one of the ToolBox mnemonics. If None,
+        the data is loaded via the source and key arguments.
+    mnemonic: str or dict
+        ToolBox mnemonic or dict with source and key as in 
+        {'source': 'SCS_UTC1_ADQ/ADC/1:network',
+        'key': 'digitizers.channel_1_A.raw.samples'}
+    useRaw: bool
+        If True, extract peaks from raw traces. If False, uses the APD (or
+        peaks) data from the digitizer.
+    autoFind: bool
+        If True, finds integration parameters by inspecting the average raw
+        trace. Only valid if useRaw is True.
+    integParams: dict
+        dictionnary containing the integration parameters for raw trace
+        integration: 'pulseStart', 'pulseStop', 'baseStart', 'baseStop',
+        'period', 'npulses'. Not used if autoFind is True. All keys are
+        required when bunch pattern is missing.
+    bunchPattern: str
+        match the peaks to the bunch pattern: 'sase1', 'sase3', 'scs_ppl'.
+        This will dictate the name of the pulse ID coordinates: 'sa1_pId',
+        'sa3_pId' or 'scs_ppl'.
+    bpt: xarray DataArray
+        bunch pattern table
+    extra_dim: str
+        Name given to the dimension along the peaks. If None, the name is given
+        according to the bunchPattern.
+    indices: array, slice
+        indices from the peak-integrated data to retrieve. Only required
+        when bunch pattern is missing and useRaw is False.
+
+    Returns
+    -------
+    xarray.DataArray containing digitizer peaks with pulse coordinates
+    """
+    arr = data
+    dim = [d for d in arr.dims if d != 'trainId'][0]
+
+    # Load bunch pattern table
+    run_mnemonics = mnemonics_for_run(run)
+    if bpt is None and bunchPattern != 'None':
+        if 'bunchPatternTable' in run_mnemonics:
+            m = run_mnemonics['bunchPatternTable']
+            bpt = run.get_array(m['source'], m['key'], m['dim'])
+        pattern = bunchPattern
+    else:
+        pattern = bunchPattern
+    if bunchPattern == 'None':
+        bpt = None
+
+    # Find digitizer type
+    m = mnemonic if isinstance(mnemonic, dict) else run_mnemonics[mnemonic]
+    digitizer = digitizer_type(run, m.get('source'))
+
+    # 1. Peak-integrated data from digitizer
+    if useRaw is False:
+        # 1.1 No bunch pattern provided
+        if bpt is None:
+            log.info('Missing bunch pattern info.')
+            if indices is None:
+                raise TypeError('indices argument must be provided '
+                                'when bunch pattern info is missing.')
+            if extra_dim is None:
+                extra_dim = 'pulseId'
+            return arr.isel({dim: indices}).rename({dim: extra_dim})
+
+        # 1.2 Bunch pattern is provided
+        if isinstance(mnemonic, dict):
+            peak_params = channel_peak_params(run, mnemonic.get('source'),
+                                              mnemonic.get('key'))
+        else:
+            peak_params = channel_peak_params(run, mnemonic)
+        log.debug(f'Digitizer peak integration parameters: {peak_params}')
+        return peaks_from_apd(arr, peak_params, digitizer, bpt, bunchPattern)
+
+    # 2. Use raw data from digitizer
+    # minimum pulse period @ 4.5MHz, according to digitizer type
+    min_distance = 1
+    if digitizer == 'FastADC':
+        min_distance = 24
+    if digitizer == 'ADQ412':
+        min_distance = 440
+    if autoFind:
+        stride = int(np.max([1, np.floor(arr.sizes['trainId']/200)]))
+        trace = arr.isel(trainId=slice(0, None, stride)).mean(dim='trainId')
+        try:
+            integParams = find_integ_params(trace)
+        except ValueError as err:
+            log.warning(f'{err}, trying with averaged trace over all trains.')
+            trace = arr.mean(dim='trainId')
+            integParams = find_integ_params(trace)
+        log.debug(f'Auto find peaks result: {integParams}')
+
+    required_keys = ['pulseStart', 'pulseStop', 'baseStart',
+                     'baseStop', 'period', 'npulses']
+    if integParams is None or not all(name in integParams
+                                      for name in required_keys):
+        raise TypeError('All keys of integParams argument '
+                        f'{required_keys} are required when '
+                        'bunch pattern info is missing.')
+
+    # 2.1. No bunch pattern provided
+    if bpt is None:
+        log.info('Missing bunch pattern info.')
+        log.debug(f'Retrieving {integParams["npulses"]} pulses.')
+        if extra_dim is None:
+            extra_dim = 'pulseId'
+        return peaks_from_raw_trace(arr, integParams['pulseStart'],
+                                    integParams['pulseStop'],
+                                    integParams['baseStart'],
+                                    integParams['baseStop'],
+                                    integParams['period'],
+                                    integParams['npulses'],
+                                    extra_dim=extra_dim)
+
+    # 2.2 Bunch pattern is provided
+    # load mask and extract pulse Id:
+    dim_names = {'sase3': 'sa3_pId', 'sase1': 'sa1_pId',
+                 'scs_ppl': 'ol_pId', 'None': 'pulseId'}
+    extra_dim = dim_names[pattern]
+    valid_tid = np.intersect1d(arr.trainId, bpt.trainId, assume_unique=True)
+    mask = is_pulse_at(bpt, pattern)
+    pattern_changed = ~(mask == mask[0]).all().values
+    if not pattern_changed:
+        pid = np.nonzero(mask[0].values)[0]
+        valid_arr = arr
+    else:
+        mask = is_pulse_at(bpt.sel(trainId=valid_tid), pattern)
+        mask = mask.rename({'pulse_slot': extra_dim})
+        mask = mask.assign_coords(
+            {extra_dim: np.arange(bpt.sizes['pulse_slot'])})
+        mask_on = mask.where(mask, drop=True).fillna(False).astype(bool)
+        log.info(f'Bunch pattern of {pattern} changed during the run.')
+        pid = mask_on.coords[extra_dim]
+        # select trains containing pulses
+        valid_arr = arr.sel(trainId=mask_on.trainId)
+    npulses = len(pid)
+    log.debug(f'Bunch pattern: {npulses} pulses for {pattern}.')
+    if npulses == 1:
+        period_bpt = 0
+    else:
+        period_bpt = np.min(np.diff(pid))
+    if autoFind and period_bpt*min_distance != integParams['period']:
+        log.warning('The period from the bunch pattern is different than '
+                    'that found by the peak-finding algorithm. Either '
+                    'the algorithm failed or the bunch pattern source '
+                    f'({bunchPattern}) is not correct.')
+    # create array of sample indices for peak integration
+    sample_id = (pid-pid[0])*min_distance
+    # override auto find parameters
+    if isinstance(integParams['pulseStart'], (int, np.integer)):
+        integParams['pulseStart'] = integParams['pulseStart'] + sample_id
+    peaks = peaks_from_raw_trace(valid_arr, integParams['pulseStart'],
+                                 integParams['pulseStop'],
+                                 integParams['baseStart'],
+                                 integParams['baseStop'],
+                                 integParams['period'],
+                                 integParams['npulses'],
+                                 extra_dim)
+    if pattern_changed:
+        peaks = peaks.where(mask_on, drop=True)
+    return peaks.assign_coords({extra_dim: pid})
+
+
+def get_dig_avg_trace(run, mnemonic, ntrains=None):
+    """
+    Compute the average over ntrains evenly spaced accross all trains
+    of a digitizer trace.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        ToolBox mnemonic of the digitizer data, e.g. 'MCP2apd'.
+    ntrains: int
+        Number of trains used to calculate the average raw trace.
+        If None, all trains are used.
+
+    Returns
+    -------
+    trace: DataArray
+        The average digitizer trace
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if ntrains is None:
+        sel = run
+    else:
+        total_tid = len(run.train_ids)
+        stride = int(np.max([1, np.floor(total_tid/ntrains)]))
+        sel = run.select_trains(np.s_[0:None:stride])
+    m = run_mnemonics[mnemonic]
+    raw_trace = sel.get_array(m['source'], m['key'], m['dim'])
+    raw_trace = raw_trace.mean(dim='trainId')
+    return raw_trace
+
+
+def channel_peak_params(run, source, key=None, channel=None, board=None):
+    """
+    Extract peak-integration parameters used by a channel of the digitizer.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    source: str
+        ToolBox mnemonic of a digitizer data, e.g. "MCP2apd" or
+        "FastADC4peaks", or name of digitizer source, e.g.
+        'SCS_UTC1_ADQ/ADC/1:network'.
+    key: str
+        optional, used in combination of source (if source is not a ToolBox
+        mnemonics) instead of digitizer, channel and board.
+    channel: int or str
+        The digitizer channel for which to retrieve the parameters. If None,
+        inferred from the source mnemonic or source + key arguments.
+    board: int
+        Board of the ADQ412. If None, inferred from the source mnemonic or
+        source + key arguments.
+
+    Returns
+    -------
+    dict with peak integration parameters.
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if source in run_mnemonics:
+        m = run_mnemonics[source]
+        source = m['source']
+        key = m['key']
+    if 'network' in source:
+        return adq412_channel_peak_params(run, source, key, channel, board)
+    else:
+        return fastADC_channel_peak_params(run, source, channel)
+
+
+def fastADC_channel_peak_params(run, source, channel=None):
+    """
+    Extract peak-integration parameters used by a channel of the Fast ADC.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    source: str
+        Name of Fast ADC source, e.g. 'SCS_UTC1_MCP/ADC/1:channel_5.output'.
+    channel: int
+        The Fast ADC channel for which to retrieve the parameters. If None,
+        inferred from the source.
+
+    Returns
+    -------
+    dict with peak integration parameters.
+    """
+    fastADC_keys = {
+        'baseStart': ('baselineSettings.baseStart.value',
+                      'baseStart.value'),
+        'baseStop': ('baseStop.value',),
+        'baseLength': ('baselineSettings.length.value',),
+        'enable': ('enablePeakComputation.value',),
+        'pulseStart': ('initialDelay.value',),
+        'pulseLength': ('peakSamples.value',),
+        'npulses': ('numPulses.value',),
+        'period': ('pulsePeriod.value',)
+    }
+    if channel is None:
+        channel = int(source.split(':')[1].split('.')[0].split('_')[1])
+    baseName = f'channel_{channel}.'
+    source = source.split(':')[0]
+    data = run.select(source).train_from_index(0)[1][source]
+    result = {}
+    for mnemo, versions in fastADC_keys.items():
+        for v in versions:
+            key = baseName + v
+            if key in data:
+                result[mnemo] = data[key]
+    if 'baseLength' in result:
+        result['baseStop'] = result['baseStart'] + \
+                             result.pop('baseLength')
+    if 'pulseLength' in result:
+        result['pulseStop'] = result['pulseStart'] + \
+                             result.pop('pulseLength')
+    return result
+
+
+def adq412_channel_peak_params(run, source, key=None,
+                               channel=None, board=None):
+    """
+    Extract peak-integration parameters used by a channel of the ADQ412.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    source: str
+        Nname of ADQ412 source, e.g. 'SCS_UTC1_ADQ/ADC/1:network'.
+    key: str
+        optional, e.g. 'digitizers.channel_1_D.apd.pulseIntegral', used in
+        combination of source instead of channel and board.
+    channel: int or str
+        The ADQ412 channel for which to retrieve the parameters. If None,
+        inferred from the source mnemonic or source + key arguments.
+    board: int
+        Board of the ADQ412. If None, inferred from the source mnemonic or
+        source + key arguments.
+
+    Returns
+    -------
+    dict with peak integration parameters.
+    """
+    if key is None:
+        if channel is None or board is None:
+            raise ValueError('key or channel + board arguments are '
+                             'missing.')
+    else:
+        k = key.split('.')[1].split('_')
+        ch_to_int = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
+        channel = ch_to_int[k[2]]
+        board = k[1]
+    baseName = f'board{board}.apd.channel_{channel}.'
+    source = source.split(':')[0]
+    adq412_keys = {
+        'baseStart': (baseName + 'baseStart.value',),
+        'baseStop': (baseName + 'baseStop.value',),
+        'enable': (baseName + 'enable.value',),
+        'pulseStart': (baseName + 'pulseStart.value',),
+        'pulseStop': (baseName + 'pulseStop.value',),
+        'initialDelay': (baseName + 'initialDelay.value',),
+        'upperLimit': (baseName + 'upperLimit.value',),
+        'npulses': (f'board{board}.apd.numberOfPulses.value',)
+    }
+
+    data = run.select(source).train_from_index(0)[1][source]
+    result = {}
+    for mnemo, versions in adq412_keys.items():
+        for key in versions:
+            if key in data:
+                result[mnemo] = data[key]
+    result['period'] = result.pop('upperLimit') - \
+                       result.pop('initialDelay')
+    return result
+
+
+def find_integ_params(trace, height=None, width=1):
+    """
+    Find integration parameters for peak integration of a raw
+    digitizer trace. Based on scipy find_peaks().
+
+    Parameters
+    ----------
+    trace: numpy array or xarray DataArray
+        The digitier raw trace used to find peaks
+    height: int
+        minimum threshold for peak determination
+    width: int
+        minimum width of peak
+
+    Returns
+    -------
+    dict with keys 'pulseStart', 'pulseStop', 'baseStart', 'baseStop',
+        'period', 'npulses' and values in number of samples.
+    """
+    if isinstance(trace, xr.DataArray):
+        trace = trace.values
+    trace_norm = trace - np.median(trace)
+    trace_norm = -trace_norm if np.mean(trace_norm) < 0 else trace_norm
+    SNR = np.max(np.abs(trace_norm)) / np.std(trace_norm[:100])
+    if SNR < 10:
+        log.warning('signal-to-noise ratio too low: cannot '
+                    'automatically find peaks.')
+        return {'pulseStart': 2, 'pulseStop': 3,
+               'baseStart': 0, 'baseStop': 1,
+               'period': 0, 'npulses': 1}
+    # Compute autocorrelation using fftconvolve
+    # from "https://stackoverflow.com/questions/12323959/"
+    #      "fast-cross-correlation-method-in-python"
+    if len(trace_norm) % 2 == 0:
+        n = len(trace_norm)
+    else:
+        n = len(trace_norm) - 1
+    hl = int(n/2)
+    a = trace_norm[:n]
+    b = np.zeros(2*n)
+    b[hl: hl + n] = a
+    # Do an array flipped convolution, which is a correlation.
+    ac_trace = fftconvolve(b, a[::-1], mode='valid')
+    # slower approach:
+    # ac_trace = np.correlate(trace_norm, trace_norm, mode='same')
+    n = int(len(ac_trace)/2)
+    # estimate quality of ac_trace and define min height to detect peaks
+    factor = np.abs(ac_trace.max() / np.median(ac_trace))
+    factor = 3 if factor > 20 else 1.5
+    ac_peaks = find_peaks(ac_trace[n:], height=ac_trace[n:].max()/factor)
+    if len(ac_peaks[0]) == 0:
+        period = 0
+        distance = 1
+    elif len(ac_peaks[0]) == 1:
+        period = ac_peaks[0][0]
+        distance = max(1, period-6)
+    else:
+        period = int(np.median(ac_peaks[0][1:] - ac_peaks[0][:-1]))
+        distance = max(1, period-6)  # smaller than period to account for all peaks
+    # define min height to detect peaks depending on signal quality
+    f = np.max([3, np.min([(SNR/10), 4])])
+    height = trace_norm.max() / f
+    peaks, dic = find_peaks(trace_norm, distance=distance,
+                            height=height, width=1)
+    # filter out peaks that are not periodically spaced
+    idx = np.ones(len(peaks), dtype=bool)
+    idx[1:] = np.isclose(peaks[1:] - peaks[:-1],
+                         np.ones(len(peaks[1:]))*period, atol=6)
+    peaks = peaks[idx]
+    for d in dic:
+        dic[d] = dic[d][idx]
+    npulses = len(peaks)
+    if npulses == 0:
+        raise ValueError('Could not automatically find peaks.')
+    pulseStart = np.mean(dic['left_ips'] - peaks + peaks[0], dtype=int)
+    pulseStop = np.mean(dic['right_ips'] - peaks + peaks[0], dtype=int)
+    baseStop = pulseStart - np.mean(peaks - dic['left_ips'])/2 - 1
+    baseStop = np.min([baseStop, pulseStart]).astype(int)
+    baseStop = np.max([baseStop, 0]).astype(int)
+    baseStart = baseStop - np.mean(dic['widths'])/2
+    baseStart = np.max([baseStart, 0]).astype(int)
+    result = {'pulseStart': pulseStart, 'pulseStop': pulseStop,
+              'baseStart': baseStart, 'baseStop': baseStop,
+              'period': period, 'npulses': npulses}
+    return result
+
+
+def get_peak_params(run, mnemonic, raw_trace=None, ntrains=None):
+    """
+    Get the peak region and baseline region of a raw digitizer trace used
+    to compute the peak integration. These regions are either those of the
+    digitizer peak-integration settings or those determined in get_tim_peaks
+    or get_fast_adc_peaks when the inputs are raw traces.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        ToolBox mnemonic of the digitizer data, e.g. 'MCP2apd'.
+    raw_trace: optional, 1D numpy array or xarray DataArray
+        Raw trace to display. If None, the average raw trace over
+        ntrains of the corresponding channel is loaded (this can
+        be time-consuming).
+    ntrains: optional, int
+        Only used if raw_trace is None. Number of trains used to
+        calculate the average raw trace of the corresponding channel.
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if mnemonic not in run_mnemonics:
+        raise ToolBoxValueError("mnemonic must be a ToolBox mnemonics")
+    if "raw" not in mnemonic:
+        params = channel_peak_params(run, mnemonic)
+        if 'enable' in params and params['enable'] == 0:
+            log.warning('The digitizer did not record peak-integrated data.')
+    else:
+        if raw_trace is None:
+            raw_trace = get_dig_avg_trace(run, mnemonic, ntrains)
+        params = find_integ_params(raw_trace)
+    return params
+
+
+def find_peak_integration_parameters(run, mnemonic, raw_trace=None, 
+                                     integParams=None, pattern=None):
+    '''
+    Finds peak integration parameters.
+    '''
+    run_mnemonics = mnemonics_for_run(run)
+    digitizer = digitizer_type(run, run_mnemonics[mnemonic]['source'])
+    pulse_period = 24 if digitizer == "FastADC" else 440
+
+    if integParams is None:  # load raw trace and find peaks
+        autoFind = True
+        if raw_trace is None:
+            raw_trace = get_dig_avg_trace(run, mnemonic)
+        params = get_peak_params(run, mnemonic, raw_trace)
+    else:  # inspect provided parameters
+        autoFind = False
+        required_keys = ['pulseStart', 'pulseStop', 'baseStart',
+                         'baseStop']
+        add_text = ''
+        if pattern is None and not hasattr(integParams['pulseStart'],
+                                           '__len__'):
+            required_keys += ['period', 'npulses']
+            add_text = 'Bunch pattern not provided. '
+        if not all(name in integParams for name in required_keys):
+            raise ValueError(add_text + 'All keys of integParams argument '
+                             f'{required_keys} are required.')
+        params = integParams.copy()
+
+    # extract the pulse ids from the parameters (starting at 0)
+    if hasattr(params['pulseStart'], '__len__'):
+        if params.get('npulses') is not None and (
+            params.get('npulses') != len(params['pulseStart']) ):
+            log.warning('The number of pulses does not match the length '
+                        'of pulseStart. Using length of pulseStart as '
+                        'the number of pulses.')
+        params['npulses'] = len(params['pulseStart'])
+        pulse_ids_params = ((np.array(params['pulseStart'])
+                             - params['pulseStart'][0]) / pulse_period).astype(int)
+    else:
+        pulse_ids_params = np.arange(0, params['npulses'] * params['period'] / pulse_period,
+                                     params['period'] / pulse_period).astype(int)
+
+    # Extract pulse_ids and npulses from bunch pattern
+    pulse_ids, npulses, period = None, None, None
+    regular = True
+    if pattern is not None:
+        bunchPattern = 'sase3' if hasattr(pattern, 'sase') else 'scs_ppl'
+        if pattern.is_constant_pattern() is False:
+            log.info('The number of pulses changed during the run.')
+            pulse_ids = np.unique(pattern.pulse_ids(labelled=False, copy=False))
+            npulses, period = None, None
+            regular = False
+        else:
+            pulse_ids = pattern.peek_pulse_ids(labelled=False)
+            npulses = len(pulse_ids)
+            if npulses > 1:
+                periods = np.diff(pulse_ids)
+                if len(np.unique(periods)) > 1:
+                    regular = False
+                else:
+                    period = np.unique(periods)[0] * pulse_period
+    # Compare parameters with bunch pattern
+        if len(pulse_ids_params) == len(pulse_ids):
+            if not (pulse_ids_params == pulse_ids - pulse_ids[0]).all():
+                log.warning('The provided pulseStart parameters do not match the '
+                            f'{bunchPattern} bunch pattern pulse ids. Using '
+                            'bunch pattern parameters.')
+            pulse_ids_params = pulse_ids
+            
+        if (npulses != params.get('npulses') or
+            period != params.get('period')):
+            if autoFind:
+                add_text = 'Automatically found '
+            else:
+                add_text = 'Provided '
+            log.warning(add_text + 'integration parameters '
+                f'(npulses={params.get("npulses")}, '
+                f'period={params.get("period")}) do not match the '
+                f'{bunchPattern} bunch pattern (npulses={npulses}, '
+                f'period={period}). Using bunch pattern parameters.')
+            pulse_ids_params = pulse_ids
+        params['npulses'] = npulses
+        params['period'] = period
+    if not hasattr(params['pulseStart'], '__len__'):
+        start = params['pulseStart']
+    else:
+        start = params['pulseStart'][0]
+
+    params['pulseStart'] = np.array([int(start + (pid - pulse_ids_params[0]) * pulse_period)
+                            for pid in pulse_ids_params])
+    return params, pulse_ids_params, regular, raw_trace
+
+
+def check_peak_params(run, mnemonic, raw_trace=None, ntrains=200, params=None,
+                      plot=True, show_all=False, bunchPattern='sase3'):
+    """
+    Checks and plots the peak parameters (pulse window and baseline window
+    of a raw digitizer trace) used to compute the peak integration. These
+    parameters are either set by the digitizer peak-integration settings,
+    or are determined by a peak finding algorithm (used in get_tim_peaks
+    or get_fast_adc_peaks) when the inputs are raw traces. The parameters
+    can also be provided manually for visual inspection. The plot either
+    shows the first and last pulse of the trace or the entire trace.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        ToolBox mnemonic of the digitizer data, e.g. 'MCP2apd'.
+    raw_trace: optional, 1D numpy array or xarray DataArray
+        Raw trace to display. If None, the average raw trace over
+        ntrains of the corresponding channel is loaded (this can
+        be time-consuming).
+    ntrains: optional, int
+        Only used if raw_trace is None. Number of trains used to
+        calculate the average raw trace of the corresponding channel.
+    plot: bool
+        If True, displays the raw trace and peak integration regions.
+    show_all: bool
+        If True, displays the entire raw trace and all peak integration
+        regions (this can be time-consuming).
+        If False, shows the first and last pulse according to the bunchPattern.
+    bunchPattern: optional, str
+        Only used if plot is True. Checks the bunch pattern against
+        the digitizer peak parameters and shows potential mismatch.
+
+    Returns
+    -------
+    dictionnary of peak integration parameters
+    """
+    '''
+    run_mnemonics = mnemonics_for_run(run)
+    if raw_trace is None:
+        raw_trace = get_dig_avg_trace(run, mnemonic, ntrains)
+    if params is None:
+        integParams = get_peak_params(run, mnemonic, raw_trace)
+        title = 'Auto-find peak params'
+    else:
+        title = 'Provided peak params'
+        integParams = params.copy()
+    if 'enable' in integParams and integParams['enable'] == 0:
+        log.warning('The digitizer did not record peak-integrated data.')
+    if not plot:
+        return integParams
+
+    digitizer = digitizer_type(run, run_mnemonics[mnemonic]['source'])
+    pulse_period = 24 if digitizer == "FastADC" else 440
+    pattern = None
+    try:
+        if bunchPattern == 'sase3':
+            pattern = XrayPulses(run)
+        if bunchPattern == 'scs_ppl':
+            pattern = OpticalLaserPulses(run)
+    except Exception as e:
+        log.warning(e)
+        bunchPattern = None
+
+    # Extract pulse_ids, npulses and period from bunch pattern
+    pulse_ids, npulses, period = None, None, None
+    regular = True
+    if pattern is not None:
+        if pattern.is_constant_pattern() is False:
+            log.info('The number of pulses changed during the run.')
+            pulse_ids = np.unique(pattern.pulse_ids(labelled=False, copy=False))
+            npulses, period = None, None
+            regular = False
+        else:
+            pulse_ids = pattern.peek_pulse_ids(labelled=False)
+            npulses = len(pulse_ids)
+            if npulses > 1:
+                periods = np.diff(pulse_ids)
+                if len(np.unique(periods)) > 1:
+                    regular = False
+                else:
+                    period = np.unique(periods)[0] * pulse_period
+        bp_params = {}
+        bp_params['npulses'] = npulses
+        bp_params['period'] = period
+        bp_params['pulse_ids'] = pulse_ids
+        bp_params['regular'] = regular
+        
+        if regular:
+            print(f'bunch pattern {bunchPattern}: {bp_params["npulses"]} pulses,'
+              f' {bp_params["period"]} samples between two pulses')
+        else:
+            print(f'bunch pattern {bunchPattern}: Not a regular pattern. '
+                  f'{bp_params["npulses"]} pulses, pulse_ids='
+                  f'{bp_params["pulse_ids"]}.')
+        if (npulses != integParams.get('npulses') or
+            period != integParams.get('period')):
+            log.warning(f'Integration parameters '
+                f'(npulses={integParams.get("npulses")}, '
+                f'period={integParams.get("period")}) do not match the '
+                f'the bunch pattern (npulses={npulses}, '
+                f'period={period}). Using bunch pattern parameters.')
+        integParams['npulses'] = npulses
+        integParams['period'] = period
+        start = integParams['pulseStart']
+        integParams['pulseStart'] = [start + (pid - pulse_ids[0]) * pulse_period
+                                    for pid in pulse_ids]
+    else:
+        bp_params = None
+    print(f'{title}: {integParams["npulses"]} pulses, {integParams["period"]}'
+          ' samples between two pulses')
+    fig, ax = plotPeakIntegrationWindow(raw_trace, integParams, bp_params, show_all)
+    ax[0].set_ylabel(mnemonic.replace('peaks', 'raw').replace('apd', 'raw'))
+    fig.suptitle(title, size=12)
+    '''
+    run_mnemonics = mnemonics_for_run(run)
+    digitizer = digitizer_type(run, run_mnemonics[mnemonic]['source'])
+    pulse_period = 24 if digitizer == "FastADC" else 440
+    if params is None:
+        title = 'Auto-find peak parameters'
+    else:
+        title = 'Provided peak parameters'
+    pattern = None
+    try:
+        if bunchPattern == 'sase3':
+            pattern = XrayPulses(run)
+        if bunchPattern == 'scs_ppl':
+            pattern = OpticalLaserPulses(run)
+    except Exception as e:
+        log.warning(e)
+        bunchPattern = None
+    integParams, pulse_ids, regular, raw_trace = find_peak_integration_parameters(
+        run, mnemonic, raw_trace, params, pattern)
+    if bunchPattern:
+        if regular:
+            print(f'bunch pattern {bunchPattern}: {len(pulse_ids)} pulses,'
+              f' {(pulse_ids[1] - pulse_ids[0]) * pulse_period} samples between two pulses')
+            print(title + ': ' + f' {integParams["npulses"]} pulses, '
+                  f'{integParams["period"]} samples between two pulses') 
+        else:
+            print(f'bunch pattern {bunchPattern}: Not a regular pattern. '
+                  f'{len(pulse_ids)} pulses, pulse_ids='
+                  f'{pulse_ids}.')
+    if plot:
+        if raw_trace is None:
+            raw_trace = get_dig_avg_trace(run, mnemonic)
+        fig, ax = plotPeakIntegrationWindow(raw_trace, integParams,
+                                            show_all=show_all)
+        fig.suptitle(title, size=12)
+    return integParams
+
+
+def plotPeakIntegrationWindow(raw_trace, params, show_all=False):
+    if hasattr(params['pulseStart'], '__len__'):
+        pulseStarts = np.array(params['pulseStart'])
+        pulseStops = params['pulseStop'] + pulseStarts - params['pulseStart'][0]
+        baseStarts = params['baseStart'] + pulseStarts - params['pulseStart'][0]
+        baseStops = params['baseStop'] + pulseStarts - params['pulseStart'][0]
+    else:
+        n = params['npulses']
+        p = params['period']
+        pulseStarts = [params['pulseStart'] + i*p for i in range(n)]
+        pulseStops = [params['pulseStop'] + i*p for i in range(n)]
+        baseStarts = [params['baseStart'] + i*p for i in range(n)]
+        baseStops = [params['baseStop'] + i*p for i in range(n)]
+
+    if show_all:
+        fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
+        for i in range(len(pulseStarts)):
+            lbl = 'baseline' if i == 0 else None
+            lp = 'peak' if i == 0 else None
+            ax.axvline(baseStarts[i], ls='--', color='k')
+            ax.axvline(baseStops[i], ls='--', color='k')
+            ax.axvspan(baseStarts[i], baseStops[i],
+                       alpha=0.5, color='grey', label=lbl)
+            ax.axvline(pulseStarts[i], ls='--', color='r')
+            ax.axvline(pulseStops[i], ls='--', color='r')
+            ax.axvspan(pulseStarts[i], pulseStops[i],
+                       alpha=0.2, color='r', label=lp)
+        ax.plot(raw_trace, color='C0', label='raw trace')
+        ax.legend(fontsize=8)
+        return fig, [ax]
+
+    fig, ax = plt.subplots(1, 2, figsize=(6, 3), constrained_layout=True)
+    for plot in range(2):
+        title = 'First pulse' if plot == 0 else 'Last pulse'
+        i = 0 if plot == 0 else -1
+        ax[plot].axvline(baseStarts[i], ls='--', color='k')
+        ax[plot].axvline(baseStops[i], ls='--', color='k')
+        ax[plot].axvspan(baseStarts[i], baseStops[i],
+                      alpha=0.5, color='grey', label='baseline')
+        ax[plot].axvline(pulseStarts[i], ls='--', color='r')
+        ax[plot].axvline(pulseStops[i], ls='--', color='r')
+        ax[plot].axvspan(pulseStarts[i], pulseStops[i],
+                      alpha=0.2, color='r', label='peak')
+        xmin = np.max([0, baseStarts[i]-200])
+        xmax = np.min([pulseStops[i]+200, raw_trace.size])
+        ax[plot].plot(np.arange(xmin, xmax), raw_trace[xmin:xmax], color='C0',
+                   label=title)
+        ax[plot].legend(fontsize=8)
+        ax[plot].set_xlim(xmin, xmax)
+        ax[plot].set_xlabel('digitizer samples')
+        ax[plot].set_title(title, size=10)
+    return fig, ax
+
+
+def digitizer_type(run, source):
+    """
+    Finds the digitizer type based on the class Id / name of the source.
+    Example source: 'SCS_UTC1_MCP/ADC/1'. Defaults to ADQ412 if not found.
+    """
+    ret = None
+    if '_MCP/ADC/1' in source:
+        ret = 'FastADC'
+    if '_ADQ/ADC/1' in source:
+        ret = 'ADQ412'
+    if ret is None:
+        digi_dict = {'FastAdc': 'FastADC',
+                     'FastAdcLegacy': 'FastADC',
+                     'AdqDigitizer': 'ADQ412',
+                     'PyADCChannel': 'FastADC',
+                     'PyADCChannelLegacy': 'FastADC'}
+        try:
+            source = source.split(':')[0]
+            classId = run.get_run_value(source, 'classId.value')
+            ret = digi_dict.get(classId)
+        except Exception as e:
+            log.warning(str(e))
+            log.warning(f'Could not find digitizer type from source {source}.')
+            ret = 'ADQ412'
+    return ret
+
+
+def get_tim_peaks(run, mnemonic=None, merge_with=None,
+                  bunchPattern='sase3', integParams=None,
+                  keepAllSase=False):
+    """
+    Automatically computes TIM peaks. Sources can be loaded on the
+    fly via the mnemonics argument, or processed from an existing data
+    set (merge_with). The bunch pattern table is used to assign the
+    pulse id coordinates.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        mnemonics for TIM, e.g. "MCP2apd".
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one. The TIM variables of merge_with (if any) will also be
+        computed and merged.
+    bunchPattern: str
+        'sase1' or 'sase3' or 'scs_ppl', bunch pattern
+        used to extract peaks. The pulse ID dimension will be named
+        'sa1_pId', 'sa3_pId' or 'ol_pId', respectively.
+    integParams: dict
+        dictionnary for raw trace integration, e.g.
+        {'pulseStart':100, 'pulsestop':200, 'baseStart':50,
+        'baseStop':99, 'period':24, 'npulses':500}.
+        If None, integration parameters are computed automatically.
+    keepAllSase: bool
+        Only relevant in case of sase-dedicated trains. If True, all
+        trains are kept, else only those of the bunchPattern are kept.
+
+    Returns
+    -------
+    xarray Dataset with TIM variables substituted by
+    the peak caclulated values (e.g. "MCP2raw" becomes
+    "MCP2peaks"), merged with Dataset *merge_with* if provided.
+    """
+    return get_digitizer_peaks(run, mnemonic, merge_with,
+                               bunchPattern, integParams,
+                               keepAllSase)
+
+
+def get_laser_peaks(run, mnemonic=None, merge_with=None,
+                    bunchPattern='scs_ppl', integParams=None):
+    """
+    Extracts laser photodiode signal (peak intensity) from Fast ADC
+    digitizer. Sources can be loaded on the fly via the mnemonics
+    argument, and/or processed from an existing data set (merge_with).
+    The PP laser bunch pattern is used to assign the pulse idcoordinates.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        mnemonic for FastADC corresponding to laser signal, e.g.
+        "FastADC2peaks" or 'I0_ILHraw'.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one. The FastADC variables of merge_with (if any) will also be
+        computed and merged.
+    bunchPattern: str
+        'sase1' or 'sase3' or 'scs_ppl', bunch pattern
+        used to extract peaks.
+    integParams: dict
+        dictionnary for raw trace integration, e.g.
+        {'pulseStart':100, 'pulsestop':200, 'baseStart':50,
+        'baseStop':99, 'period':24, 'npulses':500}.
+        If None, integration parameters are computed
+        automatically.
+
+    Returns
+    -------
+    xarray Dataset with all Fast ADC variables substituted by
+    the peak caclulated values (e.g. "FastADC2raw" becomes
+    "FastADC2peaks").
+    """
+    return get_digitizer_peaks(run, mnemonic, merge_with,
+                               bunchPattern, integParams, False)
+
+
+def get_digitizer_peaks(run, mnemonic, merge_with=None,
+                        bunchPattern='sase3', integParams=None,
+                        keepAllSase=False):
+    """
+    Automatically computes digitizer peaks. A source can be loaded on the
+    fly via the mnemonic argument, or processed from an existing data set
+    (merge_with). The bunch pattern table is used to assign the pulse
+    id coordinates.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        mnemonic for FastADC or ADQ412, e.g. "I0_ILHraw" or "MCP3apd".
+        The data is either loaded from the DataCollection or taken from
+        merge_with.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this one.
+    bunchPattern: str or dict
+        'sase1' or 'sase3' or 'scs_ppl', 'None': bunch pattern
+    integParams: dict
+        dictionnary for raw trace integration, e.g.
+        {'pulseStart':100, 'pulsestop':200, 'baseStart':50,
+        'baseStop':99, 'period':24, 'npulses':500}.
+        If None, integration parameters are computed automatically.
+    keepAllSase: bool
+        Only relevant in case of sase-dedicated trains. If True, all
+        trains are kept, else only those of the bunchPattern are kept.
+
+    Returns
+    -------
+    xarray Dataset with digitizer peak variables. Raw variables are
+    substituted by the peak caclulated values (e.g. "FastADC2raw" becomes
+    "FastADC2peaks").
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if mnemonic not in run_mnemonics:
+        log.warning('Mnemonic not found in run. Skipping.')
+        return merge_with
+    if bool(merge_with) and mnemonic in merge_with:
+        for d in merge_with[mnemonic].dims:
+            if d in ['sa3_pId', 'ol_pId']:
+                log.warning(f'{mnemonic} already extracted. '
+                            'Skipping.')
+                return merge_with
+    # check if bunch pattern table exists
+    bpt = None
+    if bool(merge_with) and 'bunchPatternTable' in merge_with:
+        bpt = merge_with['bunchPatternTable']
+        log.debug('Using bpt from merge_with dataset.')
+    elif 'bunchPatternTable' in run_mnemonics:
+        m = run_mnemonics['bunchPatternTable']
+        bpt = run.get_array(m['source'], m['key'], m['dim'])
+        log.debug('Loaded bpt from DataCollection.')
+    elif 'bunchPatternTable_SA3' in run_mnemonics:
+        m = run_mnemonics['bunchPatternTable_SA3']
+        bpt = run.get_array(m['source'], m['key'], m['dim'])
+        log.debug('Loaded bpt from DataCollection.')
+    else:
+        log.warning('Could not load bunch pattern table.')
+    # prepare resulting dataset
+    if bool(merge_with):
+        mw_ds = merge_with.drop(mnemonic, errors='ignore')
+    else:
+        mw_ds = xr.Dataset()
+    # iterate over mnemonics and merge arrays in dataset
+    autoFind = True if integParams is None else False
+    m = run_mnemonics[mnemonic]
+    useRaw = True if 'raw' in mnemonic else False
+    if bool(merge_with) and mnemonic in merge_with:
+        data = merge_with[mnemonic]
+    else:
+        data = run.get_array(m['source'], m['key'], m['dim'])
+    peaks = get_peaks(run, data, mnemonic,
+                      useRaw=useRaw,
+                      autoFind=autoFind,
+                      integParams=integParams,
+                      bunchPattern=bunchPattern,
+                      bpt=bpt)
+    name = mnemonic.replace('raw', 'peaks').replace('apd', 'peaks')
+    join = 'outer' if keepAllSase else 'inner'
+    ds = mw_ds.merge(peaks.rename(name), join=join)
+    return ds
+
+
+def digitizer_signal_description(run, digitizer=None):
+    """
+    Check for the existence of signal description and return all corresponding
+    channels in a dictionnary.
+    
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    digitizer: str or list of str (default=None)
+        Name of the digitizer: one in ['FastADC', FastADC2',
+        'ADQ412', 'ADQ412_2]
+        If None, all digitizers are used
+
+    Returns
+    -------
+    signal_description: dictionnary containing the signal description of
+        the digitizer channels.
+
+
+    Example
+    -------
+    import toolbox_scs as tb
+    run = tb.open_run(3481, 100)
+    signals = tb.digitizer_signal_description(run)
+    signals_fadc2 = tb.digitizer_signal_description(run, 'FastADC2')
+    """
+    if digitizer not in [None, 'FastADC', 'FastADC2']:
+        raise ValueError('digitizer must be one in '
+                         '["FastADC", "FastADC2"]')
+    if digitizer is None:
+        digitizer = ['FastADC', 'FastADC2', 'ADQ412', 'ADQ412_2']
+    else:
+        digitizer = [digitizer]
+    def key_fadc(i):
+        if i > 9:
+            return None
+        return f'channel_{i}.signalDescription.value'
+    def key_adq412(i):
+        if i > 3:
+            return None
+        return f'board1.channel_{i}.description.value'
+    
+    sources = {'FastADC': ['SCS_UTC1_MCP/ADC/1', key_fadc],
+               'FastADC2': ['SCS_UTC2_FADC/ADC/1', key_fadc],
+               'ADQ412': ['SCS_UTC1_ADQ/ADC/1', key_adq412],
+               'ADQ412_2': ['SCS_UTC2_ADQ/ADC/1', key_adq412]}
+    res = {}
+    for d in digitizer:
+        if sources[d][0] not in run.all_sources:
+            continue
+        if sources[d][1](0) not in run.get_run_values(sources[d][0]):
+            raise ValueError('No signal description available for '
+                             f'{d}: {sources[d][0]}')
+        for ch in range(10):
+            val = sources[d][1](ch)
+            if val is None:
+                break
+            desc = run.get_run_value(sources[d][0], val)
+            res[f'{d}_Ch{ch}'] = desc
+    return res
+
+
+def calibrateTIM(data, rollingWindow=200, mcp=1, plot=False, use_apd=True, intstart=None,
+                 intstop=None, bkgstart=None, bkgstop=None, t_offset=None, npulses_apd=None):
+    ''' Calibrate TIM signal (Peak-integrated signal) to the slow ion signal of SCS_XGM
+        (photocurrent read by Keithley, channel 'pulseEnergy.photonFlux.value').
+        The aim is to find F so that E_tim_peak[uJ] = F x TIM_peak. For this, we want to
+        match the SASE3-only average TIM pulse peak per train (TIM_avg) to the slow XGM 
+        signal E_slow.
+        Since E_slow is the average energy per pulse over all SASE1 and SASE3 
+        pulses (N1 and N3), we first extract the relative contribution C of the SASE3 pulses
+        by looking at the pulse-resolved signals of the SA3_XGM in the tunnel.
+        There, the signal of SASE1 is usually strong enough to be above noise level.
+        Let TIM_avg be the average of the TIM pulses (SASE3 only).
+        The calibration factor is then defined as: F = E_slow * C * (N1+N3) / ( N3 * TIM_avg ).
+        If N3 changes during the run, we locate the indices for which N3 is maximum and define
+        a window where to apply calibration (indices start/stop).
+        
+        Warning: the calibration does not include the transmission by the KB mirrors!
+        
+        Inputs:
+            data: xarray Dataset
+            rollingWindow: length of running average to calculate TIM_avg
+            mcp: MCP channel
+            plot: boolean. If True, plot calibration results.
+            use_apd: boolean. If False, the TIM pulse peaks are extract from raw traces using
+                     getTIMapd
+            intstart: trace index of integration start
+            intstop: trace index of integration stop
+            bkgstart: trace index of background start
+            bkgstop: trace index of background stop
+            t_offset: index separation between two pulses
+            npulses_apd: number of pulses
+            
+        Output:
+            F: float, TIM calibration factor.
+        
+    '''
+    start = 0
+    stop = None
+    npulses = data['npulses_sase3']
+    ntrains = npulses.shape[0]
+    if not np.all(npulses == npulses[0]):
+        start = np.argmax(npulses.values)
+        stop = ntrains + np.argmax(npulses.values[::-1]) - 1
+        if stop - start < rollingWindow:
+            print('not enough consecutive data points with the largest number of pulses per train')
+        start += rollingWindow
+        stop = np.min((ntrains, stop+rollingWindow))
+    filteredTIM = getTIMapd(data, mcp, use_apd, intstart, intstop, bkgstart, bkgstop, t_offset, npulses_apd)
+    sa3contrib = saseContribution(data, 'sase3', 'XTD10_XGM')
+    avgFast = filteredTIM.mean(axis=1).rolling(trainId=rollingWindow).mean()
+    ratio = ((data['npulses_sase3']+data['npulses_sase1']) *
+             data['SCS_photonFlux'] * sa3contrib) / (avgFast*data['npulses_sase3'])
+    F = float(ratio[start:stop].median().values)
+
+    if plot:
+        fig = plt.figure(figsize=(8,5))
+        ax = plt.subplot(211)
+        ax.set_title('E[uJ] = {:2e} x TIM (MCP{})'.format(F, mcp))
+        ax.plot(data['SCS_photonFlux'], label='SCS XGM slow (all SASE)', color='C0')
+        slow_avg_sase3 = data['SCS_photonFlux']*(data['npulses_sase1']
+                                                    +data['npulses_sase3'])*sa3contrib/data['npulses_sase3']
+        ax.plot(slow_avg_sase3, label='SCS XGM slow (SASE3 only)', color='C1')
+        ax.plot(avgFast*F, label='Calibrated TIM rolling avg', color='C2')
+        ax.legend(loc='upper left', fontsize=8)
+        ax.set_ylabel('Energy [$\mu$J]', size=10)
+        ax.plot(filteredTIM.mean(axis=1)*F, label='Calibrated TIM train avg', alpha=0.2, color='C9')
+        ax.legend(loc='best', fontsize=8, ncol=2)
+        plt.xlabel('train in run')
+        
+        ax = plt.subplot(234)
+        xgm_fast = selectSASEinXGM(data)
+        ax.scatter(filteredTIM, xgm_fast, s=5, alpha=0.1, rasterized=True)
+        fit, cov = np.polyfit(filteredTIM.values.flatten(),xgm_fast.values.flatten(),1, cov=True)
+        y=np.poly1d(fit)
+        x=np.linspace(filteredTIM.min(), filteredTIM.max(), 10)
+        ax.plot(x, y(x), lw=2, color='r')
+        ax.set_ylabel('Raw HAMP [$\mu$J]', size=10)
+        ax.set_xlabel('TIM (MCP{}) signal'.format(mcp), size=10)
+        ax.annotate(s='y(x) = F x + A\n'+
+                    'F = %.3e\n$\Delta$F/F = %.2e\n'%(fit[0],np.abs(np.sqrt(cov[0,0])/fit[0]))+
+                    'A = %.3e'%fit[1],
+                    xy=(0.5,0.6), xycoords='axes fraction', fontsize=10, color='r')
+        print('TIM calibration factor: %e'%(F))
+        
+        ax = plt.subplot(235)
+        ax.hist(filteredTIM.values.flatten()*F, bins=50, rwidth=0.8)
+        ax.set_ylabel('number of pulses', size=10)
+        ax.set_xlabel('Pulse energy MCP{} [uJ]'.format(mcp), size=10)
+        ax.set_yscale('log')
+        
+        ax = plt.subplot(236)
+        if not use_apd:
+            pulseStart = intstart
+            pulseStop = intstop
+        else:
+            pulseStart = data.attrs['run'].get_array(
+                'SCS_UTC1_ADQ/ADC/1', 'board1.apd.channel_0.pulseStart.value')[0].values
+            pulseStop = data.attrs['run'].get_array(
+                'SCS_UTC1_ADQ/ADC/1', 'board1.apd.channel_0.pulseStop.value')[0].values
+            
+        if 'MCP{}raw'.format(mcp) not in data:
+            tid, data = data.attrs['run'].train_from_index(0)
+            trace = data['SCS_UTC1_ADQ/ADC/1:network']['digitizers.channel_1_D.raw.samples']
+            print('no raw data for MCP{}. Loading trace from MCP1'.format(mcp))
+            label_trace='MCP1 Voltage [V]'
+        else:
+            trace = data['MCP{}raw'.format(mcp)][0]
+            label_trace='MCP{} Voltage [V]'.format(mcp)
+        ax.plot(trace[:pulseStop+25], 'o-', ms=2, label='trace')
+        ax.axvspan(pulseStart, pulseStop, color='C2', alpha=0.2, label='APD region')
+        ax.axvline(pulseStart, color='gray', ls='--')
+        ax.axvline(pulseStop, color='gray', ls='--')
+        ax.set_xlim(pulseStart - 25, pulseStop + 25)
+        ax.set_ylabel(label_trace, size=10)
+        ax.set_xlabel('sample #', size=10)
+        ax.legend(fontsize=8)
+
+    return F
+
+
+''' TIM calibration table
+    Dict with key= photon energy and value= array of polynomial coefficients for each MCP (1,2,3).
+    The polynomials correspond to a fit of the logarithm of the calibration factor as a function
+    of MCP voltage. If P is a polynomial and V the MCP voltage, the calibration factor (in microjoule
+    per APD signal) is given by -exp(P(V)).
+    This table was generated from the calibration of March 2019, proposal 900074, semester 201930, 
+    runs 69 - 111 (Ni edge):  https://in.xfel.eu/elog/SCS+Beamline/2323
+    runs 113 - 153 (Co edge): https://in.xfel.eu/elog/SCS+Beamline/2334
+    runs 163 - 208 (Fe edge): https://in.xfel.eu/elog/SCS+Beamline/2349
+'''
+tim_calibration_table = {
+    705.5: np.array([
+        [-6.85344690e-12,  5.00931986e-08, -1.27206912e-04, 1.15596821e-01, -3.15215367e+01],
+        [ 1.25613942e-11, -5.41566381e-08,  8.28161004e-05, -7.27230153e-02,  3.10984925e+01],
+        [ 1.14094964e-12,  7.72658935e-09, -4.27504907e-05, 4.07253378e-02, -7.00773062e+00]]),
+    779: np.array([
+        [ 4.57610777e-12, -2.33282497e-08,  4.65978738e-05, -6.43305156e-02,  3.73958623e+01],
+        [ 2.96325102e-11, -1.61393276e-07,  3.32600044e-04, -3.28468195e-01,  1.28328844e+02],
+        [ 1.14521506e-11, -5.81980336e-08,  1.12518434e-04, -1.19072484e-01,  5.37601559e+01]]),
+    851: np.array([
+        [ 3.15774215e-11, -1.71452934e-07,  3.50316512e-04, -3.40098861e-01,  1.31064501e+02],
+        [5.36341958e-11, -2.92533156e-07,  6.00574534e-04, -5.71083140e-01,  2.10547161e+02],
+        [ 3.69445588e-11, -1.97731342e-07,  3.98203522e-04, -3.78338599e-01,  1.41894119e+02]])
+}
+
+
+def timFactorFromTable(voltage, photonEnergy, mcp=1):
+    ''' Returns an energy calibration factor for TIM integrated peak signal (APD)
+        according to calibration from March 2019, proposal 900074, semester 201930, 
+        runs 69 - 111 (Ni edge):  https://in.xfel.eu/elog/SCS+Beamline/2323
+        runs 113 - 153 (Co edge): https://in.xfel.eu/elog/SCS+Beamline/2334
+        runs 163 - 208 (Fe edge): https://in.xfel.eu/elog/SCS+Beamline/2349
+        Uses the tim_calibration_table declared above.
+        
+        Inputs:
+            voltage: MCP voltage in volts.
+            photonEnergy: FEL photon energy in eV. Calibration factor is linearly
+                interpolated between the known values from the calibration table. 
+            mcp: MCP channel (1, 2, or 3).
+            
+        Output:
+            f: calibration factor in microjoule per APD signal
+    '''
+    energies = np.sort([key for key in tim_calibration_table])
+    if photonEnergy not in energies:
+        if photonEnergy > energies.max():
+            photonEnergy = energies.max()
+        elif photonEnergy < energies.min():
+            photonEnergy = energies.min()
+        else:
+            idx = np.searchsorted(energies, photonEnergy) - 1
+            polyA = np.poly1d(tim_calibration_table[energies[idx]][mcp-1])
+            polyB = np.poly1d(tim_calibration_table[energies[idx+1]][mcp-1])
+            fA = -np.exp(polyA(voltage))
+            fB = -np.exp(polyB(voltage))
+            f = fA + (fB-fA)/(energies[idx+1]-energies[idx])*(photonEnergy - energies[idx])
+            return f
+    poly = np.poly1d(tim_calibration_table[photonEnergy][mcp-1])
+    f = -np.exp(poly(voltage))
+    return f
+
+#############################################################################################
+#############################################################################################
+#############################################################################################
+
+def extract_digitizer_peaks(proposal, runNB, mnemonic, bunchPattern=None,
+                            integParams=None, save=False,
+                            subdir='usr/processed_runs'):
+    """
+    Extract the peaks from digitizer raw traces and saves them into a file.
+    The calculation is a trapezoidal integration between 'pulseStart' and
+    'pulseStop' with subtraction of a baseline defined as the median between
+    'baseStart' and 'baseStop'.
+    The integration parameters can either be provided using integParams, or
+    determined by a peak finding algorithm using autoFind. If the bunchPattern
+    argument is provided, the pulse ids are aligned to it.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        the mnemonic containing raw traces. Example: 'XRD_MCP_BIGraw'
+    bunchPattern: str
+        'sase3' or 'scs_ppl'. If provided, checks the bunch pattern table using
+        Extra XrayPulses or OpticalPulses and aligns the pulse ids.
+        If None, the pulse ids are not aligned and indexed between 0 and the
+        number of pulses per train.
+    integParams: dict
+        dictionnary with keys ['pulseStart', 'pulseStop', 'baseStart',
+        'baseStop', 'period', 'npulses']. If provided, autoFind is set to False.
+        If bunchPattern is not None, 'period' and 'npulses' are adjusted to match
+        the bunch pattern and align the pulse ids.
+    save: bool
+        whether to save the result to a file or not.
+    subdir: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-digitizers-data.h5'
+    """
+    if integParams is None and autoFind is False:
+        log.warning('integParams not provided and autoFind is False. '
+                    'Cannot compute peak integration parameters.')
+        return xr.DataArray()
+
+    run = open_run(proposal, runNB)
+    run_mnemonics = mnemonics_for_run(run)
+    mnemo = run_mnemonics.get(mnemonic)
+    if mnemo is None:
+        log.warning('Mnemonic not found. Skipping.')
+        return xr.DataArray()
+    source, key = mnemo['source'], mnemo['key']
+    extra_dim = {'sase3': 'sa3_pId', 'scs_ppl': 'ol_pId'}.get(bunchPattern)
+    if extra_dim is None:
+        extra_dim = 'pulseId'
+    digitizer = digitizer_type(run, source)
+    if digitizer == 'FastADC':
+        pulse_period = 24
+    if digitizer == 'ADQ412':
+        pulse_period = 440
+    pattern = None
+    try:
+        if bunchPattern == 'sase3':
+            pattern = XrayPulses(run)
+        if bunchPattern == 'scs_ppl':
+            pattern = OpticalLaserPulses(run)
+    except Exception as e:
+        log.warning(e)
+    '''
+    pattern = None
+    try:
+        if bunchPattern == 'sase3':
+            pattern = XrayPulses(run)
+        if bunchPattern == 'scs_ppl':
+            pattern = OpticalLaserPulses(run)
+    except Exception as e:
+        log.warning(e)
+        bunchPattern = None
+
+    # Extract pulse_ids, npulses and period from bunch pattern
+    pulse_ids, npulses, period = None, None, None
+    regular = True
+    if pattern is not None:
+        if pattern.is_constant_pattern() is False:
+            log.info('The number of pulses changed during the run.')
+            pulse_ids = np.unique(pattern.pulse_ids(labelled=False, copy=False))
+            npulses, period = None, None
+            regular = False
+        else:
+            pulse_ids = pattern.peek_pulse_ids(labelled=False)
+            npulses = len(pulse_ids)
+            if npulses > 1:
+                periods = np.diff(pulse_ids)
+                if len(np.unique(periods)) > 1:
+                    regular = False
+                else:
+                    period = np.unique(periods)[0] * pulse_period
+
+    # Use integration parameters, adjust them to match bunch pattern if necessary
+    if integParams is not None:
+        required_keys = ['pulseStart', 'pulseStop', 'baseStart',
+                         'baseStop']
+        if bunchPattern is None:
+            required_keys += ['period', 'npulses']
+        if not all(name in integParams for name in required_keys):
+            raise TypeError('All keys of integParams argument '
+                            f'{required_keys} are required.')
+        params = integParams.copy()
+        if pattern is not None:
+            if (npulses != params.get('npulses') or
+                period != params.get('period')):
+                log.warning(f'Integration parameters '
+                    f'(npulses={params.get("npulses")}, '
+                    f'period={params.get("period")}) do not match the '
+                    f'the bunch pattern (npulses={npulses}, '
+                    f'period={period}). Using bunch pattern parameters.')
+            params['npulses'] = npulses
+            params['period'] = period
+        else:
+            pulse_ids = np.arange(0, params['npulses'], params['period'],
+                                  dtype=np.uint64)
+        start = params['pulseStart']
+        params['pulseStart'] = [start + (pid - pulse_ids[0]) * pulse_period
+                                for pid in pulse_ids]
+    '''
+    
+    # load traces and calculate the average
+    traces = run[source, key].xarray(name=mnemonic, extra_dims=mnemo['dim'])
+    trace = traces.mean('trainId').rename(mnemonic.replace('raw', 'avg'))
+
+    params, pulse_ids, regular, trace = find_peak_integration_parameters(
+        run, mnemonic, trace, integParams, pattern)
+
+    '''
+    # find peak integration parameters
+    if integParams is None:
+        params = find_integ_params(trace)
+        if (period is not None and params['period'] != period
+            or npulses is not None and params['npulses'] != npulses):
+            log.warning(f'Bunch pattern (npulses={npulses}, period={period}) and '
+                  f'found integration parameters (npulses={params["npulses"]}, '
+                  f'period={params["period"]}) do not match. Using bunch '
+                  'pattern parameters.')
+            params["npulses"] = npulses
+            params["period"] = period
+        if pulse_ids is None:
+            pulse_ids = np.arange(params['npulses'], dtype=np.uint64)
+        start = params['pulseStart']
+        params['pulseStart'] = [start + (pid - pulse_ids[0]) * pulse_period
+                                for pid in pulse_ids]
+    '''
+    
+    # extract peaks
+    data = peaks_from_raw_trace(traces, **params, extra_dim=extra_dim)
+    data = data.rename(mnemonic.replace('raw', 'peaks'))
+
+    if regular is True:
+        data = data.assign_coords({extra_dim: pulse_ids})
+    else:
+        mask = pattern.pulse_mask(labelled=False)
+        mask = xr.DataArray(mask, dims=['trainId', extra_dim],
+                            coords={'trainId': run[source, key].train_ids,
+                                    extra_dim: np.arange(mask.shape[1])})
+        mask = mask.sel({extra_dim: pulse_ids})
+        data = data.where(mask, drop=True)
+
+    data.attrs['params_keys'] = list(params.keys())
+    if regular:
+        params['pulseStart'] = params['pulseStart'][0]
+    for p in params:
+        if params[p] is None:
+            params[p] = 'None'
+    data.attrs[f'params_{data.name}'] = list(params.values())
+    if save:
+        save_peaks(proposal, runNB, data, trace, subdir)
+    return data
+
+
+def save_peaks(proposal, runNB, peaks, avg_trace, subdir):
+    '''
+    Save the peaks extracted with extract_digitizer_peaks() as well as the
+    average raw trace in a dataset at the proposal + subdir location.
+    If a dataset already exists, the new data is merged with it.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    peaks: xarray DataArray
+        the 2D-array obtained by extract_digitizer_peaks()
+    avg_trace: xarray DataArray
+        the average raw trace over the trains
+    subdir: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-digitizers-data.h5'
+
+    Returns
+    -------
+        None
+    '''
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, subdir + f'/r{runNB:04d}/')
+    os.makedirs(path, exist_ok=True)
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    ds_peaks = peaks.to_dataset(promote_attrs=True)
+
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        ds = ds.drop_vars([peaks.name, avg_trace.name], errors='ignore')
+        for dim in ds.dims:
+            if all(dim not in ds[v].dims for v in ds):
+                ds = ds.drop_dims(dim)
+        dim_name = 'sampleId'
+        if 'sampleId' in ds.dims and ds.sizes['sampleId'] != len(avg_trace):
+            dim_name = 'sampleId2'
+        avg_trace = avg_trace.rename({avg_trace.dims[0]: dim_name})
+        if f'params_{peaks.name}' in ds.attrs:
+            del ds.attrs[f'params_{peaks.name}']
+        ds = xr.merge([ds, ds_peaks, avg_trace],
+                      combine_attrs='drop_conflicts', join='inner')
+    else:
+        ds = ds_peaks.merge(avg_trace.rename({avg_trace.dims[0]: 'sampleId'}))
+    ds.to_netcdf(fname, format='NETCDF4')
+    print(f'saved data into {fname}.')
+    return None
+
+
+def load_processed_peaks(proposal, runNB, mnemonic=None,
+                         data='usr/processed_runs', merge_with=None):
+    """
+    Load processed digitizer peaks data.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        the mnemonic containing peaks. Example: 'XRD_MCP_BIGpeaks'.
+        If None, the entire dataset is loaded
+    data: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-digitizers-data.h5'
+    merge_with: xarray Dataset
+        A dataset to merge the data with.
+
+    Returns
+    -------
+    xarray DataArray if menmonic is not None and merge_with is None
+    xarray Dataset if mnemonic is None or merge_with is not None.
+
+    Example
+    -------
+    # load the mono energy and the MCP_BIG peaks
+    run, ds = tb.load(proposal, runNB, 'nrj')
+    ds = tb.load_processed_peaks(proposal, runNB,'XRD_MCP_BIGpeaks', merge_with=ds)
+    """
+    if mnemonic is None:
+        return load_all_processed_peaks(proposal, runNB, data, merge_with)
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, data + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        if mnemonic not in ds:
+            print(f'Mnemonic {mnemonic} not found in {fname}')
+            return {}
+        da = ds[mnemonic]
+        da.attrs['params_keys'] = ds.attrs['params_keys']
+        da.attrs[f'params_{mnemonic}'] = ds.attrs[f'params_{mnemonic}']
+        if merge_with is not None:
+            return merge_with.merge(da.to_dataset(promote_attrs=True),
+                                    combine_attrs='drop_conflicts', join='inner')
+        else:
+            return da
+    else:
+        print(f'Mnemonic {mnemonic} not found in {fname}')
+        return merge_with
+
+
+def load_all_processed_peaks(proposal, runNB, data='usr/processed_runs',
+                             merge_with=None):
+    """
+    Load processed digitizer peaks dataset. The data contains the peaks,
+    the average raw trace and the integration parameters (attribute) of
+    each processed digitizer source.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    data: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-digitizers-data.h5'
+    merge_with: xarray Dataset
+        A dataset to merge the data with.
+
+    Returns
+    -------
+    xarray Dataset
+    """
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, data + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    if os.path.isfile(fname):
+        if merge_with is not None:
+            return merge_with.merge(xr.load_dataset(fname),
+                                    combine_attrs='drop_conflicts', join='inner')
+        return xr.load_dataset(fname)
+    else:
+        print(f'{fname} not found.')
+        return merge_with
+
+
+def check_processed_peak_params(proposal, runNB, mnemonic, data='usr/processed_runs',
+                                plot=True, show_all=False):
+    """
+    Check the integration parameters used to generate the processed peak values.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        the mnemonic containing peaks. Example: 'XRD_MCP_BIGpeaks'.
+        If None, the entire dataset is loaded
+    data: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-digitizers-data.h5'
+    plot: bool
+        If True, displays the raw trace and peak integration regions.
+    show_all: bool
+        If True, displays the entire raw trace and all peak integration
+        regions (this can be time-consuming).
+        If False, shows the first and last pulses.
+
+    Returns
+    -------
+    params: dict
+        the integration parameters with keys ['pulseStart', 'pulseStop',
+        'baseStart', 'baseStop', 'period', 'npulses'].
+        See `extract_digitizer_peaks()`.
+    """
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, data + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        if mnemonic.replace('raw', 'peaks') not in ds:
+            print(f'Mnemonic {mnemonic} not found in {fname}')
+            return {}
+        da = ds[mnemonic]
+        params = dict(zip(ds.attrs['params_keys'], ds.attrs[f'params_{mnemonic}']))
+        if plot:
+            plotPeakIntegrationWindow(ds[mnemonic.replace('peaks', 'avg')],
+                                      params, show_all=show_all)
+        return params
+    else:
+        print(f'{fname} not found.')
+        return {}
\ No newline at end of file
--- a/src/toolbox_scs/detectors/deprecated/dssc_bkp.py
+++ b/src/toolbox_scs/detectors/deprecated/dssc_bkp.py
+from joblib import Parallel, delayed, parallel_backend
+from time import strftime
+import tempfile
+import shutil
+from tqdm.auto import tqdm
+import os
+import warnings
+import psutil
+
+import extra_data as ed
+from extra_data.read_machinery import find_proposal
+from extra_geom import DSSC_1MGeometry
+import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1 import ImageGrid
+import numpy as np
+import xarray as xr
+import h5py
+from imageio import imread
+
+from ..constants import mnemonics as _mnemonics
+from .azimuthal_integrator import AzimuthalIntegrator
+
+class DSSC:
+    
+    def __init__(self, proposal, distance=1):
+        """ Create a DSSC object to process DSSC data.
+        
+            inputs:
+                proposal: (int,str) proposal number string
+                distance: (float) distance sample to DSSC detector in meter
+            
+        """
+        if isinstance(proposal,int):
+            proposal = 'p{:06d}'.format(proposal)
+        runFolder = find_proposal(proposal)
+        self.semester = runFolder.split('/')[-2]
+        self.proposal = proposal
+        self.topic = runFolder.split('/')[-3]
+        self.tempdir = None
+        self.save_folder = os.path.join(runFolder, 'usr/condensed_runs/')
+        self.distance = distance
+        self.px_pitch_h = 236 # horizontal pitch in microns
+        self.px_pitch_v = 204 # vertical pitch in microns
+        self.aspect = self.px_pitch_v/self.px_pitch_h # aspect ratio of the DSSC images
+        self.geom = None
+        self.mask = None
+        self.max_fraction_memory = 0.4
+        self.filter_mask = None
+        self.Nworker = 16
+        
+        print('DSSC configuration')
+        print(f'Topic: {self.topic}')
+        print(f'Semester: {self.semester}')
+        print(f'Proposal: {self.proposal}')
+        print(f'Default save folder: {self.save_folder}')
+        print(f'Sample to DSSC distance: {self.distance} m')
+        
+        if not os.path.exists(self.save_folder):
+            warnings.warn(f'Default save folder does not exist: {self.save_folder}')
+        
+    def __del__(self):
+        # deleting temporay folder
+        if self.tempdir:
+            shutil.rmtree(self.tempdir)
+    
+    def open_run(self, run_nr, isDark=False):
+        """ Open a run with extra-data and prepare the virtual dataset for multiprocessing
+        
+            inputs:
+                run_nr: the run number
+                isDark: a boolean to specify if the run is a dark run or not
+        
+        """
+        
+        print('Opening run data with extra-data')
+        self.run_nr = run_nr
+        self.xgm = None
+        self.filter_mask = None
+
+        self.run = ed.open_run(self.proposal, self.run_nr)
+        self.isDark = isDark
+        self.plot_title = f'{self.proposal} run: {self.run_nr}'
+        
+        self.fpt = self.run.detector_info('SCS_DET_DSSC1M-1/DET/0CH0:xtdf')['frames_per_train']
+        self.nbunches = self.run.get_array('SCS_RR_UTC/MDL/BUNCH_DECODER', 'sase3.nPulses.value')
+        self.nbunches = np.unique(self.nbunches)
+        if len(self.nbunches) == 1:
+            self.nbunches = self.nbunches[0]
+        else:
+            warnings.warn('not all trains have same length DSSC data')
+            print(f'nbunches: {self.nbunches}')
+            self.nbunches = self.nbunches[-1]
+            
+        print(f'DSSC frames per train: {self.fpt}')
+        print(f'SA3 bunches per train: {self.nbunches}')
+        
+        if self.tempdir is not None:
+            shutil.rmtree(self.tempdir)
+        
+        self.tempdir = tempfile.mkdtemp()
+        print(f'Temporary directory: {self.tempdir}')
+
+        print('Creating virtual dataset')
+        self.vds_filenames = self.create_virtual_dssc_datasets(self.run, path=self.tempdir)
+        
+        # create a dummy scan variable for dark run
+        # for other type or run, use DSSC.define_run function to overwrite it
+        self.scan = xr.DataArray(np.ones_like(self.run.train_ids), dims=['trainId'],
+                                 coords={'trainId': self.run.train_ids}).to_dataset(
+                        name='scan_variable')
+        self.scan_vname = 'dummy'
+        
+    def define_scan(self, vname, bins):
+        """
+            Prepare the binning of the DSSC data.
+            
+            inputs:
+                vname: variable name for the scan, can be a mnemonic string from ToolBox
+                    or a dictionnary with ['source', 'key'] fields
+                bins: step size (or bins_edge but not yet implemented)
+        """
+
+        if type(vname) is dict:
+            scan = self.run.get_array(vname['source'], vname['key'])
+        elif type(vname) is str:
+            if vname not in _mnemonics:
+                raise ValueError(f'{vname} not found in the ToolBox mnemonics table')
+            scan = self.run.get_array(_mnemonics[vname]['source'], _mnemonics[vname]['key'])
+        else:
+            raise ValueError(f'vname should be a string or a dict. We got {type(vname)}')
+            
+        if (type(bins) is int) or (type(bins) is float):
+            scan = bins * np.round(scan / bins)
+        else:
+            # TODO: digitize the data
+            raise ValueError(f'To be implemented')
+        self.scan_vname = vname
+       
+        self.scan = scan.to_dataset(name='scan_variable')
+        self.scan['xgm_pumped'] = self.xgm[:, :self.nbunches:2].mean('dim_0')
+        self.scan['xgm_unpumped'] = self.xgm[:, 1:self.nbunches:2].mean('dim_0')
+
+        self.scan_counts = xr.DataArray(np.ones(len(self.scan['scan_variable'])),
+                                        dims=['scan_variable'],
+                                        coords={'scan_variable': self.scan['scan_variable'].values},
+                                        name='counts')
+        self.scan_points = self.scan.groupby('scan_variable').mean('trainId').coords['scan_variable'].values
+        self.scan_points_counts = self.scan_counts.groupby('scan_variable').sum()
+        self.plot_scan()
+        
+    def plot_scan(self):
+        """ Plot a previously defined scan to see the scan range and the statistics.
+        """
+        if self.scan:
+            fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=[5, 5])
+        else:
+            fig, ax1 = plt.subplots(nrows=1, figsize=[5, 2.5])
+            
+        ax1.plot(self.scan_points, self.scan_points_counts, 'o-', ms=2)
+        ax1.set_xlabel(f'{self.scan_vname}')
+        ax1.set_ylabel('# trains')
+        ax1.set_title(self.plot_title)
+        
+        if self.scan:
+            ax2.plot(self.scan['scan_variable'])
+            ax2.set_xlabel('train #')
+            ax2.set_ylabel(f'{self.scan_vname}')
+            
+    def load_xgm(self):
+        """ Loads pulse resolved dedicated SAS3 data from the SCS XGM.
+        
+        """
+        if self.xgm is None:
+            self.xgm = self.run.get_array(_mnemonics['SCS_SA3']['source'],
+                                          _mnemonics['SCS_SA3']['key'], 
+                                          roi=ed.by_index[:self.nbunches])
+            
+    def plot_xgm_hist(self, nbins=100):
+        """ Plots an histogram of the SCS XGM dedicated SAS3 data.
+        
+            inputs:
+                nbins: number of the bins for the histogram.
+        """
+        if self.xgm is None:
+            self.load_xgm()
+            
+        hist, bins_edges = np.histogram(self.xgm, nbins, density=True)
+        width = 1.0 * (bins_edges[1] - bins_edges[0])
+        bins_center = 0.5*(bins_edges[:-1] + bins_edges[1:])
+        
+        plt.figure(figsize=(5,3))
+        plt.bar(bins_center, hist, align='center', width=width)
+        plt.xlabel(f"{_mnemonics['SCS_SA3']['source']}{_mnemonics['SCS_SA3']['key']}")
+        plt.ylabel('density')
+        plt.title(self.plot_title)
+        
+    def xgm_filter(self, xgm_low=-np.inf, xgm_high=np.inf):
+        """ Filters the data by train. If one pulse within a train has an SASE3 SCS XGM value below
+            xgm_low or above xgm_high, that train will be dropped from the dataset.
+            
+            inputs:
+                xgm_low: low threshold value
+                xgm_high: high threshold value
+        """
+                   
+        if self.xgm is None:
+            self.load_xgm()
+        
+        if self.isDark:
+            warnings.warn(f'This run was loaded as dark. Filtering on xgm makes no sense. Aborting')
+            return
+        
+        self.xgm_low = xgm_low
+        self.xgm_high = xgm_high
+        
+        filter_mask = (self.xgm > self.xgm_low) * (self.xgm < self.xgm_high)
+                   
+        if self.filter_mask:
+            self.filter_mask = self.filter_mask*filter_mask
+        else:
+            self.filter_mask = filter_mask
+                   
+        valid = filter_mask.prod('dim_0').astype(bool)
+        xgm_valid = self.xgm.where(valid)
+        xgm_valid = xgm_valid.dropna('trainId')
+        self.scan = self.scan.sel({'trainId': xgm_valid.trainId})
+        nrejected = len(self.run.train_ids) - len(self.scan)
+        print((f'Rejecting {nrejected} out of {len(self.run.train_ids)} trains due to xgm '
+               f'thresholds: [{self.xgm_low}, {self.xgm_high}]'))
+
+    def load_geom(self, geopath=None, quad_pos=None):
+        """ Loads and return the DSSC geometry.
+
+            inputs:
+                geopath: path to the h5 geometry file. If None uses a default file.
+                quad_pos: list of quadrants tuple position. If None uses a default position.
+
+            output:
+                return the loaded geometry
+        """
+        if quad_pos is None:
+            quad_pos = [(-124.100,    3.112),  # TR
+                    (-133.068, -110.604),  # BR
+                    (   0.988, -125.236),  # BL
+                    (   4.528,   -4.912)   # TL
+                ]
+
+        if geopath is None:
+            geopath = '/gpfs/exfel/sw/software/git/EXtra-geom/docs/dssc_geo_june19.h5'
+
+        self.geom = DSSC_1MGeometry.from_h5_file_and_quad_positions(geopath, quad_pos)
+
+        return self.geom
+               
+    def load_mask(self, fname, plot=True):
+        """ Load a DSSC mask file.
+            
+            input:
+                fname: string of the filename of the mask file
+                plot: if True, the loaded mask is plotted
+        """
+                   
+
+        dssc_mask = imread(fname)
+        dssc_mask = dssc_mask.astype(float)[..., 0] // 255
+        dssc_mask[dssc_mask==0] = np.nan
+        self.mask = dssc_mask
+        
+        if plot:
+            plt.figure()
+            plt.imshow(self.mask)
+
+    def create_virtual_dssc_datasets(self, run, path=''):
+        """ Create virtual datasets for each 16 DSSC modules used for the multiprocessing.
+            
+            input:
+                run: extra-data run
+                path: string where the virtual files are created
+            output:
+                dictionnary of key:module, value:virtual dataset filename
+        """
+        
+        vds_filenames = {}
+
+        for module in tqdm(range(16)):
+            fname = os.path.join(path, f'dssc{module}_vds.h5')
+            if os.path.isfile(fname):
+                os.remove(fname)
+
+            vds = run.get_virtual_dataset(f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf',
+                                                 'image.data', filename=fname)
+
+            vds.file.close() # keep h5 file closed outside 'with' context
+
+            vds_filenames[module] = fname
+
+        return vds_filenames
+    
+    def binning(self, do_pulse_mean=True):
+        """ Bin the DSSC data by the predifined scan type (DSSC.define()) using multiprocessing
+        
+        """
+
+        # get available memory in GB, we will try to use 80 % of it
+        max_GB = psutil.virtual_memory().available/1024**3
+        print(f'max available memory: {max_GB} GB')
+        
+        # max_GB / (8byte * Nworker * 128px * 512px * N_pulses)
+        self.chunksize = int(self.max_fraction_memory*max_GB * 1024**3 // (8 * self.Nworker * 128 * 512 * self.fpt))
+        
+        print('processing', self.chunksize, 'trains per chunk')
+                   
+        jobs = []
+        for m in range(16):
+            jobs.append(dict(
+            module=m,
+            fpt=self.fpt,
+            vds=self.vds_filenames[m],
+            chunksize=self.chunksize,
+            scan=self.scan['scan_variable'],
+            nbunches=self.nbunches,
+            run_nr=self.run_nr,
+            do_pulse_mean=do_pulse_mean
+            ))
+            
+        if self.Nworker != 16:
+            with warnings.catch_warnings():
+                warnings.simplefilter("default")
+                warnings.warn(('Nworker other than 16 known to cause issue' +
+                    '(https://in.xfel.eu/gitlab/SCS/ToolBox/merge_requests/76)'),
+                    RuntimeWarning)
+
+        timestamp = strftime('%X')
+        print(f'start time: {timestamp}')
+
+        with parallel_backend('loky', n_jobs=self.Nworker):
+            module_data = Parallel(verbose=20)(
+                delayed(process_one_module)(job) for job in tqdm(jobs)
+            )
+
+        print('finished:', strftime('%X'))
+    
+        # rearange the multiprocessed data
+        self.module_data = xr.concat(module_data, dim='module')
+        self.module_data['run'] = self.run_nr
+        self.module_data = self.module_data.transpose('scan_variable', 'module', 'x', 'y')
+                   
+        if do_pulse_mean:
+            self.module_data = xr.merge([self.module_data, self.scan.groupby('scan_variable').mean('trainId')])
+        elif self.xgm is not None:
+            xgm_pumped = self.xgm[:, :self.nbunches:2].mean('trainId').to_dataset(name='xgm_pumped').rename({'dim_0':'scan_variable'})
+            xgm_unpumped = self.xgm[:, 1:self.nbunches:2].mean('trainId').to_dataset(name='xgm_unpumped').rename({'dim_0':'scan_variable'})
+            self.module_data = xr.merge([self.module_data, xgm_pumped, xgm_unpumped])
+        self.module_data = self.module_data.squeeze()
+        
+        if do_pulse_mean:
+            self.module_data.attrs['scan_variable'] = self.scan_vname
+        else:
+            self.module_data.attrs['scan_variable'] = 'pulse id'
+                   
+    def save(self, save_folder=None, overwrite=False):
+        """ Save the crunched data.
+        
+            inputs:
+                save_folder: string of the fodler where to save the data.
+                overwrite: boolean whether or not to overwrite existing files.
+        """
+        if save_folder is None:
+            save_folder = self.save_folder
+
+        if self.isDark:
+            fname = f'run{self.run_nr}_dark.nc'  # no scan
+        else:
+            fname = f'run{self.run_nr}.nc'  # run with delay scan (change for other scan types!)
+
+
+        save_path = os.path.join(save_folder, fname)
+        file_exists = os.path.isfile(save_path)
+
+        if not file_exists or (file_exists and overwrite):
+            if file_exists:
+                warnings.warn(f'Overwriting file: {save_path}')
+                os.remove(save_path)
+            self.module_data.to_netcdf(save_path, group='data')
+            self.module_data.close()
+            os.chmod(save_path, 0o664)
+            print('saving: ', save_path)
+        else:
+            print('file', save_path, 'exists and overwrite is False')
+                   
+    def load_binned(self, runNB, dark_runNB, xgm_norm = True, save_folder=None):
+        """ load previously binned (crunched) DSSC data by DSSC.crunch() and DSSC.save()
+        
+            inputs:
+                runNB: run number to load
+                dark_runNB: run number of the corresponding dark
+                xgm_norm: normlize by XGM data if True
+                save_folder: path string  where the crunched data are saved
+        """
+
+        if save_folder is None:
+            save_folder = self.save_folder
+
+        self.plot_title = f'{self.proposal} run: {runNB} dark: {dark_runNB}'
+                   
+        dark = xr.load_dataset(os.path.join(save_folder, f'run{dark_runNB}_dark.nc'), group='data',
+            engine='netcdf4')
+        binned = xr.load_dataset(os.path.join(save_folder, f'run{runNB}.nc'), group='data',
+            engine='netcdf4')
+
+        binned['pumped'] = (binned['pumped'] - dark['pumped'].values)
+        binned['unpumped'] = (binned['unpumped'] - dark['unpumped'].values)
+
+        if xgm_norm:
+            binned['pumped'] = binned['pumped'] / binned['xgm_pumped']
+            binned['unpumped'] = binned['unpumped'] / binned['xgm_unpumped']
+        
+        self.scan_points = binned['scan_variable']
+        self.scan_points_counts = binned['sum_count'][:, 0]
+        self.scan_vname = binned.attrs['scan_variable']
+        self.scan = None
+
+        self.binned = binned
+                   
+    def plot_DSSC(self, use_mask = True, p_low = 1, p_high = 98, vmin = None, vmax = None):
+        """ Plot pumped and unpumped DSSC images.
+        
+            inputs:
+                use_mask: if True, a mask is applied on the DSSC.
+                p_low: low percentile value to adjust the contrast scale on the unpumped and pumped image
+                p_high: high percentile value to adjust the contrast scale on the unpumped and pumped image
+                vmin: low value of the image scale
+                vmax: high value of the image scale
+        """
+                   
+        if use_mask:
+            if self.mask is None:
+                raise ValueError('No mask was loaded !')
+                   
+            mask = self.mask
+            mask_txt = ' masked'
+        else:
+            mask = 1
+            mask_txt = ''
+        
+        if self.geom is None:
+            self.load_geom()
+                   
+        im_pump_mean, _ = self.geom.position_modules_fast(self.binned['pumped'].mean('scan_variable'))
+        im_unpump_mean, _ = self.geom.position_modules_fast(self.binned['unpumped'].mean('scan_variable'))
+        
+        self.im_pump_mean = mask*im_pump_mean
+        self.im_unpump_mean = mask*im_unpump_mean
+                           
+        fig = plt.figure(figsize=(9, 4))
+        grid = ImageGrid(fig, 111,
+                 nrows_ncols=(1,2),
+                 axes_pad=0.15,
+                 share_all=True,
+                 cbar_location="right",
+                 cbar_mode="single",
+                 cbar_size="7%",
+                 cbar_pad=0.15,
+                 )
+
+        _vmin, _vmax = np.percentile(self.im_pump_mean[~np.isnan(self.im_pump_mean)], [p_low, p_high])
+        if vmin is None:
+            vmin = _vmin
+        if vmax is None:
+            vmax = _vmax
+                         
+        im = grid[0].imshow(self.im_pump_mean, vmin=vmin, vmax=vmax, aspect=self.aspect)
+        grid[0].set_title('pumped' + mask_txt)
+
+        im = grid[1].imshow(self.im_unpump_mean, vmin=vmin, vmax=vmax, aspect=self.aspect)
+        grid[1].set_title('unpumped' + mask_txt)
+                   
+        grid[-1].cax.colorbar(im)
+        grid[-1].cax.toggle_label(True)
+        
+        fig.suptitle(self.plot_title)
+                   
+                   
+    def azimuthal_int(self, wl, center=None, angle_range=[0, 180-1e-6], dr=1, use_mask=True):
+        """ Perform azimuthal integration of 1D binned DSSC run.
+        
+            inputs:
+                wl: photon wavelength
+                center: center of integration
+                angle_range: angles of integration
+                dr: dr
+                use_mask: if True, use the loaded mask
+        """
+
+        if self.geom is None:
+            self.load_geom()
+
+        if use_mask:
+            if self.mask is None:
+                raise ValueError('No mask was loaded !')
+
+            mask = self.mask
+            mask_txt = ' masked'
+        else:
+            mask = 1
+            mask_txt = ''
+
+        im_pumped_arranged, c_geom = self.geom.position_modules_fast(self.binned['pumped'].values)
+        im_unpumped_arranged, c_geom = self.geom.position_modules_fast(self.binned['unpumped'].values)
+
+        im_pumped_arranged *= mask
+        im_unpumped_arranged *= mask
+
+        im_pumped_mean = im_pumped_arranged.mean(axis=0)
+        im_unpumped_mean = im_unpumped_arranged.mean(axis=0)
+
+        if center is None:
+            center = c_geom
+
+        ai = AzimuthalIntegrator(im_pumped_mean.shape, center, angle_range, dr=dr)
+        norm = ai(~np.isnan(im_pumped_mean))
+
+        az_pump = []
+        az_unpump = []
+
+        for i in tqdm(range(len(self.binned['scan_variable']))):
+            az_pump.append(ai(im_pumped_arranged[i]) / norm)
+            az_unpump.append(ai(im_unpumped_arranged[i]) / norm)
+
+        az_pump = np.stack(az_pump)
+        az_unpump = np.stack(az_unpump)
+
+        coords = {'scan_variable': self.binned['scan_variable'], 'distance': ai.distance}
+        azimuthal = xr.DataArray(az_pump, dims=['scan_variable', 'distance'], coords=coords)
+        azimuthal = azimuthal.to_dataset(name='pumped')
+        azimuthal['unpumped'] = xr.DataArray(az_unpump, dims=['scan_variable', 'distance'], coords=coords)
+        azimuthal = azimuthal.transpose('distance', 'scan_variable')
+
+        #t0 = 225.5
+        #azimuthal['delay'] = (t0 - azimuthal.delay)*6.6
+        #azimuthal['delay'] = azimuthal.delay
+
+        azimuthal['delta_q (1/nm)'] = 2e-9 * np.pi * np.sin(
+            np.arctan(azimuthal.distance *  self.px_pitch_v*1e-6 / self.distance)) / wl
+        
+        azimuthal.attrs = self.binned.attrs
+
+        self.azimuthal = azimuthal.swap_dims({'distance': 'delta_q (1/nm)'})
+                   
+    def plot_azimuthal_int(self, kind='difference', lim=None):
+        """ Plot a computed azimuthal integration.
+        
+            inputs:
+                kind: (str) either 'difference' or 'relative' to change the type of plot.
+        """
+        fig, [ax1, ax2, ax3] = plt.subplots(nrows=3, sharex=True, sharey=True)
+
+        xr.plot.imshow(self.azimuthal.pumped, ax=ax1, vmin=0, robust=True)
+        ax1.set_title('pumped')
+
+        xr.plot.imshow(self.azimuthal.unpumped, ax=ax2, vmin=0, robust=True)
+        ax2.set_title('unpumped')
+                   
+        if kind == 'difference':
+            val = self.azimuthal.pumped - self.azimuthal.unpumped
+            ax3.set_title('pumped - unpumped')
+        elif kind == 'relative':
+            val = (self.azimuthal.pumped - self.azimuthal.unpumped)/self.azimuthal.unpumped
+            ax3.set_title('(pumped - unpumped)/unpumped')
+        else:
+            raise ValueError('kind should be either difference or relative')
+
+        if lim is None:
+            xr.plot.imshow(val, ax=ax3, robust=True)
+        else:
+            xr.plot.imshow(val, ax=ax3, vmin=lim[0], vmax=lim[1])
+                   
+        ax3.set_xlabel(self.scan_vname)
+        fig.suptitle(f'{self.plot_title}')
+
+    def plot_azimuthal_line_cut(self, data, qranges, qwidths):
+        """ Plot line scans on top of the data.
+        
+            inputs:
+                data: an azimuthal integrated xarray DataArray with 'delta_q (1/nm)' as one of its dimension.
+                qranges: a list of q-range
+                qwidth: a list of q-width, same length as qranges
+        """
+                   
+        fig, [ax1, ax2] = plt.subplots(nrows=2, sharex=True, figsize=[8, 7])
+
+        xr.plot.imshow(data, ax=ax1, robust=True)
+
+        # attributes are not propagated during xarray mathematical operation https://github.com/pydata/xarray/issues/988
+        # so we might not have in data the scan vaiable name anymore
+        ax1.set_xlabel(self.scan_vname) 
+        fig.suptitle(f'{self.plot_title}')
+    
+        for i, (qr, qw) in enumerate(zip(qranges, qwidths)):
+            sel = (data['delta_q (1/nm)'] > (qr - qw/2)) * (data['delta_q (1/nm)'] < (qr + qw/2))
+            val = data.where(sel).mean('delta_q (1/nm)')
+            ax2.plot(data.scan_variable, val, c=f'C{i}', label=f'q = {qr:.2f}')
+        
+            ax1.axhline(qr - qw/2, c=f'C{i}', lw=1)
+            ax1.axhline(qr + qw/2, c=f'C{i}', lw=1)
+        ax2.legend()
+        ax2.set_xlabel(self.scan_vname)
+        
+                   
+# since 'self' is not pickable, this function has to be outside the DSSC class so that it can be used
+# by the multiprocessing pool.map function
+def process_one_module(job):
+    module = job['module']
+    fpt = job['fpt']
+    vds = job['vds']
+    scan = job['scan']
+    chunksize = job['chunksize']
+    nbunches = job['nbunches']
+    do_pulse_mean = job['do_pulse_mean']
+
+    image_path = f'INSTRUMENT/SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf/image/data'
+    npulse_path = f'INDEX/SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf/image/count'
+    with h5py.File(vds, 'r') as m:
+        all_trainIds = m['INDEX/trainId'][()]
+        frames_per_train = m[npulse_path][()]
+    trains_with_data = all_trainIds[frames_per_train == fpt]
+
+    len_scan = len(scan.groupby(scan))
+
+    if do_pulse_mean:
+        # create empty dataset to add actual data to
+        module_data = xr.DataArray(np.empty([len_scan, 128, 512], dtype=np.float64),
+                                   dims=['scan_variable', 'x', 'y'],
+                                   coords={'scan_variable': np.unique(scan)})
+        module_data = module_data.to_dataset(name='pumped')
+        module_data['unpumped'] = xr.full_like(module_data['pumped'], 0)
+        module_data['sum_count'] = xr.DataArray(np.zeros_like(np.unique(scan)), dims=['scan_variable'])
+        module_data['module'] = module
+    else:
+        scan = xr.full_like(scan, 1)
+        len_scan = len(scan.groupby(scan))                   
+        module_data = xr.DataArray(np.empty([len_scan, int(nbunches/2), 128, 512], dtype=np.float64),
+                                   dims=['scan_variable', 'pulse', 'x', 'y'],
+                                   coords={'scan_variable': np.unique(scan)})
+        module_data = module_data.to_dataset(name='pumped')
+        module_data['unpumped'] = xr.full_like(module_data['pumped'], 0)
+        module_data['sum_count'] = xr.full_like(module_data['pumped'][..., 0, 0], 0)
+        module_data['module'] = module
+
+    # crunching
+    with h5py.File(vds, 'r') as m:
+        chunk_start = np.arange(len(all_trainIds), step=chunksize, dtype=int)
+        trains_start = 0
+                   
+        # This line is the strange hack from https://github.com/tqdm/tqdm/issues/485
+        print(' ', end='', flush=True)
+        for c0 in tqdm(chunk_start, desc=f'pool.map#{module:02d}', position=module):
+            chunk_dssc = np.s_[int(c0 * fpt):int((c0 + chunksize) * fpt)]  # for dssc data
+            data = m[image_path][chunk_dssc].squeeze()
+            data = data.astype(np.float64)
+            n_trains = int(data.shape[0] // fpt)
+            trainIds_chunk = np.unique(trains_with_data[trains_start:trains_start + n_trains])
+            trains_start += n_trains
+            n_trains_actual = len(trainIds_chunk)
+            coords = {'trainId': trainIds_chunk}
+            data = np.reshape(data, [n_trains_actual, fpt, 128, 512])[:, :int(2 * nbunches)]
+            data = xr.DataArray(data, dims=['trainId', 'pulse', 'x', 'y'], coords=coords)
+            
+            if do_pulse_mean:
+                data_pumped = (data[:, ::4]).mean('pulse')
+                data_unpumped = (data[:, 2::4]).mean('pulse')
+            else:
+                data_pumped = (data[:, ::4])
+                data_unpumped = (data[:, 2::4])
+                   
+            data = data_pumped.to_dataset(name='pumped')
+            data['unpumped'] = data_unpumped
+            data['sum_count'] = xr.full_like(data['unpumped'][..., 0, 0], fill_value=1)
+                   
+            # grouping and summing
+            data['scan_variable'] = scan  # this only adds scan data for matching trainIds
+            data = data.dropna('trainId')
+            data = data.groupby('scan_variable').sum('trainId')
+            where = {'scan_variable': data.scan_variable}
+
+            for var in ['pumped', 'unpumped', 'sum_count']:
+                module_data[var].loc[where] = module_data[var].loc[where] + data[var]
+                   
+    for var in ['pumped', 'unpumped']:
+        module_data[var] = module_data[var] / module_data.sum_count
+    #module_data = module_data.drop('sum_count')
+    
+    if not do_pulse_mean:
+        module_data = module_data.sum('scan_variable')
+        module_data = module_data.rename({'pulse':'scan_variable'})  
+                   
+    return module_data
--- a/src/toolbox_scs/detectors/deprecated/dssc_single_module.py
+++ b/src/toolbox_scs/detectors/deprecated/dssc_single_module.py
+import multiprocessing
+from time import strftime
+from tqdm.auto import tqdm
+import os
+import warnings
+import psutil
+
+import extra_data as ed
+from extra_data.read_machinery import find_proposal
+import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1 import ImageGrid
+import matplotlib.patches as patches
+import numpy as np
+import xarray as xr
+import h5py
+from glob import glob
+
+from imageio import imread
+
+from ..constants import mnemonics as _mnemonics
+from ..misc.laser_utils import positionToDelay
+
+class DSSC1module:
+        
+    def __init__(self, module, proposal):
+        """ Create a DSSC object to process 1 module of DSSC data.
+        
+            inputs:
+                module: module number to process
+                proposal: (int,str) proposal number string
+            
+        """
+        self.module = module
+        
+        if isinstance(proposal,int):
+            proposal = 'p{:06d}'.format(proposal)
+        self.runFolder = find_proposal(proposal)
+        self.semester = self.runFolder.split('/')[-2]
+        self.proposal = proposal
+        self.topic = self.runFolder.split('/')[-3]
+        self.save_folder = os.path.join(self.runFolder, 'usr/condensed_runs/')
+        self.px_pitch_h = 236 # horizontal pitch in microns
+        self.px_pitch_v = 204 # vertical pitch in microns
+        self.aspect = self.px_pitch_v/self.px_pitch_h # aspect ratio of the DSSC images
+        
+        print('DSSC configuration')
+        print(f'DSSC module: {self.module}')
+        print(f'Topic: {self.topic}')
+        print(f'Semester: {self.semester}')
+        print(f'Proposal: {self.proposal}')
+        print(f'Default save folder: {self.save_folder}')
+        
+        if not os.path.exists(self.save_folder):
+            warnings.warn(f'Default save folder does not exist: {self.save_folder}')
+            
+        self.dark_data = 0
+        self.max_fraction_memory = 0.8
+        self.Nworker = 10
+        self.rois = None
+        self.maxSaturatedPixel = 1
+           
+    def open_run(self, run_nr, t0=0.0):
+        """ Open a run with extra-data and prepare the virtual dataset for multiprocessing
+        
+            inputs:
+                run_nr: the run number
+                t0: optional t0 in mm
+        """
+        
+        print('Opening run data with extra-data')
+        self.run_nr = run_nr
+        self.xgm = None
+        
+        self.run = ed.open_run(self.proposal, self.run_nr)
+        self.plot_title = f'{self.proposal} run: {self.run_nr}'
+        
+        self.fpt = self.run.detector_info(f'SCS_DET_DSSC1M-1/DET/{self.module}CH0:xtdf')['frames_per_train']
+        self.nbunches = self.run.get_array('SCS_RR_UTC/MDL/BUNCH_DECODER', 'sase3.nPulses.value')
+        self.nbunches = np.unique(self.nbunches)
+        if len(self.nbunches) == 1:
+            self.nbunches = self.nbunches[0]
+        else:
+            warnings.warn('not all trains have same length DSSC data')
+            print(f'nbunches: {self.nbunches}')
+            self.nbunches = self.nbunches[-1]
+            
+        print(f'DSSC frames per train: {self.fpt}')
+        print(f'SA3 bunches per train: {self.nbunches}')
+        
+        print('Collecting DSSC module files')
+        self.collect_dssc_module_file()
+        
+        
+        print(f'Loading XGM data')
+        self.xgm = self.run.get_array(_mnemonics['SCS_SA3']['source'],
+                                      _mnemonics['SCS_SA3']['key'],
+                                      roi=ed.by_index[:self.nbunches])
+        self.xgm = self.xgm.rename({'dim_0':'pulseId'})
+        self.xgm['pulseId'] = np.arange(0, 2*self.nbunches, 2)
+        
+        print(f'Loading mono nrj data')
+        self.nrj = self.run.get_array(_mnemonics['nrj']['source'],
+                                      _mnemonics['nrj']['key'])
+        print(f'Loading delay line data')
+        try:
+            self.delay_mm = self.run.get_array(_mnemonics['PP800_DelayLine']['source'],
+                                               _mnemonics['PP800_DelayLine']['key'])
+        except:
+            self.delay_mm = 0*self.nrj
+        self.t0 = t0
+        self.delay_ps = positionToDelay(self.delay_mm, origin=self.t0, invert=True)
+                    
+    def collect_dssc_module_file(self):
+        """ Collect the raw DSSC module h5 files.
+        """
+        
+        pattern = self.runFolder + f'/raw/r{self.run_nr:04d}/RAW-R{self.run_nr:04d}-DSSC{self.module:02d}-S*.h5'
+        self.h5list = glob(pattern)
+       
+    def process(self, dark_pass=None):
+        """ Process DSSC data from one module using multiprocessing
+        
+            dark_pass: if None, process data, if 'mean', compute the mean, if 'std', compute the std
+        
+        """
+
+        # get available memory in GB, we will try to use 80 % of it
+        max_GB = psutil.virtual_memory().available/1024**3
+        print(f'max available memory: {max_GB} GB')
+        
+        # max_GB / (8byte * Nworker * 128px * 512px * N_pulses)
+        self.chunksize = int(self.max_fraction_memory*max_GB * 1024**3 // (8 * self.Nworker * 128 * 512 * self.fpt))
+                
+        print('processing', self.chunksize, 'trains per chunk')
+        
+        if dark_pass == 'mean':
+            rois = None
+            dark = 0
+            mask = 1
+        elif dark_pass == 'std':
+            dark = self.dark_data['dark_mean']
+            rois = None
+            mask = 1
+        elif dark_pass is None:
+            dark = self.dark_data['dark_mean']
+            rois = self.rois
+            mask = self.dark_data['mask']
+        else:
+            raise ValueError(f"dark_pass should be either None or 'mean' or 'std' but not {dark_pass}")
+                   
+        jobs = []
+        for m,h5fname in enumerate(self.h5list):
+            jobs.append(dict(
+            fpt=self.fpt,
+            module=self.module,
+            h5fname=h5fname,
+            chunksize=self.chunksize,
+            nbunches=self.nbunches,
+            workerId=m,
+            Nworker=self.Nworker,
+            dark_data=dark,
+            rois=rois,
+            mask=mask,
+            maxSaturatedPixel=self.maxSaturatedPixel
+            ))
+            
+        timestamp = strftime('%X')
+        print(f'start time: {timestamp}')
+
+        with multiprocessing.Pool(self.Nworker) as pool:
+            res = pool.map(process_one_module, jobs)
+        
+        print('finished:', strftime('%X'))
+        
+        # rearange the multiprocessed data
+        # this is to get rid of the worker dimension, there is no sum over worker really involved
+        self.module_data = xr.concat(res, dim='worker').sum(dim='worker')
+        
+        # reorder the dimension
+        if 'trainId' in self.module_data.dims:
+            self.module_data = self.module_data.transpose('trainId', 'pulseId', 'x', 'y')
+        else:
+            self.module_data = self.module_data.transpose('pulseId', 'x', 'y')
+        
+        # fix some computation now that we have everything
+        self.module_data['std_data'] = np.sqrt(self.module_data['std_data']/(self.module_data['counts'] - 1))
+        self.module_data['dark_corrected_data'] = self.module_data['dark_corrected_data']/self.module_data['counts']
+             
+        self.module_data['run'] = self.run_nr
+      
+        if dark_pass == 'mean':
+            self.dark_data = self.module_data['dark_corrected_data'].to_dataset('dark_mean')
+            self.dark_data['run'] = self.run_nr
+        elif dark_pass == 'std':
+            self.dark_data['dark_std'] = self.module_data['std_data']
+            assert self.dark_data['run'] == self.run_nr, "noise map computed from different darks"
+        else:
+            self.module_data['xgm'] = self.xgm
+            self.module_data['nrj'] = self.nrj
+            self.module_data['delay_mm'] = self.delay_mm
+            self.module_data['delay_ps'] = self.delay_ps
+            self.module_data['t0'] = self.t0
+            
+            
+        self.plot_title = f"{self.proposal} run: {self.module_data['run'].values} dark: {self.dark_data['run'].values}"
+        self.module_data.attrs['plot_title'] = self.plot_title
+        
+    def compute_mask(self, low=0.01, high=0.8):
+        """ Compute a DSSC module mask from the noise map of a dark run.
+        """
+        
+        if self.dark_data['dark_std'] is None:
+            raise ValueError('Cannot compute from from a missing dark noise map')
+            
+        self.dark_data['mask_low'] = low
+        self.dark_data['mask_high'] = high
+        
+        m_std = self.dark_data['dark_std'].mean('pulseId')
+        
+        self.dark_data['mask'] = 1 - ((m_std > self.dark_data['mask_high']) + (m_std < self.dark_data['mask_low']))
+
+    def plot_module(self, plot_dark=False, low=1, high=98, vmin=None, vmax=None):
+        """ Plot a module.
+        
+            inputs:
+                plot_dark: if true, plot dark instead of run data.
+                low: low percentile fraction of the display scale
+                high: high percentile fraction of the display scale
+                vmin: low value of the display scale, overwrites vmin computed from low
+                vmax: max value of the display scale, overwrites vmax computed from high
+                
+        """
+        
+        if plot_dark:
+            mean = self.dark_data['dark_mean'].mean('pulseId')
+            std = self.dark_data['dark_std']
+            title = f"{self.proposal} dark: {self.dark_data['run'].values}"
+        else:
+            mean = self.module_data['dark_corrected_data'].mean('pulseId')
+            std = self.module_data['std_data']
+            title = self.plot_title
+            
+        fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, figsize=[5, 4*2.5])
+        _vmin, _vmax = np.percentile((mean.values[~self.dark_data['mask']]).flatten(), [low, high])
+        if vmin is None:
+            vmin = _vmin
+        if vmax is None:
+            vmax = _vmax
+        im = ax1.imshow(mean, vmin=vmin, vmax=vmax)
+        fig.colorbar(im, ax=ax1)
+        ax1.set_title('mean')
+        fig.suptitle(title)
+        
+        im = ax2.imshow(std.mean('pulseId'), vmin=0, vmax=2)
+        fig.colorbar(im, ax=ax2)
+        ax2.set_title('std')
+        
+        ax3.hist(std.values.flatten(), bins=200, range=[0, 2], density=True)
+        ax3.axvline(self.dark_data['mask_low'], ls='--', c='k')
+        ax3.axvline(self.dark_data['mask_high'], ls='--', c='k')
+        ax3.set_yscale('log')
+        ax3.set_ylabel('density')
+        ax3.set_xlabel('std values')
+        
+        im = ax4.imshow(self.dark_data['mask'])
+        fig.colorbar(im, ax=ax4)
+
+    def save(self, save_folder=None, overwrite=False, isDark=False):
+        """ Save the crunched data.
+        
+            inputs:
+                save_folder: string of the fodler where to save the data.
+                overwrite: boolean whether or not to overwrite existing files.
+                isDark: save the dark or the process data
+        """
+        if save_folder is None:
+            save_folder = self.save_folder
+
+        if isDark:
+            fname = f'run{self.run_nr}_dark.h5'  # no scan
+            data = self.dark_data
+        else:
+            fname = f'run{self.run_nr}.h5'  # run with delay scan (change for other scan types!)
+            data = self.module_data
+
+
+        save_path = os.path.join(save_folder, fname)
+        file_exists = os.path.isfile(save_path)
+
+        if not file_exists or (file_exists and overwrite):
+            if file_exists:
+                warnings.warn(f'Overwriting file: {save_path}')
+                os.remove(save_path)
+            data.to_netcdf(save_path, group='data')
+            data.close()
+            os.chmod(save_path, 0o664)
+            print('saving: ', save_path)
+        else:
+            print('file', save_path, 'exists and overwrite is False')
+            
+    def load_dark(self, dark_runNB, save_folder=None):
+        """ Load dark data.
+        
+            inputs:
+                save_folder: string of the folder where the data were saved.
+        """
+
+        if save_folder is None:
+            save_folder = self.save_folder
+            
+        self.run_nr = dark_runNB
+        self.dark_data = xr.load_dataset(os.path.join(save_folder, f'run{dark_runNB}_dark.h5'), group='data')
+        self.plot_title = f"{self.proposal} dark: {self.dark_data['run'].values}"
+
+    def show_rois(self):
+        fig, ax1 = plt.subplots(nrows=1, figsize=[5, 2.5])
+        try:
+            ax1.imshow(self.module_data['dark_corrected_data'].mean('pulseId') * self.dark_data['mask'])
+        except:
+            ax1.imshow(self.dark_data['dark_mean'].mean('pulseId') * self.dark_data['mask'])            
+        for r,v in self.rois.items():
+            rect = patches.Rectangle((v['y'][0], v['x'][0]),
+                                     v['y'][1] - v['y'][0],
+                                     v['x'][1] - v['x'][0],
+                                     linewidth=1, edgecolor='r', facecolor='none')
+
+            ax1.add_patch(rect)
+            
+        fig.suptitle(self.plot_title)
+
+ 
+# since 'self' is not pickable, this function has to be outside the DSSC class so that it can be used
+# by the multiprocessing pool.map function
+def process_one_module(job):
+    
+    chunksize = job['chunksize']
+    Nworker = job['Nworker']
+    workerId = job['workerId']
+    dark_data = job['dark_data']
+    fpt = job['fpt']
+    module = job['module']
+    rois = job['rois']
+    mask = job['mask']
+    h5fname = job['h5fname']
+    maxSaturatedPixel = job['maxSaturatedPixel']
+    
+    image_path = f"INSTRUMENT/SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf/image/data"
+    npulse_path = f"INDEX/SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf/image/count"
+        
+    with h5py.File(h5fname, 'r') as m:
+        all_trainIds = m['INDEX/trainId'][()]
+    n_trains = len(all_trainIds)
+    
+    n_chunk = np.ceil(n_trains/chunksize) + 1
+    
+    chunks = np.linspace(0, n_trains, n_chunk, endpoint=True, dtype=int)
+        
+    # create empty dataset to add actual data to
+    module_data = xr.DataArray(np.zeros([fpt, 128, 512], dtype=np.float64),
+                               dims=['pulseId', 'x', 'y'],
+                               coords={'pulseId':np.arange(fpt)}).to_dataset(name='dark_corrected_data')
+    module_data['std_data'] = xr.DataArray(np.zeros([fpt, 128, 512], dtype=np.float64),
+                               dims=['pulseId', 'x', 'y'])
+
+    if rois is not None:
+        for k in rois.keys():
+            module_data[k] = xr.DataArray(np.empty([n_trains], dtype=np.float64),
+                                   dims=['trainId'], coords = {'trainId': all_trainIds})
+    module_data['counts'] = 0
+    
+    # crunching
+    with h5py.File(h5fname, 'r') as m:
+
+        #chunk_start = np.arange(len(all_trainIds), step=job['chunksize'], dtype=int)
+        trains_start = 0
+                   
+        # This line is the strange hack from https://github.com/tqdm/tqdm/issues/485
+        print(' ', end='', flush=True)
+         
+        for k,v in enumerate(tqdm(chunks[:-1], desc=f"pool.map#{workerId:02d}")):              
+            chunk_dssc = np.s_[int(chunks[k] * fpt):int(chunks[k+1] * fpt)]  # for dssc data
+            data = m[image_path][chunk_dssc].squeeze()
+            
+            trains = m['INDEX/trainId'][np.s_[int(chunks[k]):int(chunks[k+1])]]
+            n_trains = len(trains)                   
+                    
+            data = data.astype(np.float64)
+            data = xr.DataArray(np.reshape(data, [n_trains, fpt, 128, 512]),
+                                dims=['trainId', 'pulseId', 'x', 'y'],
+                                coords={'trainId': trains})
+            
+            temp = data - dark_data
+            
+            if rois is not None:
+                temp2 = temp.where(mask)
+                for k,v in rois.items():
+                    bkg = dark_data.isel({'x':slice(v['x'][0], v['x'][1]),
+                                     'y':slice(v['y'][0], v['y'][1])})
+                    
+                    im = data.isel({'x':slice(v['x'][0], v['x'][1]),
+                                     'y':slice(v['y'][0], v['y'][1])})
+                    
+                    smask = mask.isel({'x':slice(v['x'][0], v['x'][1]),
+                                     'y':slice(v['y'][0], v['y'][1])})
+                    
+                    im = im.where(smask)
+                                        
+
+                    cim = im - bkg.where(smask)
+                    
+                    val = cim.sum(dim=['x','y'])
+                    
+                    tokeep = (im>254).sum(dim=['x', 'y']) < maxSaturatedPixel
+                    tokeep = tokeep.assign_coords(pulseId = val.coords['pulseId'].values)
+                    todrop = 1-tokeep
+                    
+                    Ndropped = todrop.sum().values
+                    percent_dropped = 100*Ndropped/(n_trains * fpt)
+                    
+                    print(f'Dropped: {Ndropped}, i.e. {percent_dropped:.2f}%')
+                    gval = val.where(tokeep, np.nan) 
+                    module_data[k] = gval
+
+            module_data['dark_corrected_data'] += temp.sum(dim='trainId')
+            module_data['std_data'] += (temp**2).sum(dim='trainId')
+            module_data['counts'] += n_trains
+       
+    return module_data
--- a/src/toolbox_scs/detectors/digitizers.py
+++ b/src/toolbox_scs/detectors/digitizers.py
+""" Digitizers related sub-routines
+
+    Copyright (2021) SCS Team.
+
+    (contributions preferrably comply with pep8 code structure
+    guidelines.)
+"""
+
+import logging
+import os
+
+import numpy as np
+import xarray as xr
+import matplotlib.pyplot as plt
+from scipy.signal import find_peaks
+
+from ..misc.bunch_pattern_external import is_pulse_at
+from ..mnemonics_machinery import mnemonics_for_run
+from extra_data import open_run
+from extra_data.read_machinery import find_proposal
+from extra.components import XrayPulses, OpticalLaserPulses
+
+__all__ = [
+    'check_peak_params',
+    'get_digitizer_peaks',
+    'get_dig_avg_trace',
+    'load_processed_peaks',
+    'check_processed_peak_params'
+]
+
+log = logging.getLogger(__name__)
+
+
+def peaks_from_raw_trace(traces, pulseStart, pulseStop, baseStart, baseStop,
+                         period=None, npulses=None, extra_dim=None,
+                         mode='trapz'):
+    """
+    Computes peaks from raw digitizer traces by trapezoidal integration.
+
+    Parameters
+    ----------
+    traces: xarray DataArray or numpy array containing raw traces. If
+        numpy array is provided, the second dimension is that of the samples.
+    pulseStart: int or list or 1D-numpy array
+        trace index of integration start. If 1d array, each value is the start
+        of one peak. The period and npulses parameters are ignored.
+    pulseStop: int
+        trace index of integration stop.
+    baseStart: int
+        trace index of background start.
+    baseStop: int
+        trace index of background stop.
+    period: int
+        number of samples between two peaks. Ignored if intstart is a 1D-array.
+    npulses: int
+        number of pulses. Ignored if intstart is a 1D-array.
+    extra_dim: str
+        Name given to the dimension along the peaks. Defaults to 'pulseId'.
+    mode: str in ['trapz', 'sum', 'mean', 'amplitude']
+        The operation performed over the integration region. `amplitude` is
+        the difference between the baseline and the peak height.
+
+    Returns
+    -------
+    xarray DataArray
+    """
+
+    assert traces.ndim == 2
+
+    if isinstance(traces, xr.DataArray):
+        ntid = traces.sizes['trainId']
+        coords = traces.coords
+        traces = traces.values
+        if traces.shape[0] != ntid:
+            traces = traces.T
+    else:
+        coords = None
+
+    if hasattr(pulseStart, '__len__'):
+        pulseStart = np.array(pulseStart)
+        pulses = pulseStart - pulseStart[0]
+        pulseStart = pulseStart[0]
+    else:
+        if period is None or period == 0:
+            period = 1
+        pulses = range(0, npulses*period, period)
+
+    if extra_dim is None:
+        extra_dim = 'pulseId'
+    results = xr.DataArray(np.empty((traces.shape[0], len(pulses))),
+                           coords=coords,
+                           dims=['trainId', extra_dim])
+    func = {'trapz': np.trapz, 'sum': np.sum,
+            'mean': np.mean, 'amplitude': 'amplitude'}.get(mode)
+    if func is None:
+        raise ValueError('mode must be a string in ["trapz", "sum",'
+                         '"mean", "amplitude"]')
+    for i, p in enumerate(pulses):
+        a = pulseStart + p
+        b = pulseStop + p
+        bkga = baseStart + p
+        bkgb = baseStop + p
+        if b > traces.shape[1]:
+            break
+        bg = np.outer(np.median(traces[:, bkga:bkgb], axis=1),
+                      np.ones(b-a))
+        if mode == 'amplitude':
+            results[:, i] = np.max(traces[:, bkga:b], axis=1) - \
+                            np.min(traces[:, bkga:b], axis=1)
+        else:
+            results[:, i] = func(traces[:, a:b] - bg, axis=1)
+    return results
+
+
+def peaks_from_apd(array, params, digitizer, bpt, bunchPattern):
+    """
+    Extract peak-integrated data according to the bunch pattern.
+
+    Parameters
+    ----------
+    array: xarray DataArray
+        2D array containing peak-integrated data
+    params: dict
+        peak-integration parameters of the digitizer
+    digitizer: str
+        digitizer type, one of {'FastADC', 'ADQ412'}
+    bpt: xarray DataArray
+        bunch pattern table
+    bunchPattern: str
+        one of {'sase1', 'sase3', 'scs_ppl'}, used to select pulses and
+        assign name of the pulse id dimension.
+
+    Returns
+    -------
+    xarray DataArray with pulse id coordinates.
+    """
+    if params['enable'] == 0 or (array == 1.).all():
+        raise ValueError('The digitizer did not record integrated peaks. '
+                         'Consider using raw traces from the same channel '
+                         'for peak integration.')
+    if digitizer == 'FastADC':
+        min_distance = 24
+    if digitizer == 'ADQ412':
+        min_distance = 440
+    period = params['period']
+    if period % min_distance != 0:
+        log.warning(f'Warning: the pulse period ({period} samples) of '
+                    'digitizer is not a multiple of the pulse separation at '
+                    f'4.5 MHz ({min_distance} samples). Pulse id assignment '
+                    'is likely to fail.')
+    stride = int(period/min_distance)
+    npulses_apd = params['npulses']
+    dim_names = {'sase3': 'sa3_pId', 'sase1': 'sa1_pId', 'scs_ppl': 'ol_pId'}
+    pulse_dim = dim_names[bunchPattern]
+    arr_dim = [dim for dim in array.dims if dim != 'trainId'][0]
+    if npulses_apd > array.sizes[arr_dim]:
+        log.warning(f'The digitizer was set to record {npulses_apd} pulses '
+                    f'but the array length is only {array.sizes[arr_dim]}.')
+        npulses_apd = array.sizes[arr_dim]
+    mask = is_pulse_at(bpt, bunchPattern).rename({'pulse_slot': pulse_dim})
+    mask = mask.where(mask.trainId.isin(array.trainId), drop=True)
+    mask = mask.assign_coords({pulse_dim: np.arange(bpt.sizes['pulse_slot'])})
+    pid = np.sort(np.unique(np.where(mask)[1]))
+    npulses_bpt = len(pid)
+    apd_coords = np.arange(pid[0], pid[0] + stride*npulses_apd, stride)
+    noalign = False
+    if len(np.intersect1d(apd_coords, pid, assume_unique=True)) < npulses_bpt:
+        log.warning('Not all pulses were recorded. The digitizer '
+                    'was set to record pulse ids '
+                    f'{apd_coords[apd_coords<bpt.sizes["pulse_slot"]]} but the'
+                    'bunch pattern for'
+                    f' {bunchPattern} is {pid}. Skipping pulse ID alignment.')
+        noalign = True
+    array = array.isel({arr_dim: slice(0, npulses_apd)})
+    array = array.where(array != 1.)
+    if noalign:
+        return array
+    array = array.rename(
+        {arr_dim: pulse_dim}).assign_coords({pulse_dim: apd_coords})
+    array, mask = xr.align(array, mask, join='inner')
+    array = array.where(mask, drop=True)
+    return array
+
+
+def get_dig_avg_trace(run, mnemonic, ntrains=None):
+    """
+    Compute the average over ntrains evenly spaced accross all trains
+    of a digitizer trace.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonic: str
+        ToolBox mnemonic of the digitizer data, e.g. 'MCP2apd'.
+    ntrains: int
+        Number of trains used to calculate the average raw trace.
+        If None, all trains are used.
+
+    Returns
+    -------
+    trace: DataArray
+        The average digitizer trace
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if ntrains is None:
+        sel = run
+    else:
+        total_tid = len(run.train_ids)
+        stride = int(np.max([1, np.floor(total_tid/ntrains)]))
+        sel = run.select_trains(np.s_[0:None:stride])
+    m = run_mnemonics[mnemonic]
+    raw_trace = sel.get_array(m['source'], m['key'], m['dim'])
+    raw_trace = raw_trace.mean(dim='trainId')
+    return raw_trace
+
+
+def channel_peak_params(run, source, key=None, channel=None, board=None):
+    """
+    Extract peak-integration parameters used by a channel of the digitizer.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    source: str
+        ToolBox mnemonic of a digitizer data, e.g. "MCP2apd" or
+        "FastADC4peaks", or name of digitizer source, e.g.
+        'SCS_UTC1_ADQ/ADC/1:network'.
+    key: str
+        optional, used in combination of source (if source is not a ToolBox
+        mnemonics) instead of digitizer, channel and board.
+    channel: int or str
+        The digitizer channel for which to retrieve the parameters. If None,
+        inferred from the source mnemonic or source + key arguments.
+    board: int
+        Board of the ADQ412. If None, inferred from the source mnemonic or
+        source + key arguments.
+
+    Returns
+    -------
+    dict with peak integration parameters.
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if source in run_mnemonics:
+        m = run_mnemonics[source]
+        source = m['source']
+        key = m['key']
+    if 'network' in source:
+        return adq412_channel_peak_params(run, source, key, channel, board)
+    else:
+        return fastADC_channel_peak_params(run, source, channel)
+
+
+def fastADC_channel_peak_params(run, source, channel=None):
+    """
+    Extract peak-integration parameters used by a channel of the Fast ADC.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    source: str
+        Name of Fast ADC source, e.g. 'SCS_UTC1_MCP/ADC/1:channel_5.output'.
+    channel: int
+        The Fast ADC channel for which to retrieve the parameters. If None,
+        inferred from the source.
+
+    Returns
+    -------
+    dict with peak integration parameters.
+    """
+    fastADC_keys = {
+        'baseStart': ('baselineSettings.baseStart.value',
+                      'baseStart.value'),
+        'baseStop': ('baseStop.value',),
+        'baseLength': ('baselineSettings.length.value',),
+        'enable': ('enablePeakComputation.value',),
+        'pulseStart': ('initialDelay.value',),
+        'pulseLength': ('peakSamples.value',),
+        'npulses': ('numPulses.value',),
+        'period': ('pulsePeriod.value',)
+    }
+    if channel is None:
+        channel = int(source.split(':')[1].split('.')[0].split('_')[1])
+    baseName = f'channel_{channel}.'
+    source = source.split(':')[0]
+    data = run.select(source).train_from_index(0)[1][source]
+    result = {}
+    for mnemo, versions in fastADC_keys.items():
+        for v in versions:
+            key = baseName + v
+            if key in data:
+                result[mnemo] = data[key]
+    if 'baseLength' in result:
+        result['baseStop'] = result['baseStart'] + \
+                             result.pop('baseLength')
+    if 'pulseLength' in result:
+        result['pulseStop'] = result['pulseStart'] + \
+                             result.pop('pulseLength')
+    return result
+
+
+def adq412_channel_peak_params(run, source, key=None,
+                               channel=None, board=None):
+    """
+    Extract peak-integration parameters used by a channel of the ADQ412.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    source: str
+        Nname of ADQ412 source, e.g. 'SCS_UTC1_ADQ/ADC/1:network'.
+    key: str
+        optional, e.g. 'digitizers.channel_1_D.apd.pulseIntegral', used in
+        combination of source instead of channel and board.
+    channel: int or str
+        The ADQ412 channel for which to retrieve the parameters. If None,
+        inferred from the source mnemonic or source + key arguments.
+    board: int
+        Board of the ADQ412. If None, inferred from the source mnemonic or
+        source + key arguments.
+
+    Returns
+    -------
+    dict with peak integration parameters.
+    """
+    if key is None:
+        if channel is None or board is None:
+            raise ValueError('key or channel + board arguments are '
+                             'missing.')
+    else:
+        k = key.split('.')[1].split('_')
+        ch_to_int = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
+        channel = ch_to_int[k[2]]
+        board = k[1]
+    baseName = f'board{board}.apd.channel_{channel}.'
+    source = source.split(':')[0]
+    adq412_keys = {
+        'baseStart': (baseName + 'baseStart.value',),
+        'baseStop': (baseName + 'baseStop.value',),
+        'enable': (baseName + 'enable.value',),
+        'pulseStart': (baseName + 'pulseStart.value',),
+        'pulseStop': (baseName + 'pulseStop.value',),
+        'initialDelay': (baseName + 'initialDelay.value',),
+        'upperLimit': (baseName + 'upperLimit.value',),
+        'npulses': (f'board{board}.apd.numberOfPulses.value',)
+    }
+
+    data = run.select(source).train_from_index(0)[1][source]
+    result = {}
+    for mnemo, versions in adq412_keys.items():
+        for key in versions:
+            if key in data:
+                result[mnemo] = data[key]
+    result['period'] = result.pop('upperLimit') - \
+                       result.pop('initialDelay')
+    return result
+
+
+def find_peaks_in_raw_trace(trace, height=None, width=1, distance=24):
+    """
+    Find integration parameters for peak integration of a raw
+    digitizer trace. Based on scipy find_peaks().
+
+    Parameters
+    ----------
+    trace: numpy array or xarray DataArray
+        The digitier raw trace used to find peaks
+    height: int
+        minimum height for peak determination
+    width: int
+        minimum width of peak
+    distance: int
+        minimum distance between two peaks
+
+    Returns
+    -------
+    dict with keys 'pulseStart', 'pulseStop', 'baseStart', 'baseStop',
+        'period', 'npulses' and values in number of samples.
+    """
+    if isinstance(trace, xr.DataArray):
+        trace = trace.values
+    trace_norm = trace - np.median(trace)
+    trace_norm = -trace_norm if np.mean(trace_norm) < 0 else trace_norm
+    SNR = np.max(np.abs(trace_norm)) / np.std(trace_norm[:100])
+    if SNR < 10:
+        log.warning('signal-to-noise ratio too low: cannot '
+                    'automatically find peaks.')
+        return {'pulseStart': 2, 'pulseStop': 3,
+                'baseStart': 0, 'baseStop': 1,
+                'period': 0, 'npulses': 1}
+    min_height = min(3 * SNR, np.max(np.abs(trace_norm)) / 3)
+    peaks, prop = find_peaks(trace_norm, distance=distance,
+                             height=min_height,
+                             width=2)
+    params = {}
+    start = int(prop['left_ips'][0])
+    if len(peaks) == 1:
+        params['pulseStart'] = start
+        params['period'] = 0
+    else:
+        pulse_period = int(np.median(np.diff(peaks)))
+        pulse_ids = np.digitize(peaks,
+                                np.arange(peaks[0] - pulse_period/2,
+                                          peaks[-1] + pulse_period/2,
+                                          pulse_period)) - 1
+        if len(np.unique(np.diff(pulse_ids))) == 1:
+            # Regular pattern
+            params['pulseStart'] = start
+            params['period'] = pulse_period
+        else:
+            # Irregular pattern
+            params['pulseStart'] = start + pulse_ids * pulse_period
+            params['period'] = 0
+    params['pulseStop'] = int(prop['right_ips'][0])
+    params['baseStop'] = (3 * start - peaks[0]) / 2
+    params['baseStop'] = np.min([params['baseStop'],
+                                 int(prop['right_ips'][0])]).astype(int)
+    params['baseStart'] = params['baseStop'] - np.mean(prop['widths'])/2
+    params['baseStart'] = np.max([params['baseStart'], 0]).astype(int)
+    params['npulses'] = len(peaks)
+    return params
+
+
+def find_peak_integration_parameters(run, mnemonic, raw_trace=None,
+                                     integParams=None, pattern=None,
+                                     ntrains=None):
+    '''
+    Finds peak integration parameters.
+    '''
+    run_mnemonics = mnemonics_for_run(run)
+    digitizer = digitizer_type(run, run_mnemonics[mnemonic]['source'])
+    pulse_period = 24 if digitizer == "FastADC" else 440
+
+    if integParams is None:
+        # load raw trace and find peaks
+        autoFind = True
+        if raw_trace is None:
+            raw_trace = get_dig_avg_trace(run, mnemonic, ntrains)
+        params = find_peaks_in_raw_trace(raw_trace, distance=pulse_period)
+    else:
+        # inspect provided parameters
+        autoFind = False
+        required_keys = ['pulseStart', 'pulseStop', 'baseStart',
+                         'baseStop']
+        add_text = ''
+        if pattern is None and not hasattr(integParams['pulseStart'],
+                                           '__len__'):
+            required_keys += ['period', 'npulses']
+            add_text = 'Bunch pattern not provided. '
+        if not all(name in integParams for name in required_keys):
+            raise ValueError(add_text + 'All keys of integParams argument '
+                             f'{required_keys} are required.')
+        params = integParams.copy()
+    # extract pulse ids from the parameters (starting at 0)
+    pulse_ids_params = None
+    if hasattr(params['pulseStart'], '__len__'):
+        if params.get('npulses') is not None and (
+                    params.get('npulses') != len(params['pulseStart'])):
+            log.warning('The number of pulses does not match the length '
+                        'of pulseStart. Using length of pulseStart as '
+                        'the number of pulses.')
+        params['npulses'] = len(params['pulseStart'])
+        pulse_ids_params = ((np.array(params['pulseStart']) -
+            params['pulseStart'][0]) / pulse_period).astype(int)
+    elif 'npulses' in params and 'period' in params:
+        if params['npulses'] == 1:
+            pulse_ids_params = np.array([0])
+        else:
+            pulse_ids_params = np.arange(0,
+                params['npulses'] * params['period'] / pulse_period,
+                params['period'] / pulse_period).astype(int)
+
+    # Extract pulse_ids, period and npulses from bunch pattern
+    pulse_ids_bp, npulses_bp, period_bp = None, None, 0
+    regular = True
+    if pattern is not None:
+        bunchPattern = 'sase3' if hasattr(pattern, 'sase') else 'scs_ppl'
+        if pattern.is_constant_pattern() is False:
+            log.warning('The number of pulses changed during the run.')
+            pulse_ids_bp = np.unique(pattern.pulse_ids(labelled=False,
+                                                       copy=False))
+            npulses_bp, period_bp = None, 0
+            regular = False
+        else:
+            pulse_ids_bp = pattern.peek_pulse_ids(labelled=False)
+            npulses_bp = len(pulse_ids_bp)
+            if npulses_bp > 1:
+                periods = np.diff(pulse_ids_bp)
+                if len(np.unique(periods)) > 1:
+                    regular = False
+                else:
+                    period_bp = np.unique(periods)[0] * pulse_period
+    # Compare parameters with bunch pattern
+        if pulse_ids_params is None:
+            params['npulses'] = npulses_bp
+            params['period'] = period_bp
+            pulse_ids_params = np.arange(0,
+                 params['npulses'] * params['period'] / pulse_period,
+                 params['period'] / pulse_period).astype(int)
+        if len(pulse_ids_params) == len(pulse_ids_bp):
+            if not (pulse_ids_params == pulse_ids_bp - pulse_ids_bp[0]).all():
+                log.warning('The provided pulseStart parameters do not match '
+                            f'the {bunchPattern} bunch pattern pulse ids. '
+                            'Using bunch pattern parameters.')
+            pulse_ids_params = pulse_ids_bp
+
+        if (npulses_bp != params.get('npulses') or
+                period_bp != params.get('period')):
+            if autoFind:
+                add_text = 'Automatically found '
+            else:
+                add_text = 'Provided '
+            log.warning(add_text + 'integration parameters '
+                        f'(npulses={params.get("npulses")}, ' +
+                        f'period={params.get("period")}) do not match the '
+                        f'{bunchPattern} bunch pattern (npulses='
+                        f'{npulses_bp}, period={period_bp}). Using bunch '
+                        'pattern parameters.')
+            pulse_ids_params = pulse_ids_bp
+        params['npulses'] = npulses_bp
+        params['period'] = period_bp
+
+    if not regular:
+        # Irregular pattern
+        if hasattr(params['pulseStart'], '__len__'):
+            start = params['pulseStart'][0]
+        else:
+            start = params['pulseStart']
+        params['pulseStart'] = np.array(
+            [int(start + (pid - pulse_ids_params[0]) * pulse_period)
+             for pid in pulse_ids_params])
+        params['period'] = 0
+    else:
+        # Regular pattern
+        if hasattr(params['pulseStart'], '__len__'):
+            params['pulseStart'] = params['pulseStart'][0]
+        if len(pulse_ids_params) == 1:
+            params['period'] = 0
+    return params, pulse_ids_params, regular, raw_trace
+
+
+def check_peak_params(proposal, runNB, mnemonic, raw_trace=None,
+                      ntrains=200, integParams=None, bunchPattern='sase3',
+                      plot=True, show_all=False):
+    """
+    Checks and plots the peak parameters (pulse window and baseline window
+    of a raw digitizer trace) used to compute the peak integration. These
+    parameters are either set by the digitizer peak-integration settings,
+    or are determined by a peak finding algorithmes. The parameters
+    can also be provided manually for visual inspection. The plot either
+    shows the first and last pulse of the trace or the entire trace.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        ToolBox mnemonic of the digitizer data, e.g. 'MCP2apd'.
+    raw_trace: optional, 1D numpy array or xarray DataArray
+        Raw trace to display. If None, the average raw trace over
+        ntrains of the corresponding channel is loaded (this can
+        be time-consuming).
+    ntrains: optional, int
+        Only used if raw_trace is None. Number of trains used to
+        calculate the average raw trace of the corresponding channel.
+    plot: bool
+        If True, displays the raw trace and peak integration regions.
+    show_all: bool
+        If True, displays the entire raw trace and all peak integration
+        regions (this can be time-consuming).
+        If False, shows the first and last pulse according to the bunchPattern.
+    bunchPattern: optional, str
+        Only used if plot is True. Checks the bunch pattern against
+        the digitizer peak parameters and shows potential mismatch.
+
+    Returns
+    -------
+    dictionnary of peak integration parameters
+    """
+    run = open_run(proposal, runNB)
+    run_mnemonics = mnemonics_for_run(run)
+    digitizer = digitizer_type(run, run_mnemonics[mnemonic]['source'])
+    pulse_period = 24 if digitizer == "FastADC" else 440
+    pattern = None
+    try:
+        if bunchPattern == 'sase3':
+            pattern = XrayPulses(run)
+        if bunchPattern == 'scs_ppl':
+            pattern = OpticalLaserPulses(run)
+    except Exception as e:
+        log.warning(e)
+        bunchPattern = None
+    params, pulse_ids, regular, raw_trace = find_peak_integration_parameters(
+        run, mnemonic, raw_trace, integParams, pattern)
+    if bunchPattern:
+        if regular:
+            add_text = ''
+            if len(pulse_ids) > 1:
+                add_text = f's, {(pulse_ids[1]-pulse_ids[0]) * pulse_period}'\
+                           + ' samples between two pulses'
+            print(f'Bunch pattern {bunchPattern}: {len(pulse_ids)} pulse'
+                  + add_text)
+        else:
+            print(f'Bunch pattern {bunchPattern}: Not a regular pattern. '
+                  f'{len(pulse_ids)} pulses, pulse_ids='
+                  f'{pulse_ids}.')
+    if integParams is None:
+        title = 'Auto-find peak parameters'
+    else:
+        title = 'Provided peak parameters'
+        no_change = True
+        for k, v in integParams.items():
+            no_change = no_change & (v == params[k])
+            if hasattr(no_change, '__len__'):
+                no_change = no_change.all()
+        if no_change is False:
+            print('The provided parameters did not match the bunch '
+                  'pattern and were adjusted.')
+            title += ' (adjusted)'
+    if regular:
+        add_text = ''
+        if params['npulses'] > 1:
+            add_text = f's, {params["period"]} samples between two pulses'
+        print(title + ': ' + f' {params["npulses"]} pulse' + add_text)
+    else:
+        print(f'{title}: Not a regular pattern. '
+              f'{len(pulse_ids)} pulses, pulse_ids='
+              f'{pulse_ids}.')
+    if plot:
+        if raw_trace is None:
+            raw_trace = get_dig_avg_trace(run, mnemonic)
+        fig, ax = plotPeakIntegrationWindow(raw_trace, params,
+                                            show_all=show_all)
+        fig.suptitle(f'p{proposal} r{runNB} ' + title, size=12)
+    return params
+
+
+def plotPeakIntegrationWindow(raw_trace, params, show_all=False):
+    if hasattr(params['pulseStart'], '__len__'):
+        starts = np.array(params['pulseStart'])
+        stops = params['pulseStop'] + starts - params['pulseStart'][0]
+        baseStarts = params['baseStart'] + starts - params['pulseStart'][0]
+        baseStops = params['baseStop'] + starts - params['pulseStart'][0]
+    else:
+        n = params['npulses']
+        p = params['period']
+        starts = [params['pulseStart'] + i*p for i in range(n)]
+        stops = [params['pulseStop'] + i*p for i in range(n)]
+        baseStarts = [params['baseStart'] + i*p for i in range(n)]
+        baseStops = [params['baseStop'] + i*p for i in range(n)]
+
+    if show_all:
+        fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
+        for i in range(len(starts)):
+            lbl = 'baseline' if i == 0 else None
+            lp = 'peak' if i == 0 else None
+            ax.axvline(baseStarts[i], ls='--', color='k')
+            ax.axvline(baseStops[i], ls='--', color='k')
+            ax.axvspan(baseStarts[i], baseStops[i],
+                       alpha=0.5, color='grey', label=lbl)
+            ax.axvline(starts[i], ls='--', color='r')
+            ax.axvline(stops[i], ls='--', color='r')
+            ax.axvspan(starts[i], stops[i],
+                       alpha=0.2, color='r', label=lp)
+        ax.plot(raw_trace, color='C0', label='raw trace')
+        ax.legend(fontsize=8)
+        return fig, [ax]
+
+    fig, ax = plt.subplots(1, 2, figsize=(6, 3), constrained_layout=True)
+    for plot in range(2):
+        title = 'First pulse' if plot == 0 else 'Last pulse'
+        i = 0 if plot == 0 else - 1
+        for j in range(min(2, len(starts))):
+            if plot == 1:
+                j = -j
+            label = 'baseline' if j == 0 else ''
+            ax[plot].axvline(baseStarts[i+j], ls='--', color='k')
+            ax[plot].axvline(baseStops[i+j], ls='--', color='k')
+            ax[plot].axvspan(baseStarts[i+j], baseStops[i+j],
+                          alpha=0.5, color='grey', label=label)
+            label = 'peak' if j == 0 else ''
+            ax[plot].axvline(starts[i+j], ls='--', color='r')
+            ax[plot].axvline(stops[i+j], ls='--', color='r')
+            ax[plot].axvspan(starts[i+j], stops[i+j],
+                          alpha=0.2, color='r', label=label)
+        if len(starts) > 1:
+            period = starts[1] - starts[0]
+            xmin = np.max([0, baseStarts[i] - int(1.5*period)])
+            xmax = np.min([stops[i] + int(1.5*period), raw_trace.size])
+        else:
+            xmin = np.max([0, baseStarts[i] - 200])
+            xmax = np.min([stops[i] + 200, raw_trace.size])
+        ax[plot].plot(np.arange(xmin, xmax),
+                      raw_trace[xmin:xmax], color='C0', label='raw trace')
+        ax[plot].legend(fontsize=8)
+        ax[plot].set_xlim(xmin, xmax)
+        ax[plot].set_xlabel('digitizer samples')
+        ax[plot].set_title(title, size=10)
+    return fig, ax
+
+
+def digitizer_type(run, source):
+    """
+    Finds the digitizer type based on the class Id / name of the source.
+    Example source: 'SCS_UTC1_MCP/ADC/1'. Defaults to ADQ412 if not found.
+    """
+    ret = None
+    if '_MCP/ADC/1' in source:
+        ret = 'FastADC'
+    if '_ADQ/ADC/1' in source:
+        ret = 'ADQ412'
+    if ret is None:
+        digi_dict = {'FastAdc': 'FastADC',
+                     'FastAdcLegacy': 'FastADC',
+                     'AdqDigitizer': 'ADQ412',
+                     'PyADCChannel': 'FastADC',
+                     'PyADCChannelLegacy': 'FastADC'}
+        try:
+            source = source.split(':')[0]
+            classId = run.get_run_value(source, 'classId.value')
+            ret = digi_dict.get(classId)
+        except Exception as e:
+            log.warning(str(e))
+            log.warning(f'Could not find digitizer type from source {source}.')
+            ret = 'ADQ412'
+    return ret
+
+
+def get_digitizer_peaks(proposal, runNB, mnemonic, merge_with=None,
+                        bunchPattern='sase3', integParams=None, mode='trapz',
+                        save=False, subdir='usr/processed_runs'):
+    """
+    Extract the peaks from digitizer raw traces. The result can be merged
+    to an existing `merge_with` dataset and/or saved into an HDF file.
+    The calculation is a trapezoidal integration between 'pulseStart' and
+    'pulseStop' with subtraction of a baseline defined as the median between
+    'baseStart' and 'baseStop'.
+    The integration parameters can either be provided using integParams, or
+    computed by a peak finding algorithm if integParams is None.
+    If the bunchPattern argument is provided, the pulse ids are aligned to it.
+    If there is a mismatch between the provided parameters or the computed
+    parameters and the bunch pattern, the bunch pattern parameters prevail.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        mnemonic for FastADC or ADQ412, e.g. "I0_ILHraw" or "MCP3apd".
+        The data is either loaded from the DataCollection or taken from
+        merge_with.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this one.
+    bunchPattern: str or dict
+        'sase1' or 'sase3' or 'scs_ppl', 'None': bunch pattern
+    integParams: dict
+        dictionnary for raw trace integration, e.g.
+        {'pulseStart':100, 'pulsestop':200, 'baseStart':50,
+        'baseStop':99, 'period':24, 'npulses':500}.
+        If None, integration parameters are computed automatically.
+    mode: str in ['trapz', 'sum', 'mean', 'amplitude']
+        The operation performed over the integration region. `ampl` is the
+        amplitude between the baseline and the peak height.
+    save: bool
+        If True, save the computed peaks and the average raw trace in a h5
+        file in the subdir + f'/r{runNB:04d}/ folder.
+    subdir: str
+        subdirectory to save the output in.
+
+    Returns
+    -------
+    xarray Dataset with digitizer peak variables. Raw variables are
+    substituted by the peak caclulated values (e.g. "FastADC2raw" becomes
+    "FastADC2peaks").
+    """
+    # prepare resulting dataset
+    if bool(merge_with):
+        mw_ds = merge_with.drop(mnemonic, errors='ignore')
+    else:
+        mw_ds = xr.Dataset()
+    run = open_run(proposal, runNB)
+    run_mnemonics = mnemonics_for_run(run)
+    mnemo = run_mnemonics.get(mnemonic)
+    if mnemo is None:
+        log.warning('Mnemonic not found. Skipping.')
+        return xr.DataArray()
+    source, key = mnemo['source'], mnemo['key']
+    extra_dim = {'sase3': 'sa3_pId', 'scs_ppl': 'ol_pId'}.get(bunchPattern)
+    if extra_dim is None:
+        extra_dim = 'pulseId'
+    pattern = None
+    try:
+        if bunchPattern == 'sase3':
+            pattern = XrayPulses(run)
+        if bunchPattern == 'scs_ppl':
+            pattern = OpticalLaserPulses(run)
+    except Exception as e:
+        log.warning(e)
+
+    # load traces and calculate the average
+    traces = run[source, key].xarray(name=mnemonic, extra_dims=mnemo['dim'])
+    trace = traces.mean('trainId').rename(mnemonic.replace('raw', 'avg'))
+
+    # find integration parameters
+    params, pulse_ids, regular, trace = find_peak_integration_parameters(
+        run, mnemonic, trace, integParams, pattern)
+
+    # extract peaks
+    peaks = peaks_from_raw_trace(traces, **params,
+                                 extra_dim=extra_dim, mode=mode)
+    peaks = peaks.rename(mnemonic.replace('raw', 'peaks'))
+
+    # Align pulse id
+    if regular is True:
+        peaks = peaks.assign_coords({extra_dim: pulse_ids})
+    elif pattern:
+        mask = pattern.pulse_mask(labelled=False)
+        mask = xr.DataArray(mask, dims=['trainId', extra_dim],
+                            coords={'trainId': run[source, key].train_ids,
+                                    extra_dim: np.arange(mask.shape[1])})
+        mask = mask.sel({extra_dim: pulse_ids})
+        peaks = peaks.where(mask, drop=True)
+
+    # add peak integration parameters attributes
+    for p in params:
+        if params[p] is None:
+            params[p] = 'None'
+        peaks.attrs[f'{peaks.name}_{p}'] = params[p]
+    peaks.attrs[f'{peaks.name}_mode'] = mode
+
+
+    # save peaks
+    if save:
+        save_peaks(proposal, runNB, peaks, trace, subdir)
+
+    # merge with existing dataset
+    ds = mw_ds.merge(peaks.to_dataset(promote_attrs=True), join='inner',
+                     combine_attrs='drop_conflicts')
+    return ds
+
+
+def save_peaks(proposal, runNB, peaks, avg_trace, subdir):
+    '''
+    Save the peaks extracted with extract_digitizer_peaks() as well as the
+    average raw trace in a dataset at the proposal + subdir location.
+    If a dataset already exists, the new data is merged with it.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    peaks: xarray DataArray
+        the 2D-array obtained by extract_digitizer_peaks()
+    avg_trace: xarray DataArray
+        the average raw trace over the trains
+    subdir: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/r{runNB:04d}-digitizers-data.h5
+
+    Returns
+    -------
+        None
+    '''
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, subdir + f'/r{runNB:04d}/')
+    os.makedirs(path, exist_ok=True)
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    ds_peaks = peaks.to_dataset(promote_attrs=True)
+
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        ds = ds.drop_vars([peaks.name, avg_trace.name], errors='ignore')
+        for dim in ds.dims:
+            if all(dim not in ds[v].dims for v in ds):
+                ds = ds.drop_dims(dim)
+        dim_name = 'sampleId'
+        if 'sampleId' in ds.dims and ds.sizes['sampleId'] != len(avg_trace):
+            dim_name = 'sampleId2'
+        avg_trace = avg_trace.rename({avg_trace.dims[0]: dim_name})
+        if f'params_{peaks.name}' in ds.attrs:
+            del ds.attrs[f'params_{peaks.name}']
+        ds = xr.merge([ds, ds_peaks, avg_trace],
+                      combine_attrs='drop_conflicts', join='inner')
+    else:
+        ds = ds_peaks.merge(avg_trace.rename({avg_trace.dims[0]: 'sampleId'}))
+    ds.to_netcdf(fname, format='NETCDF4')
+    print(f'saved data into {fname}.')
+    return None
+
+
+def load_processed_peaks(proposal, runNB, mnemonic=None,
+                         data='usr/processed_runs', merge_with=None):
+    """
+    Load processed digitizer peaks data.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        the mnemonic containing peaks. Example: 'XRD_MCP_BIGpeaks'.
+        If None, the entire dataset is loaded
+    data: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/r{runNB:04d}-digitizers-data.h5
+    merge_with: xarray Dataset
+        A dataset to merge the data with.
+
+    Returns
+    -------
+    xarray DataArray if menmonic is not None and merge_with is None
+    xarray Dataset if mnemonic is None or merge_with is not None.
+
+    Example
+    -------
+    # load the mono energy and the MCP_BIG peaks
+    run, ds = tb.load(proposal, runNB, 'nrj')
+    ds = tb.load_processed_peaks(proposal, runNB,'XRD_MCP_BIGpeaks',
+                                 merge_with=ds)
+    """
+    if mnemonic is None:
+        return load_all_processed_peaks(proposal, runNB, data, merge_with)
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, data + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        if mnemonic not in ds:
+            log.warning(f'Mnemonic {mnemonic} not found in {fname}')
+            return {}
+        da = ds[mnemonic]
+        da.attrs = {k: ds.attrs[k] for k in ds.attrs if f'{mnemonic}_' in k}
+        if merge_with is not None:
+            return merge_with.merge(da.to_dataset(promote_attrs=True),
+                                    combine_attrs='drop_conflicts',
+                                    join='inner')
+        else:
+            return da
+    else:
+        log.warning(f'Mnemonic {mnemonic} not found in {fname}')
+        return merge_with
+
+
+def load_all_processed_peaks(proposal, runNB, data='usr/processed_runs',
+                             merge_with=None):
+    """
+    Load processed digitizer peaks dataset. The data contains the peaks,
+    the average raw trace and the integration parameters (attribute) of
+    each processed digitizer source.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    data: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/r{runNB:04d}-digitizers-data.h5
+    merge_with: xarray Dataset
+        A dataset to merge the data with.
+
+    Returns
+    -------
+    xarray Dataset
+    """
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, data + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    if os.path.isfile(fname):
+        if merge_with is not None:
+            return merge_with.merge(xr.load_dataset(fname),
+                                    combine_attrs='drop_conflicts',
+                                    join='inner')
+        return xr.load_dataset(fname)
+    else:
+        log.warning(f'{fname} not found.')
+        return merge_with
+
+
+def check_processed_peak_params(proposal, runNB, mnemonic,
+                                data='usr/processed_runs',
+                                plot=True, show_all=False):
+    """
+    Check the integration parameters used to generate the processed
+    peak values.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    mnemonic: str
+        the mnemonic containing peaks. Example: 'XRD_MCP_BIGpeaks'.
+        If None, the entire dataset is loaded
+    data: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/r{runNB:04d}-digitizers-data.h5
+    plot: bool
+        If True, displays the raw trace and peak integration regions.
+    show_all: bool
+        If True, displays the entire raw trace and all peak integration
+        regions (this can be time-consuming).
+        If False, shows the first and last pulses.
+
+    Returns
+    -------
+    params: dict
+        the integration parameters with keys ['pulseStart', 'pulseStop',
+        'baseStart', 'baseStop', 'period', 'npulses'].
+        See `extract_digitizer_peaks()`.
+    """
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, data + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-digitizers-data.h5'
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        if mnemonic.replace('raw', 'peaks') not in ds:
+            log.warning(f'Mnemonic {mnemonic} not found in {fname}')
+            return {}
+        params = {k.replace(f'{mnemonic}_', ''): ds.attrs[k] for
+                  k in ds.attrs if f'{mnemonic}_' in k}
+        if plot:
+            title = 'Processed peak parameters'
+            fig, ax = plotPeakIntegrationWindow(
+                ds[mnemonic.replace('peaks', 'avg')],
+                params,
+                show_all=show_all)
+            fig.suptitle(f'p{proposal} r{runNB} ' + title, size=12)
+
+        return params
+    else:
+        log.warning(f'{fname} not found.')
+        return {}
--- a/src/toolbox_scs/detectors/dssc.py
+++ b/src/toolbox_scs/detectors/dssc.py
+"""
+    DSSC-detector class module
+
+    The dssc detector class. It represents a namespace for frequent evaluation
+    while implicitly applying/requiring certain structure/naming conventions to
+    its objects.
+
+    comments:
+        - contributions should comply with pep8 code structure guidelines.
+        - Plot routines don't fit into objects since they are rather fluent.
+          They have been outsourced to dssc_plot.py. They can now be accessed
+          as toolbox_scs member functions.
+"""
+import os
+import logging
+import joblib
+
+import numpy as np
+import xarray as xr
+
+import toolbox_scs as tb
+from ..util.exceptions import ToolBoxValueError, ToolBoxFileError
+from .dssc_data import (
+    save_xarray, load_xarray, save_attributes_h5,
+    search_files, get_data_formatted)
+from .dssc_misc import (
+        load_dssc_info, get_xgm_formatted, get_tim_formatted)
+from .dssc_processing import (
+        process_dssc_data, create_empty_dataset)
+
+__all__ = [
+    "DSSCBinner",
+    "DSSCFormatter"]
+
+log = logging.getLogger(__name__)
+
+
+class DSSCBinner:
+    def __init__(self, proposal_nr, run_nr,
+                 binners={},
+                 xgm_name='SCS_SA3',
+                 tim_names=['MCP1apd', 'MCP2apd', 'MCP3apd'],
+                 dssc_coords_stride=2,
+                 ):
+        """
+        A dssc binner object. Loads and bins the dssc data according to the
+        bins specified in 'binners'. The data can be reduced further through
+        masking using XGM or TIM data.
+
+        Parameters
+        ----------
+        proposal_nr: int, str
+            proposal number containing run folders
+        run_nr: int, str
+            run number
+        binners: dictionary
+            dictionary containing binners constructed using the
+            'create_dssc_bins' toolbox_scs.detectors-method.
+        xgm_name: str
+            a valid mnemonic key of the XGM data to be used to mask the dssc
+            frames. Since the xgm is used in several methods its name can be
+            set here globally.
+        tim_names: list of strings
+            a list of valid mnemonic keys for an mcp in the tim. Once the
+            corresponding data is loaded the different sources will be averaged.
+        dssc_coords_stride: int, list
+            defines which dssc frames should be normalized using data from the
+            xgm. The parameter may be an integer (stride parameter) or a list,
+            that assigns each pulse to its corresponding dssc frame number.
+
+        Returns
+        -------
+        DSSCbinner: object
+
+        Example
+        -------
+        1.) quick -> generic bins, no xgm,
+        >>> import toolbox_scs as tb
+        >>> run235 = tb.DSSCBinner(proposal_nb=2212, run_nb=235)
+        2.) detailed -> docs
+        """
+
+        # ---------------------------------------------------------------------
+        # object (run) properties
+        # ---------------------------------------------------------------------
+        self.proposal = proposal_nr
+        self.runnr = run_nr
+        self.info = load_dssc_info(proposal_nr, run_nr)
+        self.run, _ = tb.load(proposal_nr, run_nr)
+        self.binners = {}
+        for b in binners:
+            self.add_binner(b, binners[b])
+        self.xgm_name = xgm_name
+        self.tim_names = tim_names
+        self.dssc_coords_stride = dssc_coords_stride
+        self.xgm = None
+        self.tim = None
+        self.pulsemask = None
+
+        log.debug("Constructed DSSC object")
+
+    def __del__(self):
+        pass
+
+    def add_binner(self, name, binner):
+        """
+        Add additional binner to internal dictionary
+
+        Parameters
+        ----------
+        name: str
+            name of binner to be created
+        binner: xarray.DataArray
+            An array that represents a map how the respective coordinate should
+            be binned.
+
+        Raises
+        ------
+        ToolBoxValueError: Exception
+            Raises exception in case the name does not correspond to a valid
+            binner name. To be generalized.
+        """
+        if name in ['trainId', 'pulse', 'x', 'y']:
+            self.binners[name] = binner
+        else:
+            msg = "Invalid binner name"
+            log.info(msg+", no binner created")
+            raise ToolBoxValueError(msg, name)
+
+    def load_xgm(self):
+        """
+        load xgm data and construct coordinate array according to corresponding
+        dssc frame number.
+        """
+        self.xgm = get_xgm_formatted(self.run,
+                                     self.xgm_name,
+                                     self.dssc_coords_stride)
+
+    def load_tim(self):
+        """
+        load tim data and construct coordinate array according to corresponding
+        dssc frame number.
+        """
+        self.tim = get_tim_formatted(self.proposal,
+                                     self.runnr,
+                                     self.tim_names,
+                                     self.dssc_coords_stride)
+
+    def create_pulsemask(self, use_data='xgm', threshold=(0, np.inf)):
+        """
+        creates a mask for dssc frames according to measured xgm intensity.
+        Once such a mask has been constructed, it will be used in the data
+        reduction process to drop out-of-bounds pulses.
+        """
+        fpt = self.info['frames_per_train']
+        n_trains = self.info['number_of_trains']
+        trainIds = self.info['trainIds']
+        data = np.ones([n_trains, fpt], dtype=bool)
+        self.pulsemask = xr.DataArray(data,
+                                      dims=['trainId', 'pulse'],
+                                      coords={'trainId': trainIds,
+                                              'pulse': range(fpt)})
+
+        if use_data == 'xgm':
+            if self.xgm is None:
+                self.load_xgm()
+            valid = (self.xgm > threshold[0]) * \
+                    (self.xgm < threshold[1])
+        if use_data == 'tim':
+            if self.tim is None:
+                self.load_tim()
+            valid = (self.tim > threshold[0]) * \
+                    (self.tim < threshold[1])
+
+        self.pulsemask = \
+            (valid.combine_first(self.pulsemask).astype(bool))[:, 0:fpt]
+        log.info(f'created pulse mask used during processing')
+
+    def get_info(self):
+        """
+        Returns the expected shape of the binned dataset, in case binners have
+        been defined.
+        """
+        if any(self.binners):
+            empty = create_empty_dataset(self.info, self.binners)
+            return(empty.dims)
+        else:
+            log.info("no binner defined yet.")
+            pass
+
+    def _bin_metadata(self, data):
+        if self.pulsemask is not None:
+            data = data.where(self.pulsemask)
+        for b in self.binners:
+            if b in ['trainId', 'pulse']:
+                data[b+"_binned"] = self.binners[b]
+                data = data.groupby(b+"_binned").mean(b)
+                data = data.rename(name_dict={b+"_binned": b})
+        log.debug('binned metadata data according to dssc binners.')
+        return data.transpose('trainId', 'pulse')
+
+    def get_xgm_binned(self):
+        """
+        Bin the xgm data according to the binners of the dssc data. The result
+        can eventually be merged into the final dataset by the DSSCFormatter.
+
+        Returns
+        -------
+        xgm_data: xarray.DataSet
+            xarray dataset containing the binned xgm data
+        """
+        if self.xgm is not None:
+            xgm_data = self.xgm.to_dataset(name='xgm')
+            xgm_binned = self._bin_metadata(xgm_data)
+            log.info('binned xgm data according to dssc binners.')
+            return xgm_binned
+        else:
+            log.warning("no xgm data. Use load_xgm() to load the xgm data.")
+            pass
+
+    def get_tim_binned(self):
+        """
+        Bin the tim data according to the binners of the dssc data. The result
+        can eventually be merged into the final dataset by the DSSCFormatter.
+
+        Returns
+        -------
+        tim_data: xarray.DataSet
+            xarray dataset containing the binned tim data
+        """
+        if self.tim is not None:
+            tim_data = self.tim.to_dataset(name='tim')
+            tim_binned = self._bin_metadata(tim_data)
+            log.info('binned tim data according to dssc binners.')
+            return tim_binned
+        else:
+            log.warning("no data. Use load_tim() to load the tim data.")
+            pass
+
+    # -------------------------------------------------------------------------
+    # Data processing
+    # -------------------------------------------------------------------------
+    def process_data(self, modules=[], filepath='./',
+                     chunksize=512, backend='loky', n_jobs=None,
+                     dark_image=None,
+                     xgm_normalization=False, normevery=1
+                     ):
+        """
+        Load and bin dssc data according to self.bins. No data is returned by
+        this method. The condensed data is written to file by the worker
+        processes directly.
+
+        Parameters
+        ----------
+        modules: list of ints
+            a list containing the module numbers that should be processed. If
+            empty, all modules are processed.
+        filepath: str
+            the path where the files containing the reduced data should be
+            stored.
+        chunksize: int
+            The number of trains that should be read in one iterative step.
+        backend: str
+            joblib multiprocessing backend to be used. At the moment it can be
+            any of joblibs standard backends: 'loky' (default),
+            'multiprocessing', 'threading'. Anything else than the default is
+            experimental and not appropriately implemented in the dbdet member
+            function 'bin_data'.
+        n_jobs: int
+            inversely proportional of the number of cpu's available for one
+            job. Tasks within one job can grab a maximum of n_CPU_tot/n_jobs of
+            cpu's.
+            Note that when using the default backend there is no need to adjust
+            this parameter with the current implementation.
+        dark_image: xarray.DataArray
+            DataArray with dimensions compatible with the loaded dssc data. If
+            given, it will be subtracted from the dssc data before the binning.
+            The dark image needs to be of dimension module, trainId, pulse, x
+            and y.
+        xgm_normalization: boolean
+            if true, the dssc data is normalized by the xgm data before the
+            binning.
+        normevery: int
+            integer indicating which out of normevery frame will be normalized.
+        """
+        log.info("Bin data according to binners")
+        log.info(f'Process {chunksize} trains per chunk')
+
+        mod_list = modules
+        if len(mod_list) == 0:
+            mod_list = [i for i in range(16)]
+        log.info(f'Process modules {mod_list}')
+        njobs = n_jobs
+        if njobs is None:
+            njobs = len(mod_list)
+
+        module_jobs = []
+        for m in mod_list:
+            dark = dark_image
+            if dark_image is not None:
+                dark = dark_image.sel(module=m)
+            module_jobs.append(dict(
+                proposal=self.proposal,
+                run_nr=self.runnr,
+                module=m,
+                chunksize=chunksize,
+                path=filepath,
+                info=self.info,
+                dssc_binners=self.binners,
+                pulsemask=self.pulsemask,
+                dark_image=dark,
+                xgm_normalization=xgm_normalization,
+                xgm_mnemonic=self.xgm_name,
+                normevery=normevery,
+            ))
+
+        log.info(f'using parallelization backend {backend}')
+        joblib.Parallel(n_jobs=njobs, backend=backend)\
+            (joblib.delayed(process_dssc_data)(**module_jobs[i])
+             for i in range(len(mod_list)))
+
+        log.info(f'Binning done')
+
+
+class DSSCFormatter:
+    def __init__(self, filepath):
+        """
+        Class that handles formatting related aspects, before handing the data
+        overt for analysis.
+
+        Parameters
+        ----------
+        filepath: str
+            location of processed files.
+
+        Raises
+        ------
+        ToolBoxFileError: Exception
+            Trows an error in case the the given path does not exist.
+        """
+        self._filenames = []
+        self._filename = ''
+        if os.path.exists(filepath):
+            try:
+                self._filenames = search_files(filepath)
+            except ToolBoxFileError:
+                log.info("path did not contain any files")
+                pass
+        else:
+            log.info("path did not exist")
+
+        self.data = None
+        self.data_xarray = {}
+        self.attributes = {}
+
+    def combine_files(self, filenames=[]):
+        """
+        Read the files given in filenames, and store the data in the class
+        variable 'data'. If no filenames are given, it tries to read the files
+        stored in the class-internal variable '_filenames'.
+
+        Parameters
+        ----------
+        filenames: list
+            list of strings containing the names of the files to be combined.
+        """
+        if any(filenames) is True:
+            self._filenames = filenames
+        if self._filenames is not None:
+            self.data = get_data_formatted(self._filenames)
+        else:
+            log.info("No matching data found.")
+
+    def add_dataArray(self, groups=[]):
+        """
+        Reads addional xarray-data from the first file given in the list of
+        filenames. This assumes that all the files in the folder contain the
+        same additional data. To be generalized.
+
+        Parameters
+        ----------
+        groups: list
+            list of strings with the names of the groups in the h5 file,
+            containing additional xarray data.
+        """
+        if any(self._filenames) is True:
+            for group in groups:
+                self.data_xarray[group] = load_xarray(self._filenames[0],
+                                                      group=group,
+                                                      form='array')
+        else:
+            log.info("No files found in specified folder.")
+
+    def add_attributes(self, attributes={}):
+        """
+        Add additional information, such as run-type, as attributes to the
+        formatted .h5 file.
+
+        Parameters
+        ----------
+        attributes: dictionary
+            a dictionary, containing information or data of any kind, that
+            will be added to the formatted .h5 file as attributes.
+        """
+        for key in attributes.keys():
+            self.attributes[key] = attributes[key]
+
+    def save_formatted_data(self, filename):
+        """
+        Create a .h5 file containing the main dataset in the group called
+        'data'. Additional groups will be created for the content of the
+        variable 'data_array'. Metadata about the file is added in the form of
+        attributes.
+
+        Parameters
+        ----------
+        filename: str
+            the name of the file to be created
+        """
+        save_xarray(filename, self.data, group='data', mode='w')
+
+        for arr in self.data_xarray.keys():
+            save_xarray(filename, self.data_xarray[arr], group=arr, mode='a')
+
+        save_attributes_h5(filename, self.attributes)
--- a/src/toolbox_scs/detectors/dssc_data.py
+++ b/src/toolbox_scs/detectors/dssc_data.py
+import os
+import logging
+
+import h5py
+import xarray as xr
+
+from ..util.exceptions import ToolBoxFileError
+
+__all__ = [
+    'get_data_formatted',
+    'load_xarray',
+    'save_attributes_h5',
+    'save_xarray',
+]
+
+log = logging.getLogger(__name__)
+
+
+def _to_netcdf(fname, data, group, mode):
+    f_exists = os.path.isfile(fname)
+    if (f_exists and mode == 'w'):
+        data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
+        log.warning(f"File {fname} existed: overwritten")
+        log.info(f"Stored data in file {fname}")
+    elif f_exists and mode == 'a':
+        try:
+            data.to_netcdf(fname, group=group, mode='a', engine='h5netcdf')
+            log.info(f"Created group {group} in file {fname}")
+        except (ValueError, TypeError):
+            msg = f"Group {group} exists and has incompatible dimensions."
+            log.warning(f"Could not store data: {msg}")
+            raise ToolBoxFileError(msg, fname)
+    else:
+        data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
+        log.info(f"Stored data in file {fname}")
+
+
+def save_xarray(fname, data, group='data', mode='a'):
+    """
+    Store xarray Dataset in the specified location
+
+    Parameters
+    ----------
+    data: xarray.DataSet
+        The data to be stored
+    fname: str, int
+        filename
+    overwrite: bool
+        overwrite existing data
+
+    Raises
+    ------
+    ToolBoxFileError: Exception
+        File existed, but overwrite was set to False.
+    """
+    try:
+        _to_netcdf(fname, data, group, mode)
+    except ToolBoxFileError as err:
+        raise err
+
+
+def save_attributes_h5(fname, data={}):
+    """
+    Adding attributes to a hdf5 file. This function is intended to be used to
+    attach metadata to a processed run.
+
+    Parameters
+    ----------
+    fname: str
+        filename as string
+    data: dictionary
+        the data that should be added to the file in form of a dictionary.
+    """
+    f = h5py.File(fname, mode='a')
+    for d in data.keys():
+        f.attrs[d] = data[d]
+    f.close()
+    log.info(f"added attributes to file {fname}")
+
+
+def load_xarray(fname, group='data', form='dataset'):
+    """
+    Load stored xarray Dataset.
+    Comment: This function exists because of a problem with the standard
+    netcdf engine that is malfunctioning due to related software installed
+    in the exfel-python environment. May be dropped at some point.
+
+    Parameters
+    ----------
+    fname: str
+        filename as string
+    group: str
+        the name of the xarray dataset (group in h5 file).
+    form: str
+        specify whether the data to be loaded is a 'dataset' or a 'array'.
+    """
+    f_exists = os.path.isfile(fname)
+    if f_exists:
+        if form == 'dataset':
+            log.debug(f'open xarray dataset {fname}')
+            return xr.load_dataset(fname, group=group, engine='h5netcdf')
+        elif form == 'array':
+            log.debug(f'open xarray dataarray {fname}')
+            return xr.load_dataarray(fname, group=group, engine='h5netcdf')
+    else:
+        msg = "File does not exists."
+        raise ToolBoxFileError(msg, fname)
+
+
+def _data_from_list(filenames):
+    """
+    Helper function for data formatting routines. Loads the specified files
+    given by their names. This subroutine expects the name of the group to be
+    'data'.
+
+    Parameters
+    ----------
+    filenames: list
+        list of valid xarray filenames
+
+    Returns
+    -------
+    data: list
+        a list containing the loaded data
+
+    Raises
+    ------
+    ToolBoxFileError
+        raises ToolBoxFileError in case file does not exist.
+    """
+    data = []
+    for name in filenames:
+        f_exists = os.path.isfile(name)
+        if f_exists:
+            data.append(load_xarray(name, group='data'))
+        else:
+            msg = "File does not exists."
+            raise ToolBoxFileError(msg, name)
+    return data
+
+
+def get_data_formatted(filenames=[], data_list=[]):
+    """
+    Combines the given data into one dataset. For any of extra_data's data
+    types, an xarray.Dataset is returned. The data is sorted along the 'module'
+    dimension. The array dimension have the order 'trainId', 'pulse', 'module',
+    'x', 'y'. This order is required by the extra_geometry package.
+
+    Parameters
+    ----------
+    filenames: list of str
+        files to be combined as a list of names. Calls '_data_from_list' to
+        actually load the data.
+    data_list: list
+        list containing the already loaded data
+
+    Returns
+    -------
+    data: xarray.Dataset
+        A xarray.Dataset containing the combined data.
+    """
+    if any(filenames) is True:
+        data = _data_from_list(filenames)
+    elif any(data_list) is True:
+        data = data_list
+
+    mod_list = []
+    for d in data:
+        if 'module' in d.attrs.keys():
+            mod_list.append(d.attrs['module'])
+
+    if type(data[0]).__module__ == 'xarray.core.dataset':
+        data = xr.concat(data, dim='module')
+    elif type(data[0]).__module__ == 'pandas.core.frame':
+        pass
+    elif type(data[0]).__module__ == 'dask.dataframe.core':
+        pass
+
+    if mod_list is not None:
+        data = data.assign_coords(module=("module", mod_list))
+    data = data.sortby("module")
+    data.attrs.clear()
+    return data.transpose('trainId', 'pulse', 'module', 'x', 'y')
+
+
+def search_files(run_folder):
+    """
+    Search folder for h5 files.
+
+    Parameters
+    ----------
+    run_folder: str
+        the path to a folder containing h5 files.
+
+    Returns
+    -------
+    a list of the filenames of all .h5 files in the given folder.
+
+    Raises
+    ------
+    ToolBoxFileError: Exception
+        raises ToolBoxFileError in case there are no .h5 files in the folder,
+        or the folder does not exist.
+    """
+    try:
+        filenames = os.listdir(run_folder)
+        return [run_folder+name for name in filenames if ".h5" in name]
+    except:
+        msg = "No files in folder"
+        raise ToolBoxFileError(msg, run_folder)
--- a/src/toolbox_scs/detectors/dssc_misc.py
+++ b/src/toolbox_scs/detectors/dssc_misc.py
+"""
+    DSSC-related sub-routines.
+
+    comment: contributions should comply with pep8 code structure guidelines.
+"""
+import logging
+
+import numpy as np
+import xarray as xr
+from imageio import imread
+
+import extra_data as ed
+
+from .xgm import get_xgm
+from .digitizers import get_digitizer_peaks
+
+__all__ = [
+    'create_dssc_bins',
+    'get_xgm_formatted',
+    'load_dssc_info',
+    'load_mask',
+    'quickmask_DSSC_ASIC',
+]
+
+log = logging.getLogger(__name__)
+
+
+def load_dssc_info(proposal, run_nr):
+    """
+    Loads the first data file for DSSC module 0 (this is hardcoded)
+    and returns the detector_info dictionary
+
+    Parameters
+    ----------
+    proposal: str, int
+        number of proposal
+    run_nr: str, int
+        number of run
+
+    Returns
+    -------
+    info : dictionary
+        {'dims': tuple, 'frames_per_train': int, 'total_frames': int}
+    """
+    module = ed.open_run(proposal, run_nr, include='*DSSC01*')
+    info = module.detector_info('SCS_DET_DSSC1M-1/DET/1CH0:xtdf')
+    info["number_of_trains"] = len(module.train_ids)
+    info["trainIds"] = module.train_ids
+    log.debug("Fetched information for DSSC module nr. 1.")
+    return info
+
+
+def create_dssc_bins(name, coordinates, bins):
+    """
+    Creates a single entry for the dssc binner dictionary. The produced xarray
+    data-array will later be used to perform grouping operations according to
+    the given bins.
+
+    Parameters
+    ----------
+    name: str
+        name of the coordinate to be binned.
+    coordinates: numpy.ndarray
+        the original coordinate values (1D)
+    bins: numpy.ndarray
+        the bins according to which the corresponding dimension should
+        be grouped.
+
+    Returns
+    -------
+    da: xarray.DataArray
+        A pre-formatted xarray.DataArray relating the specified dimension with
+        its bins.
+
+    Examples
+    --------
+    >>> import toolbox_scs as tb
+    >>> run = tb.open_run(2212, 235, include='*DA*')
+
+    1.) binner along 'pulse' dimension. Group data into two bins.
+    >>> bins_pulse = ['pumped', 'unpumped'] * 10
+    >>> binner_pulse = tb.create_dssc_bins("pulse",
+                                np.linspace(0,19,20, dtype=int),
+                                bins_pulse)
+
+    2.) binner along 'train' dimension. Group data into bins corresponding
+        to the positions of a delay stage for instance.
+    >>> bins_trainId = tb.get_array(run, 'PP800_PhaseShifter', 0.04)
+    >>> binner_train = tb.create_dssc_bins("trainId",
+                                run.trainIds,
+                                bins_trainId.values)
+    """
+    if name in ['trainId', 'pulse', 'x', 'y']:
+        da = xr.DataArray(bins,
+                          dims=[name],
+                          coords={name: coordinates})
+        log.debug(f'created dssc bin array for dimension {name}')
+        return da
+    log.debug(f'could not construct dssc bin array for dimension {name}')
+    raise ValueError(f'Invalid name {str(name)}: value should be '
+                     'trainId, x, or y')
+
+
+def get_xgm_formatted(run_obj, xgm_name, dssc_frame_coords):
+    """
+    Load the xgm data and define coordinates along the pulse dimension.
+
+    Parameters
+    ----------
+    run_obj: extra_data.DataCollection
+        DataCollection object providing access to the xgm data to be loaded
+    xgm_name: str
+        valid mnemonic of a xgm source
+    dssc_frame_coords: int, list
+        defines which dssc frames should be normalized using data from the xgm.
+
+    Returns
+    -------
+    xgm: xarray.DataArray
+        xgm data with coordinate 'pulse'.
+    """
+    log.debug('load raw xgm data')
+    xgm = get_xgm(run_obj, xgm_name)[xgm_name]
+    pulse_dim = [d for d in xgm.dims if d != 'trainId'][0]
+    xgm = xgm.rename({pulse_dim: 'pulse'})
+    if type(dssc_frame_coords) == int:
+        dssc_frame_coords = np.arange(xgm.sizes['pulse']*dssc_frame_coords,
+                                      step=dssc_frame_coords, dtype=np.uint64)
+
+    xgm['pulse'] = dssc_frame_coords
+    log.info('loaded formatted xgm data.')
+    return xgm
+
+
+def get_tim_formatted(proposal, runNB, tim_names, dssc_frame_coords):
+    """
+    Load the tim data and define coordinates along the pulse dimension.
+
+    Parameters
+    ----------
+    proposal: int
+        the proposal number
+    runNB: int
+        the run number
+    tim_names: list of str
+        a list of valid mnemonics for tim data sources
+    dssc_frame_coords: int
+        defines which dssc frames should be normalized using data from the tim.
+
+    Returns
+    -------
+    tim: xarray.DataArray
+        tim data with coordinate 'pulse'.
+    """
+    log.debug('load tim data')
+    tim = get_digitizer_peaks(run_obj, tim_names)
+    # average over all tim sources
+    tim = -tim.to_array().mean(dim='variable')
+    pulse_dim = [d for d in tim.dims if d != 'trainId'][0]
+    tim = tim.rename({pulse_dim: 'pulse'})
+    if type(dssc_frame_coords) == int:
+        dssc_frame_coords = np.arange(tim.sizes['pulse']*dssc_frame_coords,
+                                      step=dssc_frame_coords, dtype=np.uint64)
+
+    tim['pulse'] = dssc_frame_coords
+    log.info('loaded formatted tim data.')
+    return tim
+
+
+def quickmask_DSSC_ASIC(poslist):
+    '''
+    Returns a mask for the given DSSC geometry with ASICs given in poslist
+    blanked. poslist is a list of (module, row, column) tuples. Each module
+    consists of 2 rows and 8 columns of individual ASICS.
+
+    Copyright (c) 2019, Michael Schneider
+    Copyright (c) 2020, SCS-team
+    license: BSD 3-Clause License (see LICENSE_BSD for more info)
+    '''
+    mask = np.ones([16, 128, 512], dtype=float)  # need floats to use NaN
+    for (module, row, col) in poslist:
+        mask[module, 64 * row:64 * (row + 1), 64 * col:64 * (col + 1)] = \
+            np.nan
+    return mask
+
+
+def load_mask(fname, dssc_mask):
+    """
+    Load a DSSC mask file.
+
+    Copyright (c) 2019, Michael Schneider
+    Copyright (c) 2020, SCS-team
+    license: BSD 3-Clause License (see LICENSE_BSD for more info)
+
+    Parameters
+    ----------
+    fname: str
+       string of the filename of the mask file
+
+    Returns
+    -------
+    dssc_mask:
+    """
+    mask = imread(fname)
+    mask = dssc_mask.astype(float)[..., 0] // 255
+    mask[dssc_mask == 0] = np.nan
+    return mask
--- a/src/toolbox_scs/detectors/dssc_plot.py
+++ b/src/toolbox_scs/detectors/dssc_plot.py
+""" DSSC visualization routines
+
+Plotting sub-routines frequently done in combination with dssc analysis. 
+The initial code is based on: https://github.com/dscran/dssc_process
+/blob/master/example_image_process_pulsemask.ipynb
+
+Todo: For visualization of statistical information we could eventually 
+switch to seaborn: https://seaborn.pydata.org/
+"""
+
+from time import strftime
+
+import matplotlib.pyplot as plt
+import numpy as np
+import xarray as xr
+
+
+def plot_xgm_threshold(xgm,
+                       xgm_min = None, xgm_max = None,
+                       run_nr = '', safe_fig = False):
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+
+    ax.plot(xgm.trainId, xgm, 'o', c='C0', ms=1)
+    if xgm_min:
+        ax.axhline(xgm_min, c='r')
+    if xgm_max:
+        ax.axhline(xgm_max, c='r')
+
+    ax.set_xlabel('trainId')
+    ax.set_ylabel('xgm')
+    ax.set_title(f'run: {run_nr}')
+
+    if safe_fig == True:
+        tstamp = strftime('%y%m%d_%H%M')
+        fig.savefig(f'images/run{run_nr}_scan_{tstamp}.png', dpi=200)
+
+
+def plot_binner(binner,
+                yname = 'data', xname='data',
+                run_nr = ''):
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+
+    ax.plot(binner.values)
+
+    ax.set_ylabel(yname)
+    ax.set_xlabel(xname)
+    ax.set_title(f'run: {run_nr}')
+
+
+def plot_binner_hist(binner,
+                 dname = 'data', run_nr = ''):
+
+    counts = xr.DataArray(np.ones(len(binner.values)),
+                      dims=[dname],
+                      coords={dname: binner.values},
+                      name='counts')
+
+    counts = counts.groupby(dname).sum()
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+
+    ax.plot(counts[dname], counts, 'o', ms=4)
+
+    ax.set_xlabel(dname)
+    ax.set_ylabel('counts')
+    ax.set_title(f'run {run_nr}')
+    ax.grid(True)
+
+    #if safe_fig == True:
+    #    tstamp = strftime('%y%m%d_%H%M')
+    #    fig.savefig(f'images/run{run_nr}_scan_{tstamp}.png', dpi=200)
+
+
+def plot_hist_processed(hist_data):
+    pass
--- a/src/toolbox_scs/detectors/dssc_processing.py
+++ b/src/toolbox_scs/detectors/dssc_processing.py
+"""
+    DSSC-related sub-routines.
+
+    comment: contributions should comply with pep8 code structure guidelines.
+"""
+import logging
+import os
+
+import numpy as np
+import xarray as xr
+import pandas as pd
+
+import extra_data as ed
+
+from ..mnemonics_machinery import mnemonics_for_run
+from .dssc_data import save_xarray
+
+__all__ = [
+    'process_dssc_data'
+]
+
+log = logging.getLogger(__name__)
+
+
+def create_empty_dataset(run_info, binners={}):
+    """
+    Create empty (zero-valued) DataSet for a single DSSC module
+    to iteratively add data to.
+
+    Copyright (c) 2020, SCS-team
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+
+    """
+    fpt = run_info['frames_per_train']
+    len_x = run_info['dims'][0]
+    len_y = run_info['dims'][1]
+    defaults = {"trainId": run_info["trainIds"],
+                "pulse": np.linspace(0, fpt-1, fpt, dtype=int),
+                "x": np.linspace(1, len_x, len_x, dtype=int),
+                "y": np.linspace(1, len_y, len_y, dtype=int)}
+
+    dims = []
+    coords = {}
+    shape = []
+
+    for key in defaults:
+        dims.append(key)
+        df = pd.DataFrame({'data': defaults[key]})
+        if key in binners:
+            df = pd.DataFrame({'data': binners[key].values})
+        grouped = df.groupby(['data'])
+        groups = list(grouped.groups.keys())
+        coords[key] = groups
+        shape.append(len(groups))
+
+    da_data = xr.DataArray(np.zeros(shape, dtype=np.float64),
+                           coords=coords,
+                           dims=dims)
+    ds = da_data.to_dataset(name="data")
+    ds['hist'] = xr.full_like(ds.data[:, :, 0, 0], fill_value=0)
+
+    log.debug("Prepared empty dataset for single dssc module")
+    return ds
+
+
+def load_chunk_data(sel, sourcename, maxframes=None):
+    """
+    Load selected DSSC data. The flattened multi-index (trains+pulses) is
+    unraveled before returning the data.
+
+    Copyright (c) 2019, Michael Schneider
+    Copyright (c) 2020, SCS-team
+    license: BSD 3-Clause License (see LICENSE_BSD for more info)
+
+    Parameters
+    ----------
+    sel: extra_data.DataCollection
+        a DataCollection or a subset of a DataCollection obtained by its
+        select_trains() method
+    sourcename: str
+
+    Returns
+    -------
+    xarray.DataArray
+    """
+
+    info = sel.detector_info(sourcename)
+    fpt = info['frames_per_train']
+    data = sel.get_array(sourcename, 'image.data',
+                         extra_dims=['_empty_', 'x', 'y']
+                         ).squeeze()
+
+    tids = np.unique(data.trainId)
+    data = data.rename(dict(trainId='trainId_pulse'))
+
+    midx = pd.MultiIndex.from_product([sorted(tids), range(fpt)],
+                                      names=('trainId', 'pulse'))
+    data = xr.DataArray(data,
+                        dict(trainId_pulse=midx)
+                        ).unstack('trainId_pulse')
+    data = data.transpose('trainId', 'pulse', 'x', 'y')
+
+    log.debug(f"loaded and formatted chunk of dssc data")
+    return data.loc[{'pulse': np.s_[:maxframes]}]
+
+
+def _load_chunk_xgm(sel, xgm_mnemonic, stride):
+    """
+    Load selected xgm data.
+
+    Copyright (c) 2020, SCS-team
+
+    Parameters
+    ----------
+    sel: extra_data.DataCollection
+        a DataCollection or a subset of a DataCollection obtained by its
+        select_trains() method
+    xgm_mnemonic: str
+        a valid mnemonic for an xgm source from the tb.mnemonic dictionary
+    stride: int
+        indicating which dssc frames should be normalized using the xgm data.
+
+    Returns
+    -------
+    xarray.DataArray
+    """
+    mnemonics = mnemonics_for_run(sel)
+    d = sel.get_array(*mnemonics[xgm_mnemonic].values())
+    d = d.rename(new_name_or_name_dict={'XGMbunchId': 'pulse'})
+    frame_coords = np.linspace(0, (d.shape[1]-1)*stride, d.shape[1], dtype=int)
+    d = d.assign_coords({'pulse': frame_coords})
+    log.debug(f"loaded and formatted chunk of xgm data")
+    return d
+
+
+def process_dssc_data(proposal, run_nr, module, chunksize, info, dssc_binners,
+                      path='./',
+                      pulsemask=None,
+                      dark_image=None,
+                      xgm_mnemonic='SCS_SA3',
+                      xgm_normalization=False,
+                      normevery=1
+                      ):
+    """
+    Collects and reduces DSSC data  for a single module.
+
+    Copyright (c) 2020, SCS-team
+
+    Parameters
+    ----------
+    proposal : int
+        proposal number
+    run_nr : int
+        run number
+    module : int
+        DSSC module to process
+    chunksize : int
+        number of trains to load simultaneously
+    info: dictionary
+        dictionary containing keys 'dims', 'frames_per_train', 'total_frames',
+        'trainIds', 'number_of_trains'.
+    dssc_binners: dictionary
+        a dictionary containing binner objects created by the ToolBox member
+        function "create_binner()"
+    path : str
+        location in which the .h5 files, containing the binned data, should
+        be stored.
+    pulsemask : numpy.ndarray
+        array of booleans to be used to mask dssc data according to xgm data.
+    dark_image: xarray.DataArray
+        an xarray dataarray with matching coordinates with the loaded data. If
+        dark_image is not None it will be subtracted from each individual dssc
+        frame.
+    xgm_normalization: bool
+        true if the data should be divided by the corresponding xgm value.
+    xgm_mnemonic: str
+        Mnemonic of the xgm data to be used for normalization.
+    normevery: int
+        One out of normevery dssc frames will be normalized.
+
+    Returns
+    -------
+    module_data: xarray.Dataset
+        xarray datastructure containing data binned according to bins.
+    """
+    # metadata definition
+    ntrains = info['number_of_trains']
+    chunks = np.arange(ntrains, step=chunksize)
+    sourcename_dssc = f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf'
+
+    # open extra_data run objects
+    collection_DSSC = ed.open_run(proposal, run_nr,
+                                  include=f'*DSSC{module:02d}*')
+    collection_DA1 = ed.open_run(proposal, run_nr, include='*DA01*')
+    log.info(f"Processing dssc module {module}: start")
+
+    # create empty dataset for dssc data to be filled iteratively
+    module_data = create_empty_dataset(info, dssc_binners)
+
+    # load data chunk by chunk and merge result into prepared empty dataset
+    for chunk in chunks:
+        log.debug(f"Module {module}: "
+                  f"load trains {chunk}:{chunk + chunksize}")
+
+        sel = collection_DSSC.select_trains(
+                                    ed.by_index[chunk:chunk + chunksize])
+        chunk_data = load_chunk_data(sel, sourcename_dssc)
+        chunk_hist = xr.full_like(chunk_data[:, :, 0, 0], fill_value=1)
+
+        # ---------------------------------------------------------------------
+        # optional blocks -> ToDo: see merge request !87
+        # ---------------------------------------------------------------------
+        # option 1: prefiltering -> xgm pulse masking
+        if pulsemask is not None:
+            log.debug(f'Module {module}: drop out-of-bounds frames')
+            # relatively slow. ToDo -> wrap using np.ufunc
+            chunk_data = chunk_data.where(pulsemask)
+            chunk_hist = chunk_hist.where(pulsemask)
+
+        # option 2: subtraction of dark image/s
+        if dark_image is not None:
+            log.debug(f'Module {module}: subtract dark')
+            chunk_data.values = chunk_data.values - dark_image.values
+            # slower: using xarray directly
+            # chunk_data = chunk_data - dark_image
+
+        # option 3: normalize dssc frames by their xgm values
+        if xgm_normalization:
+            log.debug(f'Module {module}: load and format xgm data')
+            sel_DA1 = collection_DA1.select_trains(
+                                        ed.by_index[chunk:chunk + chunksize])
+            chunk_xgm = _load_chunk_xgm(sel_DA1, xgm_mnemonic, normevery)
+
+            log.debug(f'Module {module}: normalize chunk_data using xgm')
+            # the following line uses numpys fast vectorized array operation
+            # there is overhead coming from rewriting the xarrayDataarray
+            chunk_data.values[:, 0::normevery, :, :] = \
+                np.divide(chunk_data[:, 0::normevery, :, :], chunk_xgm)
+            # slow code using xarray directly
+            # chunk_data = chunk_data / chunk_xgm
+        # ---------------------------------------------------------------------
+        # end optional blocks: xarray operations from here on.
+        # ---------------------------------------------------------------------
+
+        # data reduction -> apply binners
+        log.debug(f'Module {module}: apply binning to chunk_data')
+        chunk_data = chunk_data.to_dataset(name='data')
+        chunk_data['hist'] = chunk_hist
+        for b in dssc_binners:
+            chunk_data[b+"_binned"] = dssc_binners[b]
+            chunk_data = chunk_data.groupby(b+"_binned").sum(b)
+            chunk_data = chunk_data.rename(name_dict={b+"_binned": b})
+
+        # add chunk data to loaded data
+        log.debug(f'Module {module}: merge junk data')
+        for var in ['data', 'hist']:
+            module_data[var] = xr.concat([module_data[var],
+                                         chunk_data[var]],
+                                         dim='tmp').sum('tmp')
+
+        log.debug(f"Module {module}: "
+                  f"processed trains {chunk}:{chunk + chunksize}")
+
+    log.debug(f'Module {module}: calculate mean')
+    module_data['data'] = module_data['data'] / module_data['hist']
+    module_data = module_data.transpose('trainId', 'pulse', 'x', 'y')
+    module_data.attrs['module'] = module
+
+    log.debug(f'saving module {module}')
+    if not os.path.isdir(path):
+        os.mkdir(path)
+    fname = f'run_{run_nr}_module{module}.h5'
+    save_xarray(path+fname, module_data, mode='a')
+
+    log.info(f"processing module {module}: done")
--- a/src/toolbox_scs/detectors/fccd.py
+++ b/src/toolbox_scs/detectors/fccd.py
+from joblib import Parallel, delayed, parallel_backend
+from time import strftime
+import shutil
+from tqdm.auto import tqdm
+import os
+import warnings
+import psutil
+
+import extra_data as ed
+from extra_data.read_machinery import find_proposal
+import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1 import ImageGrid
+import numpy as np
+import xarray as xr
+import h5py
+from glob import glob
+from imageio import imread
+
+from ..constants import mnemonics as _mnemonics
+from .azimuthal_integrator import AzimuthalIntegrator
+from ..misc.laser_utils import positionToDelay
+
+
+class FastCCD:
+    
+    def __init__(self, proposal, distance=1, raw=False):
+        """ Create a FastCCD object to process FaStCCD data.
+        
+            inputs:
+                proposal: (int,str) proposal number string
+                distance: (float) distance sample to FastCCD detector in meter
+                raw: use processed data from the calibration pipeline or raw files
+            
+        """
+        if isinstance(proposal,int):
+            proposal = 'p{:06d}'.format(proposal)
+            
+        self.runFolder = find_proposal(proposal)
+        self.semester = self.runFolder.split('/')[-2]
+        self.proposal = proposal
+        self.topic = self.runFolder.split('/')[-3]
+        self.tempdir = None
+        self.save_folder = os.path.join(self.runFolder, 'usr/condensed_runs/')
+        self.distance = distance
+        self.px_pitch_h = 30 # pitch in microns
+        self.px_pitch_v = 30 # pitch in microns      
+        
+        self.aspect = 1 # aspect ratio of the FastCCD images
+        self.mask = None
+        self.max_fraction_memory = 0.8
+        self.filter_mask = None
+        self.raw = raw
+        self.gain = 1
+        
+        print('FastCCD configuration')
+        print(f'Topic: {self.topic}')
+        print(f'Semester: {self.semester}')
+        print(f'Proposal: {self.proposal}')
+        print(f'Default save folder: {self.save_folder}')
+        print(f'Sample to FastCCD distance: {self.distance} m')
+        print(f'Using raw files: {self.raw}')
+        
+        if not os.path.exists(self.save_folder):
+            warnings.warn(f'Default save folder does not exist: {self.save_folder}')
+            
+        self.max_fraction_memory = 0.8
+        self.Nworker = 10
+        self.maxSaturatedPixel = 1
+        
+        
+    def __del__(self):
+        # deleting temporay folder
+        if self.tempdir:
+            shutil.rmtree(self.tempdir)
+    
+    def open_run(self, run_nr, isDark=False, t0=0.0):
+        """ Open a run with extra-data and prepare the virtual dataset for multiprocessing
+        
+            inputs:
+                run_nr: the run number
+                isDark: True if the run is a dark run
+                t0: optional t0 in mm
+        """
+        
+        print('Opening run data with extra-data')
+        self.run_nr = run_nr
+        self.xgm = None
+        
+        self.run = ed.open_run(self.proposal, self.run_nr)
+        self.plot_title = f'{self.proposal} run: {self.run_nr}'
+        self.isDark = isDark
+        self.fpt = 1
+        #self.nbunches = self.run.get_array('SCS_RR_UTC/MDL/BUNCH_DECODER', 'sase3.nPulses.value')
+        #self.nbunches = np.unique(self.nbunches)
+        self.nbunches = 1
+        #if len(self.nbunches) == 1:
+        #    self.nbunches = self.nbunches[0]
+        #else:
+        #    warnings.warn('not all trains have same length FastCCD data')
+        #    print(f'nbunches: {self.nbunches}')
+        #    self.nbunches = self.nbunches[-1]
+            
+        print(f'FastCCD frames per train: {self.fpt}')
+        print(f'SA3 bunches per train: {self.nbunches}')
+        
+        print('Collecting FastCCD module files')
+        self.collect_fastccd_file()
+        
+        print(f'Loading XGM data')
+        try:
+            self.xgm = self.run.get_array(_mnemonics['SCS_SA3']['source'],
+                                      _mnemonics['SCS_SA3']['key'],
+                                      roi=ed.by_index[:self.nbunches])
+            self.xgm = self.xgm.squeeze() # remove the pulseId dimension since XGM should have only 1 value per train
+        except:
+            self.xgm = xr.DataArray(np.zeros_like(self.run.train_ids),dims = 'trainId', coords = {"trainId":self.run.train_ids})
+            
+        print(f'Loading mono nrj data')
+        try:
+            self.nrj = self.run.get_array(_mnemonics['nrj']['source'],
+                                      _mnemonics['nrj']['key'])
+               
+        except:
+            self.nrj = xr.DataArray(np.zeros_like(self.run.train_ids),dims = 'trainId', coords = {"trainId":self.run.train_ids})    
+                
+                
+        print(f'Loading delay line data')
+        try:
+            self.delay_mm = self.run.get_array(_mnemonics['PP800_DelayLine']['source'],
+                                               _mnemonics['PP800_DelayLine']['key'])
+        except:
+            self.delay_mm = xr.DataArray(np.zeros_like(self.run.train_ids),dims = 'trainId', coords = {"trainId":self.run.train_ids})    
+        self.t0 = t0
+        self.delay_ps = positionToDelay(self.delay_mm, origin=self.t0, invert=True)
+
+        print(f'Loading Fast ADC5 data')
+        try:
+            self.FastADC5 = self.run.get_array(_mnemonics['FastADC5raw']['source'], _mnemonics['FastADC5raw']['key']).max('dim_0')
+            self.FastADC5[self.FastADC5<35000] = 0
+            self.FastADC5[self.FastADC5>=35000] = 1
+        except:
+            self.FastADC5 = xr.DataArray(np.zeros_like(self.run.train_ids),dims = 'trainId', coords = {"trainId":self.run.train_ids})    
+        
+        # create a dummy scan variable for dark run
+        # for other type or run, use FastCCD.define_run function to overwrite it
+        self.scan = xr.DataArray(np.ones_like(self.run.train_ids), dims=['trainId'],coords={'trainId': self.run.train_ids})
+        self.scan = self.scan.to_dataset(name='scan_variable')
+        self.scan_vname = 'dummy'
+        
+    def load_gain(self, fname):
+        """ Load a gain map by giving the filename
+        """
+        self.gain = np.load(fname)['arr_0'][:,:,0]
+        
+    def collect_fastccd_file(self):
+        """ Collect the raw fastCCD h5 files.
+        """
+        
+        if self.raw:
+            folder = 'raw'
+        else:
+            folder = 'proc'
+            
+        pattern = self.runFolder + f'/{folder}/r{self.run_nr:04d}/RAW-R{self.run_nr:04d}-DA05-S*.h5'
+        self.h5list = glob(pattern)                
+        
+    def define_scan(self, vname, bins):
+        """
+            Prepare the binning of the FastCCD data.
+            
+            inputs:
+                vname: variable name for the scan, can be a mnemonic string from ToolBox
+                    or a dictionnary with ['source', 'key'] fields
+                bins: step size (or bins_edge but not yet implemented)
+        """
+
+        if vname == 'delay_ps':
+            self.scan = self.delay_ps
+        else:
+            if type(vname) is dict:
+                self.scan = self.run.get_array(vname['source'], vname['key'])
+            elif type(vname) is str:
+                if vname not in _mnemonics:
+                    raise ValueError(f'{vname} not found in the ToolBox mnemonics table')
+                self.scan = self.run.get_array(_mnemonics[vname]['source'], _mnemonics[vname]['key'])
+            else:
+                raise ValueError(f'vname should be a string or a dict. We got {type(vname)}')
+            
+        if (type(bins) is int) or (type(bins) is float):
+            self.scan = bins * np.round(self.scan / bins)
+        else:
+            # TODO: digitize the data
+            raise ValueError(f'To be implemented')
+        self.scan_vname = vname
+                   
+        self.scan = self.scan.to_dataset(name='scan_variable')
+        #self.scan['xgm_pumped'] = self.xgm[:, :self.nbunches:2].mean('dim_0')
+        #self.scan['xgm_unpumped'] = self.xgm[:, 1:self.nbunches:2].mean('dim_0')
+        #self.scan.to_netcdf(self.vds_scan, group='data')
+
+        self.scan_counts = xr.DataArray(np.ones(len(self.scan['scan_variable'])),
+                                        dims=['scan_variable'],
+                                        coords={'scan_variable': self.scan['scan_variable'].values},
+                                        name='counts')
+        self.scan_points = self.scan.groupby('scan_variable').mean('trainId').coords['scan_variable'].values
+        self.scan_points_counts = self.scan_counts.groupby('scan_variable').sum()
+        self.plot_scan()
+        
+    def plot_scan(self):
+        """ Plot a previously defined scan to see the scan range and the statistics.
+        """
+        if self.scan:
+            fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=[5, 5])
+        else:
+            fig, ax1 = plt.subplots(nrows=1, figsize=[5, 2.5])
+            
+        ax1.plot(self.scan_points, self.scan_points_counts, 'o-', ms=2)
+        ax1.set_xlabel(f'{self.scan_vname}')
+        ax1.set_ylabel('# trains')
+        ax1.set_title(self.plot_title)
+        
+        if self.scan:
+            ax2.plot(self.scan['scan_variable'])
+            ax2.set_xlabel('train #')
+            ax2.set_ylabel(f'{self.scan_vname}')
+                      
+    def plot_xgm_hist(self, nbins=100):
+        """ Plots an histogram of the SCS XGM dedicated SAS3 data.
+        
+            inputs:
+                nbins: number of the bins for the histogram.
+        """
+        
+        hist, bins_edges = np.histogram(self.xgm, nbins, density=True)
+        width = 1.0 * (bins_edges[1] - bins_edges[0])
+        bins_center = 0.5*(bins_edges[:-1] + bins_edges[1:])
+        
+        plt.figure(figsize=(5,3))
+        plt.bar(bins_center, hist, align='center', width=width)
+        plt.xlabel(f"{_mnemonics['SCS_SA3']['source']}{_mnemonics['SCS_SA3']['key']}")
+        plt.ylabel('density')
+        plt.title(self.plot_title)
+        
+    def xgm_filter(self, xgm_low=-np.inf, xgm_high=np.inf):
+        """ Filters the data by train. If one pulse within a train has an SASE3 SCS XGM value below
+            xgm_low or above xgm_high, that train will be dropped from the dataset.
+            
+            inputs:
+                xgm_low: low threshold value
+                xgm_high: high threshold value
+        """
+                   
+        if self.isDark:
+            warnings.warn(f'This run was loaded as dark. Filtering on xgm makes no sense. Aborting')
+            return
+        
+        self.xgm_low = xgm_low
+        self.xgm_high = xgm_high
+        
+        filter_mask = (self.xgm > self.xgm_low) * (self.xgm < self.xgm_high)
+                   
+        if self.filter_mask is None:
+            self.filter_mask = filter_mask
+        else:
+            self.filter_mask = self.filter_mask*filter_mask
+                   
+        valid = filter_mask.astype(bool)
+        self.xgm = self.xgm.where(valid).dropna('trainId')
+        self.scan = self.scan.sel({'trainId': self.xgm.trainId})
+        nrejected = len(self.run.train_ids) - len(self.xgm.trainId)
+        print((f'Rejecting {nrejected} out of {len(self.run.train_ids)} trains due to xgm '
+               f'thresholds: [{self.xgm_low}, {self.xgm_high}]'))
+
+    def load_mask(self, fname, plot=True):
+        """ Load a FastCCD mask file.
+            
+            input:
+                fname: string of the filename of the mask file
+                plot: if True, the loaded mask is plotted
+        """
+                   
+
+        fccd_mask = imread(fname)
+        fccd_mask = fccd_mask.astype(float)[..., 0] // 255
+        fccd_mask[fccd_mask==0] = np.nan
+        self.mask = fccd_mask
+        
+        if plot:
+            plt.figure()
+            plt.imshow(self.mask)
+
+    def binning(self):
+        """ Bin the FastCCD data by the predifined scan type (FastCCD.define()) using multiprocessing
+        
+        """
+      
+        # get available memory in GB, we will try to use 80 % of it
+        max_GB = psutil.virtual_memory().available/1024**3
+        print(f'max available memory: {max_GB} GB')
+        
+        # max_GB / (8byte * 16modules * 128px * 512px * N_pulses)
+        self.chunksize = int(self.max_fraction_memory*max_GB * 1024**3 // (self.Nworker * 16 * 1934 * 960 * self.fpt))
+        
+        print('processing', self.chunksize, 'trains per chunk')
+        assert self.chunksize > 500, "cannot load FastCCD h5 files in memory"
+                   
+        jobs = []
+                   
+        for m,h5fname in enumerate(self.h5list):
+            jobs.append(dict(
+            fpt=self.fpt,
+            h5fname=h5fname,
+            chunksize=self.chunksize,
+            nbunches=self.nbunches,
+            workerId=m,
+            Nworker=self.Nworker,
+            scan = self.scan,
+            xgm = self.xgm,
+            FastADC5 = self.FastADC5
+            #maxSaturatedPixel=self.maxSaturatedPixel
+            ))
+            
+        timestamp = strftime('%X')
+        print(f'start time: {timestamp}')
+                                      
+        with parallel_backend('threading', n_jobs=self.Nworker):
+            res = Parallel( verbose=20)(
+                delayed(process_one_module)(job) for job in tqdm(jobs)
+            )
+        
+        print('finished:', strftime('%X'))
+        
+        # rearange the multiprocessed data
+        # this is to get rid of the worker dimension, there is no sum over worker really involved
+        self.module_data = xr.concat(res, dim='workerId').sum(dim='workerId')
+                
+        self.module_data['pumped'] = self.module_data['pumped'] / self.module_data['sum_count_pumped']
+        self.module_data['unpumped'] = self.module_data['unpumped'] / self.module_data['sum_count_unpumped']           
+        self.module_data['xgm_pumped'] = self.module_data['xgm_pumped'] / self.module_data['sum_count_pumped']
+        self.module_data['xgm_unpumped'] = self.module_data['xgm_unpumped'] / self.module_data['sum_count_unpumped']           
+
+        self.module_data['run'] = self.run_nr
+        self.module_data['t0'] = self.t0
+            
+        self.plot_title = f"{self.proposal} run: {self.module_data['run'].values}"
+        self.module_data.attrs['plot_title'] = self.plot_title
+        self.module_data.attrs['scan_variable'] = self.scan_vname
+
+
+    def save(self, save_folder=None, overwrite=False):
+        """ Save the crunched data.
+        
+            inputs:
+                save_folder: string of the fodler where to save the data.
+                overwrite: boolean whether or not to overwrite existing files.
+        """
+        if save_folder is None:
+            save_folder = self.save_folder
+
+        if self.isDark:
+            fname = f'run{self.run_nr}_dark.h5'  # no scan
+        else:
+            fname = f'run{self.run_nr}.h5'  # run with delay scan (change for other scan types!)
+
+
+        save_path = os.path.join(save_folder, fname)
+        file_exists = os.path.isfile(save_path)
+
+        if not file_exists or (file_exists and overwrite):
+            if file_exists:
+                warnings.warn(f'Overwriting file: {save_path}')
+                os.remove(save_path)
+            self.module_data.to_netcdf(save_path, group='data')
+            self.module_data.close()
+            os.chmod(save_path, 0o664)
+            print('saving: ', save_path)
+        else:
+            print('file', save_path, 'exists and overwrite is False')
+                   
+    def load_binned(self, runNB, dark_runNB=None, xgm_norm = True, save_folder=None):
+        """ load previously binned (crunched) FastCCD data by FastCCD.crunch() and FastCCD.save()
+        
+            inputs:
+                runNB: run number to load
+                dark_runNB: run number of the corresponding dark
+                xgm_norm: normlize by XGM data if True
+                save_folder: path string  where the crunched data are saved
+        """
+
+        if save_folder is None:
+            save_folder = self.save_folder
+
+        self.plot_title = f'{self.proposal} run: {runNB} dark: {dark_runNB}'
+                   
+        binned = xr.load_dataset(os.path.join(save_folder, f'run{runNB}.h5'), group='data')
+                   
+        if dark_runNB is not None:
+            dark = xr.load_dataset(os.path.join(save_folder, f'run{dark_runNB}_dark.h5'), group='data')
+            binned['pumped'] = self.gain*(binned['pumped'] - dark['pumped'].squeeze(drop=True))
+            binned['unpumped'] = self.gain*(binned['unpumped'] - dark['unpumped'].squeeze(drop=True))
+                   
+        if xgm_norm:
+            binned['pumped'] = binned['pumped']/binned['xgm_pumped']
+            binned['unpumped'] = binned['unpumped']/binned['xgm_unpumped']
+                   
+        self.scan_points = binned['scan_variable']
+        self.scan_points_counts = binned['sum_count_pumped'] + binned['sum_count_unpumped']
+        self.scan_vname = binned.attrs['scan_variable']
+        self.scan = None
+
+        self.binned = binned
+                   
+    def plot_FastCCD(self, use_mask = True, p_low = 1, p_high = 98, vmin = None, vmax = None):
+        """ Plot pumped and unpumped FastCCD images.
+        
+            inputs:
+                use_mask: if True, a mask is applied on the FastCCD.
+                p_low: low percentile value to adjust the contrast scale on the unpumped and pumped image
+                p_high: high percentile value to adjust the contrast scale on the unpumped and pumped image
+                vmin: low value of the image scale
+                vmax: high value of the image scale
+        """
+                   
+        if use_mask:
+            if self.mask is None:
+                raise ValueError('No mask was loaded !')
+                   
+            mask = self.mask
+            mask_txt = ' masked'
+        else:
+            mask = 1
+            mask_txt = ''
+        
+                   
+        im_pump_mean = self.binned['pumped'].mean('scan_variable')
+        im_unpump_mean = self.binned['unpumped'].mean('scan_variable')
+        
+        self.im_pump_mean = mask*im_pump_mean
+        self.im_unpump_mean = mask*im_unpump_mean
+                           
+        fig = plt.figure(figsize=(9, 4))
+        grid = ImageGrid(fig, 111,
+                 nrows_ncols=(1,2),
+                 axes_pad=0.15,
+                 share_all=True,
+                 cbar_location="right",
+                 cbar_mode="single",
+                 cbar_size="7%",
+                 cbar_pad=0.15,
+                 )
+
+        tmp = self.im_pump_mean.values.flatten()
+        try:
+            _vmin, _vmax = np.percentile(tmp[~np.isnan(tmp)], [p_low, p_high])
+        except:
+            _vmin, _vmax = (None, None)
+                   
+        if vmin is None:
+            vmin = _vmin
+        if vmax is None:
+            vmax = _vmax
+                         
+        im = grid[0].imshow(self.im_pump_mean, vmin=vmin, vmax=vmax, aspect=self.aspect)
+        grid[0].set_title('pumped' + mask_txt)
+
+        im = grid[1].imshow(self.im_unpump_mean, vmin=vmin, vmax=vmax, aspect=self.aspect)
+        grid[1].set_title('unpumped' + mask_txt)
+                   
+        grid[-1].cax.colorbar(im)
+        grid[-1].cax.toggle_label(True)
+        
+        fig.suptitle(self.plot_title)
+                   
+    def azimuthal_int(self, wl, center=None, angle_range=[0, 180-1e-6], dr=1, use_mask=True):
+        """ Perform azimuthal integration of 1D binned FastCCD run.
+        
+            inputs:
+                wl: photon wavelength
+                center: center of integration
+                angle_range: angles of integration
+                dr: dr
+                use_mask: if True, use the loaded mask
+        """
+
+        if use_mask:
+            if self.mask is None:
+                raise ValueError('No mask was loaded !')
+
+            mask = self.mask
+            mask_txt = ' masked'
+        else:
+            mask = 1
+            mask_txt = ''
+
+        im_pumped_arranged = self.binned['pumped'].values
+        im_unpumped_arranged = self.binned['unpumped'].values
+
+        im_pumped_arranged *= mask
+        im_unpumped_arranged *= mask
+
+        im_pumped_mean = im_pumped_arranged.mean(axis=0)
+        im_unpumped_mean = im_unpumped_arranged.mean(axis=0)
+
+        ai = AzimuthalIntegrator(im_pumped_mean.shape, center, angle_range, dr=dr, aspect=1)
+        norm = ai(~np.isnan(im_pumped_mean))
+
+        az_pump = []
+        az_unpump = []
+
+        for i in tqdm(range(len(self.binned['scan_variable']))):
+            az_pump.append(ai(im_pumped_arranged[i]) / norm)
+            az_unpump.append(ai(im_unpumped_arranged[i]) / norm)
+
+        az_pump = np.stack(az_pump)
+        az_unpump = np.stack(az_unpump)
+
+        coords = {'scan_variable': self.binned['scan_variable'], 'distance': ai.distance}
+        azimuthal = xr.DataArray(az_pump, dims=['scan_variable', 'distance'], coords=coords)
+        azimuthal = azimuthal.to_dataset(name='pumped')
+        azimuthal['unpumped'] = xr.DataArray(az_unpump, dims=['scan_variable', 'distance'], coords=coords)
+        azimuthal = azimuthal.transpose('distance', 'scan_variable')
+
+        #t0 = 225.5
+        #azimuthal['delay'] = (t0 - azimuthal.delay)*6.6
+        #azimuthal['delay'] = azimuthal.delay
+
+        azimuthal['delta_q (1/nm)'] = 2e-9 * np.pi * np.sin(
+            np.arctan(azimuthal.distance *  self.px_pitch_v*1e-6 / self.distance)) / wl
+        
+        azimuthal.attrs = self.binned.attrs
+
+        self.azimuthal = azimuthal.swap_dims({'distance': 'delta_q (1/nm)'})
+                   
+    def plot_azimuthal_int(self, kind='difference', lim=None):
+        """ Plot a computed azimuthal integration.
+        
+            inputs:
+                kind: (str) either 'difference' or 'relative' to change the type of plot.
+        """
+        fig, [ax1, ax2, ax3] = plt.subplots(nrows=3, sharex=True, sharey=True)
+
+        xr.plot.imshow(self.azimuthal.pumped, ax=ax1, vmin=0, robust=True)
+        ax1.set_title('pumped')
+
+        xr.plot.imshow(self.azimuthal.unpumped, ax=ax2, vmin=0, robust=True)
+        ax2.set_title('unpumped')
+                   
+        if kind == 'difference':
+            val = self.azimuthal.pumped - self.azimuthal.unpumped
+            ax3.set_title('pumped - unpumped')
+        elif kind == 'relative':
+            val = (self.azimuthal.pumped - self.azimuthal.unpumped)/self.azimuthal.unpumped
+            ax3.set_title('(pumped - unpumped)/unpumped')
+        else:
+            raise ValueError('kind should be either difference or relative')
+
+        if lim is None:
+            xr.plot.imshow(val, ax=ax3, robust=True)
+        else:
+            xr.plot.imshow(val, ax=ax3, vmin=lim[0], vmax=lim[1])
+                   
+        ax3.set_xlabel(self.scan_vname)
+        fig.suptitle(f'{self.plot_title}')
+
+    def plot_azimuthal_line_cut(self, data, qranges, qwidths):
+        """ Plot line scans on top of the data.
+        
+            inputs:
+                data: an azimuthal integrated xarray DataArray with 'delta_q (1/nm)' as one of its dimension.
+                qranges: a list of q-range
+                qwidth: a list of q-width, same length as qranges
+        """
+                   
+        fig, [ax1, ax2] = plt.subplots(nrows=2, sharex=True, figsize=[8, 7])
+
+        xr.plot.imshow(data, ax=ax1, robust=True)
+
+        # attributes are not propagated during xarray mathematical operation https://github.com/pydata/xarray/issues/988
+        # so we might not have in data the scan vaiable name anymore
+        ax1.set_xlabel(self.scan_vname) 
+        fig.suptitle(f'{self.plot_title}')
+    
+        for i, (qr, qw) in enumerate(zip(qranges, qwidths)):
+            sel = (data['delta_q (1/nm)'] > (qr - qw/2)) * (data['delta_q (1/nm)'] < (qr + qw/2))
+            val = data.where(sel).mean('delta_q (1/nm)')
+            ax2.plot(data.scan_variable, val, c=f'C{i}', label=f'q = {qr:.2f}')
+        
+            ax1.axhline(qr - qw/2, c=f'C{i}', lw=1)
+            ax1.axhline(qr + qw/2, c=f'C{i}', lw=1)
+        ax2.legend()
+        ax2.set_xlabel(self.scan_vname)
+        
+                   
+# since 'self' is not pickable, this function has to be outside the FastCCD class so that it can be used
+# by the multiprocessing pool.map function
+def process_one_module(job):
+    fpt = job['fpt']
+    Nworker = job['Nworker']
+    workerId = job['workerId']
+    scan = job['scan']
+    chunksize = job['chunksize']
+    nbunches = job['nbunches']
+    h5fname = job['h5fname']
+    xgm = job['xgm']
+    FastADC5 = job['FastADC5']                   
+    #maxSaturatedPixel = job['maxSaturatedPixel']
+
+    image_path = f'/INSTRUMENT/SCS_CDIDET_FCCD2M/DAQ/FCCD:daqOutput/data/image/pixels'
+
+    # crunching
+    with h5py.File(h5fname, 'r') as m:
+        fastccd_trains = m['/INSTRUMENT/SCS_CDIDET_FCCD2M/DAQ/FCCD:daqOutput/data/trainId'][()]
+        data = m[image_path][()].squeeze().astype(np.float64)
+
+        unique_trainIds, unique_list = np.unique(fastccd_trains, return_index = True)                  
+        unique_nz_list = np.nonzero(unique_trainIds)[0]
+        
+        fastccd_trains = unique_trainIds[unique_nz_list]
+        coords = {'trainId': fastccd_trains}
+
+        fastccd = xr.DataArray(data[unique_nz_list, :, :], dims=['trainId', 'x', 'y'], coords=coords)
+        fastccd = fastccd.where(fastccd.sum(('x','y'), skipna=True) > 0)
+                   
+        aligned_vals = xr.align(*[fastccd, xgm, FastADC5], join='inner')
+        ds = xr.Dataset(dict(zip(['fastccd', 'xgm', 'FastADC5'], aligned_vals)))
+        ds['sum_count'] = xr.full_like(ds['fastccd'][..., 0, 0], fill_value=1)
+                   
+        # grouping and summing
+        ds['scan_variable'] = scan['scan_variable'] # this only adds scan data for matching trainIds
+        ds = ds.dropna('trainId')
+        #print(ds)
+                   
+        data_pumped = ds.where(ds['FastADC5'] > 0, drop=True).groupby('scan_variable').sum('trainId')
+        data_unpumped = ds.where(ds['FastADC5'] < 1, drop=True).groupby('scan_variable').sum('trainId')
+                   
+        module_data = data_pumped['fastccd'].to_dataset(name='pumped')
+        module_data['unpumped'] = data_unpumped['fastccd']   
+        module_data['sum_count_pumped'] = data_pumped['sum_count']
+        module_data['sum_count_unpumped'] = data_unpumped['sum_count']
+        module_data['xgm_pumped'] = data_pumped['xgm']
+        module_data['xgm_unpumped'] = data_unpumped['xgm']
+        module_data['workerId'] = workerId
+        
+    return module_data
--- a/src/toolbox_scs/detectors/gotthard2.py
+++ b/src/toolbox_scs/detectors/gotthard2.py
+""" Gotthard-II detector related sub-routines
+
+    Copyright (2024) SCS Team.
+
+    (contributions preferrably comply with pep8 code structure
+    guidelines.)
+"""
+from extra.components import OpticalLaserPulses, XrayPulses
+import numpy as np
+import xarray as xr
+import logging
+__all__ = [
+    'extract_GH2',
+]
+
+log = logging.getLogger(__name__)
+
+
+def extract_GH2(ds, run, firstFrame=0, bunchPattern='scs_ppl',
+                gh2_dim='gh2_pId', keep_darks=False):
+    '''
+    Select and align the frames of the Gotthard-II that have been exposed
+    to light. `keep_darks` gives the option to keep the non-exposed
+    frames.
+
+    Parameters
+    ------
+    ds: xarray.Dataset
+        The dataset containing GH2 data
+    run: extra_data.DataCollection
+        The run containing the bunch pattern source
+    firstFrame: int
+        The GH2 frame number corresponding to the first pulse of the train.
+    bunchPattern: str in ['scs_ppl', 'sase3']
+        the bunch pattern used to align data. For 'scs_ppl', the gh2_pId
+        dimension in renamed 'ol_pId', and for 'sase3' gh2_pId is renamed
+        'sa3_pId'.
+    gh2_dim: str
+        The name of the dimension that corresponds to the Gotthard-II frames.
+    keep_darks: bool
+        If True, the frames that do not correspond to the bunchPattern are
+        kept.
+
+    Returns
+    -------
+    nds: xarray Dataset
+        The aligned and reduced dataset with only-data-containing GH2
+        variables.
+    '''
+    if gh2_dim not in ds.dims:
+        log.warning(f'gh2_dim "{gh2_dim}" not in dataset. Skipping.')
+        return ds
+    if bunchPattern == 'scs_ppl':
+        pattern = OpticalLaserPulses(run)
+        dim = 'ol_pId'
+    else:
+        pattern = XrayPulses(run)
+        dim = 'sa3_pId'
+    others = [var for var in ds if 
+              gh2_dim not in ds[var].dims or dim in ds[var].coords]
+    nds = ds.drop_dims(dim, errors='ignore')
+    nds = ds.drop_vars(others, errors='ignore')
+    dark_ds = xr.Dataset()
+    if pattern.is_constant_pattern():
+        pulse_ids = pattern.peek_pulse_ids(labelled=False)
+        signal_on_gh_ids = pulse_ids + firstFrame
+        if keep_darks:
+            no_signal_ids = [i for i in nds[gh2_dim].values
+                             if i not in signal_on_gh_ids]
+            dark_ds = nds.isel({gh2_dim: no_signal_ids})
+            dark_ds = dark_ds.assign_coords({gh2_dim: no_signal_ids})
+            for n in dark_ds:
+                dark_ds = dark_ds.rename({n: f'{n}_dark'})
+        nds = nds.isel({gh2_dim: pulse_ids + firstFrame})
+        nds = nds.assign_coords({gh2_dim: pulse_ids})
+        nds = nds.rename({gh2_dim: dim})
+    else:
+        log.warning('The number of pulses has changed during the run.')
+        pulse_ids = np.unique(pattern.pulse_ids(labelled=False, copy=False))
+        nds = nds.isel({gh2_dim: pulse_ids + firstFrame})
+        nds = nds.assign_coords({gh2_dim: pulse_ids})
+        nds = nds.rename({gh2_dim: dim})
+        mask = pattern.pulse_mask(labelled=False)
+        mask = xr.DataArray(mask, dims=['trainId', dim],
+                            coords={'trainId': run.train_ids,
+                                    dim: np.arange(mask.shape[1])})
+        mask = mask.sel({dim: pulse_ids})
+        nds = nds.where(mask, drop=True)
+    nds = nds.merge(dark_ds, join='inner')
+    ret = ds[others].merge(nds, join='inner')
+    return ret
--- a/src/toolbox_scs/detectors/hrixs.py
+++ b/src/toolbox_scs/detectors/hrixs.py
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+from scipy.signal import fftconvolve
+import xarray as xr
+
+from extra_data import open_run
+import pasha as psh
+import toolbox_scs as tb
+
+
+__all__ = [
+    'hRIXS',
+    'MaranaX',
+]
+
+
+# -----------------------------------------------------------------------------
+# Curvature
+
+def find_curvature(image, frangex=None, frangey=None,
+                   deg=2, axis=1, offset=100):
+    # Resolve arguments
+    x_range = (0, image.shape[1])
+    if frangex is not None:
+        x_range = (max(frangex[0], x_range[0]), min(frangex[1], x_range[1]))
+    y_range = (0, image.shape[0])
+    if frangex is not None:
+        y_range = (max(frangey[0], y_range[0]), min(frangey[1], y_range[1]))
+
+    axis_range = y_range if axis == 1 else x_range
+    axis_dim = image.shape[axis - 1]
+
+    # Get kernel
+    integral = image[slice(*y_range), slice(*x_range)].mean(axis=axis)
+    roi = np.ones([axis_range[1] - axis_range[0], axis_dim])
+    ref = roi * integral[:, np.newaxis]
+
+    # Get sliced image
+    slice_ = [slice(None), slice(None)]
+    slice_[axis - 1] = slice(max(axis_range[0] - offset, 0),
+                             min(axis_range[1] + offset, axis_dim))
+    sliced = image[tuple(slice_)]
+    if axis == 0:
+        sliced = sliced.T
+
+    # Get curvature factor from cross correlation
+    crosscorr = fftconvolve(sliced,
+                            ref[::-1, :],
+                            axes=0, )
+    shifts = np.argmax(crosscorr, axis=0)
+    curv = np.polyfit(np.arange(axis_dim), shifts, deg=deg)
+    return curv[:-1][::-1]
+
+
+def find_curvature(img, args, plot=False, **kwargs):
+    def parabola(x, a, b, c, s=0, h=0, o=0):
+        return (a*x + b)*x + c
+    def gauss(y, x, a, b, c, s, h, o=0):
+        return h * np.exp(-((y - parabola(x, a, b, c)) / (2 * s))**2) + o
+    x = np.arange(img.shape[1])[None, :]
+    y = np.arange(img.shape[0])[:, None]
+
+    if plot:
+        plt.figure(figsize=(10,10))
+        plt.imshow(img, cmap='gray', aspect='auto', interpolation='nearest', **kwargs)
+        plt.plot(x[0, :], parabola(x[0, :], *args))
+
+    args, _ = leastsq(lambda args: (gauss(y, x, *args) - img).ravel(), args)
+
+    if plot:
+        plt.plot(x[0, :], parabola(x[0, :], *args))
+    return args
+
+
+def correct_curvature(image, factor=None, axis=1):
+    if factor is None:
+        return
+
+    if axis == 1:
+        image = image.T
+
+    ydim, xdim = image.shape
+    x = np.arange(xdim + 1)
+    y = np.arange(ydim + 1)
+    xx, yy = np.meshgrid(x[:-1] + 0.5, y[:-1] + 0.5)
+    xxn = xx - factor[0] * yy - factor[1] * yy ** 2
+    ret = np.histogramdd((xxn.flatten(), yy.flatten()),
+                         bins=[x, y],
+                         weights=image.flatten())[0]
+
+    return ret if axis == 1 else ret.T
+
+
+def get_spectrum(image, factor=None, axis=0,
+                 pixel_range=None, energy_range=None, ):
+    start, stop = (0, image.shape[axis - 1])
+    if pixel_range is not None:
+        start = max(pixel_range[0] or start, start)
+        stop = min(pixel_range[1] or stop, stop)
+
+    edge = image.sum(axis=axis)[start:stop]
+    bins = np.arange(start, stop + 1)
+    centers = (bins[1:] + bins[:-1]) * 0.5
+    if factor is not None:
+        centers, edge = calibrate(centers, edge,
+                                  factor=factor,
+                                  range_=energy_range)
+
+    return centers, edge
+
+
+# -----------------------------------------------------------------------------
+# Energy calibration
+
+
+def energy_calibration(channels, energies):
+    return np.polyfit(channels, energies, deg=1)
+
+
+def calibrate(x, y=None, factor=None, range_=None):
+    if factor is not None:
+        x = np.polyval(factor, x)
+
+    if y is not None and range_ is not None:
+        start = np.argmin(np.abs((x - range_[0])))
+        stop = np.argmin(np.abs((x - range_[1])))
+        # Calibrated energies have a different direction
+        x, y = x[stop:start], y[stop:start]
+
+    return x, y
+
+
+# -----------------------------------------------------------------------------
+# Gaussian-related functions
+
+
+FWHM_COEFF = 2 * np.sqrt(2 * np.log(2))
+
+
+def gaussian_fit(x_data, y_data, offset=0):
+    """
+    Centre-of-mass and width. Lifted from image_processing.imageCentreofMass()
+    """
+
+    x0 = np.average(x_data, weights=y_data)
+    sx = np.sqrt(np.average((x_data - x0) ** 2, weights=y_data))
+
+    # Gaussian fit
+    baseline = y_data.min()
+    p_0 = (y_data.max(), x0 + offset, sx, baseline)
+    try:
+        p_f, _ = curve_fit(gauss1d, x_data, y_data, p_0, maxfev=10000)
+        return p_f
+    except (RuntimeError, TypeError) as e:
+        print(e)
+        return None
+
+
+def gauss1d(x, height, x0, sigma, offset):
+    return height * np.exp(-0.5 * ((x - x0) / sigma) ** 2) + offset
+
+
+def to_fwhm(sigma):
+    return abs(sigma * FWHM_COEFF)
+
+
+def decentroid(res):
+    res = np.array(res)
+    ret = np.zeros(shape=(res.max(axis=0) + 1).astype(int))
+    for cy, cx in res:
+        if cx > 0 and cy > 0:
+            ret[int(cy), int(cx)] += 1
+    return ret
+
+
+class hRIXS:
+    """The hRIXS analysis, especially curvature correction
+
+    The objects of this class contain the meta-information about the settings
+    of the spectrometer, not the actual data, except possibly a dark image
+    for background subtraction.
+
+    The actual data is loaded into `xarray`s, and stays there.
+
+    Attributes
+    ----------
+
+    PROPOSAL: int
+        the number of the proposal
+    DETECTOR: str
+        the detector to be used. Can be ['hRIXS_det', 'MaranaX']
+        defaults to 'hRIXS_det' for backward-compatibility.
+    X_RANGE: slice
+        the slice to take in the dispersive direction, in pixels. Defaults
+        to the entire width.
+    Y_RANGE: slice
+        the slice to take in the energy direction
+    THRESHOLD: float
+        pixel counts above which a hit candidate is assumed, for centroiding.
+        use None if you want to give it in standard deviations instead.
+    STD_THRESHOLD:
+        same as THRESHOLD, in standard deviations.
+    DBL_THRESHOLD:
+        threshold controling whether a detected hit is considered to be a 
+        double hit.
+    BINS: int
+        the number of bins used in centroiding
+    CURVE_A, CURVE_B: float
+        the coefficients of the parabola for the curvature correction
+    USE_DARK: bool
+        whether to do dark subtraction. Is initially `False`, magically
+        switches to `True` if a dark has been loaded, but may be reset.
+    ENERGY_INTERCEPT, ENERGY_SLOPE:
+        The calibration from pixel to energy
+    FIELDS:
+        the fields to be loaded from the data. Add additional fields if so
+        desired.
+
+    Example
+    -------
+        
+        proposal = 3145
+        h = hRIXS(proposal)
+        h.Y_RANGE = slice(700, 900)
+        h.CURVE_B = -3.695346575286939e-07
+        h.CURVE_A = 0.024084479232443695
+        h.ENERGY_SLOPE = 0.018387
+        h.ENERGY_INTERCEPT = 498.27
+        h.STD_THRESHOLD = 3.5
+    """
+    
+    DETECTOR_FIELDS = {
+        'hRIXS_det': ['hRIXS_det', 'hRIXS_index', 'hRIXS_delay', 'hRIXS_norm', 'nrj'],
+        'MaranaX': ['MaranaX', 'nrj'],
+    }
+    
+    def __init__(self, proposalNB, detector='MaranaX'):
+
+        self.PROPOSAL = proposalNB
+        self.DETECTOR = detector
+        assert detector in self.DETECTOR_FIELDS
+
+        # image range
+        self.X_RANGE = np.s_[:]
+        self.Y_RANGE = np.s_[:]
+
+        # centroid
+        # centroid_one threshold
+        self.THRESHOLD = None  # pixel counts above which a hit candidate is assumed
+        self.STD_THRESHOLD = 3.5  # same as THRESHOLD, in standard deviations
+        self.DBL_THRESHOLD = 0.1  # factor used for double hits in centroid_one
+        # centroid_two threshold
+        self.CENTROID_THRESHOLD = [0.2, 1]
+
+        self.CURVE_A = 0  # curvature parameters as determined elsewhere
+        self.CURVE_B = 0
+
+        # integral
+        self.BINS = 100
+
+        self.METHOD = 'centroid'  # ['centroid', 'integral']
+        self.USE_DARK = False
+
+        # Ignore double hits
+        self.AVOID_DBL = False
+
+        self.ENERGY_INTERCEPT = 0
+        self.ENERGY_SLOPE = 1
+
+        self.FIELDS = self.DETECTOR_FIELDS[detector]
+
+    def set_params(self, **params):
+        for key, value in params.items():
+            setattr(self, key.upper(), value)
+
+    def get_params(self, *params):
+        if not params:
+            params = ('proposal', 'x_range', 'y_range',
+                      'threshold', 'std_threshold', 'dbl_threshold',
+                      'curve_a', 'curve_b',
+                      'bins', 'method', 'fields')
+        return {param: getattr(self, param.upper()) for param in params}
+
+    def from_run(self, runNB, proposal=None, extra_fields=(), drop_first=False, subset=None):
+        """load a run
+
+        Load the run `runNB`. A thin wrapper around `toolbox.load`.
+        Parameters
+        ----------
+            drop_first: bool
+                if True, the first image in the run is removed from the dataset.
+
+        Example
+        -------
+
+            data = h.from_run(145)  # load run 145
+
+            data1 = h.from_run(145)  # load run 145
+            data2 = h.from_run(155)  # load run 155
+            data = xarray.concat([data1, data2], 'trainId')  # combine both
+        """
+        if proposal is None:
+            proposal = self.PROPOSAL
+        if drop_first:
+            subset = slice(1, None)
+        run, data = tb.load(proposal, runNB=runNB, subset=subset,
+                            fields=self.FIELDS + list(extra_fields))
+        return data
+
+    def load_dark(self, runNB, proposal=None):
+        """load a dark run
+
+        Load the dark run `runNB` from `proposal`. The latter defaults to the
+        current proposal. The dark is stored in this `hRIXS` object, and
+        subsequent analyses use it for background subtraction.
+
+        Example
+        -------
+
+            h.load_dark(166)  # load dark run 166
+        """
+        data = self.from_run(runNB, proposal)
+        self.dark_image = data[self.DETECTOR].mean(dim='trainId')
+        self.USE_DARK = True
+
+    def find_curvature(self, runNB, proposal=None, plot=True, args=None,
+                       **kwargs):
+        """find the curvature correction coefficients
+
+        The hRIXS has some abberations which leads to the spectroscopic lines
+        being curved on the detector. We approximate these abberations with
+        a parabola for later correction.
+
+        Load a run and determine the curvature. The curvature is set in `self`,
+        and returned as a pair of floats.
+
+        Parameters
+        ----------
+
+        runNB: int
+            the run number to use
+        proposal: int
+            the proposal to use, default to the current proposal
+        plot: bool
+            whether to plot the found curvature onto the data
+        args: pair of float, optional
+            a starting value to prime the fitting routine
+
+        Example
+        -------
+
+            h.find_curvature(155)  # use run 155 to fit the curvature
+        """
+        data = self.from_run(runNB, proposal)
+
+        image = data[self.DETECTOR].sum(dim='trainId') \
+                .values[self.X_RANGE, self.Y_RANGE].T
+        if args is None:
+            spec = (image - image[:10, :].mean()).mean(axis=1)
+            mean = np.average(np.arange(len(spec)), weights=spec)
+            args = (-2e-7, 0.02, mean - 0.02 * image.shape[1] / 2, 3,
+                    spec.max(), image.mean())
+        args = find_curvature(image, args, plot=plot, **kwargs)
+        self.CURVE_B, self.CURVE_A, *_ = args
+        return self.CURVE_A, self.CURVE_B
+
+    def centroid_one(self, image):
+        """find the position of photons with sub-pixel precision
+
+        A photon is supposed to have hit the detector if the intensity within a
+        2-by-2 square exceeds a threshold. In this case the position of the photon
+        is calculated as the center-of-mass in a 4-by-4 square.
+
+        Return the list of x, y coordinate pairs, corrected by the curvature.
+        """
+        base = image.mean()
+        corners = image[1:, 1:] + image[:-1, 1:] \
+                  + image[1:, :-1] + image[:-1, :-1]
+        if self.THRESHOLD is None:
+            threshold = corners.mean() + self.STD_THRESHOLD * corners.std()
+        else:
+            threshold = self.THRESHOLD
+        # Threshold for double photons chosen to be the same ratio to single
+        # photons as found in the ESRF method
+        SpotHIGH=self.DBL_THRESHOLD*threshold
+        if self.AVOID_DBL:
+            SpotHIGH = 100000
+        middle = corners[1:-1, 1:-1]
+        candidates = (
+            (middle > threshold)
+            * (middle >= corners[:-2, 1:-1]) * (middle > corners[2:, 1:-1])
+            * (middle >= corners[1:-1, :-2]) * (middle > corners[1:-1, 2:])
+            * (middle >= corners[:-2, :-2]) * (middle > corners[2:, :-2])
+            * (middle >= corners[:-2, 2:]) * (middle > corners[2:, 2:]))
+        cp = np.argwhere(candidates)
+        if len(cp) > 10000:
+            raise RuntimeError(
+                "too many peaks, threshold low or acquisition time too high")
+
+        res = []
+        dres = []
+        for cy, cx in cp:
+            spot = image[cy: cy + 4, cx: cx + 4] - base
+            mx = np.average(np.arange(cx, cx + 4), weights=spot.sum(axis=0))
+            my = np.average(np.arange(cy, cy + 4), weights=spot.sum(axis=1))
+            if spot.sum() < SpotHIGH:
+                res.append((mx, my))
+            else:
+                res.append((mx, my))
+                res.append((mx, my))
+                dres.append((mx, my))
+        return res, dres
+
+    def centroid_two(self, image, energy):
+        """ determine position of photon hits on detector
+
+        The algrothm is taken from the ESRF RIXS toolbox. The thresholds for 
+        determining photon hits are given by the incident photon energy
+
+        The function returns arrays containing the single and double hits
+        as x and y coordinates
+        """
+        # Prepare
+        threshold = self.CENTROID_THRESHOLD
+
+        # Multiplication factor * ADU/photon
+        photons = energy/3.6/1.06
+        SpotLOW = threshold[0] * photons
+        SpotHIGH = threshold[1] * photons
+        low_th_px = threshold[0] * photons
+        high_th_px = threshold[1] * photons
+
+        if self.AVOID_DBL:
+            SpotHIGH = 100000
+
+        img = image-image.mean()
+
+        gs = 2
+ 
+        # Find potential hits on a clipped image where 2 rows/columns are excluded
+        # from the edges because centroiding can't be done at the edge
+        cp = (np.argwhere((img[gs//2 : -gs//2, gs//2 : -gs//2] > low_th_px)*
+                         (img[gs//2 : -gs//2, gs//2 : -gs//2] < high_th_px))+
+                         np.array([gs//2, gs//2]))
+        
+        if len(cp) > 100000:
+            raise RuntimeError('Threshold to low or acquisition time to long')
+
+        res = []
+        dres = []
+        for cy, cx in cp:
+            spot = img[cy - gs//2 : cy + gs//2 + 1, cx - gs//2 : cx+gs//2 +1] 
+            #spot[spot < 0] = 0
+            if (spot > img[cy, cx]).sum() == 0:
+                mx = np.average(np.arange(cx - gs//2, cx + gs//2 + 1), 
+                    weights=spot.sum(axis=0))
+                my = np.average(np.arange(cy - gs//2, cy + gs//2 + 1), 
+                    weights=spot.sum(axis=1))
+                if (spot.sum()>=SpotLOW) and (spot.sum()<SpotHIGH):               
+                    res.append((mx, my))
+                elif (spot.sum()>SpotHIGH):
+                    res.append((mx, my))
+                    res.append((mx, my))
+                    dres.append((mx, my))
+        return res, dres
+
+    def centroid(self, data, bins=None, method='auto'):
+        """calculate a spectrum by finding the centroid of individual photons
+
+        This takes the `xarray.Dataset` `data` and returns a copy of it, with
+        a new `xarray.DataArray` named `spectrum` added, which contains the
+        energy spectrum calculated for each hRIXS image.
+
+        Added a key for switching between algorithims choices are "auto" and 
+        "manual" which selects for method for determining whether thresholds 
+        there is a photon hit. It changes whether centroid_one or centroid_two
+        is used.
+
+        Example
+        -------
+
+            h.centroid(data)  # find photons in all images of the run
+            data.spectrum[0, :].plot()  # plot the spectrum of the first image
+        """
+        if bins is None:
+            bins = self.BINS
+        ret = np.zeros((len(data[self.DETECTOR]), bins))
+        retd = np.zeros((len(data[self.DETECTOR]), bins))
+        total_hits = np.zeros((len(data[self.DETECTOR])))
+        dbl_hits = np.zeros((len(data[self.DETECTOR])))
+        for i, (image, r, rd) in enumerate(zip(data[self.DETECTOR], ret, retd)):
+            if method=='manual':
+                c, d = self.centroid_one(
+                    image.values[self.X_RANGE, self.Y_RANGE])
+            elif method=='auto':
+                energy = data['nrj'][i].data
+                c, d = self.centroid_two(
+                    image.values[self.X_RANGE, self.Y_RANGE], energy)
+
+            if not len(c):
+                continue
+            rc = np.array(c)
+            r[:], _ = np.histogram(
+                rc[:, 0] - self.parabola(rc[:, 1]),
+                bins=bins, range=(0, self.Y_RANGE.stop - self.Y_RANGE.start))
+            total_hits[i] = rc.shape[0]
+            rcd = np.array(d)
+            dbl_hits[i] = rcd.shape[0]
+            # Account for case where no double hits are found
+            if rcd.shape[0] == 0:
+                continue
+            else:
+                rd[:], _ = np.histogram(
+                    rcd[:, 0] - self.parabola(rcd[:, 1]),
+                    bins=bins, range=(0, self.Y_RANGE.stop - self.Y_RANGE.start))
+
+        data.coords["energy"] = (
+            np.linspace(self.Y_RANGE.start, self.Y_RANGE.stop, bins)
+            * self.ENERGY_SLOPE + self.ENERGY_INTERCEPT)
+        data['spectrum'] = (("trainId", "energy"), ret)
+        data['dbl_spectrum'] = (("trainId", "energy"), retd)
+        data['total_hits'] = ("trainId", total_hits)
+        data['double_hits'] = ("trainId", dbl_hits)
+        return data
+
+    def parabola(self, x):
+        return (self.CURVE_B * x + self.CURVE_A) * x
+
+    def integrate(self, data):
+        """calculate a spectrum by integration
+
+        This takes the `xarray` `data` and returns a copy of it, with a new
+        dataarray named `spectrum` added, which contains the energy spectrum
+        calculated for each hRIXS image.
+
+        First the energy that corresponds to each pixel is calculated.
+        Then all pixels within an energy range are summed, where the intensity
+        of one pixel is distributed among the two energy ranges the pixel
+        spans, proportionally to the overlap between the pixel and bin energy
+        ranges.
+
+        The resulting data is normalized to one pixel, so the average
+        intensity that arrived on one pixel.
+
+        Example
+        -------
+
+            h.integrate(data)  # create spectrum by summing pixels
+            data.spectrum[0, :].plot()  # plot the spectrum of the first image
+        """
+        bins = self.Y_RANGE.stop - self.Y_RANGE.start
+        margin = 10
+        ret = np.zeros((len(data[self.DETECTOR]), bins - 2 * margin))
+        if self.USE_DARK:
+            dark_image = self.dark_image.values[self.X_RANGE, self.Y_RANGE]
+        images = data[self.DETECTOR].values[:, self.X_RANGE, self.Y_RANGE]
+
+        x, y = np.ogrid[:images.shape[1], :images.shape[2]]
+        quo, rem = divmod(y - self.parabola(x), 1)
+        quo = np.array([quo, quo + 1])
+        rem = np.array([rem, 1 - rem])
+        wrong = (quo < margin) | (quo >= bins - margin)
+        quo[wrong] = margin
+        rem[wrong] = 0
+        quo = (quo - margin).astype(int).ravel()
+
+        for image, r in zip(images, ret):
+            if self.USE_DARK:
+                image = image - dark_image
+            r[:] = np.bincount(quo, weights=(rem * image).ravel())
+        ret /= np.bincount(quo, weights=rem.ravel())
+        data.coords["energy"] = (
+            np.arange(self.Y_RANGE.start + margin, self.Y_RANGE.stop - margin)
+            * self.ENERGY_SLOPE + self.ENERGY_INTERCEPT)
+        data['spectrum'] = (("trainId", "energy"), ret)
+        return data
+
+    aggregators = dict(
+        hRIXS_det=lambda x, dim: x.sum(dim=dim),
+        MaranaX=lambda x, dim: x.sum(dim=dim),
+        Delay=lambda x, dim: x.mean(dim=dim),
+        hRIXS_delay=lambda x, dim: x.mean(dim=dim),
+        hRIXS_norm=lambda x, dim: x.sum(dim=dim),
+        spectrum=lambda x, dim: x.sum(dim=dim),
+        dbl_spectrum=lambda x, dim: x.sum(dim=dim),
+        total_hits=lambda x, dim: x.sum(dim=dim),
+        dbl_hits=lambda x, dim: x.sum(dim=dim),
+        counts=lambda x, dim: x.sum(dim=dim)
+    )
+
+    def aggregator(self, da, dim):
+        agg = self.aggregators.get(da.name)
+        if agg is None:
+            return None
+        return agg(da, dim=dim)
+
+    def aggregate(self, ds, var=None, dim="trainId"):
+        """aggregate (i.e. mostly sum) all data within one dataset
+
+        take all images in a dataset and aggregate them and their metadata.
+        For images, spectra and normalizations that means adding them, for
+        others (e.g. delays) adding would not make sense, so we treat them
+        properly. The aggregation functions of each variable are defined
+        in the aggregators attribute of the class.
+        If var is specified, group the dataset by var prior to aggregation.
+        A new variable "counts" gives the number of frames aggregated in
+        each group.
+
+        Parameters
+        ----------
+        ds: xarray Dataset
+            the dataset containing RIXS data
+        var: string
+            One of the variables in the dataset. If var is specified, the
+            dataset is grouped by var prior to aggregation. This is useful
+            for sorting e.g. a dataset that contains multiple delays.
+        dim: string
+            the dimension over which to aggregate the data
+
+        Example
+        -------
+
+            h.centroid(data)  # create spectra from finding photons
+            agg = h.aggregate(data)  # sum all spectra
+            agg.spectrum.plot()  # plot the resulting spectrum
+
+            agg2 = h.aggregate(data, 'hRIXS_delay') # group data by delay
+            agg2.spectrum[0, :].plot()  # plot the spectrum for first value
+        """
+        ds["counts"] = xr.ones_like(ds[dim])
+        if var is not None:
+            groups = ds.groupby(var)
+            return groups.map(self.aggregate_ds, dim=dim)
+        return self.aggregate_ds(ds, dim)
+
+    def aggregate_ds(self, ds, dim='trainId'):
+        ret = ds.map(self.aggregator, dim=dim)
+        ret = ret.drop_vars([n for n in ret if n not in self.aggregators])
+        return ret
+
+    def normalize(self, data, which="hRIXS_norm"):
+        """ Adds a 'normalized' variable to the dataset defined as the
+        ration between 'spectrum' and 'which'
+
+        Parameters
+        ----------
+            data: xarray Dataset
+                the dataset containing hRIXS data
+            which: string, default="hRIXS_norm"
+                one of the variables of the dataset, usually "hRIXS_norm"
+                or "counts"
+        """
+        return data.assign(normalized=data["spectrum"] / data[which])
+
+
+class MaranaX(hRIXS):
+    """
+    A spin-off of the hRIXS class: with parallelized centroiding
+    """
+
+    NUM_MAX_HITS = 30
+
+    psh.set_default_context('processes', num_workers=20)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._centroid_result = None
+
+    def centroid(self, data, bins=None, **kwargs):
+        # 0.a Setup constants that we might use
+        if bins is None:
+            bins = self.BINS
+        images = data[self.DETECTOR]
+
+        # 0.b. Allocate output arrays (the naming is a bit meany..)
+        num_images, _, Nx = images.shape
+        self._centroid_result = {
+            'total_hist': psh.alloc(shape=(num_images, bins)),
+            'double_hist': psh.alloc(shape=(num_images, bins)),
+            'total_hits': psh.alloc(shape=(num_images,)),
+            'double_hits': psh.alloc(shape=(num_images,)),
+        }
+
+        # 0.c Use a xr.DataArray that pasha understands
+        data_array = images.assign_coords(nrj=data['nrj'])
+
+        # 1. Calculate with parallelization
+        psh.map(self._centroid_tb_map, data_array)
+
+        # 2. Finalize: set the results back to the dataset
+        data.coords["energy"] = (
+            np.linspace(self.Y_RANGE.start or 0, self.Y_RANGE.stop or Nx, bins)
+            * self.ENERGY_SLOPE + self.ENERGY_INTERCEPT)
+        data['spectrum'] = (("trainId", "energy"),
+                            self._centroid_result['total_hist'])
+        data['dbl_spectrum'] = (("trainId", "energy"),
+                                self._centroid_result['double_hist'])
+        data['total_hits'] = ("trainId",
+                              self._centroid_result['total_hits'])
+        data['double_hits'] = ("trainId",
+                               self._centroid_result['double_hits'])
+        return data
+
+    def _centroid_tb_map(self, _, index, data):
+        self._centroid_map(index,
+                           image=data.values,
+                           energy=data.coords['nrj'].item())
+
+    def _centroid_map(self, index, *, image, energy):
+        total, double = self._centroid_task(index, image, energy)
+        # Check if there are results: don't do anything otherwise
+        if not len(total):
+            return
+
+        self._histogram_task(index, total, double,
+                             default_range=(0, image.shape[1]))
+
+    def _centroid_task(self, index, image, energy):
+        # Calculate the centroid and store it on the allocated output array
+        total, double = self.centroid_two(image[self.X_RANGE, self.Y_RANGE],
+                                          energy)
+        total, double = np.array(total), np.array(double)
+
+        total_hits_list = self._centroid_result.get('total_hits_list')
+        if total_hits_list is not None and len(total):
+            total_hits_list[index][:min(len(total), self.NUM_MAX_HITS)] \
+                = total[:self.NUM_MAX_HITS]
+        double_hits_list = self._centroid_result.get('double_hits_list')
+        if double_hits_list is not None and len(double):
+            double_hits_list[index][:min(len(double), self.NUM_MAX_HITS)] \
+                = double[:self.NUM_MAX_HITS]
+
+        return total, double
+
+    def _histogram_task(self, index, total, double, default_range):
+        # Prepare
+        _, bins = self._centroid_result['total_hist'].shape
+        hist_range = (self.Y_RANGE.start or default_range[0],
+                      self.Y_RANGE.stop or default_range[1])
+
+        # Calculate total spectrum
+        self._centroid_result['total_hist'][index], _ = np.histogram(
+            total[:, 0] - self.parabola(total[:, 1]),
+            bins=bins, range=hist_range)
+        self._centroid_result['total_hits'][index] = len(total)
+
+        # Calculate double spectrum
+        double_hits = len(double)
+        if double_hits:
+            self._centroid_result['double_hist'][index], _ = np.histogram(
+                double[:, 0] - self.parabola(double[:, 1]),
+                bins=bins, range=hist_range)
+            self._centroid_result['double_hits'][index] = double_hits
+
+    def centroid_from_run(self, runNB, proposal=None, extra_fields=(),
+                          drop_first=False, subset=None, bins=None,
+                          return_hits=False):
+        """
+        A combined function of `from_run()` and `centroid()`, which
+        uses `extra_data` and `pasha` to avoid bulk loading of files.
+        """
+        # 0.a Setup constants that we might use
+        if proposal is None:
+            proposal = self.PROPOSAL
+        if drop_first:
+            subset = slice(1, None)
+        if bins is None:
+            bins = self.BINS
+        run_mnemo = set(self.DETECTOR_FIELDS['MaranaX']) | set(extra_fields)
+
+        # 0.b. Open the run with extra-data and select the relevant fields
+        run = open_run(proposal, runNB)
+        if subset is not None:
+            run = run.select_trains(subset)
+        # Filter out mnemonics that does not exist in the run files
+        run_mnemo = set([mnemo for mnemo in run_mnemo
+                         if self._is_mnemo_in_run(mnemo, run)])
+        assert set(self.DETECTOR_FIELDS['MaranaX']).issubset(run_mnemo)
+        
+        sources = [self._mnemo_to_prop(mnemo)[0] for mnemo in run_mnemo]
+        selection = run.select([source for source in sources
+                                if source in run.all_sources],
+                               require_all=True)
+
+        # 0.c. Allocate output arrays (the naming is a bit meany..)
+        mara_source, mara_key = self._mnemo_to_prop('MaranaX')
+        num_images, _, Nx = selection[mara_source, mara_key].shape
+        self._centroid_result = {
+            'total_hist': psh.alloc(shape=(num_images, bins)),
+            'double_hist': psh.alloc(shape=(num_images, bins)),
+            'total_hits': psh.alloc(shape=(num_images,)),
+            'double_hits': psh.alloc(shape=(num_images,)),
+        }
+        if return_hits:
+            self._centroid_result['total_hits_list'] = psh.alloc(
+                shape=(num_images, self.NUM_MAX_HITS, 2), fill=np.nan)
+            self._centroid_result['double_hits_list'] = psh.alloc(
+                shape=(num_images, self.NUM_MAX_HITS, 2), fill=np.nan)
+
+        # 1. Calculate with parallelization
+        psh.map(self._centroid_ed_map, selection)
+
+        # 2. Finalize: generate the rest of the mnemonics to the dataset
+        # and set the results also.
+        data = xr.merge(
+            [selection.get_array(*self._mnemo_to_prop(mnemo), name=mnemo)
+             for mnemo in run_mnemo - {'MaranaX'}], join='inner')
+        data.coords["energy"] = (
+            np.linspace(self.Y_RANGE.start or 0, self.Y_RANGE.stop or Nx, bins)
+            * self.ENERGY_SLOPE + self.ENERGY_INTERCEPT)
+        data['spectrum'] = (("trainId", "energy"),
+                            self._centroid_result['total_hist'])
+        data['dbl_spectrum'] = (("trainId", "energy"),
+                                self._centroid_result['double_hist'])
+        data['total_hits'] = ("trainId",
+                              self._centroid_result['total_hits'])
+        data['double_hits'] = ("trainId",
+                               self._centroid_result['double_hits'])
+
+        if return_hits:
+            data['total_hits_list'] = (
+                ('trainId', 'hits', 'coord'),
+                self._centroid_result['total_hits_list'])
+            data['double_hits_list'] = (
+                ('trainId', 'hits', 'coord'),
+                self._centroid_result['double_hits_list'])
+
+        # Add attributes
+        data.attrs.update(
+            CENTROID_THRESHOLD=self.CENTROID_THRESHOLD,
+            NUM_BINS=bins,
+            CURVATURE_CORRECTION=[self.CURVE_A, self.CURVE_B],
+            ENERGY_CALIBRATION=[self.ENERGY_SLOPE, self.ENERGY_INTERCEPT],
+            Y_RANGE=[self.Y_RANGE.start or 0,
+                     self.Y_RANGE.stop or Nx]
+        )
+
+        return data
+
+    def _centroid_ed_map(self, _, index, trainId, data):
+        mara_source, mara_key = self._mnemo_to_prop('MaranaX')
+        nrj_source, nrj_key = self._mnemo_to_prop('nrj')
+        self._centroid_map(
+            index,
+            image=data[mara_source][mara_key],
+            energy=data[nrj_source][nrj_key])
+
+    @staticmethod
+    def _mnemo_to_prop(mnemo):
+        prop = tb.constants.mnemonics[mnemo][0]
+        return prop['source'], prop['key']
+
+    def _is_mnemo_in_run(self, mnemo, run):
+        source, key = self._mnemo_to_prop(mnemo)
+        if source not in run.all_sources:
+            return False
+        return key in run.keys_for_source(source)
--- a/src/toolbox_scs/detectors/pes.py
+++ b/src/toolbox_scs/detectors/pes.py
+""" Beam Arrival Monitor related sub-routines
+
+    Copyright (2021) SCS Team.
+
+    (contributions preferrably comply with pep8 code structure
+    guidelines.)
+"""
+
+import logging
+import numpy as np
+import xarray as xr
+import extra_data as ed
+import re
+import os
+from pathlib import Path
+from multiprocessing import Pool
+from extra.components import XrayPulses, AdqRawChannel
+
+from ..misc.bunch_pattern_external import is_sase_3
+from ..mnemonics_machinery import (mnemonics_to_process,
+                                   mnemonics_for_run)
+from extra_data.read_machinery import find_proposal
+from ..constants import mnemonics as _mnemonics
+
+__all__ = [
+    'get_pes_params',
+    'get_pes_tof',
+    'save_pes_avg_traces',
+    'load_pes_avg_traces'
+]
+
+
+log = logging.getLogger(__name__)
+
+
+def get_pes_tof(proposal, runNB, mnemonic, start=0, origin=None,
+                        width=None, subtract_baseline=False,
+                        baseStart=None, baseWidth=40,merge_with=None):
+    """
+    Extracts time-of-flight spectra from raw digitizer traces. The spectra
+    are aligned by pulse Id using the SASE 3 bunch pattern. If origin is
+    not None, a time coordinate in nanoseconds 'time_ns' is computed and
+    added to the DataArray.
+
+    Parameters
+    ----------
+    proposal:int
+        The proposal number.
+    runNB: int
+        The run number.
+    mnemonic: str
+        mnemonic for PES, e.g. "PES_2Araw".
+    start: int
+        starting sample of the first spectrum in the raw trace.
+    origin: int
+        sample of the trace that corresponds to time-of-flight origin,
+        also called prompt. Used to compute the 'time_ns'
+        coordinates. If None, computation of 'time_ns' is skipped.
+    width: int
+        number of samples per spectra. If None, the number of samples
+        for 4.5 MHz repetition rate is used.
+    subtract_baseline: bool
+        If True, subtract baseline defined by baseStart and baseWidth to each
+        spectrum.
+    baseStart: int
+        starting sample of the baseline.
+    baseWidth: int
+        number of samples to average (starting from baseStart) for baseline
+        calculation.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one.
+
+    Returns
+    -------
+    pes: xarray DataArray
+        DataArray containing the PES time-of-flight spectra.
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> import toolbox_scs.detectors as tbdet
+    >>> proposal, runNB = 900447, 12
+    >>> pes = tbdet.get_pes_tof(proposal, runNB, 'PES_2Araw',
+    >>>                         start=2557, origin=76)
+    """
+    run = ed.open_run(proposal, runNB)
+    all_mnemonics = mnemonics_for_run(run)
+    mnemo = all_mnemonics.get(mnemonic)
+    if mnemo is None:
+        print(f'Mnemonic {mnemonic} not found in run. Skipping.')
+        return None
+    PULSE_PERIOD = 440
+    pattern = XrayPulses(run)
+    regular = True
+    if pattern.is_constant_pattern() is False:
+        print('The number of pulses changed during the run.')
+        pulse_ids = np.unique(pattern.pulse_ids(labelled=False, copy=False))
+        regular = False
+    else:
+        pulse_ids = pattern.peek_pulse_ids(labelled=False)
+    npulses = len(pulse_ids)
+    period = 1
+    if npulses > 1:
+        periods = np.diff(pulse_ids)
+        if len(np.unique(periods)) > 1:
+            regular = False
+        period = min(periods)
+        npulses = int((max(pulse_ids) - min(pulse_ids)) / period) + 1
+    period *= PULSE_PERIOD
+    kd = run[mnemo['source'], mnemo['key']]
+    nsamples = kd.shape[1]
+    npulses_trace = int(np.floor((nsamples - start) / period))
+    if npulses_trace < npulses:
+        log.warning(f'The digitizer only recorded {npulses_trace} pulses '
+                    f'out of the {npulses} pulses in the train.')
+    else:
+        npulses_trace = npulses
+    indices = np.array([start + i*period for i in range(npulses_trace + 1)])
+
+    outp = kd.ndarray()
+    outp = outp[:, indices[0]:indices[-1]].reshape([outp.shape[0],
+                                                    npulses_trace, period])
+    spectra = xr.DataArray(outp, dims=['trainId',  'sa3_pId', 'sampleId'],
+                           coords={'trainId': kd.train_id_coordinates(),
+                                   'sa3_pId': pulse_ids[:npulses_trace],
+                                   'sampleId': np.arange(period)})
+    if subtract_baseline:
+        if baseStart is None:
+            baseStart = 0
+        spectra = spectra - spectra.isel(
+            sampleId=slice(baseStart, baseStart + baseWidth)).mean('sampleId')
+    if width is None:
+        width = PULSE_PERIOD
+    spectra = spectra.isel(sampleId=slice(0, width), drop=True)
+
+    spectra.attrs['start'] = start
+    if origin is not None:
+        try:
+            adq = AdqRawChannel(run, mnemonic.split('_')[1].split('raw')[0])
+            sample_rate = adq.sampling_rate
+        except Exception as e:
+            log.warning(e)
+            sample_rate = 1986111111.1111112
+        origin -= start
+        time_ns = (np.arange(spectra.sizes['sampleId'])
+                   - origin)/sample_rate * 1e9
+        spectra = spectra.assign_coords(time_ns=('sampleId', time_ns))
+        spectra.attrs['origin'] = origin
+    spectra = spectra.rename(mnemonic.replace('raw', 'spectrum'))
+    if merge_with is not None:
+        return merge_with.merge(spectra.to_dataset(promote_attrs=True),
+                                join='inner')
+    return spectra.rename(mnemonic.replace('raw', 'spectrum'))
+
+
+def get_pes_params(run, channel=None):
+    """
+    Extract PES parameters for a given extra_data DataCollection.
+    Parameters are gas, binding energy, retardation voltages or all
+    voltages of the MPOD.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data
+    channel: str
+        Channel name or PES mnemonic, e.g. '2A' or 'PES_1Craw'.
+        If None, or if the channel is not found in the data,
+        the retardation voltage for all channels is retrieved.
+    Returns
+    -------
+    params: dict
+        dictionnary of PES parameters
+    """
+    params = {}
+    sel = run.select_trains(ed.by_index[:20])
+    gas_dict = {'N2': 409.9, 'Ne': 870.2, 'Kr': 1921, 'Xe': 1148.7}
+    for gas in gas_dict.keys():
+        mnemo = _mnemonics[f'PES_{gas}'][0]
+        arr = sel.get_run_value(mnemo['source'], mnemo['key'])
+        if arr == 'ON':
+            params['gas'] = gas
+            params['binding_energy'] = gas_dict[gas]
+            break
+    if 'gas' not in params:
+        params['gas'] = 'unknown'
+        log.warning('Could not find which PES gas was used.')
+    mpod_mapper = 'SA3_XTD10_PES/MDL/MPOD_MAPPER'
+    if mpod_mapper in run.all_sources:
+        if channel is None:
+            channel = [f'{a//4 + 1}{b}' for a, b in enumerate(
+                ['A', 'B', 'C', 'D']*4)]
+        else:
+            if 'raw' in channel:
+                channel = channel.split('raw')[0].split('_')[1]
+            channel = [channel]
+        for c in channel:
+            rv = get_pes_rv(run, c, mpod_mapper)
+            if rv is not None:
+                params[f'{c}_RV'] = rv
+    elif 'SA3_XTD10_PES/MDL/DAQ_MPOD' in run.all_sources:
+        voltages = get_pes_voltages(run)
+        if voltages is not None:
+            params.update(voltages)
+
+    return params
+
+
+def get_pes_rv(run, channel, mpod_mapper='SA3_XTD10_PES/MDL/MPOD_MAPPER'):
+    channel_to_group = {'4D': 1, '4B': 1, '4C': 1, '4A': 1,
+                        '2A': 2, '1C': 2, '2C': 2, '2D': 2,
+                        '3D': 3, '3B': 3, '2B': 3, '3A': 3,
+                        '1A': 4, '3C': 4, '1B': 4, '1D': 4}
+    group = channel_to_group.get(channel)
+    if group is None:
+        log.warning(f'Could not find group for channel {channel}.')
+        return None
+    key = f'Group{group}D.channelMeasurementSenseVoltage.value'
+    voltage = run[mpod_mapper, key].ndarray().mean()
+    return voltage
+
+
+def get_pes_voltages(run, device='SA3_XTD10_PES/MDL/DAQ_MPOD'):
+    """
+    Extract PES voltages read by the MDL watchdog of the MPOD device.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing PES data.
+    device: string
+        Name of the device containing the voltage data.
+
+    Returns
+    -------
+    voltages: dict
+        dictionnary of voltages, empty if device not found.
+    """
+    if device not in run.all_sources:
+        log.warning(f'Could not find {device} in run. Skipping.')
+        return None
+    a = re.compile("[u]\d{3}.value")
+    tid, da = run.train_from_index(0, devices=device)
+    voltages = {}
+    for k in da[device]:
+        if len(a.findall(k)) == 1:
+            voltages[k.split('.')[0]] = da[device][k]
+    return voltages
+
+
+def calculate_average(run, name, mnemo):
+    return run[mnemo['source'], mnemo['key']].xarray(
+        name=name, extra_dims=mnemo['dim']).mean('trainId')
+
+
+def save_pes_avg_traces(proposal, runNB, channels=None,
+                        subdir='usr/processed_runs'):
+    '''
+    Save average traces of PES into an h5 file.
+
+    Parameters
+    ----------
+    proposal:int
+        The proposal number.
+    runNB: int
+        The run number.
+    channels: str or list
+        The PES channels or mnemonics, e.g. '2A', ['2A', '3C'],
+        ['PES_1Araw', 'PES_4Draw', '3B']
+    subdir: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-pes-data.h5'
+
+    Output
+    ------
+    xarray Dataset saved in a h5 file containing the PES average traces.
+    '''
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, subdir + f'/r{runNB:04d}/')
+    Path(path).mkdir(parents=True, exist_ok=True)
+    fname = path + f'r{runNB:04d}-pes-data.h5'
+    if channels is None:
+        channels = [f'{a//4 + 1}{b}' for a, b in enumerate(
+            ['A', 'B', 'C', 'D']*4)]
+    if isinstance(channels, str):
+        channels = [channels]
+    run = ed.open_run(proposal, runNB, parallelize=False)
+    all_mnemos = mnemonics_for_run(run)
+    # use multiprocessing.Pool
+    args = []
+    for c in channels:
+        m = c
+        if 'raw' not in c:
+            m = f'PES_{c}raw'
+        if m not in all_mnemos:
+            continue
+        args.append([run, m.replace('raw', 'avg'), all_mnemos[m]])
+    if len(args) == 0:
+        log.warning('No pes average trace to save. Skipping')
+        return
+    with Pool(len(args)) as pool:
+        avg_traces = pool.starmap(calculate_average, args)
+    avg_traces = xr.merge(avg_traces)
+    ds = xr.Dataset()
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        ds = ds.drop_vars(channels, errors='ignore')
+    ds = ds.merge(avg_traces)
+    ds.to_netcdf(fname, format='NETCDF4')
+    return
+
+
+def load_pes_avg_traces(proposal, runNB, channels=None,
+                        subdir='usr/processed_runs'):
+    '''
+    Load existing PES average traces.
+
+    Parameters
+    ----------
+    proposal:int
+        The proposal number.
+    runNB: int
+        The run number.
+    channels: str or list
+        The PES channels or mnemonics, e.g. '2A', ['2A', '3C'],
+        ['PES_1Araw', 'PES_4Draw', '3B']
+    subdir: str
+        subdirectory. The data is stored in
+        <proposal path>/<subdir>/r{runNB:04d}/f'r{runNB:04d}-pes-data.h5'
+
+    Output
+    ------
+    ds: xarray Dataset
+        dataset containing the PES average traces.
+    '''
+    root = find_proposal(f'p{proposal:06d}')
+    path = os.path.join(root, subdir + f'/r{runNB:04d}/')
+    fname = path + f'r{runNB:04d}-pes-data.h5'
+    if channels is None:
+        channels = [f'PES_{a//4 + 1}{b}avg' for a, b in
+                    enumerate(['A', 'B', 'C', 'D']*4)]
+    if isinstance(channels, str):
+        channels = [channels]
+    for i, c in enumerate(channels):
+        if 'PES_' not in c and 'avg' not in c:
+            channels[i] = f'PES_{c}avg'
+    if os.path.isfile(fname):
+        ds = xr.load_dataset(fname)
+        channels = [c for c in channels if c in ds]
+        ds = ds[channels]
+        return ds
+    else:
+        log.warning(f'{fname} is not a valid file.')
--- a/src/toolbox_scs/detectors/viking.py
+++ b/src/toolbox_scs/detectors/viking.py
+import numpy as np
+import xarray as xr
+import toolbox_scs as tb
+import matplotlib.pyplot as plt
+from scipy.optimize import curve_fit
+from .hrixs import gauss1d
+
+__all__ = ['Viking']
+
+
+def plot_viking_xas(xas, plot_errors=True, xas_ylim=(-1, 3)):
+    fig, ax = plt.subplots(3, 1, figsize=(7, 7), sharex=True)
+    ax[0].plot(xas.newt_x, xas['I0'])
+    ax[0].grid()
+    ax[0].set_title('I0 spectra')
+
+    ax[1].plot(xas.newt_x, xas['It'])
+    ax[1].grid()
+    ax[1].set_title('It spectra')
+
+    ax[2].plot(xas.newt_x, xas['absorptionCoef'])
+    ax[2].set_ylim(*xas_ylim)
+    ax[2].grid()
+    ax[2].set_title('XAS')
+
+    if plot_errors:
+        ax[0].fill_between(xas.newt_x,
+                           xas['I0'] - 1.96*xas['I0_stderr'],
+                           xas['I0'] + 1.96*xas['I0_stderr'],
+                           alpha=0.2)
+        ax[1].fill_between(xas.newt_x,
+                           xas['It'] - 1.96*xas['It_stderr'],
+                           xas['It'] + 1.96*xas['It_stderr'],
+                           alpha=0.2)
+        ax[2].fill_between(xas.newt_x,
+                   xas['absorptionCoef'] - 1.96*xas['absorptionCoef_stderr'],
+                   xas['absorptionCoef'] + 1.96*xas['absorptionCoef_stderr'],
+                   alpha=0.2)
+
+
+def plot_viking_calibration(spectra, x_pos, nrj, energy_calib,
+                            gaussian_fits, runNBs):
+    fig, ax = plt.subplots(2, 1, figsize=(7, 6), sharex=True)
+    for i in range(len(x_pos)):
+        x = spectra[i].newt_x.values
+        ax[0].plot(spectra[i].newt_x, spectra[i], color=f'C{i}',
+                   label=f'run {runNBs[i]}')
+        ax[0].plot(x, gauss1d(x, *gaussian_fits[i]),
+                   ls='--', color=f'C{i}')
+    ax[0].legend()
+    ax[0].set_ylabel('intensity [arb. un.]')
+
+    ax[1].scatter(x_pos, nrj, label='data')
+    ax[1].plot(x_pos, np.polyval(energy_calib, x_pos), color='r',
+               label=f"{energy_calib[2]:.4f}+{energy_calib[1]:.4f}"
+                     f"*pixel+{energy_calib[0]:.4e}*pixel^2")
+    ax[1].legend()
+    ax[1].set_xlabel('pixel')
+    ax[1].set_ylabel('energy [eV]')
+    ax[1].grid()
+    fig.suptitle(f'Viking calibration, runs {runNBs}')
+
+
+# -----------------------------------------------------------------------------
+# Viking class
+class Viking:
+    """The Viking analysis (spectrometer used in combination with Andor Newton
+    camera)
+
+    The objects of this class contain the meta-information about the settings
+    of the spectrometer, not the actual data, except possibly a dark image
+    for background subtraction.
+
+    The actual data is loaded into `xarray`s via the method `from_run()`,
+    and stays there.
+
+    Attributes
+    ----------
+
+    PROPOSAL: int
+        the number of the proposal
+    X_RANGE: slice
+        the slice to take in the non-dispersive direction, in pixels.
+        Defaults to the entire width.
+    Y_RANGE: slice
+        the slice to take in the energy dispersive direction
+    USE_DARK: bool
+        whether to do dark subtraction. Is initially `False`, magically
+        switches to `True` if a dark has been loaded, but may be reset.
+    ENERGY_CALIB: 1D array (len=3)
+        The 2nd degree polynomial coefficients for calibration from pixel
+        to energy. Defaults to [0, 1, 0] (no calibration applied).
+    BL_POLY_DEG: int
+        the dgree of the polynomial used for baseline subtraction.
+        Defaults to 1.
+    BL_SIGNAL_RANGE: list
+        the dispersive-axis range, defined by an interval [min, max],
+        to avoid when fitting a polynomial for baseline subtraction.
+        Multiple ranges can be provided in the form 
+        [[min1, max1], [min2, max2], ...].
+    FIELDS: list of str
+        the fields to be loaded from the data. Add additional fields if so
+        desired.
+
+    Example
+    -------
+
+        proposal = 2953
+        v = Viking(proposal)
+        v.X_RANGE = slice(0, 1900)
+        v.Y_RANGE = slice(38, 80)
+        v.ENERGY_CALIB = [1.47802667e-06, 2.30600328e-02, 5.15884589e+02]
+        v.BL_SIGNAL_RANGE = [500, 545]
+        
+    """
+    def __init__(self, proposalNB):
+        self.PROPOSAL = proposalNB
+        self.FIELDS = ['newton']
+        # 2 deg polynomial energy calibration
+        self.ENERGY_CALIB = [0, 1, 0]
+        # dark
+        self.USE_DARK = False
+        self.dark_image = None
+        # image range
+        self.X_RANGE = slice(None, None)
+        self.Y_RANGE = slice(None, None)
+        # polynomial degree for background subtraction
+        self.BL_POLY_DEG = 1
+        self.BL_SIGNAL_RANGE = []
+
+    def set_params(self, **params):
+        for key, value in params.items():
+            setattr(self, key.upper(), value)
+
+    def get_params(self, *params):
+        if not params:
+            params = ('proposal', 'x_range', 'y_range', 'energy_calib',
+                      'use_dark', 'bl_poly_deg', 'bl_signal_range', 'fields')
+        return {param: getattr(self, param.upper()) for param in params}
+
+    def from_run(self, runNB, add_attrs=True, tid_offset=-1):
+        """load a run
+
+        Load the run `runNB`. A thin wrapper around `toolbox_scs.load`.
+        
+        Parameters
+        ----------
+        runNB: int
+            the run number
+        add_attrs: bool
+            if True, adds the camera parameters as attributes to the dataset
+            (see get_camera_params())
+        tid_offset: int
+            train Id offset of Newton camera
+
+        Output
+        ------
+        ds: xarray Dataset
+            the dataset containing the camera images
+
+        Example
+        -------
+
+            data = v.from_run(145)  # load run 145
+
+            data1 = v.from_run(145)  # load run 145
+            data2 = v.from_run(155)  # load run 155
+            data = xarray.concat([data1, data2], 'trainId')  # combine both
+        """
+        #separately load newton image and deal with trainId mismatch
+        roi = {'newton': {'newton': {'roi': (self.Y_RANGE, self.X_RANGE),
+                                     'dim': ['newt_y', 'newt_x']}}}
+        run, newton = tb.load(self.PROPOSAL, runNB, 'newton', rois=roi)
+        newton = newton.shift(trainId=tid_offset).astype(float)
+        
+        #load the rest
+        fields = [f for f in self.FIELDS if f != 'newton']
+        if len(fields) == 0:
+            data = newton
+        else:
+            run, data = tb.load(self.PROPOSAL, runNB,
+                                fields=fields)
+            data = data.merge(newton, join='inner')
+        data = data.assign_coords(newt_x=np.polyval(self.ENERGY_CALIB,
+                                                    data['newt_x']))
+        if add_attrs:
+            params = self.get_camera_params(run)
+            for k, v in params.items():
+                data.attrs[k] = v
+        return data
+
+    def load_dark(self, runNB=None):
+        if runNB is None:
+            self.USE_DARK = False
+            return
+        data = self.from_run(runNB, add_attrs=False)
+        self.dark_image = data['newton'].mean(dim='trainId')
+        self.dark_image.attrs['runNB'] = runNB
+        self.USE_DARK = True
+
+    def integrate(self, data):
+        '''
+        This function calculates the mean over the non-dispersive dimension
+        to create a spectrum. If the camera parameters are known, the
+        spectrum is multiplied by the number of photoelectrons per ADC count.
+        A new variable "spectrum" is added to the data.
+        '''
+        imgs = data['newton']
+        if self.USE_DARK:
+            imgs = imgs - self.dark_image
+        spectrum = imgs.mean(dim='newt_y')
+        if 'photoelectrons_per_count' in data.attrs:
+            spectrum *= data.attrs['photoelectrons_per_count']
+        data['spectrum'] = spectrum
+        return data
+
+    def get_camera_gain(self, run):
+        """
+        Get the preamp gain of the camera in the Viking spectrometer for
+        a specified run.
+
+        Parameters
+        ----------
+        run: extra_data DataCollection
+            information on the run
+
+        Output
+        ------
+        gain: int
+        """
+        gain = run.get_run_value('SCS_EXP_NEWTON/CAM/CAMERA',
+                                 'preampGain.value')
+        gain_dict = {0: 1, 1: 2, 2: 4}
+        return gain_dict[gain]
+
+    def e_per_counts(self, run, gain=None):
+        """
+        Conversion factor from camera digital counts to photoelectrons
+        per count. The values can be found in the camera datasheet
+        (Andor Newton) but they have been slightly corrected for High
+        Sensitivity mode after analysis of runs 1204, 1207 and 1208,
+        proposal 2937.
+
+        Parameters
+        ----------
+        run: extra_data DataCollection
+            information on the run
+        gain: int
+            the camera preamp gain
+
+        Output
+        ------
+        ret: float
+            photoelectrons per count
+        """
+        if gain is None:
+            gain = self.get_camera_gain(run)
+        hc = run.get_run_value('SCS_EXP_NEWTON/CAM/CAMERA',
+                               'HighCapacity.value')
+        if hc == 0:  # High Sensitivity
+            pe_dict = {1: 4., 2: 2.05, 4: 0.97}
+        elif hc == 1:  # High Capacity
+            pe_dict = {1: 17.9, 2: 9., 4: 4.5}
+        return pe_dict[gain]
+
+    def get_camera_params(self, run):
+        dic = {'vbin:': 'imageSpecifications.verticalBinning.value',
+               'hbin': 'imageSpecifications.horizontalBinning.value',
+               'startX': 'imageSpecifications.startX.value',
+               'endX': 'imageSpecifications.endX.value',
+               'startY': 'imageSpecifications.startY.value',
+               'endY': 'imageSpecifications.endY.value',
+               'temperature': 'coolerActual.temperature.value',
+               'high_capacity': 'HighCapacity.value',
+               'exposure_s': 'exposureTime.value'
+               }
+        ret = {}
+        try:
+            for k, v in dic.items():
+                ret[k] = run.get_run_value('SCS_EXP_NEWTON/CAM/CAMERA', v)
+        except Exception as e:
+            print(e)
+            dic['temperature'] = 'CoolerActual.temperature.value'
+            for k, v in dic.items():
+                ret[k] = run.get_run_value('SCS_EXP_NEWTON/CAM/CAMERA', v)
+        ret['gain'] = self.get_camera_gain(run)
+        ret['photoelectrons_per_count'] = self.e_per_counts(run, ret['gain'])
+        return ret
+
+    def removePolyBaseline(self, data, key='spectrum'):
+        """
+        Removes a polynomial baseline to a spectrum, assuming a fixed
+        position for the signal.
+        
+        Parameters
+        ----------
+        data: xarray Dataset
+            The Viking data containing the variable "spectrum"
+            or given by argument key
+            
+        Output
+        ------
+        data: xarray Dataset
+            the original dataset with the added variable "spectrum_nobl"
+            containing the baseline subtracted spectra.
+
+        """
+        if key not in data:
+            return
+        x = data.newt_x
+        spectra = data[key]
+        #print(x)
+        mask = xr.ones_like(x, dtype=bool)
+        if len(self.BL_SIGNAL_RANGE) > 0:
+            if not hasattr(self.BL_SIGNAL_RANGE[0], '__len__'):
+                ranges = [self.BL_SIGNAL_RANGE]
+            else:
+                ranges = self.BL_SIGNAL_RANGE
+            for xrange in ranges:
+                mask = mask & ((x < xrange[0]) | (x > xrange[1]))
+        x_bl = x.where(mask, drop=True).astype(x.dtype)
+        bl = spectra.sel(newt_x=x_bl)
+        #print(mask)
+        #print(x_bl)
+        #print(bl)
+        fit = np.polyfit(x_bl, bl.T, self.BL_POLY_DEG)
+        if len(spectra.shape) == 1:
+            return spectra - np.polyval(fit, x)
+        final_bl = np.empty(spectra.shape)
+        for t in range(spectra.shape[0]):
+            final_bl[t] = np.polyval(fit[:, t], x)
+        data[key+'_nobl'] = spectra - final_bl
+        return data
+    
+    
+    def xas(self, sam, ref, thickness=1, dim='newt_x', plot=False,
+            plot_errors=True, xas_ylim=(-1, 3)):
+        """
+        Given two independent datasets (one with sample and one reference),
+        this calculates the average XAS spectrum (absorption coefficient),
+        associated standard deviation and standard error. The absorption
+        coefficient is defined as -log(It/I0)/thickness.
+
+        Parameters
+        ----------
+        sam: xarray DataArray
+            the data array containing the spectra with sample
+        ref: xarray DataArray
+            the data array containing the spectra without sample
+        thickness: float
+            the thickness used for the calculation of the absorption
+            coefficient
+        dim: string
+            the name of the dimension along the dispersion axis
+        plot: bool
+            If True, plot the resulting average spectra.
+        plot_errors: bool
+            If True, adds the 95% confidence interval on the spectra.
+        xas_ylim: tuple or list of float
+            the y limits for the XAS plot.
+
+        Output
+        ------
+        xas: xarray Dataset
+            the dataset containing the computed XAS quantities:
+            I0, It, absorptionCoef and their associated errors.
+        """
+
+        if sam[dim].equals(ref[dim]) is False:
+            return
+        spectrum = sam.mean(dim='trainId')
+        std = sam.std(dim='trainId')
+        std_err = std / np.sqrt(sam.sizes['trainId'])
+        spectrum_ref = ref.mean(dim='trainId')
+        std_ref = ref.std(dim='trainId')
+        std_err_ref = std_ref / np.sqrt(ref.sizes['trainId'])
+
+        ds = xr.Dataset()
+        ds['It'] = spectrum
+        ds['It_std'] = std
+        ds['It_stderr'] = std_err
+        ds['I0'] = spectrum_ref
+        ds['I0_std'] = std
+        ds['I0_stderr'] = std_err
+        absorption = spectrum_ref / spectrum
+        # assume that It and I0 are independent variables:
+        absorption_std = np.abs(absorption) * np.sqrt(
+            std_ref**2 / spectrum_ref**2 + std**2 / spectrum**2)
+        absorption_stderr = np.abs(absorption) * np.sqrt(
+            (std_err_ref / spectrum_ref)**2 + (std_err / spectrum)**2)
+        ds['absorptionCoef'] = np.log(absorption) / thickness
+        ds['absorptionCoef_std'] = absorption_std / (thickness *
+                                                     np.abs(absorption))
+        ds['absorptionCoef_stderr'] = absorption_stderr / (thickness *
+                                                           np.abs(absorption))
+        ds.attrs['n_It'] = sam.sizes['trainId']
+        ds.attrs['n_I0'] = ref.sizes['trainId']
+
+        if plot:
+            plot_viking_xas(ds, plot_errors, xas_ylim)
+        return ds
+
+
+    def calibrate(self, runList, source='nrj', order=2, xrange=slice(None,None), plot=True):
+        """
+        This routine determines the calibration coefficients to translate the
+        camera pixels into energy in eV. The Viking spectrometer is calibrated
+        using the beamline monochromator: runs with various monochromatized
+        photon energy are recorded and their peak position on the detector are
+        determined by Gaussian fitting. The energy vs. position data is then
+        fitted to a second degree polynomial.
+
+        Parameters
+        ----------
+        runList: list of int
+            the list of runs containing the monochromatized spectra
+        plot: bool
+            if True, the spectra, their Gaussian fits and the calibration
+            curve are plotted.
+
+        Output
+        ------
+        energy_calib: np.array
+            the calibration coefficients (2nd degree polynomial)
+        """
+        old_calib = self.ENERGY_CALIB
+        self.ENERGY_CALIB = [0, 1, 0]
+        x_pos = []
+        nrj = []
+        spectra = []
+        gaussian_fits = []
+        runNBs = []
+        for i, runNB in enumerate(runList):
+            if plot:
+                print(runNB)
+            xrange0 = self.X_RANGE
+            self.X_RANGE=xrange
+            ds = self.from_run(runNB)
+            self.integrate(ds)
+            ds = self.removePolyBaseline(ds)
+            avg_spectrum = ds['spectrum_nobl'].mean(dim='trainId')
+            p0 = [np.max(avg_spectrum)-np.min(avg_spectrum),
+                  avg_spectrum['newt_x'][avg_spectrum.argmax()].values,
+                  10, np.min(avg_spectrum)]
+            eV_unit = 1e3 if 'UND' in source else 1
+            try:
+                popt, pcov = curve_fit(gauss1d, avg_spectrum['newt_x'].values,
+                                       avg_spectrum.values, p0=p0)
+                x_pos.append(popt[1])
+                gaussian_fits.append(popt)
+                spectra.append(avg_spectrum)
+                runNBs.append(runNB)
+                nrj.append(tb.load_run_values(self.PROPOSAL, runNB)[source]*eV_unit)
+            except Exception as e:
+                print(f'error with run {runNB}:', e)
+                self.ENERGY_CALIB = old_calib
+            finally:
+                self.X_RANGE = xrange0
+        x_pos = np.array(x_pos)
+        nrj = np.array(nrj)
+        idx = np.argsort(x_pos)
+        x_pos = x_pos[idx]
+        nrj = nrj[idx]
+        energy_calib = np.polyfit(x_pos, nrj, order)
+        if plot:
+            plot_viking_calibration(spectra, x_pos, nrj, energy_calib,
+                                    gaussian_fits, runNBs)
+        self.ENERGY_CALIB = energy_calib
+        self.GAUSSIAN_FITS = gaussian_fits
+        return energy_calib
--- a/src/toolbox_scs/detectors/xgm.py
+++ b/src/toolbox_scs/detectors/xgm.py
+""" XGM related sub-routines
+
+    Copyright (2019) SCS Team.
+
+    (contributions preferrably comply with pep8 code structure
+    guidelines.)
+"""
+
+import logging
+
+import numpy as np
+import xarray as xr
+import matplotlib.pyplot as plt
+
+from ..misc.bunch_pattern_external import is_sase_1, is_sase_3
+from ..misc.bunch_pattern import (npulses_has_changed,
+                                  get_unique_sase_pId, load_bpt)
+from ..mnemonics_machinery import mnemonics_for_run
+from toolbox_scs.load import get_array
+
+__all__ = [
+    'calibrate_xgm',
+    'get_xgm',
+]
+
+
+log = logging.getLogger(__name__)
+
+
+def get_xgm(run, mnemonics=None, merge_with=None,
+            indices=slice(0, None)):
+    """
+    Load and/or computes XGM data. Sources can be loaded on the
+    fly via the mnemonics argument, or processed from an existing dataset
+    (merge_with). The bunch pattern table is used to assign the pulse
+    id coordinates if the number of pulses has changed during the run.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the xgm data.
+    mnemonics: str or list of str
+        mnemonics for XGM, e.g. "SCS_SA3" or ["XTD10_XGM", "SCS_XGM"].
+        If None, defaults to "SCS_SA3" in case no merge_with dataset
+        is provided.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one. The XGM variables of merge_with (if any) will also be
+        computed and merged.
+    indices: slice, list, 1D array
+        Pulse indices of the XGM array in case bunch pattern is missing.
+    Returns
+    -------
+    xarray Dataset with pulse-resolved XGM variables aligned,
+     merged with Dataset *merge_with* if provided.
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> run, ds = tb.load(2212, 213, 'SCS_SA3')
+    >>> ds['SCS_SA3']
+
+    """
+    xgm_mnemos = ['XTD10_SA', 'XTD10_XGM', 'SCS_SA', 'SCS_XGM']
+    m2 = []
+    if mnemonics is not None:
+        mnemonics = [mnemonics] if isinstance(mnemonics, str) else mnemonics
+        for m in mnemonics:
+            if any([(k in m) for k in xgm_mnemos]):
+                if merge_with is not None and m in merge_with:
+                    continue
+                m2.append(m)
+    if merge_with is not None:
+        in_mw = []
+        for m, da in merge_with.items():
+            if any([(k in m) for k in xgm_mnemos]) and 'XGMbunchId' in da.dims:
+                in_mw.append(m)
+        m2 += in_mw
+    if len(m2) == 0:
+        log.info('no XGM mnemonics to process. Skipping.')
+        return merge_with
+    mnemonics = list(set(m2))
+    # Prepare the dataset of non-XGM data to merge with
+    if bool(merge_with):
+        ds_mw = merge_with.drop(mnemonics, errors='ignore')
+    else:
+        ds_mw = xr.Dataset()
+
+    # Load bunch pattern table and check pattern changes
+    bpt = load_bpt(run, ds_mw)
+    if bpt is not None:
+        sase1 = sase3 = None
+        mask_sa1 = mask_sa3 = None
+        if any([("SA1" in m) for m in mnemonics]) or any(
+            [("XGM" in m) for m in mnemonics]):
+            mask = mask_sa1 = is_sase_1(bpt)
+            sase1 = np.nonzero(mask_sa1[0].values)[0]
+        if any([("SA3" in m) for m in mnemonics]) or any(
+            [("XGM" in m) for m in mnemonics]):
+            mask = mask_sa3 = is_sase_3(bpt)
+            sase3 = np.nonzero(mask_sa3[0].values)[0]
+        if any([("XGM" in m) for m in mnemonics]):
+            mask = np.logical_or(mask_sa1, mask_sa3)
+        pattern_changed = ~(mask == mask[0]).all().values
+    ds = xr.Dataset()
+    for m in mnemonics:
+        if merge_with is not None and m in merge_with:
+            da_xgm = merge_with[m]
+        else:
+            da_xgm = get_array(run, m)
+        if bpt is not None:
+            if not pattern_changed:
+                ds_xgm = load_xgm_array(run, da_xgm, sase1, sase3)
+            else:
+                ds_xgm = align_xgm_array(da_xgm, bpt, mask_sa1, mask_sa3)
+        else:
+            xgm_val = da_xgm.values
+            xgm_val[xgm_val == 1] = np.nan
+            xgm_val[xgm_val == 0] = np.nan
+            da_xgm.values = xgm_val
+            da_xgm = da_xgm.dropna(dim='XGMbunchId', how='all')
+            ds_xgm = da_xgm.fillna(0).sel(XGMbunchId=indices).to_dataset()
+        ds = ds.merge(ds_xgm, join='inner')
+    # merge with non-XGM dataset
+    ds = ds_mw.merge(ds, join='inner')
+    return ds
+
+
+def load_xgm_array(run, xgm, sase1, sase3):
+    """
+    from a raw array xgm, extract and assign pulse Id coordinates
+    when the number of pulses does not change during the run.
+    If 'XGM' in mnemonic, the data is split in two variables
+    'SA1' and 'SA3'.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the xgm data.
+    xgm: xarray.DataArray
+        the raw XGM array
+    sase1: list or 1D array
+        the sase1 pulse ids
+    sase3: list or 1D array
+        the sase3 pulse ids
+
+    Returns
+    -------
+    ds_xgm: xarray.Dataset
+        the dataset containing the aligned XGM variable(s).
+    """
+    xgm_val = xgm.values
+    xgm_val[xgm_val == 1] = np.nan
+    xgm_val[xgm_val == 0] = np.nan
+    xgm.values = xgm_val
+    xgm = xgm.dropna(dim='XGMbunchId', how='all')
+    xgm = xgm.fillna(0)
+    if 'XGM' in xgm.name:
+        sase1_3 = np.sort(np.concatenate([sase1, sase3]))
+        sase1_idx = [np.argwhere(sase1_3 == i)[0][0] for i in sase1]
+        sase3_idx = [np.argwhere(sase1_3 == i)[0][0] for i in sase3]
+        xgm_sa1 = xgm.isel(XGMbunchId=sase1_idx).rename(XGMbunchId='sa1_pId')
+        xgm_sa1 = xgm_sa1.assign_coords(sa1_pId=sase1)
+        xgm_sa1 = xgm_sa1.rename(xgm.name.replace('XGM', 'SA1'))
+        xgm_sa3 = xgm.isel(XGMbunchId=sase3_idx).rename(XGMbunchId='sa3_pId')
+        xgm_sa3 = xgm_sa3.assign_coords(sa3_pId=sase3)
+        xgm_sa3 = xgm_sa3.rename(xgm.name.replace('XGM', 'SA3'))
+        xgm = xr.merge([xgm_sa1, xgm_sa3])
+    elif 'SA1' in xgm.name:
+        xgm = xgm.rename(XGMbunchId='sa1_pId')
+        xgm = xgm.assign_coords(sa1_pId=sase1).rename(xgm.name)
+        xgm = xgm.to_dataset()
+    elif 'SA3' in xgm.name:
+        xgm = xgm.rename(XGMbunchId='sa3_pId')
+        xgm = xgm.assign_coords(sa3_pId=sase3).rename(xgm.name)
+        xgm = xgm.to_dataset()
+    return xgm
+
+
+'''
+def get_xgm_old(run, mnemonics=None, merge_with=None, keepAllSase=False,
+            indices=slice(0, None)):
+    """
+    Load and/or computes XGM data. Sources can be loaded on the
+    fly via the key argument, or processed from an existing data set
+    (merge_with). The bunch pattern table is used to assign the pulse
+    id coordinates.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    mnemonics: str or list of str
+        mnemonics for XGM, e.g. "SCS_SA3" or ["XTD10_XGM", "SCS_XGM"].
+        If None, defaults to "SCS_SA3" in case no merge_with dataset
+        is provided.
+    merge_with: xarray Dataset
+        If provided, the resulting Dataset will be merged with this
+        one. The XGM variables of merge_with (if any) will also be
+        computed and merged.
+    keepAllSase: bool
+        Only relevant in case of sase-dedicated trains. If True, all
+        trains are kept, else only those of the bunchPattern are kept.
+    indices: slice, list, 1D array
+        Pulse indices of the XGM array in case bunch pattern is missing.
+    Returns
+    -------
+    xarray Dataset with pulse-resolved XGM variables aligned,
+     merged with Dataset *merge_with* if provided.
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> import toolbox_scs.detectors as tbdet
+    >>> run, _ = tb.load(2212, 213)
+    >>> xgm = tbdet.get_xgm(run)
+
+    """
+    # get the list of mnemonics to process
+    mnemonics = mnemonics_to_process(mnemonics, merge_with, 'XGM')
+
+    if len(mnemonics) == 0:
+        log.info('No array with unaligned XGM peaks to extract. Skipping.')
+        return merge_with
+    else:
+        log.info(f'Extracting XGM data from {mnemonics}.')
+
+    # Prepare the dataset of non-XGM data to merge with
+    if bool(merge_with):
+        ds = merge_with.drop(mnemonics, errors='ignore')
+    else:
+        ds = xr.Dataset()
+
+    run_mnemonics = mnemonics_for_run(run)
+    # check if bunch pattern table exists
+    if bool(merge_with) and 'bunchPatternTable' in merge_with:
+        bpt = merge_with['bunchPatternTable']
+        log.debug('Using bpt from merge_with dataset.')
+    elif 'bunchPatternTable' in run_mnemonics:
+        bpt = run.get_array(*run_mnemonics['bunchPatternTable'].values())
+        log.debug('Loaded bpt from DataCollection.')
+    else:
+        bpt = None
+
+    # Load the arrays, assign pulse ID and merge
+    for m in mnemonics:
+        if bool(merge_with) and m in merge_with:
+            arr = merge_with[m]
+            log.debug(f'Using {m} from merge_with dataset.')
+        else:
+            arr = run.get_array(*run_mnemonics[m].values(), name=m)
+            log.debug(f'Loading {m} from DataCollection.')
+        if bpt is not None:
+            arr = align_xgm_array(arr, bpt)
+        else:
+            arr = arr.where(arr != 1., drop=True).sel(XGMbunchId=indices)
+        ds = ds.merge(arr, join='inner')
+    return ds
+'''
+
+
+def align_xgm_array(xgm_arr, bpt, mask_sa1, mask_sa3):
+    """
+    Assigns pulse ID coordinates to a pulse-resolved XGM array, according to
+    the bunch pattern table. If the arrays contains both SASE 1 and SASE 3
+    data, it is split in two arrays.
+
+    Parameters
+    ----------
+    xgm_arr: xarray DataArray
+        array containing pulse-resolved XGM data, with dims ['trainId',
+        'XGMbunchId']
+    bpt: xarray DataArray
+        bunch pattern table
+    mask_sa1: xarray DataArray
+        boolean 2D array (trainId x pulseId) of sase 1 pulses
+    mask_sa3: xarray DataArray
+        boolean 2D array (trainId x pulseId) of sase 3 pulses
+
+    Returns
+    -------
+    xgm: xarray Dataset
+        dataset with pulse ID coordinates. For SASE 1 data, the coordinates
+        name is sa1_pId, for SASE 3 data, the coordinates name is sa3_pId.
+    """        
+    key = xgm_arr.name
+    compute_sa1 = False
+    compute_sa3 = False
+    valid_tid = np.intersect1d(xgm_arr.trainId, bpt.trainId,
+                               assume_unique=True)
+    # get the relevant masks for SASE 1 and/or SASE3
+    if "SA1" in key or "SA3" in key:
+        if "SA1" in key:
+            mask = mask_sa1.sel(trainId=valid_tid)
+            compute_sa1 = True
+        else:
+            mask = mask_sa3.sel(trainId=valid_tid)
+            compute_sa3 = True
+        tid = mask.where(mask.sum(dim='pulse_slot') > 0, drop=True).trainId
+        mask = mask.sel(trainId=tid)
+        mask_sa1 = mask.rename({'pulse_slot': 'sa1_pId'})
+        mask_sa3 = mask.rename({'pulse_slot': 'sa3_pId'})
+    if "XGM" in key:
+        compute_sa1 = True
+        compute_sa3 = True
+        mask_sa1 = mask_sa1.sel(trainId=valid_tid)
+        mask_sa3 = mask_sa3.sel(trainId=valid_tid)
+        mask = np.logical_or(mask_sa1, mask_sa3)
+        tid = mask.where(mask.sum(dim='pulse_slot') > 0,
+                         drop=True).trainId
+        mask_sa1 = mask_sa1.sel(trainId=tid).rename({'pulse_slot': 'sa1_pId'})
+        mask_sa3 = mask_sa3.sel(trainId=tid).rename({'pulse_slot': 'sa3_pId'})
+        mask = mask.sel(trainId=tid)
+
+    npulses_max = mask.sum(dim='pulse_slot').max().values
+    bpt_npulses = bpt.sizes['pulse_slot']
+    xgm_arr = xgm_arr.sel(trainId=tid).isel(
+                XGMbunchId=slice(0, npulses_max))
+    # In rare cases, some xgm data is corrupted: trainId is valid but values
+    # are inf / NaN. We set them to -1 to avoid size mismatch between xgm and
+    # bpt. Before returning we will drop them.
+    xgm_arr = xgm_arr.where(np.isfinite(xgm_arr)).fillna(-1.)
+    # pad the xgm array to match the bpt dims, flatten and
+    # reorder xgm array to match the indices of the mask
+    xgm_flat = np.hstack((xgm_arr.fillna(1.),
+                          np.ones((xgm_arr.sizes['trainId'],
+                                   bpt_npulses-npulses_max)))).flatten()
+    xgm_flat_arg = np.argwhere(xgm_flat != 1.)
+    mask_flat = mask.values.flatten()
+    mask_flat_arg = np.argwhere(mask_flat)
+    if(xgm_flat_arg.shape != mask_flat_arg.shape):
+        log.warning(f'{key}: XGM data and bunch pattern do not match.')
+    new_xgm_flat = np.ones(xgm_flat.shape)
+    new_xgm_flat[mask_flat_arg] = xgm_flat[xgm_flat_arg]
+    new_xgm = new_xgm_flat.reshape((xgm_arr.sizes['trainId'], bpt_npulses))
+    # create a dataset with new_xgm array masked by SASE 1 or SASE 3
+    xgm_dict = {}
+    if compute_sa1:
+        sa1_xgm = xr.DataArray(new_xgm, dims=['trainId', 'sa1_pId'],
+                               coords={'trainId': xgm_arr.trainId,
+                                       'sa1_pId': np.arange(bpt_npulses)},
+                               name=key.replace('XGM', 'SA1'))
+        sa1_xgm = sa1_xgm.where(mask_sa1, drop=True)
+        sa1_xgm = sa1_xgm.where(sa1_xgm != -1., drop=True)
+        # remove potential corrupted data:
+        xgm_dict[sa1_xgm.name] = sa1_xgm
+    if compute_sa3:
+        sa3_xgm = xr.DataArray(new_xgm, dims=['trainId', 'sa3_pId'],
+                               coords={'trainId': xgm_arr.trainId,
+                                       'sa3_pId': np.arange(bpt_npulses)},
+                               name=key.replace('XGM', 'SA3'))
+        sa3_xgm = sa3_xgm.where(mask_sa3, drop=True)
+        # remove potential corrupted data:
+        sa3_xgm = sa3_xgm.where(sa3_xgm != -1., drop=True)
+        xgm_dict[sa3_xgm.name] = sa3_xgm
+    ds = xr.Dataset(xgm_dict)
+    return ds
+
+
+def calibrate_xgm(run, data, xgm='SCS', plot=False):
+    """
+    Calculates the calibration factor F between the photon flux (slow signal)
+    and the fast signal (pulse-resolved) of the sase 3 pulses. The calibrated
+    fast signal is equal to the uncalibrated one multiplied by F.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the digitizer data.
+    data: xarray Dataset
+        dataset containing the pulse-resolved sase 3 signal, e.g. 'SCS_SA3'
+    xgm: str
+        one in {'XTD10', 'SCS'}
+    plot: bool
+        If True, shows a plot of the photon flux, averaged fast signal and
+        calibrated fast signal.
+
+    Returns
+    -------
+    F: float
+        calibration factor F defined as:
+        calibrated XGM [microJ] = F * fast XGM array ('SCS_SA3' or 'XTD10_SA3')
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> import toolbox_scs.detectors as tbdet
+    >>> run, data = tb.load(900074, 69, ['SCS_XGM'])
+    >>> ds = tbdet.get_xgm(run, merge_with=data)
+    >>> F = tbdet.calibrate_xgm(run, ds, plot=True)
+    >>> # Add calibrated XGM to the dataset:
+    >>> ds['SCS_SA3_uJ'] = F * ds['SCS_SA3']
+    """
+
+    run_mnemonics = mnemonics_for_run(run)
+    # check if bunch pattern table exists
+    if 'bunchPatternTable' in data:
+        bpt = data['bunchPatternTable']
+    elif 'bunchPatternTable' in run_mnemonics:
+        bpt = run.get_array(*run_mnemonics['bunchPatternTable'].values())
+    elif 'bunchPatternTable_SA3' in run_mnemonics:
+        bpt = run.get_array(*run_mnemonics['bunchPatternTable_SA3'].values())
+    else:
+        raise ValueError('Bunch pattern missing. Cannot calibrate XGM.')
+    mask_sa3 = is_sase_3(bpt.sel(trainId=data.trainId))
+    npulses_sa3 = np.unique(mask_sa3.sum(dim='pulse_slot'))
+    if len(npulses_sa3) == 1:
+        npulses_sa3 = npulses_sa3[0]
+    else:
+        log.warning('change of pulse pattern in sase3 during the run.')
+        npulses_sa3 = max(npulses_sa3)
+    mask_sa1 = is_sase_1(bpt.sel(trainId=data.trainId))
+    npulses_sa1 = np.unique(mask_sa1.sum(dim='pulse_slot'))
+    if len(npulses_sa1) == 1:
+        npulses_sa1 = npulses_sa1[0]
+    else:
+        log.warning('change of pulse pattern in sase1 during the run.')
+        npulses_sa1 = max(npulses_sa1)
+
+    pflux_key = f'{xgm}_photonFlux'
+    if pflux_key in data:
+        pflux = data[pflux_key]
+    else:
+        pflux = run.get_array(*run_mnemonics[pflux_key].values())
+        pflux = pflux.sel(trainId=data.trainId)
+    pflux_sa3 = (npulses_sa1 + npulses_sa3) * pflux / npulses_sa3
+    avg_fast = data[f'{xgm}_SA3'].rolling(trainId=200).mean().mean(axis=1)
+    calib = np.nanmean(pflux_sa3.values / avg_fast.values)
+    if plot:
+        plot_xgm_calibration(xgm, pflux, pflux_sa3, avg_fast, calib)
+    return calib
+
+
+def plot_xgm_calibration(xgm, pflux, pflux_sa3, avg_fast, calib):
+    plt.figure(figsize=(8, 4))
+    plt.plot(pflux, label='photon flux all')
+    plt.plot(pflux_sa3, label='photon flux SA3')
+    plt.plot(avg_fast, label='avg pulsed XGM')
+    plt.plot(avg_fast*calib, label='calibrated avg pulsed XGM')
+    plt.title(f'calibrated XGM = {xgm}_SA3 * {calib:.3e}')
+    plt.xlabel('train number')
+    plt.ylabel(r'Pulse energy [$\mu$J]')
+    plt.legend()
+    return
--- a/src/toolbox_scs/load.py
+++ b/src/toolbox_scs/load.py
+# -*- coding: utf-8 -*-
+"""
+    Toolbox for SCS.
+
+    Various utilities function to quickly process data measured at the SCS
+    instruments.
+
+    Copyright (2019) SCS Team.
+"""
+
+import logging
+import os
+
+import numpy as np
+import xarray as xr
+import extra_data as ed
+from extra_data.read_machinery import find_proposal
+
+from .constants import mnemonics as _mnemonics
+from .mnemonics_machinery import mnemonics_for_run
+from .util.exceptions import ToolBoxValueError
+import toolbox_scs.detectors as tbdet
+from .misc.bunch_pattern import (npulses_has_changed,
+                                 get_unique_sase_pId, load_bpt)
+
+__all__ = [
+    'concatenateRuns',
+    'find_run_path',
+    'get_array',
+    'load',
+    'open_run',
+    'run_by_path',
+    'load_run_values',
+    'check_data_rate',
+]
+
+log = logging.getLogger(__name__)
+
+
+def load(proposalNB=None, runNB=None,
+         fields=None,
+         data='all',
+         display=False,
+         validate=False,
+         subset=None,
+         rois={},
+         extract_digitizers=True,
+         extract_xgm=True,
+         extract_bam=True,
+         bunchPattern='sase3',
+         parallelize=True,
+         ):
+    """
+    Load a run and extract the data. Output is an xarray with aligned
+    trainIds.
+
+    Parameters
+    ----------
+
+    proposalNB: str, int
+        proposal number e.g. 'p002252' or 2252
+    runNB: str, int
+        run number as integer
+    fields: str, list of str, list of dict
+        list of mnemonics to load specific data such as "fastccd",
+        "SCS_XGM", or dictionnaries defining a custom mnemonic such as
+        {"extra": {'source': 'SCS_CDIFFT_MAG/SUPPLY/CURRENT',
+                   'key': 'actual_current.value',
+                   'dim': None}}
+    data: str or Sequence of str
+        'raw', 'proc' (processed), or any other location relative to the
+        proposal path with data per run to access. May also be ‘all’
+        (both ‘raw’ and ‘proc’) or a sequence of strings to load data
+        from several locations, with later locations overwriting sources
+        present in earlier ones. The default is 'raw'.
+    display: bool
+        whether to show the run.info or not
+    validate: bool
+        whether to run extra-data-validate or not
+    subset: slice or extra_data.by_index or numpy.s_
+        a subset of train that can be loaded with extra_data.by_index[:5] for
+        the first 5 trains. If None, all trains are retrieved.
+    rois: dict
+        a dictionnary of mnemonics with a list of rois definition and
+        the desired names, for example:
+        {'fastccd': {'ref': {'roi': by_index[730:890, 535:720],
+                             'dim': ['ref_x', 'ref_y']},
+                     'sam': {'roi':by_index[1050:1210, 535:720],
+                             'dim': ['sam_x', 'sam_y']}}}
+    extract_digitizers: bool
+        If True, extracts the peaks from digitizer variables and aligns the
+        pulse Id according to the fadc_bp bunch pattern.
+    extract_xgm: bool
+        If True, extracts the values from XGM variables (e.g. 'SCS_SA3',
+        'XTD10_XGM') and aligns the pulse Id with the sase1 / sase3 bunch
+        pattern.
+    extract_bam: bool
+        If True, extracts the values from BAM variables (e.g. 'BAM1932M')
+        and aligns the pulse Id with the sase3 bunch pattern.
+    bunchPattern: str
+        bunch pattern used to extract the Fast ADC pulses.
+        A string or a dict as in::
+
+        {'FFT_PD2': 'sase3', 'ILH_I0': 'scs_ppl'}
+
+        Ignored if extract_digitizers=False.
+    parallelize: bool
+        from EXtra-Data: enable or disable opening files in parallel.
+        Particularly useful if creating child processes is not allowed
+        (e.g. in a daemonized multiprocessing.Process).
+
+    Returns
+    -------
+    run, ds: DataCollection, xarray.Dataset
+        extra_data DataCollection of the proposal and run number and an
+        xarray Dataset with aligned trainIds and pulseIds
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> run, data = tb.load(2212, 208, ['SCS_SA3', 'MCP2apd', 'nrj'])
+
+    """
+    run = ed.open_run(proposalNB, runNB, data=data, parallelize=parallelize)
+    if subset is not None:
+        run = run.select_trains(subset)
+    if fields is None:
+        return run, xr.Dataset()
+    if isinstance(fields, str):
+        fields = [fields]
+    if validate:
+        # get_ipython().system('extra-data-validate ' + runFolder)
+        pass
+    if display:
+        run.info()
+
+    data_arrays = []
+    run_mnemonics = mnemonics_for_run(run)
+
+    for f in fields:
+        if type(f) == dict:
+            # extracting mnemomic defined on the spot
+            if len(f.keys()) > 1:
+                print('Loading only one "on-the-spot" mnemonic at a time, '
+                      'skipping all others !')
+            k = list(f.keys())[0]
+            v = f[k]
+        else:
+            # extracting mnemomic from the table
+            if f in run_mnemonics:
+                v = run_mnemonics[f]
+                k = f
+            else:
+                if f in _mnemonics:
+                    log.warning(f'Mnemonic "{f}" not found in run. Skipping!')
+                    print(f'Mnemonic "{f}" not found in run. Skipping!')
+                else:
+                    log.warning(f'Unknow mnemonic "{f}". Skipping!')
+                    print(f'Unknow mnemonic "{f}". Skipping!')
+                continue
+        if k in [d.name for d in data_arrays]:
+            continue  # already loaded, skip
+        if display:
+            print(f'Loading {k}')
+        if v['source'] not in run.all_sources:
+            log.warning(f'Source {v["source"]} not found in run. Skipping!')
+            print(f'Source {v["source"]} not found in run. Skipping!')
+            continue
+        if k == 'MTE3':
+            arr = run.get_array(v['source'], v['key'],
+                                extra_dims=v['dim'], name=k)
+            tpi = run.get_array('SCS_XTD10_TPI/DCTRL/SHUTTER',
+                                'hardwareStatusBitField.value', name=k)
+            tpi_open = iter(tpi.trainId[tpi & (1 << 12) > 0])
+
+            mte3_tids = []
+            last = 0
+            current = next(tpi_open, None)
+            if current is None:
+                data_arrays.append(arr)
+            else:
+                for tid in arr.trainId:
+                     while current < tid:
+                         last = current
+                         current = next(tpi_open, tid)
+                     mte3_tids.append(last)
+                data_arrays.append(
+                    arr.assign_coords(trainId=np.array(mte3_tids, dtype='u8')))
+        elif k not in rois:
+            # no ROIs selection, we read everything
+            arr = run.get_array(v['source'], v['key'],
+                                extra_dims=v['dim'], name=k)
+            if len(arr) == 0:
+                log.warning(f'Empty array for {f}: {v["source"]}, {v["key"]}. '
+                            'Skipping!')
+                print(f'Empty array for {f}: {v["source"]}, {v["key"]}. '
+                      'Skipping!')
+                continue
+            data_arrays.append(arr)
+        else:
+            # ROIs selection, for each ROI we select a region of the data and
+            # save it with new name and dimensions
+            for nk, nv in rois[k].items():
+                arr = run.get_array(v['source'], v['key'],
+                                    extra_dims=nv['dim'],
+                                    roi=nv['roi'],
+                                    name=nk)
+                if len(arr) == 0:
+                    log.warning(f'Empty array for {f}: {v["source"]}, '
+                                f'{v["key"]}. Skipping!')
+                    print(f'Empty array for {f}: {v["source"]}, {v["key"]}. '
+                          'Skipping!')
+                    continue
+                data_arrays.append(arr)
+    # Check missing trains
+    for arr in data_arrays:
+        if 'hRIXS' in arr.name:
+            continue
+        rate = arr.sizes["trainId"] / len(run.train_ids)
+        if rate < 0.95:
+            log.warning(f'{arr.name}: only {rate*100:.1f}% of trains '
+                        f'({arr.sizes["trainId"]} out of '
+                        f'{len(run.train_ids)}) contain data.')
+    ds = xr.merge(data_arrays, join='inner')
+    ds.attrs['runNB'] = runNB
+    if isinstance(proposalNB, int):
+        proposalNB = 'p{:06d}'.format(proposalNB)
+    ds.attrs['proposal'] = find_proposal(proposalNB)
+    ds.attrs['data'] = data
+
+    if extract_digitizers:
+        bp = bunchPattern
+        for k, v in run_mnemonics.items():
+            if k not in ds or v.get('extract') != 'peaks':
+                continue
+            if isinstance(bunchPattern, dict):
+                bp = bunchPattern.get(k)
+                if bp is None:
+                    continue
+            ds = tbdet.get_digitizer_peaks(proposalNB, runNB,
+                mnemonic=k, merge_with=ds, bunchPattern=bp)
+    if extract_xgm:
+        for k, v in run_mnemonics.items():
+            if k not in ds or v.get('extract') != 'XGM':
+                continue
+            ds = tbdet.get_xgm(run, mnemonics=k, merge_with=ds)
+
+    if extract_bam:
+        for k, v in run_mnemonics.items():
+            if k not in ds or v.get('extract') != 'BAM':
+                continue
+            ds = tbdet.get_bam(run, mnemonics=k, merge_with=ds)
+
+    return run, ds
+
+
+def run_by_path(path):
+    """
+    Return specified run
+
+    Wraps the extra_data RunDirectory routine, to ease its use for the
+    scs-toolbox user.
+
+    Parameters
+    ----------
+    path: str
+        path to the run directory
+
+    Returns
+    -------
+    run : extra_data.DataCollection
+        DataCollection object containing information about the specified
+        run. Data can be loaded using built-in class methods.
+    """
+    return ed.RunDirectory(path)
+
+
+def find_run_path(proposalNB, runNB, data='raw'):
+    """
+    Return the run path given the specified proposal and run numbers.
+
+    Parameters
+    ----------
+    proposalNB: (str, int)
+        proposal number e.g. 'p002252' or 2252
+    runNB: (str, int)
+        run number as integer
+    data: str
+        'raw', 'proc' (processed) or 'all' (both 'raw' and 'proc') to access
+        data from either or both of those folders. If 'all' is used, sources
+        present in 'proc' overwrite those in 'raw'. The default is 'raw'.
+    Returns
+    -------
+    path: str
+        The run path.
+    """
+    if isinstance(runNB, int):
+        runNB = 'r{:04d}'.format(runNB)
+    if isinstance(proposalNB, int):
+        proposalNB = 'p{:06d}'.format(proposalNB)
+    return os.path.join(find_proposal(proposalNB), data, runNB)
+
+
+def open_run(proposalNB, runNB, subset=None, **kwargs):
+    """
+    Get extra_data.DataCollection in a given proposal.
+    Wraps the extra_data open_run routine and adds subset selection, out of
+    convenience for the toolbox user. More information can be found in the
+    extra_data documentation.
+
+    Parameters
+    ----------
+    proposalNB: (str, int)
+        proposal number e.g. 'p002252' or 2252
+    runNB: (str, int)
+        run number e.g. 17 or 'r0017'
+    subset: slice or extra_data.by_index or numpy.s_
+        a subset of train that can be loaded with extra_data.by_index[:5] for
+        the first 5 trains. If None, all trains are retrieved.
+
+    **kwargs
+    --------
+    data: str
+        default -> 'raw'
+    include: str
+        default -> '*'
+
+    Returns
+    -------
+    run : extra_data.DataCollection
+        DataCollection object containing information about the specified
+        run. Data can be loaded using built-in class methods.
+    """
+    run = ed.open_run(proposalNB, runNB, **kwargs)
+    if subset is not None:
+        run = run.select_trains(subset)
+    return run
+
+
+def get_array(run=None, mnemonic=None, stepsize=None,
+              subset=None, data='raw',
+              proposalNB=None, runNB=None):
+    """
+    Loads one data array for the specified mnemonic and rounds its values to
+    integer multiples of stepsize for consistent grouping (except for
+    stepsize=None).
+    Returns a 1D array of ones if mnemonic is set to None.
+
+    Parameters
+    ----------
+    run: extra_data.DataCollection
+        DataCollection containing the data.
+        Used if proposalNB and runNB are None.
+    mnemonic: str
+        Identifier of a single item in the mnemonic collection. None creates a
+        dummy 1D array of ones with length equal to the number of trains.
+    stepsize : float
+        nominal stepsize of the array data - values will be rounded to integer
+        multiples of this value.
+    subset: slice or extra_data.by_index or numpy.s_
+        a subset of train that can be loaded with extra_data.by_index[:5] for
+        the first 5 trains. If None, all trains are retrieved.
+    data: str or Sequence of str
+        'raw', 'proc' (processed), or any other location relative to the
+        proposal path with data per run to access. May also be ‘all’
+        (both ‘raw’ and ‘proc’) or a sequence of strings to load data
+        from several locations, with later locations overwriting sources
+        present in earlier ones. The default is 'raw'.
+    proposalNB: (str, int)
+        proposal number e.g. 'p002252' or 2252.
+    runNB: (str, int)
+        run number e.g. 17 or 'r0017'.
+
+    Returns
+    -------
+    data : xarray.DataArray
+        xarray DataArray containing rounded array values using the trainId as
+        coordinate.
+
+    Raises
+    ------
+    ToolBoxValueError: Exception
+        Toolbox specific exception, indicating a non-valid mnemonic entry
+
+    Example
+    -------
+    >>> import toolbox_scs as tb
+    >>> run = tb.open_run(2212, 235)
+    >>> mnemonic = 'PP800_PhaseShifter'
+    >>> data_PhaseShifter = tb.get_array(run, mnemonic, 0.5)
+    """
+    if run is None:
+        run = open_run(proposalNB, runNB, subset, data=data)
+    else:
+        if not isinstance(run, ed.DataCollection):
+            raise TypeError(f'run argument has type {type(run)} but '
+                            'expected type is extra_data.DataCollection')
+        if subset is not None:
+            run = run.select_trains(subset)
+    run_mnemonics = mnemonics_for_run(run)
+
+    try:
+        if mnemonic is None:
+            da = xr.DataArray(
+                        np.ones(len(run.train_ids), dtype=np.int16),
+                        dims=['trainId'], coords={'trainId': run.train_ids})
+        elif mnemonic in run_mnemonics:
+            mnem = run_mnemonics[mnemonic]
+            da = run.get_array(mnem['source'], mnem['key'],
+                                 extra_dims=mnem['dim'], name=mnemonic)
+        else:
+            raise ToolBoxValueError("Invalid mnemonic", mnemonic)
+
+        if stepsize is not None:
+            da = stepsize * np.round(da / stepsize)
+        log.debug(f"Got data for {mnemonic}")
+    except ToolBoxValueError as err:
+        log.error(f"{err.message}")
+        raise
+
+    return da
+
+
+def load_run_values(prop_or_run, runNB=None, which='mnemonics'):
+    """
+    Load the run value for each mnemonic whose source is a CONTORL
+    source (see extra-data DataCollection.get_run_value() for details)
+
+    Parameters
+    ----------
+    prop_or_run: extra_data DataCollection or int
+        The run (DataCollection) to check for mnemonics.
+        Alternatively, the proposal number (int), for which the runNB
+        is also required.
+    runNB: int
+        The run number. Only used if the first argument is the proposal
+        number.
+    which: str
+        'mnemonics' or 'all'. If 'mnemonics', only the run values for the
+        ToolBox mnemonics are retrieved. If 'all', a compiled dictionnary
+        of all control sources run values is returned.
+
+    Output
+    ------
+    run_values: a dictionnary containing the mnemonic or all run values.
+    """
+    if which not in ['mnemonics', 'all']:
+        raise ValueError('`which` should be either "mnemonics" or "all"')
+    run = prop_or_run
+    if runNB is not None:
+        run = open_run(prop_or_run, runNB)
+    if which == 'all':
+        run_values = {}
+        for c in run.control_sources:
+            v = run.get_run_values(c)
+            run_values[c] = v
+        return run_values
+
+    mnemos = mnemonics_for_run(run)
+    run_values = {}
+    for m in mnemos:
+        val = None
+        try:
+            if mnemos[m]['source'] in run.control_sources:
+                val = run.get_run_value(mnemos[m]['source'],
+                                        mnemos[m]['key'])
+        except Exception as e:
+            log.info(f'Error while retrieving {m} mnemonic: {e}')
+            continue
+        run_values[m] = val
+    return run_values
+
+
+def concatenateRuns(runs):
+    """ Sorts and concatenate a list of runs with identical data variables
+        along the trainId dimension.
+
+        Input:
+            runs: (list) the xarray Datasets to concatenate
+        Output:
+            a concatenated xarray Dataset
+    """
+    firstTid = {i: int(run.trainId[0].values) for i, run in enumerate(runs)}
+    orderedDict = dict(sorted(firstTid.items(), key=lambda t: t[1]))
+    orderedRuns = [runs[i] for i in orderedDict]
+    keys = orderedRuns[0].keys()
+    for run in orderedRuns[1:]:
+        if run.keys() != keys:
+            print('data fields between different runs are not identical. '
+                  'Cannot combine runs.')
+            return
+
+    result = xr.concat(orderedRuns, dim='trainId')
+    for k in orderedRuns[0].attrs.keys():
+        result.attrs[k] = [run.attrs[k] for run in orderedRuns]
+    return result
+
+
+def check_data_rate(run, fields=None):
+    """
+    Calculates the fraction of train ids that contain data in a run.
+
+    Parameters
+    ----------
+    run: extra_data DataCollection
+        the DataCollection associated to the data.
+    fields: str, list of str or dict
+        mnemonics to check. If None, all mnemonics in the run are checked.
+        A custom mnemonic can be defined with a dictionnary: {'extra':
+        {'source': 'SCS_CDIFFT_MAG/SUPPLY/CURRENT', 'key':
+        'actual_current.value'}}
+    Output
+    ------
+        ret: dictionnary
+        dictionnary with mnemonic as keys and fraction of train ids
+        that contain data as values.
+    """
+    run_mnemonics = mnemonics_for_run(run)
+    if fields is None:
+        fields = run_mnemonics
+    fields = [fields] if isinstance(fields, str) else fields
+    ret = {}
+    for f in fields:
+        if isinstance(f, dict):
+            name = list(f.keys())[0]
+            val = f[name]
+            f = name
+        elif f not in run_mnemonics:
+            log.warning(f'mnemonic {f} not found. Skipping!')
+            continue
+        else:
+            val = run_mnemonics[f]
+        counts = run[val['source']][val['key']].data_counts(False)
+        npulses = counts.max()
+        if npulses == 0:  # (only missing data)
+            rate = 0.
+        else:
+            counts = counts / npulses  # to only count trains and not pulses
+            rate = counts.sum() / len(run.train_ids)
+        ret[f] = rate
+    return ret
No results found