Compare revisions

Philipp Schmidt · Karim Ahmed · Philipp Schmidt · Karim Ahmed · Karim Ahmed · Karim Ahmed
--- a/notebooks/DSSC/Characterize_DSSC_Darks_NBC.ipynb
+++ b/notebooks/DSSC/Characterize_DSSC_Darks_NBC.ipynb
 %% Cell type:markdown id: tags:

 # DSSC Characterize Dark Images #

 Author: S. Hauf, Version: 0.1

 The following code analyzes a set of dark images taken with the DSSC detector to deduce detector offsets and noise. Data for the detector is presented in one run and don't acquire multiple gain stages.

 The notebook explicitely does what pyDetLib provides in its offset calculation method for streaming data.

 %% Cell type:code id: tags:

 ``` python
 cluster_profile = "noDB" # The ipcluster profile to use
 in_folder = "/gpfs/exfel/exp/SQS/202131/p900210/raw" # path to input data, required
 out_folder = "/gpfs/exfel/data/scratch/samartse/data/DSSC" # path to output to, required
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 sequences = [0] # sequence files to evaluate.
 modules = [-1]  # modules to run for
 run = 20 #run number in which data was recorded, required

 karabo_id = "SQS_DET_DSSC1M-1" # karabo karabo_id
 karabo_da = ['-1']  # a list of data aggregators names, Default [-1] for selecting all data aggregators
 receiver_id = "{}CH0" # inset for receiver devices
 path_template = 'RAW-R{:04d}-{}-S{:05d}.h5' # the template to use to access data
 h5path = '/INSTRUMENT/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 h5path_idx = '/INDEX/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 slow_data_pattern = 'RAW-R{}-DA{}-S00000.h5'

 use_dir_creation_date = True # use the dir creation date for determining the creation time
 cal_db_interface = "tcp://max-exfl-cal001:8020" # the database interface to use
 cal_db_timeout = 3000000 # timeout on caldb requests"
 local_output = True # output constants locally
 db_output = False # output constants to database

 mem_cells = 0 # number of memory cells used, set to 0 to automatically infer
 bias_voltage = 100 # detector bias voltage
 rawversion = 2 # RAW file format version

 thresholds_offset_sigma = 3. # thresholds in terms of n sigma noise for offset deduced bad pixels
 thresholds_offset_hard = [4, 125] # thresholds in absolute ADU terms for offset deduced bad pixels,
 # minimal threshold at 4 is set at hardware level, DSSC full range 0-511

 thresholds_noise_sigma = 3. # thresholds in terms of n sigma noise for offset deduced bad pixels
 thresholds_noise_hard = [0.001, 3] # thresholds in absolute ADU terms for offset deduced bad pixels
 offset_numpy_algorithm = "mean"

 high_res_badpix_3d = False # set this to True if you need high-resolution 3d bad pixel plots. Runtime: ~ 1h
 slow_data_aggregators = [1,1,1,1]  # quadrant/aggregator
 slow_data_path = 'SQS_NQS_DSSC/FPGA/PPT_Q'
 operation_mode = ''  # Detector operation mode, optional
 ```

 %% Cell type:code id: tags:

 ``` python
 import os
 import warnings

 # imports and things that do not usually need to be changed
 from datetime import datetime

 warnings.filterwarnings('ignore')
 from collections import OrderedDict

 import h5py
 import matplotlib
 from ipyparallel import Client
 from IPython.display import Latex, Markdown, display

 matplotlib.use('agg')
 import matplotlib.pyplot as plt

 %matplotlib inline
 import numpy as np
 import tabulate
 import yaml
 from iCalibrationDB import Conditions, Constants, Detectors, Versions

 from cal_tools.dssclib import get_dssc_ctrl_data, get_pulseid_checksum
 from cal_tools.enums import BadPixels
 from cal_tools.plotting import (
    create_constant_overview,
    plot_badpix_3d,
    show_overview,
    show_processed_modules,
 )
 from cal_tools.tools import (
    get_dir_creation_date,
    get_from_db,
    get_notebook_name,
    get_pdu_from_db,
    get_random_db_interface,
    get_report,
    map_gain_stages,
    parse_runs,
    run_prop_seq_from_path,
    save_const_to_h5,
    send_to_db,
 )

 view = Client(profile=cluster_profile)[:]
 view.use_dill()

 # make sure a cluster is running with ipcluster start --n=32, give it a while to start

 h5path = h5path.format(karabo_id, receiver_id)
 h5path_idx = h5path_idx.format(karabo_id, receiver_id)
 gain_names = ['High', 'Medium', 'Low']

 if karabo_da[0] == '-1':
    if modules[0] == -1:
        modules = list(range(16))
    karabo_da = ["DSSC{:02d}".format(i) for i in modules]
 else:
    modules = [int(x[-2:]) for x in karabo_da]

 max_cells = mem_cells

 offset_runs = OrderedDict()
 offset_runs["high"] = run

 creation_time=None
 if use_dir_creation_date:
    creation_time = get_dir_creation_date(in_folder, run)
    print(f"Using {creation_time} as creation time of constant.")

 run, prop, seq = run_prop_seq_from_path(in_folder)

 dinstance = "DSSC1M1"

 print(f"Detector in use is {karabo_id}")

 cal_db_interface = get_random_db_interface(cal_db_interface)
 ```

 %% Cell type:code id: tags:

 ``` python
 print("Parameters are:")
 print(f"Proposal: {prop}")
 print(f"Memory cells: {mem_cells}/{max_cells}")
 print("Runs: {}".format([ v for v in offset_runs.values()]))
 print(f"Sequences: {sequences}")
 print(f"Using DB: {db_output}")
 print(f"Input: {in_folder}")
 print(f"Output: {out_folder}")
 print(f"Bias voltage: {bias_voltage}V")
 file_loc = f'proposal:{prop} runs:{[ v for v in offset_runs.values()][0]}'

 report = get_report(metadata_folder)
 ```

 %% Cell type:markdown id: tags:

 The following lines will create a queue of files which will the be executed module-parallel. Distinguishing between different gains.

 %% Cell type:code id: tags:

 ``` python
 # set everything up filewise
 os.makedirs(out_folder, exist_ok=True)
 gmf = map_gain_stages(in_folder, offset_runs, path_template, karabo_da, sequences)
 gain_mapped_files, total_sequences, total_file_size = gmf
 print(f"Will process a total of {total_sequences} file.")
 ```

 %% Cell type:markdown id: tags:

 ## Calculate Offsets, Noise and Thresholds ##

 The calculation is performed per-pixel and per-memory-cell. Offsets are simply the median value for a set of dark data taken at a given gain, noise the standard deviation, and gain-bit values the medians of the gain array.

 %% Cell type:code id: tags:

 ``` python
 import copy
 from functools import partial


 def characterize_module(cells, bp_thresh, rawversion, karabo_id, h5path, h5path_idx, inp):
    import copy

    import h5py
    import numpy as np
    from cal_tools.enums import BadPixels
-
-    def get_num_cells(fname, h5path):
-        with h5py.File(fname, "r") as f:
-
-            cells = f[f"{h5path}/cellId"][()]
-            if cells == []:
-                return
-            maxcell = np.max(cells)
-            options = [100, 200, 400, 500, 600, 700, 800, 900]
-            dists = np.array([(o-maxcell) for o in options])
-            dists[dists<0] = 10000 # assure to always go higher
-            return options[np.argmin(dists)]
+    from cal_tools.dssclib import get_num_cells

    filename, channel = inp

    h5path = h5path.format(channel)
    h5path_idx = h5path_idx.format(channel)
    if cells == 0:
        cells = get_num_cells(filename, h5path)
        if cells is None:
            raise ValueError(f"ERROR! Empty image data file for channel {channel}")


    print(f"Using {cells} memory cells")

    pulseid_checksum = None

    thresholds_offset_hard, thresholds_offset_sigma, thresholds_noise_hard, thresholds_noise_sigma = bp_thresh

    infile = h5py.File(filename, "r")
    if rawversion == 2:
        count = np.squeeze(infile[f"{h5path_idx}/count"])
        first = np.squeeze(infile[f"{h5path_idx}/first"])
        last_index = int(first[count != 0][-1]+count[count != 0][-1])
        first_index = int(first[count != 0][0])
    else:
        status = np.squeeze(infile[f"{h5path_idx}/status"])
        if np.count_nonzero(status != 0) == 0:
            return
        last = np.squeeze(infile[f"{h5path_idx}/last"])
        first = np.squeeze(infile[f"{h5path_idx}/first"])
        last_index = int(last[status != 0][-1]) + 1
        first_index = int(first[status != 0][0])
    im = np.array(infile[f"{h5path}/data"][first_index:last_index,...])
    cellIds = np.squeeze(infile[f"{h5path}/cellId"][first_index:last_index,...])
    infile.close()

    pulseid_checksum = get_pulseid_checksum(filename, h5path, h5path_idx)

    im = im[:, 0, ...].astype(np.float32)

    im = np.rollaxis(im, 2)
    im = np.rollaxis(im, 2, 1)

    mcells = cells
    offset = np.zeros((im.shape[0], im.shape[1], mcells), dtype = np.float64)
    noise = np.zeros((im.shape[0], im.shape[1], mcells), dtype = np.float64)

    for cc in np.unique(cellIds[cellIds < mcells]):
        cellidx = cellIds == cc
        if offset_numpy_algorithm == "mean":
            offset[...,cc] = np.mean(im[..., cellidx], axis=2)
        else:
            offset[...,cc] = np.median(im[..., cellidx], axis=2)
        noise[...,cc] = np.std(im[..., cellidx], axis=2)


    # bad pixels
    bp = np.zeros(offset.shape, np.uint32)
    # offset related bad pixels
    offset_mn = np.nanmedian(offset, axis=(0,1))
    offset_std = np.nanstd(offset, axis=(0,1))

    bp[(offset < offset_mn-thresholds_offset_sigma*offset_std) |
       (offset > offset_mn+thresholds_offset_sigma*offset_std)] |= BadPixels.OFFSET_OUT_OF_THRESHOLD.value
    bp[(offset < thresholds_offset_hard[0]) | (offset > thresholds_offset_hard[1])] |= BadPixels.OFFSET_OUT_OF_THRESHOLD.value
    bp[~np.isfinite(offset)] |= BadPixels.OFFSET_NOISE_EVAL_ERROR.value

    # noise related bad pixels
    noise_mn = np.nanmedian(noise, axis=(0,1))
    noise_std = np.nanstd(noise, axis=(0,1))

    bp[(noise < noise_mn-thresholds_noise_sigma*noise_std) |
       (noise > noise_mn+thresholds_noise_sigma*noise_std)] |= BadPixels.NOISE_OUT_OF_THRESHOLD.value
    bp[(noise < thresholds_noise_hard[0]) | (noise > thresholds_noise_hard[1])] |= BadPixels.NOISE_OUT_OF_THRESHOLD.value
    bp[~np.isfinite(noise)] |= BadPixels.OFFSET_NOISE_EVAL_ERROR.value


    return offset, noise, bp, cells, pulseid_checksum


 offset_g = OrderedDict()
 noise_g = OrderedDict()
 gain_g = OrderedDict()
 badpix_g = OrderedDict()
 gg = 0

 start = datetime.now()
 all_cells = []
 checksums = {}

 try:
    tGain, encodedGain, operatingFreq = get_dssc_ctrl_data(in_folder + "/r{:04d}/".format(offset_runs["high"]),
                                                           slow_data_pattern,
                                                           slow_data_aggregators,
                                                           offset_runs["high"], slow_data_path)
 except IOError:
    print("ERROR: Couldn't access slow data to read tGain, encodedGain, and operatingFreq \n")

 for gain, mapped_files in gain_mapped_files.items():
    inp = []
    dones = []
    for i in modules:
        qm = "Q{}M{}".format(i//4 +1, i % 4 + 1)
        if qm in mapped_files and not mapped_files[qm].empty():
            fname_in = mapped_files[qm].get()
            print("Process file: ", fname_in)
            dones.append(mapped_files[qm].empty())
        else:
            continue
        inp.append((fname_in, i))

    p = partial(characterize_module, max_cells,
               (thresholds_offset_hard, thresholds_offset_sigma,
                thresholds_noise_hard, thresholds_noise_sigma), rawversion, karabo_id, h5path, h5path_idx)


    results = list(map(p, inp))

    for ii, r in enumerate(results):
        i = modules[ii]
        offset, noise,  bp, thiscell, pulseid_checksum = r
        all_cells.append(thiscell)
        qm = "Q{}M{}".format(i//4 +1, i % 4 + 1)
        if qm not in offset_g:
            offset_g[qm] = np.zeros((offset.shape[0], offset.shape[1], offset.shape[2]))
            noise_g[qm] = np.zeros_like(offset_g[qm])

            badpix_g[qm] = np.zeros_like(offset_g[qm], np.uint32)
            checksums[qm] = pulseid_checksum

        offset_g[qm][...] = offset
        noise_g[qm][...] = noise
        badpix_g[qm][...] = bp
    gg +=1

 if len(all_cells) > 0:
    max_cells = np.max(all_cells)
    print(f"Using {max_cells} memory cells")
 else:
    raise ValueError("0 processed memory cells. No raw data available.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # TODO: add db_module when received from myMDC
 # Create the modules dict of karabo_das and PDUs
 qm_dict = OrderedDict()
 for i, k_da in zip(modules, karabo_da):
    qm = f"Q{i//4+1}M{i%4+1}"
    qm_dict[qm] = {"karabo_da": k_da,
                   "db_module": ""}
 ```

 %% Cell type:code id: tags:

 ``` python
 # Retrieve existing constants for comparison
 clist = ["Offset", "Noise"]
 old_const = {}
 old_mdata = {}

 print('Retrieve pre-existing constants for comparison.')
 for qm in offset_g.keys():
    old_const[qm] = {}
    old_mdata[qm] = {}
    qm_db = qm_dict[qm]
    karabo_da = qm_db["karabo_da"]
    for const in clist:

        dconst =getattr(Constants.DSSC, const)()
        condition = Conditions.Dark.DSSC(memory_cells=max_cells,
                                         bias_voltage=bias_voltage,
                                         pulseid_checksum=checksums[qm],
                                         acquisition_rate=operatingFreq[qm],
                                         target_gain=tGain[qm],
                                         encoded_gain=encodedGain[qm])

        # This should be used in case of running notebook
        # by a different method other than myMDC which already
        # sends CalCat info.
        # TODO: Set db_module to "" by default in the first cell
        if not qm_db["db_module"]:
            qm_db["db_module"] = get_pdu_from_db(karabo_id, karabo_da, dconst,
                                                 condition, cal_db_interface,
                                                 snapshot_at=creation_time)[0]
        data, mdata = get_from_db(karabo_id, karabo_da,
                                  dconst,
                                  condition,
                                  None,
                                  cal_db_interface, creation_time=creation_time,
                                  verbosity=2, timeout=cal_db_timeout)

        old_const[qm][const] = data

        if mdata is None or data is None:
            old_mdata[qm][const] = {
                "timestamp": "Not found",
                "filepath": None,
                "h5path": None
            }
        else:
            old_mdata[qm][const] = {
                "timestamp": mdata.calibration_constant_version.begin_at.isoformat(),
                "filepath": os.path.join(
                    mdata.calibration_constant_version.hdf5path,
                    mdata.calibration_constant_version.filename,
                ),
                "h5path": mdata.calibration_constant_version.h5path,
            }
    with open(f"{out_folder}/module_metadata_{qm}.yml", "w") as fd:
        yaml.safe_dump(
            {"module": qm, "pdu": qm_db["db_module"], "old-constants": old_mdata[qm]},
            fd,
        )
 ```

 %% Cell type:code id: tags:

 ``` python
 res = OrderedDict()
 for i in modules:
    qm = f"Q{i//4+1}M{i%4+1}"
    try:
        res[qm] = {'Offset': offset_g[qm],
                   'Noise': noise_g[qm],
                   }
    except Exception as e:
        print(f"Error: No constants for {qm}: {e}")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Push the same constant two different times.
 # One with the generated pulseID check sum setting for the offline calibration.
 # And another for the online calibration as it doesn't have this pulseID checksum, yet.
 md = None
 for dont_use_pulseIds in [True, False]:
    for qm in res.keys():
        karabo_da = qm_dict[qm]["karabo_da"]
        db_module = qm_dict[qm]["db_module"]
        for const in res[qm].keys():
            dconst = getattr(Constants.DSSC, const)()
            dconst.data = res[qm][const]

            opfreq = None if dont_use_pulseIds else operatingFreq[qm]
            targetgain = None if dont_use_pulseIds else tGain[qm]
            encodedgain =  None if dont_use_pulseIds else encodedGain[qm]
            pidsum = None if dont_use_pulseIds else checksums[qm]

            # set the operating condition
            condition = Conditions.Dark.DSSC(memory_cells=max_cells,
                                             bias_voltage=bias_voltage,
                                             pulseid_checksum=pidsum,
                                             acquisition_rate=opfreq,
                                             target_gain=targetgain,
                                             encoded_gain=encodedgain)
+            for parm in condition.parameters:
+                if parm.name == "Memory cells":
+                    parm.lower_deviation = max_cells
+                    parm.upper_deviation = 0

            if db_output:
                md = send_to_db(db_module, karabo_id, dconst, condition, file_loc, report,
                                cal_db_interface, creation_time=creation_time, timeout=cal_db_timeout)

            if local_output and dont_use_pulseIds: # Don't save constant localy two times.
                md = save_const_to_h5(db_module, karabo_id, dconst, condition,
                                      dconst.data, file_loc, report,
                                      creation_time, out_folder)
                print(f"Calibration constant {const} is stored locally.\n")

        if not dont_use_pulseIds:
            print("Constants parameter conditions are:\n")
            print(f"• memory_cells: {max_cells}\n• bias_voltage: {bias_voltage}\n"
                  f"• pulseid_checksum: {pidsum}\n• acquisition_rate: {opfreq}\n"
                  f"• target_gain: {targetgain}\n• encoded_gain: {encodedgain}\n"
                  f"• creation_time: {creation_time}\n")
 ```

 %% Cell type:code id: tags:

 ``` python
 mnames = []
 for i in modules:
    qm = f"Q{i//4+1}M{i % 4+1}"
    display(Markdown(f'## Position of the module {qm} and its ASICs##'))
    mnames.append(qm)

 show_processed_modules(dinstance=dinstance, constants=None, mnames=mnames, mode="position")
 ```

 %% Cell type:markdown id: tags:

 ## Single-Cell Overviews ##

 Single cell overviews allow to identify potential effects on all memory cells, e.g. on sensor level. Additionally, they should serve as a first sanity check on expected behaviour, e.g. if structuring on the ASIC level is visible in the offsets, but otherwise no immediate artifacts are visible.

 %% Cell type:code id: tags:

 ``` python
 cell = 9
 gain = 0
 out_folder = None
 show_overview(res, cell, gain, out_folder=out_folder, infix="_{}".format(run))
 ```

 %% Cell type:code id: tags:

 ``` python
 cols = {BadPixels.NOISE_OUT_OF_THRESHOLD.value: (BadPixels.NOISE_OUT_OF_THRESHOLD.name, '#FF000080'),
        BadPixels.OFFSET_NOISE_EVAL_ERROR.value: (BadPixels.OFFSET_NOISE_EVAL_ERROR.name, '#0000FF80'),
        BadPixels.OFFSET_OUT_OF_THRESHOLD.value: (BadPixels.OFFSET_OUT_OF_THRESHOLD.name, '#00FF0080'),
        BadPixels.OFFSET_OUT_OF_THRESHOLD.value | BadPixels.NOISE_OUT_OF_THRESHOLD.value: ('MIXED', '#DD00DD80')}

 if high_res_badpix_3d:
    display(Markdown("""

    ## Global Bad Pixel Behaviour ##

    The following plots show the results of bad pixel evaluation for all evaluated memory cells.
    Cells are stacked in the Z-dimension, while pixels values in x/y are rebinned with a factor of 2.
    This excludes single bad pixels present only in disconnected pixels.
    Hence, any bad pixels spanning at least 4 pixels in the x/y-plane, or across at least two memory cells are indicated.
    Colors encode the bad pixel type, or mixed type.

    """))
    # set rebin_fac to 1 for avoiding rebining and
    # losing real values of badpixels(High resolution).
    gain = 0
    for mod, data in badpix_g.items():
        plot_badpix_3d(data, cols, title=mod, rebin_fac=2)
        plt.show()
 ```

 %% Cell type:markdown id: tags:

 ## Aggregate values, and per Cell behaviour ##

 The following tables and plots give an overview of statistical aggregates for each constant, as well as per cell behavior.

 %% Cell type:code id: tags:

 ``` python
 create_constant_overview(offset_g, "Offset (ADU)", max_cells, entries=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 create_constant_overview(noise_g, "Noise (ADU)", max_cells, 0, 100, entries=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 bad_pixel_aggregate_g = OrderedDict()
 for m, d in badpix_g.items():
    bad_pixel_aggregate_g[m] = d.astype(np.bool).astype(np.float)
 create_constant_overview(bad_pixel_aggregate_g, "Bad pixel fraction", max_cells, entries=1)
 ```

 %% Cell type:markdown id: tags:

 ## Summary tables ##

 The following tables show summary information for the evaluated module. Values for currently evaluated constants are compared with values for pre-existing constants retrieved from the calibration database.

 %% Cell type:code id: tags:

 ``` python
 time_summary = []
 for qm, qm_data in old_mdata.items():
    time_summary.append(f"The following pre-existing constants are used for comparison for module {qm}:")
    for const, const_data in qm_data.items():
        time_summary.append(f"- {const} created at {const_data['timestamp']}")
 display(Markdown("\n".join(time_summary)))
 ```

 %% Cell type:code id: tags:

 ``` python
 header = ['Parameter',
          "New constant", "Old constant ",
          "New constant", "Old constant ",
          "New constant", "Old constant "]

 for const in ['Offset', 'Noise']:
    table = [['','High gain', 'High gain']]
    for qm in res.keys():

        data = np.copy(res[qm][const])

        if old_const[qm][const] is not None:
            dataold = np.copy(old_const[qm][const])

        f_list = [np.nanmedian, np.nanmean, np.nanstd, np.nanmin, np.nanmax]
        n_list = ['Median', 'Mean', 'Std', 'Min', 'Max']

        for i, f in enumerate(f_list):
            line = [n_list[i]]
            line.append('{:6.1f}'.format(f(data[...,gain])))
            if old_const[qm][const] is not None:
                line.append('{:6.1f}'.format(f(dataold[...,gain])))
            else:
                line.append('-')

            table.append(line)

    display(Markdown('### {} [ADU], good and bad pixels ###'.format(const)))
    md = display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=header)))
 ```

 %% Cell type:markdown id: tags:

 # DSSC Characterize Dark Images #

 Author: S. Hauf, Version: 0.1

 The following code analyzes a set of dark images taken with the DSSC detector to deduce detector offsets and noise. Data for the detector is presented in one run and don't acquire multiple gain stages.

 The notebook explicitely does what pyDetLib provides in its offset calculation method for streaming data.

 %% Cell type:code id: tags:

 ``` python
 cluster_profile = "noDB" # The ipcluster profile to use
 in_folder = "/gpfs/exfel/exp/SQS/202131/p900210/raw" # path to input data, required
 out_folder = "/gpfs/exfel/data/scratch/samartse/data/DSSC" # path to output to, required
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 sequences = [0] # sequence files to evaluate.
 modules = [-1]  # modules to run for
 run = 20 #run number in which data was recorded, required

 karabo_id = "SQS_DET_DSSC1M-1" # karabo karabo_id
 karabo_da = ['-1']  # a list of data aggregators names, Default [-1] for selecting all data aggregators
 receiver_id = "{}CH0" # inset for receiver devices
 path_template = 'RAW-R{:04d}-{}-S{:05d}.h5' # the template to use to access data
 h5path = '/INSTRUMENT/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 h5path_idx = '/INDEX/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 slow_data_pattern = 'RAW-R{}-DA{}-S00000.h5'

 use_dir_creation_date = True # use the dir creation date for determining the creation time
 cal_db_interface = "tcp://max-exfl-cal001:8020" # the database interface to use
 cal_db_timeout = 3000000 # timeout on caldb requests"
 local_output = True # output constants locally
 db_output = False # output constants to database

 mem_cells = 0 # number of memory cells used, set to 0 to automatically infer
 bias_voltage = 100 # detector bias voltage
 rawversion = 2 # RAW file format version

 thresholds_offset_sigma = 3. # thresholds in terms of n sigma noise for offset deduced bad pixels
 thresholds_offset_hard = [4, 125] # thresholds in absolute ADU terms for offset deduced bad pixels,
 # minimal threshold at 4 is set at hardware level, DSSC full range 0-511

 thresholds_noise_sigma = 3. # thresholds in terms of n sigma noise for offset deduced bad pixels
 thresholds_noise_hard = [0.001, 3] # thresholds in absolute ADU terms for offset deduced bad pixels
 offset_numpy_algorithm = "mean"

 high_res_badpix_3d = False # set this to True if you need high-resolution 3d bad pixel plots. Runtime: ~ 1h
 slow_data_aggregators = [1,1,1,1]  # quadrant/aggregator
 slow_data_path = 'SQS_NQS_DSSC/FPGA/PPT_Q'
 operation_mode = ''  # Detector operation mode, optional
 ```

 %% Cell type:code id: tags:

 ``` python
 import os
 import warnings

 # imports and things that do not usually need to be changed
 from datetime import datetime

 warnings.filterwarnings('ignore')
 from collections import OrderedDict

 import h5py
 import matplotlib
 from ipyparallel import Client
 from IPython.display import Latex, Markdown, display

 matplotlib.use('agg')
 import matplotlib.pyplot as plt

 %matplotlib inline
 import numpy as np
 import tabulate
 import yaml
 from iCalibrationDB import Conditions, Constants, Detectors, Versions

 from cal_tools.dssclib import get_dssc_ctrl_data, get_pulseid_checksum
 from cal_tools.enums import BadPixels
 from cal_tools.plotting import (
    create_constant_overview,
    plot_badpix_3d,
    show_overview,
    show_processed_modules,
 )
 from cal_tools.tools import (
    get_dir_creation_date,
    get_from_db,
    get_notebook_name,
    get_pdu_from_db,
    get_random_db_interface,
    get_report,
    map_gain_stages,
    parse_runs,
    run_prop_seq_from_path,
    save_const_to_h5,
    send_to_db,
 )

 view = Client(profile=cluster_profile)[:]
 view.use_dill()

 # make sure a cluster is running with ipcluster start --n=32, give it a while to start

 h5path = h5path.format(karabo_id, receiver_id)
 h5path_idx = h5path_idx.format(karabo_id, receiver_id)
 gain_names = ['High', 'Medium', 'Low']

 if karabo_da[0] == '-1':
    if modules[0] == -1:
        modules = list(range(16))
    karabo_da = ["DSSC{:02d}".format(i) for i in modules]
 else:
    modules = [int(x[-2:]) for x in karabo_da]

 max_cells = mem_cells

 offset_runs = OrderedDict()
 offset_runs["high"] = run

 creation_time=None
 if use_dir_creation_date:
    creation_time = get_dir_creation_date(in_folder, run)
    print(f"Using {creation_time} as creation time of constant.")

 run, prop, seq = run_prop_seq_from_path(in_folder)

 dinstance = "DSSC1M1"

 print(f"Detector in use is {karabo_id}")

 cal_db_interface = get_random_db_interface(cal_db_interface)
 ```

 %% Cell type:code id: tags:

 ``` python
 print("Parameters are:")
 print(f"Proposal: {prop}")
 print(f"Memory cells: {mem_cells}/{max_cells}")
 print("Runs: {}".format([ v for v in offset_runs.values()]))
 print(f"Sequences: {sequences}")
 print(f"Using DB: {db_output}")
 print(f"Input: {in_folder}")
 print(f"Output: {out_folder}")
 print(f"Bias voltage: {bias_voltage}V")
 file_loc = f'proposal:{prop} runs:{[ v for v in offset_runs.values()][0]}'

 report = get_report(metadata_folder)
 ```

 %% Cell type:markdown id: tags:

 The following lines will create a queue of files which will the be executed module-parallel. Distinguishing between different gains.

 %% Cell type:code id: tags:

 ``` python
 # set everything up filewise
 os.makedirs(out_folder, exist_ok=True)
 gmf = map_gain_stages(in_folder, offset_runs, path_template, karabo_da, sequences)
 gain_mapped_files, total_sequences, total_file_size = gmf
 print(f"Will process a total of {total_sequences} file.")
 ```

 %% Cell type:markdown id: tags:

 ## Calculate Offsets, Noise and Thresholds ##

 The calculation is performed per-pixel and per-memory-cell. Offsets are simply the median value for a set of dark data taken at a given gain, noise the standard deviation, and gain-bit values the medians of the gain array.

 %% Cell type:code id: tags:

 ``` python
 import copy
 from functools import partial


 def characterize_module(cells, bp_thresh, rawversion, karabo_id, h5path, h5path_idx, inp):
    import copy

    import h5py
    import numpy as np
    from cal_tools.enums import BadPixels
-
-    def get_num_cells(fname, h5path):
-        with h5py.File(fname, "r") as f:
-
-            cells = f[f"{h5path}/cellId"][()]
-            if cells == []:
-                return
-            maxcell = np.max(cells)
-            options = [100, 200, 400, 500, 600, 700, 800, 900]
-            dists = np.array([(o-maxcell) for o in options])
-            dists[dists<0] = 10000 # assure to always go higher
-            return options[np.argmin(dists)]
+    from cal_tools.dssclib import get_num_cells

    filename, channel = inp

    h5path = h5path.format(channel)
    h5path_idx = h5path_idx.format(channel)
    if cells == 0:
        cells = get_num_cells(filename, h5path)
        if cells is None:
            raise ValueError(f"ERROR! Empty image data file for channel {channel}")


    print(f"Using {cells} memory cells")

    pulseid_checksum = None

    thresholds_offset_hard, thresholds_offset_sigma, thresholds_noise_hard, thresholds_noise_sigma = bp_thresh

    infile = h5py.File(filename, "r")
    if rawversion == 2:
        count = np.squeeze(infile[f"{h5path_idx}/count"])
        first = np.squeeze(infile[f"{h5path_idx}/first"])
        last_index = int(first[count != 0][-1]+count[count != 0][-1])
        first_index = int(first[count != 0][0])
    else:
        status = np.squeeze(infile[f"{h5path_idx}/status"])
        if np.count_nonzero(status != 0) == 0:
            return
        last = np.squeeze(infile[f"{h5path_idx}/last"])
        first = np.squeeze(infile[f"{h5path_idx}/first"])
        last_index = int(last[status != 0][-1]) + 1
        first_index = int(first[status != 0][0])
    im = np.array(infile[f"{h5path}/data"][first_index:last_index,...])
    cellIds = np.squeeze(infile[f"{h5path}/cellId"][first_index:last_index,...])
    infile.close()

    pulseid_checksum = get_pulseid_checksum(filename, h5path, h5path_idx)

    im = im[:, 0, ...].astype(np.float32)

    im = np.rollaxis(im, 2)
    im = np.rollaxis(im, 2, 1)

    mcells = cells
    offset = np.zeros((im.shape[0], im.shape[1], mcells), dtype = np.float64)
    noise = np.zeros((im.shape[0], im.shape[1], mcells), dtype = np.float64)

    for cc in np.unique(cellIds[cellIds < mcells]):
        cellidx = cellIds == cc
        if offset_numpy_algorithm == "mean":
            offset[...,cc] = np.mean(im[..., cellidx], axis=2)
        else:
            offset[...,cc] = np.median(im[..., cellidx], axis=2)
        noise[...,cc] = np.std(im[..., cellidx], axis=2)


    # bad pixels
    bp = np.zeros(offset.shape, np.uint32)
    # offset related bad pixels
    offset_mn = np.nanmedian(offset, axis=(0,1))
    offset_std = np.nanstd(offset, axis=(0,1))

    bp[(offset < offset_mn-thresholds_offset_sigma*offset_std) |
       (offset > offset_mn+thresholds_offset_sigma*offset_std)] |= BadPixels.OFFSET_OUT_OF_THRESHOLD.value
    bp[(offset < thresholds_offset_hard[0]) | (offset > thresholds_offset_hard[1])] |= BadPixels.OFFSET_OUT_OF_THRESHOLD.value
    bp[~np.isfinite(offset)] |= BadPixels.OFFSET_NOISE_EVAL_ERROR.value

    # noise related bad pixels
    noise_mn = np.nanmedian(noise, axis=(0,1))
    noise_std = np.nanstd(noise, axis=(0,1))

    bp[(noise < noise_mn-thresholds_noise_sigma*noise_std) |
       (noise > noise_mn+thresholds_noise_sigma*noise_std)] |= BadPixels.NOISE_OUT_OF_THRESHOLD.value
    bp[(noise < thresholds_noise_hard[0]) | (noise > thresholds_noise_hard[1])] |= BadPixels.NOISE_OUT_OF_THRESHOLD.value
    bp[~np.isfinite(noise)] |= BadPixels.OFFSET_NOISE_EVAL_ERROR.value


    return offset, noise, bp, cells, pulseid_checksum


 offset_g = OrderedDict()
 noise_g = OrderedDict()
 gain_g = OrderedDict()
 badpix_g = OrderedDict()
 gg = 0

 start = datetime.now()
 all_cells = []
 checksums = {}

 try:
    tGain, encodedGain, operatingFreq = get_dssc_ctrl_data(in_folder + "/r{:04d}/".format(offset_runs["high"]),
                                                           slow_data_pattern,
                                                           slow_data_aggregators,
                                                           offset_runs["high"], slow_data_path)
 except IOError:
    print("ERROR: Couldn't access slow data to read tGain, encodedGain, and operatingFreq \n")

 for gain, mapped_files in gain_mapped_files.items():
    inp = []
    dones = []
    for i in modules:
        qm = "Q{}M{}".format(i//4 +1, i % 4 + 1)
        if qm in mapped_files and not mapped_files[qm].empty():
            fname_in = mapped_files[qm].get()
            print("Process file: ", fname_in)
            dones.append(mapped_files[qm].empty())
        else:
            continue
        inp.append((fname_in, i))

    p = partial(characterize_module, max_cells,
               (thresholds_offset_hard, thresholds_offset_sigma,
                thresholds_noise_hard, thresholds_noise_sigma), rawversion, karabo_id, h5path, h5path_idx)


    results = list(map(p, inp))

    for ii, r in enumerate(results):
        i = modules[ii]
        offset, noise,  bp, thiscell, pulseid_checksum = r
        all_cells.append(thiscell)
        qm = "Q{}M{}".format(i//4 +1, i % 4 + 1)
        if qm not in offset_g:
            offset_g[qm] = np.zeros((offset.shape[0], offset.shape[1], offset.shape[2]))
            noise_g[qm] = np.zeros_like(offset_g[qm])

            badpix_g[qm] = np.zeros_like(offset_g[qm], np.uint32)
            checksums[qm] = pulseid_checksum

        offset_g[qm][...] = offset
        noise_g[qm][...] = noise
        badpix_g[qm][...] = bp
    gg +=1

 if len(all_cells) > 0:
    max_cells = np.max(all_cells)
    print(f"Using {max_cells} memory cells")
 else:
    raise ValueError("0 processed memory cells. No raw data available.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # TODO: add db_module when received from myMDC
 # Create the modules dict of karabo_das and PDUs
 qm_dict = OrderedDict()
 for i, k_da in zip(modules, karabo_da):
    qm = f"Q{i//4+1}M{i%4+1}"
    qm_dict[qm] = {"karabo_da": k_da,
                   "db_module": ""}
 ```

 %% Cell type:code id: tags:

 ``` python
 # Retrieve existing constants for comparison
 clist = ["Offset", "Noise"]
 old_const = {}
 old_mdata = {}

 print('Retrieve pre-existing constants for comparison.')
 for qm in offset_g.keys():
    old_const[qm] = {}
    old_mdata[qm] = {}
    qm_db = qm_dict[qm]
    karabo_da = qm_db["karabo_da"]
    for const in clist:

        dconst =getattr(Constants.DSSC, const)()
        condition = Conditions.Dark.DSSC(memory_cells=max_cells,
                                         bias_voltage=bias_voltage,
                                         pulseid_checksum=checksums[qm],
                                         acquisition_rate=operatingFreq[qm],
                                         target_gain=tGain[qm],
                                         encoded_gain=encodedGain[qm])

        # This should be used in case of running notebook
        # by a different method other than myMDC which already
        # sends CalCat info.
        # TODO: Set db_module to "" by default in the first cell
        if not qm_db["db_module"]:
            qm_db["db_module"] = get_pdu_from_db(karabo_id, karabo_da, dconst,
                                                 condition, cal_db_interface,
                                                 snapshot_at=creation_time)[0]
        data, mdata = get_from_db(karabo_id, karabo_da,
                                  dconst,
                                  condition,
                                  None,
                                  cal_db_interface, creation_time=creation_time,
                                  verbosity=2, timeout=cal_db_timeout)

        old_const[qm][const] = data

        if mdata is None or data is None:
            old_mdata[qm][const] = {
                "timestamp": "Not found",
                "filepath": None,
                "h5path": None
            }
        else:
            old_mdata[qm][const] = {
                "timestamp": mdata.calibration_constant_version.begin_at.isoformat(),
                "filepath": os.path.join(
                    mdata.calibration_constant_version.hdf5path,
                    mdata.calibration_constant_version.filename,
                ),
                "h5path": mdata.calibration_constant_version.h5path,
            }
    with open(f"{out_folder}/module_metadata_{qm}.yml", "w") as fd:
        yaml.safe_dump(
            {"module": qm, "pdu": qm_db["db_module"], "old-constants": old_mdata[qm]},
            fd,
        )
 ```

 %% Cell type:code id: tags:

 ``` python
 res = OrderedDict()
 for i in modules:
    qm = f"Q{i//4+1}M{i%4+1}"
    try:
        res[qm] = {'Offset': offset_g[qm],
                   'Noise': noise_g[qm],
                   }
    except Exception as e:
        print(f"Error: No constants for {qm}: {e}")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Push the same constant two different times.
 # One with the generated pulseID check sum setting for the offline calibration.
 # And another for the online calibration as it doesn't have this pulseID checksum, yet.
 md = None
 for dont_use_pulseIds in [True, False]:
    for qm in res.keys():
        karabo_da = qm_dict[qm]["karabo_da"]
        db_module = qm_dict[qm]["db_module"]
        for const in res[qm].keys():
            dconst = getattr(Constants.DSSC, const)()
            dconst.data = res[qm][const]

            opfreq = None if dont_use_pulseIds else operatingFreq[qm]
            targetgain = None if dont_use_pulseIds else tGain[qm]
            encodedgain =  None if dont_use_pulseIds else encodedGain[qm]
            pidsum = None if dont_use_pulseIds else checksums[qm]

            # set the operating condition
            condition = Conditions.Dark.DSSC(memory_cells=max_cells,
                                             bias_voltage=bias_voltage,
                                             pulseid_checksum=pidsum,
                                             acquisition_rate=opfreq,
                                             target_gain=targetgain,
                                             encoded_gain=encodedgain)
+            for parm in condition.parameters:
+                if parm.name == "Memory cells":
+                    parm.lower_deviation = max_cells
+                    parm.upper_deviation = 0

            if db_output:
                md = send_to_db(db_module, karabo_id, dconst, condition, file_loc, report,
                                cal_db_interface, creation_time=creation_time, timeout=cal_db_timeout)

            if local_output and dont_use_pulseIds: # Don't save constant localy two times.
                md = save_const_to_h5(db_module, karabo_id, dconst, condition,
                                      dconst.data, file_loc, report,
                                      creation_time, out_folder)
                print(f"Calibration constant {const} is stored locally.\n")

        if not dont_use_pulseIds:
            print("Constants parameter conditions are:\n")
            print(f"• memory_cells: {max_cells}\n• bias_voltage: {bias_voltage}\n"
                  f"• pulseid_checksum: {pidsum}\n• acquisition_rate: {opfreq}\n"
                  f"• target_gain: {targetgain}\n• encoded_gain: {encodedgain}\n"
                  f"• creation_time: {creation_time}\n")
 ```

 %% Cell type:code id: tags:

 ``` python
 mnames = []
 for i in modules:
    qm = f"Q{i//4+1}M{i % 4+1}"
    display(Markdown(f'## Position of the module {qm} and its ASICs##'))
    mnames.append(qm)

 show_processed_modules(dinstance=dinstance, constants=None, mnames=mnames, mode="position")
 ```

 %% Cell type:markdown id: tags:

 ## Single-Cell Overviews ##

 Single cell overviews allow to identify potential effects on all memory cells, e.g. on sensor level. Additionally, they should serve as a first sanity check on expected behaviour, e.g. if structuring on the ASIC level is visible in the offsets, but otherwise no immediate artifacts are visible.

 %% Cell type:code id: tags:

 ``` python
 cell = 9
 gain = 0
 out_folder = None
 show_overview(res, cell, gain, out_folder=out_folder, infix="_{}".format(run))
 ```

 %% Cell type:code id: tags:

 ``` python
 cols = {BadPixels.NOISE_OUT_OF_THRESHOLD.value: (BadPixels.NOISE_OUT_OF_THRESHOLD.name, '#FF000080'),
        BadPixels.OFFSET_NOISE_EVAL_ERROR.value: (BadPixels.OFFSET_NOISE_EVAL_ERROR.name, '#0000FF80'),
        BadPixels.OFFSET_OUT_OF_THRESHOLD.value: (BadPixels.OFFSET_OUT_OF_THRESHOLD.name, '#00FF0080'),
        BadPixels.OFFSET_OUT_OF_THRESHOLD.value | BadPixels.NOISE_OUT_OF_THRESHOLD.value: ('MIXED', '#DD00DD80')}

 if high_res_badpix_3d:
    display(Markdown("""

    ## Global Bad Pixel Behaviour ##

    The following plots show the results of bad pixel evaluation for all evaluated memory cells.
    Cells are stacked in the Z-dimension, while pixels values in x/y are rebinned with a factor of 2.
    This excludes single bad pixels present only in disconnected pixels.
    Hence, any bad pixels spanning at least 4 pixels in the x/y-plane, or across at least two memory cells are indicated.
    Colors encode the bad pixel type, or mixed type.

    """))
    # set rebin_fac to 1 for avoiding rebining and
    # losing real values of badpixels(High resolution).
    gain = 0
    for mod, data in badpix_g.items():
        plot_badpix_3d(data, cols, title=mod, rebin_fac=2)
        plt.show()
 ```

 %% Cell type:markdown id: tags:

 ## Aggregate values, and per Cell behaviour ##

 The following tables and plots give an overview of statistical aggregates for each constant, as well as per cell behavior.

 %% Cell type:code id: tags:

 ``` python
 create_constant_overview(offset_g, "Offset (ADU)", max_cells, entries=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 create_constant_overview(noise_g, "Noise (ADU)", max_cells, 0, 100, entries=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 bad_pixel_aggregate_g = OrderedDict()
 for m, d in badpix_g.items():
    bad_pixel_aggregate_g[m] = d.astype(np.bool).astype(np.float)
 create_constant_overview(bad_pixel_aggregate_g, "Bad pixel fraction", max_cells, entries=1)
 ```

 %% Cell type:markdown id: tags:

 ## Summary tables ##

 The following tables show summary information for the evaluated module. Values for currently evaluated constants are compared with values for pre-existing constants retrieved from the calibration database.

 %% Cell type:code id: tags:

 ``` python
 time_summary = []
 for qm, qm_data in old_mdata.items():
    time_summary.append(f"The following pre-existing constants are used for comparison for module {qm}:")
    for const, const_data in qm_data.items():
        time_summary.append(f"- {const} created at {const_data['timestamp']}")
 display(Markdown("\n".join(time_summary)))
 ```

 %% Cell type:code id: tags:

 ``` python
 header = ['Parameter',
          "New constant", "Old constant ",
          "New constant", "Old constant ",
          "New constant", "Old constant "]

 for const in ['Offset', 'Noise']:
    table = [['','High gain', 'High gain']]
    for qm in res.keys():

        data = np.copy(res[qm][const])

        if old_const[qm][const] is not None:
            dataold = np.copy(old_const[qm][const])

        f_list = [np.nanmedian, np.nanmean, np.nanstd, np.nanmin, np.nanmax]
        n_list = ['Median', 'Mean', 'Std', 'Min', 'Max']

        for i, f in enumerate(f_list):
            line = [n_list[i]]
            line.append('{:6.1f}'.format(f(data[...,gain])))
            if old_const[qm][const] is not None:
                line.append('{:6.1f}'.format(f(dataold[...,gain])))
            else:
                line.append('-')

            table.append(line)

    display(Markdown('### {} [ADU], good and bad pixels ###'.format(const)))
    md = display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=header)))
 ```

--- a/notebooks/DSSC/DSSC_Correct_and_Verify.ipynb
+++ b/notebooks/DSSC/DSSC_Correct_and_Verify.ipynb
 %% Cell type:markdown id: tags:

 # DSSC Offline Correction #

 Author: European XFEL Detector Group, Version: 1.0

 Offline Calibration for the DSSC Detector

 %% Cell type:code id: tags:

 ``` python
 cluster_profile = "noDB" # The ipcluster profile to use
 in_folder = "/gpfs/exfel/exp/SQS/202131/p900210/raw" # path to input data, required
 out_folder = "/gpfs/exfel/data/scratch/samartse/data/DSSC" # path to output to, required
 sequences = [-1] # sequence files to evaluate.
 modules = [-1] # modules to correct, set to -1 for all, range allowed
 run = 20 #runs to process, required

 karabo_id = "SQS_DET_DSSC1M-1" # karabo karabo_id
 karabo_da = ['-1']  # a list of data aggregators names, Default [-1] for selecting all data aggregators
 receiver_id = "{}CH0" # inset for receiver devices
 path_template = 'RAW-R{:04d}-{}-S{:05d}.h5' # the template to use to access data
 h5path = 'INSTRUMENT/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 h5path_idx = '/INDEX/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 slow_data_pattern = 'RAW-R{}-DA{}-S00000.h5'

 use_dir_creation_date = True # use the creation data of the input dir for database queries
 cal_db_interface = "tcp://max-exfl-cal001:8020#8025" # the database interface to use
 cal_db_timeout = 300000 # in milli seconds

 mem_cells = 0 # number of memory cells used, set to 0 to automatically infer
 overwrite = True # set to True if existing data should be overwritten
 max_pulses = 800 # maximum number of pulses per train
 bias_voltage = 100 # detector bias voltage
 sequences_per_node = 1 # number of sequence files per cluster node if run as slurm job, set to 0 to not run SLURM parallel
 chunk_size_idim = 1  # chunking size of imaging dimension, adjust if user software is sensitive to this.
 mask_noisy_asic = 0.25 # set to a value other than 0 and below 1 to mask entire ADC if fraction of noisy pixels is above
 mask_cold_asic = 0.25 # mask cold ASICS if number of pixels with negligable standard deviation is larger than this fraction
 noisy_pix_threshold = 1. # threshold above which ap pixel is considered noisy.
 geo_file = "/gpfs/exfel/data/scratch/xcal/dssc_geo_june19.h5" # detector geometry file
 dinstance = "DSSC1M1"
 slow_data_aggregators = [1,2,3,4] #quadrant/aggregator
 slow_data_path = 'SQS_NQS_DSSC/FPGA/PPT_Q'

 def balance_sequences(in_folder, run, sequences, sequences_per_node, karabo_da):
    from xfel_calibrate.calibrate import balance_sequences as bs
    return bs(in_folder, run, sequences, sequences_per_node, karabo_da)

 ```

 %% Cell type:code id: tags:

 ``` python
 # make sure a cluster is running with ipcluster start --n=32, give it a while to start
 import os
 import sys
 from collections import OrderedDict

 import h5py
 import matplotlib
 import numpy as np

 matplotlib.use("agg")
 import matplotlib.pyplot as plt
 from ipyparallel import Client
 from IPython.display import Latex, Markdown, display

 print(f"Connecting to profile {cluster_profile}")
 view = Client(profile=cluster_profile)[:]
 view.use_dill()

 from datetime import timedelta

 from cal_tools.dssclib import get_dssc_ctrl_data, get_pulseid_checksum
 from cal_tools.tools import (
    get_constant_from_db,
    get_dir_creation_date,
    get_notebook_name,
    map_modules_from_folder,
    parse_runs,
    run_prop_seq_from_path,
 )
 from dateutil import parser
 from iCalibrationDB import Conditions, ConstantMetaData, Constants, Detectors, Versions
 ```

 %% Cell type:code id: tags:

 ``` python
 creation_time = None
 if use_dir_creation_date:
    creation_time = get_dir_creation_date(in_folder, run)
    print(f"Using {creation_time} as creation time")

 if sequences[0] == -1:
    sequences = None

 h5path = h5path.format(karabo_id, receiver_id)
 h5path_idx = h5path_idx.format(karabo_id, receiver_id)


 if karabo_da[0] == '-1':
    if modules[0] == -1:
        modules = list(range(16))
    karabo_da = ["DSSC{:02d}".format(i) for i in modules]
 else:
    modules = [int(x[-2:]) for x in karabo_da]
 print("Process modules: ",
      ', '.join([f"Q{x // 4 + 1}M{x % 4 + 1}" for x in modules]))

 CHUNK_SIZE = 512
 MAX_PAR = 32

 if in_folder[-1] == "/":
    in_folder = in_folder[:-1]
 print(f"Outputting to {out_folder}")

 if not os.path.exists(out_folder):
    os.makedirs(out_folder)
 elif not overwrite:
    raise AttributeError("Output path exists! Exiting")

 import warnings

 warnings.filterwarnings('ignore')

 print(f"Detector in use is {karabo_id}")
 ```

 %% Cell type:code id: tags:

 ``` python
 # set everything up filewise
 mmf = map_modules_from_folder(in_folder, run, path_template, karabo_da, sequences)
 mapped_files, mod_ids, total_sequences, sequences_qm, file_size = mmf
 MAX_PAR = min(MAX_PAR, total_sequences)
 ```

 %% Cell type:markdown id: tags:

 ## Processed Files ##

 %% Cell type:code id: tags:

 ``` python
 import copy

 import tabulate
 from IPython.display import HTML, Latex, Markdown, display

 print(f"Processing a total of {total_sequences} sequence files in chunks of {MAX_PAR}")
 table = []
 mfc = copy.copy(mapped_files)
 ti = 0
 for k, files in mfc.items():
    i = 0
    while not files.empty():
        f = files.get()
        if i == 0:
            table.append((ti, k, i, f))
        else:
            table.append((ti, "", i,  f))
        i += 1
        ti += 1
 if len(table):
    md = display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["#", "module", "# module", "file"])))
 # restore the queue
 mmf = map_modules_from_folder(in_folder, run, path_template, karabo_da, sequences)
 mapped_files, mod_ids, total_sequences, sequences_qm, file_size = mmf
 ```

 %% Cell type:code id: tags:

 ``` python
 import copy
 from functools import partial


 def correct_module(total_sequences, sequences_qm, karabo_id, dinstance, mask_noisy_asic,
                   mask_cold_asic, noisy_pix_threshold, chunksize, mem_cells, bias_voltage,
                   cal_db_timeout, creation_time, cal_db_interface, h5path, h5path_idx, inp):

    import binascii
    import copy
    import struct
    from hashlib import blake2b

    import h5py
    import numpy as np
-    from cal_tools.dssclib import get_dssc_ctrl_data, get_pulseid_checksum
+    from cal_tools.dssclib import (
+        get_dssc_ctrl_data,
+        get_num_cells,
+        get_pulseid_checksum,
+    )
    from cal_tools.enums import BadPixels
    from cal_tools.tools import get_constant_from_db_and_time
    from iCalibrationDB import (
        Conditions,
        ConstantMetaData,
        Constants,
        Detectors,
        Versions,
    )

    filename, filename_out, channel, karabo_da, qm, conditions = inp

    # DSSC correction requires path without the leading "/"
    if h5path[0] == '/':
        h5path = h5path[1:]
    if h5path_idx[0] == '/':
        h5path_idx = h5path_idx[1:]

    h5path = h5path.format(channel)
    h5path_idx = h5path_idx.format(channel)

    low_edges = None
    hists_signal_low = None
    high_edges = None
    hists_signal_high = None
    pulse_edges = None
    err = None
    offset_not_found = False
-    def get_num_cells(fname, h5path):
-        with h5py.File(fname, "r") as f:
-
-            cells = f[f"{h5path}/cellId"][()]
-            maxcell = np.max(cells)
-            options = [100, 200, 400, 500, 600, 700, 800]
-            dists = np.array([(o-maxcell) for o in options])
-            dists[dists<0] = 10000 # assure to always go higher
-            return options[np.argmin(dists)]

    if mem_cells == 0:
        mem_cells = get_num_cells(filename, h5path)

    pulseid_checksum = get_pulseid_checksum(filename, h5path, h5path_idx)

    print(f"Memcells: {mem_cells}")

    condition =  Conditions.Dark.DSSC(bias_voltage=bias_voltage, memory_cells=mem_cells,\
                                      pulseid_checksum=pulseid_checksum,\
                                      acquisition_rate=conditions['acquisition_rate'],\
                                      target_gain=conditions['target_gain'],\
                                      encoded_gain=conditions['encoded_gain'])

    detinst = getattr(Detectors, dinstance)
    device = getattr(detinst, qm)
    with h5py.File(filename, "r") as infile:
        y = infile[f"{h5path}/data"].shape[2]
        x = infile[f"{h5path}/data"].shape[3]
    offset, when = get_constant_from_db_and_time(karabo_id, karabo_da,
                                                 Constants.DSSC.Offset(),
                                                 condition,
                                                 None,
                                                 cal_db_interface,
                                                 creation_time=creation_time,
                                                 timeout=cal_db_timeout)
    if offset is not None:
        offset = np.moveaxis(np.moveaxis(offset[...], 2, 0), 2, 1)
    else:
        offset_not_found = True
        print("No offset found in the database")

    def copy_and_sanitize_non_cal_data(infile, outfile):
        # these are touched in the correct function, do not copy them here
        dont_copy = ["data"]
        dont_copy = [h5path + "/{}".format(do)
                     for do in dont_copy]

        # a visitor to copy everything else
        def visitor(k, item):
            if k not in dont_copy:

                if isinstance(item, h5py.Group):
                    outfile.create_group(k)
                elif isinstance(item, h5py.Dataset):
                    group = str(k).split("/")
                    group = "/".join(group[:-1])
                    infile.copy(k, outfile[group])

        infile.visititems(visitor)

    try:
        with h5py.File(filename, "r") as infile:
            with h5py.File(filename_out, "w") as outfile:
                copy_and_sanitize_non_cal_data(infile, outfile)
                # get indices of last images in each train
                first_arr = np.squeeze(infile[f"{h5path_idx}/first"]).astype(np.int)
                last_arr = np.concatenate((first_arr[1:], np.array([-1,]))).astype(np.int)
                assert first_arr.size == last_arr.size
                oshape = list(infile[f"{h5path}/data"].shape)
                if len(oshape) == 4:
                    oshape = [oshape[0],]+oshape[2:]
                chunks = (chunksize, oshape[1], oshape[2])
                ddset = outfile.create_dataset(f"{h5path}/data",
                                               oshape, chunks=chunks,
                                               dtype=np.float32,
                                               fletcher32=True)

                mdset = outfile.create_dataset(f"{h5path}/mask",
                                               oshape, chunks=chunks,
                                               dtype=np.uint32,
                                               compression="gzip",
                                               compression_opts=1,
                                               shuffle=True,
                                               fletcher32=True)

                for train in range(first_arr.size):
                    first = first_arr[train]
                    last = last_arr[train]
                    if first == last:
                        continue
                    data = np.squeeze(infile[f"{h5path}/data"][first:last, ...].astype(np.float32))
                    cellId = np.squeeze(infile[f"{h5path}/cellId"][first:last, ...])
                    pulseId = np.squeeze(infile[f"{h5path}/pulseId"][first:last, ...])
                    if not offset_not_found:
                        data[...] -= offset[cellId,...]

                    if hists_signal_low is None:
                        pulseId = np.repeat(pulseId[:, None], data.shape[1], axis=1)
                        pulseId = np.repeat(pulseId[:,:,None], data.shape[2], axis=2)
                        bins = (55, int(pulseId.max()))
                        rnge = [[-5, 50], [0, int(pulseId.max())]]
                        hists_signal_low, low_edges, pulse_edges = np.histogram2d(data.flatten(),
                                                                                  pulseId.flatten(),
                                                                                  bins=bins,
                                                                                  range=rnge)
                        rnge = [[-5, 300], [0, pulseId.max()]]
                        hists_signal_high, high_edges, _ = np.histogram2d(data.flatten(),
                                                                          pulseId.flatten(),
                                                                          bins=bins,
                                                                          range=rnge)
                    ddset[first:last, ...] = data

                # find static and noisy values in dark images
                data = infile[f"{h5path}/data"][last, ...].astype(np.float32)
                bpix = np.zeros(oshape[1:], np.uint32)
                dark_std = np.std(data, axis=0)
                bpix[dark_std > noisy_pix_threshold] = BadPixels.NOISE_OUT_OF_THRESHOLD.value

                for i in range(8):
                    for j in range(2):
                        count_noise = np.count_nonzero(bpix[i*64:(i+1)*64, j*64:(j+1)*64])
                        asic_std = np.std(data[:, i*64:(i+1)*64, j*64:(j+1)*64])
                        if mask_noisy_asic:
                            if count_noise/(64*64) > mask_noisy_asic:
                                bpix[i*64:(i+1)*64, j*64:(j+1)*64] = BadPixels.NOISY_ADC.value

                        if mask_cold_asic:
                            count_cold = np.count_nonzero(asic_std < 0.5)
                            if count_cold/(64*64) > mask_cold_asic:
                                bpix[i*64:(i+1)*64, j*64:(j+1)*64] = BadPixels.ASIC_STD_BELOW_NOISE.value

    except Exception as e:
        print(e)
        success = False
        reason = "Error"
        err = e

    if err is None and offset_not_found:
        err = "Offset not found in database!. No offset correction applied."

    return (hists_signal_low, hists_signal_high, low_edges, high_edges, pulse_edges, when, qm, err)

 done = False
 first_files = {}
 inp = []
 left = total_sequences

 hists_signal_low = 0
 hists_signal_high = 0

 low_edges, high_edges, pulse_edges = None, None, None

 tGain, encodedGain, operatingFreq = get_dssc_ctrl_data(in_folder\
                                + "/r{:04d}/".format(run),\
                                slow_data_pattern,slow_data_aggregators, run, slow_data_path)

 whens = []
 qms = []
 Errors = []
 while not done:
    dones = []
    for i, k_da in zip(modules, karabo_da):
        qm = "Q{}M{}".format(i//4 +1, i % 4 + 1)

        if qm in mapped_files:
            if not mapped_files[qm].empty():
                fname_in = str(mapped_files[qm].get())
                dones.append(mapped_files[qm].empty())
            else:
                print(f"{qm} file is missing")
                continue
        else:
            print(f"Skipping {qm}")
            continue
        fout = os.path.abspath("{}/{}".format(out_folder, (os.path.split(fname_in)[-1]).replace("RAW", "CORR")))

        first_files[i] = (fname_in, fout)
        conditions = {}
        conditions['acquisition_rate'] = operatingFreq[qm]
        conditions['target_gain'] = tGain[qm]
        conditions['encoded_gain'] = encodedGain[qm]
        inp.append((fname_in, fout, i, k_da, qm, conditions))

    if len(inp) >= min(MAX_PAR, left):
        print(f"Running {len(inp)} tasks parallel")
        p = partial(correct_module, total_sequences, sequences_qm,
                    karabo_id, dinstance, mask_noisy_asic, mask_cold_asic,
                    noisy_pix_threshold, chunk_size_idim, mem_cells,
                    bias_voltage, cal_db_timeout, creation_time, cal_db_interface,
                    h5path, h5path_idx)

        r = view.map_sync(p, inp)
        #r = list(map(p, inp))

        inp = []
        left -= MAX_PAR

        for rr in r:
            if rr is not None:
                hl, hh, low_edges, high_edges, pulse_edges, when, qm, err = rr
                whens.append(when)
                qms.append(qm)
                Errors.append(err)
                if hl is not None:  # any one being None will also make the others None
                    hists_signal_low += hl.astype(np.float64)
                    hists_signal_high += hh.astype(np.float64)

    done = all(dones)

 whens = [x for _,x in sorted(zip(qms,whens))]
 qms = sorted(qms)
 for i, qm in enumerate(qms):
    try:
        when = whens[i].isoformat()
    except:
        when = whens[i]
    if Errors[i] is not None:

        # Avoid writing wrong injection date if cons. not found.
        if "not found" in str(Errors[i]):
            print(f"ERROR! {qm}: {Errors[i]}")
        else:
            print(f"Offset for {qm} was injected on {when}, ERROR!: {Errors[i]}")
    else:
        print(f"Offset for {qm} was injected on {when}")
 ```

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 import numpy as np
 from matplotlib import cm
 from matplotlib.ticker import FormatStrFormatter, LinearLocator
 from mpl_toolkits.mplot3d import Axes3D

 %matplotlib inline
 def do_3d_plot(data, edges, x_axis, y_axis):
    fig = plt.figure(figsize=(10,10))
    ax = fig.gca(projection='3d')

    # Make data.
    X = edges[0][:-1]
    Y = edges[1][:-1]
    X, Y = np.meshgrid(X, Y)

    Z = data.T

    # Plot the surface.
    surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                           linewidth=0, antialiased=False)
    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    ax.set_zlabel("Counts")
 ```

 %% Cell type:code id: tags:

 ``` python
 def do_2d_plot(data, edges, y_axis, x_axis):
    from matplotlib.colors import LogNorm
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    extent = [np.min(edges[1]), np.max(edges[1]),np.min(edges[0]), np.max(edges[0])]
    im = ax.imshow(data[::-1,:], extent=extent, aspect="auto", norm=LogNorm(vmin=1, vmax=np.max(data)))
    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    cb = fig.colorbar(im)
    cb.set_label("Counts")


 ```

 %% Cell type:markdown id: tags:

 ## Mean Intensity per Pulse ##

 The following plots show the mean signal for each pulse in a detailed and expanded intensity region.

 %% Cell type:code id: tags:

 ``` python
 do_3d_plot(hists_signal_low, [low_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 do_2d_plot(hists_signal_low, [low_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 do_3d_plot(hists_signal_high, [high_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 do_2d_plot(hists_signal_high, [high_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 ```

 %% Cell type:code id: tags:

 ``` python
 corrected = []
 raw = []
 mask = []
 pulse_ids = []
 train_ids = []
 for channel, ff in first_files.items():
    try:
        raw_file, corr_file = ff
        data_path = h5path.format(channel)
        index_path = h5path_idx.format(channel)
        try:
            infile = h5py.File(raw_file, "r")
            first_idx = int(np.array(infile[f"{index_path}/first"])[0])

            raw_d = np.array(infile[f"{data_path}/data"])
            # Use first 128 images for plotting
            if raw_d.shape[0] >= 128:
                # random number for plotting
                plt_im = 128
            else:
                plt_im = d.shape[0]
            last_idx = first_idx + plt_im
            raw.append((channel,raw_d[first_idx:last_idx,0,...]))
        finally:
            infile.close()

        infile = h5py.File(corr_file, "r")
        try:
            corrected.append((channel, np.array(infile[f"{data_path}/data"][first_idx:last_idx,...])))
            mask.append((channel, np.array(infile[f"{data_path}/mask"][first_idx:last_idx,...])))
            pulse_ids.append((channel, np.squeeze(infile[f"{data_path}/pulseId"][first_idx:last_idx,...])))
            train_ids.append((channel, np.squeeze(infile[f"{data_path}/trainId"][first_idx:last_idx,...])))
        finally:
            infile.close()

    except Exception as e:
        print(e)
 ```

 %% Cell type:code id: tags:

 ``` python
 def combine_stack(d, sdim):
    combined = np.zeros((sdim, 1300,1300), np.float32)
    combined[...] = 0

    dy = 0
    quad_pos = [
        (0, 145),
        (130, 140),
        (125, 15),
        (0, 15),

    ]

    px = 0.236
    py = 0.204
    with h5py.File(geo_file, "r") as gf:
        # TODO: refactor to -> for ch, f in d:
        for i in range(len(d)):

            ch = d[i][0]

            mi = 3-(ch%4)
            mp = gf["Q{}/M{}/Position".format(ch//4+1, mi%4+1)][()]
            t1 = gf["Q{}/M{}/T01/Position".format(ch//4+1, ch%4+1)][()]
            t2 = gf["Q{}/M{}/T02/Position".format(ch//4+1, ch%4+1)][()]
            if ch//4 < 2:
                t1, t2 = t2, t1

            if ch // 4 == 0 or ch // 4 == 1:
                td = d[i][1][:,::-1,:]
            else:
                td = d[i][1][:,:,::-1]

            t1d = td[:,:,:256]
            t2d = td[:,:,256:]

            x0t1 = int((t1[0]+mp[0])/px)
            y0t1 = int((t1[1]+mp[1])/py)
            x0t2 = int((t2[0]+mp[0])/px)
            y0t2 = int((t2[1]+mp[1])/py)

            x0t1 += int(quad_pos[i//4][1]/px)
            x0t2 += int(quad_pos[i//4][1]/px)
            y0t1 += int(quad_pos[i//4][0]/py)+combined.shape[1]//16
            y0t2 += int(quad_pos[i//4][0]/py)+combined.shape[1]//16
            combined[:,y0t1:y0t1+128,x0t1:x0t1+256] = t1d
            combined[:,y0t2:y0t2+128,x0t2:x0t2+256] = t2d

    return combined
 ```

 %% Cell type:code id: tags:

 ``` python
 combined = combine_stack(corrected, last_idx-first_idx)
 combined_raw = combine_stack(raw, last_idx-first_idx)
 combined_mask = combine_stack(mask, last_idx-first_idx)
 ```

 %% Cell type:markdown id: tags:

 ### Mean RAW Preview ###


 %% Cell type:code id: tags:

 ``` python
 display(Markdown("The per pixel mean of the first {} images of the RAW data".format(plt_im)))
 ```

 %% Cell type:code id: tags:

 ``` python
 %matplotlib inline
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.mean(combined_raw[:,...],axis=0),
               vmin=min(0.75*np.median(combined_raw[combined_raw > 0]), -5),
               vmax=max(1.5*np.median(combined_raw[combined_raw > 0]), 50), cmap="jet")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 ### Single Shot Preview ###

 A single shot image from cell 2 of the first train

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 dim = combined[2,...]

 im = ax.imshow(dim, vmin=-0, vmax=max(1.5*np.median(dim[dim > 0]), 50), cmap="jet", interpolation="nearest")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 h = ax.hist(dim.flatten(), bins=100, range=(0, 100))
 ```

 %% Cell type:markdown id: tags:

 ### Mean CORRECTED Preview ###

 %% Cell type:code id: tags:

 ``` python
 display(Markdown("The per pixel mean of the first {} images of the CORRECTED data".format(plt_im)))
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.mean(combined[:,...], axis=0), vmin=0,
               vmax=max(1.5*np.median(combined[combined > 0]), 10), cmap="jet", interpolation="nearest")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 ### Max CORRECTED Preview ###

 The per pixel maximum of the first 128 images of the CORRECTED data

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.max(combined[:,...], axis=0), vmin=0,
               vmax=max(100*np.median(combined[combined > 0]), 20), cmap="jet", interpolation="nearest")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 combined[combined <= 0] = 0
 h = ax.hist(combined.flatten(), bins=100, range=(-5, 100), log=True)
 ```

 %% Cell type:markdown id: tags:

 ## Bad Pixels ##
 The mask contains dedicated entries for all pixels and memory cells as well as all three gains stages. Each mask entry is encoded in 32 bits as:

 %% Cell type:code id: tags:

 ``` python
 import tabulate
 from cal_tools.enums import BadPixels
 from IPython.display import HTML, Latex, Markdown, display

 table = []
 for item in BadPixels:
    table.append((item.name, "{:016b}".format(item.value)))
 md = display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["Bad pixel type", "Bit mask"])))
 ```

 %% Cell type:markdown id: tags:

 ### Full Train Bad Pixels ###

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.log2(np.max(combined_mask[:,...], axis=0)), vmin=0,
               vmax=32, cmap="jet")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 ### Full Train Bad Pixels - Only Dark Char. Related ###

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.max((combined_mask.astype(np.uint32)[:,...] & BadPixels.NOISY_ADC.value) != 0, axis=0), vmin=0,
               vmax=1, cmap="jet")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 # DSSC Offline Correction #

 Author: European XFEL Detector Group, Version: 1.0

 Offline Calibration for the DSSC Detector

 %% Cell type:code id: tags:

 ``` python
 cluster_profile = "noDB" # The ipcluster profile to use
 in_folder = "/gpfs/exfel/exp/SQS/202131/p900210/raw" # path to input data, required
 out_folder = "/gpfs/exfel/data/scratch/samartse/data/DSSC" # path to output to, required
 sequences = [-1] # sequence files to evaluate.
 modules = [-1] # modules to correct, set to -1 for all, range allowed
 run = 20 #runs to process, required

 karabo_id = "SQS_DET_DSSC1M-1" # karabo karabo_id
 karabo_da = ['-1']  # a list of data aggregators names, Default [-1] for selecting all data aggregators
 receiver_id = "{}CH0" # inset for receiver devices
 path_template = 'RAW-R{:04d}-{}-S{:05d}.h5' # the template to use to access data
 h5path = 'INSTRUMENT/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 h5path_idx = '/INDEX/{}/DET/{}:xtdf/image' # path in the HDF5 file to images
 slow_data_pattern = 'RAW-R{}-DA{}-S00000.h5'

 use_dir_creation_date = True # use the creation data of the input dir for database queries
 cal_db_interface = "tcp://max-exfl-cal001:8020#8025" # the database interface to use
 cal_db_timeout = 300000 # in milli seconds

 mem_cells = 0 # number of memory cells used, set to 0 to automatically infer
 overwrite = True # set to True if existing data should be overwritten
 max_pulses = 800 # maximum number of pulses per train
 bias_voltage = 100 # detector bias voltage
 sequences_per_node = 1 # number of sequence files per cluster node if run as slurm job, set to 0 to not run SLURM parallel
 chunk_size_idim = 1  # chunking size of imaging dimension, adjust if user software is sensitive to this.
 mask_noisy_asic = 0.25 # set to a value other than 0 and below 1 to mask entire ADC if fraction of noisy pixels is above
 mask_cold_asic = 0.25 # mask cold ASICS if number of pixels with negligable standard deviation is larger than this fraction
 noisy_pix_threshold = 1. # threshold above which ap pixel is considered noisy.
 geo_file = "/gpfs/exfel/data/scratch/xcal/dssc_geo_june19.h5" # detector geometry file
 dinstance = "DSSC1M1"
 slow_data_aggregators = [1,2,3,4] #quadrant/aggregator
 slow_data_path = 'SQS_NQS_DSSC/FPGA/PPT_Q'

 def balance_sequences(in_folder, run, sequences, sequences_per_node, karabo_da):
    from xfel_calibrate.calibrate import balance_sequences as bs
    return bs(in_folder, run, sequences, sequences_per_node, karabo_da)

 ```

 %% Cell type:code id: tags:

 ``` python
 # make sure a cluster is running with ipcluster start --n=32, give it a while to start
 import os
 import sys
 from collections import OrderedDict

 import h5py
 import matplotlib
 import numpy as np

 matplotlib.use("agg")
 import matplotlib.pyplot as plt
 from ipyparallel import Client
 from IPython.display import Latex, Markdown, display

 print(f"Connecting to profile {cluster_profile}")
 view = Client(profile=cluster_profile)[:]
 view.use_dill()

 from datetime import timedelta

 from cal_tools.dssclib import get_dssc_ctrl_data, get_pulseid_checksum
 from cal_tools.tools import (
    get_constant_from_db,
    get_dir_creation_date,
    get_notebook_name,
    map_modules_from_folder,
    parse_runs,
    run_prop_seq_from_path,
 )
 from dateutil import parser
 from iCalibrationDB import Conditions, ConstantMetaData, Constants, Detectors, Versions
 ```

 %% Cell type:code id: tags:

 ``` python
 creation_time = None
 if use_dir_creation_date:
    creation_time = get_dir_creation_date(in_folder, run)
    print(f"Using {creation_time} as creation time")

 if sequences[0] == -1:
    sequences = None

 h5path = h5path.format(karabo_id, receiver_id)
 h5path_idx = h5path_idx.format(karabo_id, receiver_id)


 if karabo_da[0] == '-1':
    if modules[0] == -1:
        modules = list(range(16))
    karabo_da = ["DSSC{:02d}".format(i) for i in modules]
 else:
    modules = [int(x[-2:]) for x in karabo_da]
 print("Process modules: ",
      ', '.join([f"Q{x // 4 + 1}M{x % 4 + 1}" for x in modules]))

 CHUNK_SIZE = 512
 MAX_PAR = 32

 if in_folder[-1] == "/":
    in_folder = in_folder[:-1]
 print(f"Outputting to {out_folder}")

 if not os.path.exists(out_folder):
    os.makedirs(out_folder)
 elif not overwrite:
    raise AttributeError("Output path exists! Exiting")

 import warnings

 warnings.filterwarnings('ignore')

 print(f"Detector in use is {karabo_id}")
 ```

 %% Cell type:code id: tags:

 ``` python
 # set everything up filewise
 mmf = map_modules_from_folder(in_folder, run, path_template, karabo_da, sequences)
 mapped_files, mod_ids, total_sequences, sequences_qm, file_size = mmf
 MAX_PAR = min(MAX_PAR, total_sequences)
 ```

 %% Cell type:markdown id: tags:

 ## Processed Files ##

 %% Cell type:code id: tags:

 ``` python
 import copy

 import tabulate
 from IPython.display import HTML, Latex, Markdown, display

 print(f"Processing a total of {total_sequences} sequence files in chunks of {MAX_PAR}")
 table = []
 mfc = copy.copy(mapped_files)
 ti = 0
 for k, files in mfc.items():
    i = 0
    while not files.empty():
        f = files.get()
        if i == 0:
            table.append((ti, k, i, f))
        else:
            table.append((ti, "", i,  f))
        i += 1
        ti += 1
 if len(table):
    md = display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["#", "module", "# module", "file"])))
 # restore the queue
 mmf = map_modules_from_folder(in_folder, run, path_template, karabo_da, sequences)
 mapped_files, mod_ids, total_sequences, sequences_qm, file_size = mmf
 ```

 %% Cell type:code id: tags:

 ``` python
 import copy
 from functools import partial


 def correct_module(total_sequences, sequences_qm, karabo_id, dinstance, mask_noisy_asic,
                   mask_cold_asic, noisy_pix_threshold, chunksize, mem_cells, bias_voltage,
                   cal_db_timeout, creation_time, cal_db_interface, h5path, h5path_idx, inp):

    import binascii
    import copy
    import struct
    from hashlib import blake2b

    import h5py
    import numpy as np
-    from cal_tools.dssclib import get_dssc_ctrl_data, get_pulseid_checksum
+    from cal_tools.dssclib import (
+        get_dssc_ctrl_data,
+        get_num_cells,
+        get_pulseid_checksum,
+    )
    from cal_tools.enums import BadPixels
    from cal_tools.tools import get_constant_from_db_and_time
    from iCalibrationDB import (
        Conditions,
        ConstantMetaData,
        Constants,
        Detectors,
        Versions,
    )

    filename, filename_out, channel, karabo_da, qm, conditions = inp

    # DSSC correction requires path without the leading "/"
    if h5path[0] == '/':
        h5path = h5path[1:]
    if h5path_idx[0] == '/':
        h5path_idx = h5path_idx[1:]

    h5path = h5path.format(channel)
    h5path_idx = h5path_idx.format(channel)

    low_edges = None
    hists_signal_low = None
    high_edges = None
    hists_signal_high = None
    pulse_edges = None
    err = None
    offset_not_found = False
-    def get_num_cells(fname, h5path):
-        with h5py.File(fname, "r") as f:
-
-            cells = f[f"{h5path}/cellId"][()]
-            maxcell = np.max(cells)
-            options = [100, 200, 400, 500, 600, 700, 800]
-            dists = np.array([(o-maxcell) for o in options])
-            dists[dists<0] = 10000 # assure to always go higher
-            return options[np.argmin(dists)]

    if mem_cells == 0:
        mem_cells = get_num_cells(filename, h5path)

    pulseid_checksum = get_pulseid_checksum(filename, h5path, h5path_idx)

    print(f"Memcells: {mem_cells}")

    condition =  Conditions.Dark.DSSC(bias_voltage=bias_voltage, memory_cells=mem_cells,\
                                      pulseid_checksum=pulseid_checksum,\
                                      acquisition_rate=conditions['acquisition_rate'],\
                                      target_gain=conditions['target_gain'],\
                                      encoded_gain=conditions['encoded_gain'])

    detinst = getattr(Detectors, dinstance)
    device = getattr(detinst, qm)
    with h5py.File(filename, "r") as infile:
        y = infile[f"{h5path}/data"].shape[2]
        x = infile[f"{h5path}/data"].shape[3]
    offset, when = get_constant_from_db_and_time(karabo_id, karabo_da,
                                                 Constants.DSSC.Offset(),
                                                 condition,
                                                 None,
                                                 cal_db_interface,
                                                 creation_time=creation_time,
                                                 timeout=cal_db_timeout)
    if offset is not None:
        offset = np.moveaxis(np.moveaxis(offset[...], 2, 0), 2, 1)
    else:
        offset_not_found = True
        print("No offset found in the database")

    def copy_and_sanitize_non_cal_data(infile, outfile):
        # these are touched in the correct function, do not copy them here
        dont_copy = ["data"]
        dont_copy = [h5path + "/{}".format(do)
                     for do in dont_copy]

        # a visitor to copy everything else
        def visitor(k, item):
            if k not in dont_copy:

                if isinstance(item, h5py.Group):
                    outfile.create_group(k)
                elif isinstance(item, h5py.Dataset):
                    group = str(k).split("/")
                    group = "/".join(group[:-1])
                    infile.copy(k, outfile[group])

        infile.visititems(visitor)

    try:
        with h5py.File(filename, "r") as infile:
            with h5py.File(filename_out, "w") as outfile:
                copy_and_sanitize_non_cal_data(infile, outfile)
                # get indices of last images in each train
                first_arr = np.squeeze(infile[f"{h5path_idx}/first"]).astype(np.int)
                last_arr = np.concatenate((first_arr[1:], np.array([-1,]))).astype(np.int)
                assert first_arr.size == last_arr.size
                oshape = list(infile[f"{h5path}/data"].shape)
                if len(oshape) == 4:
                    oshape = [oshape[0],]+oshape[2:]
                chunks = (chunksize, oshape[1], oshape[2])
                ddset = outfile.create_dataset(f"{h5path}/data",
                                               oshape, chunks=chunks,
                                               dtype=np.float32,
                                               fletcher32=True)

                mdset = outfile.create_dataset(f"{h5path}/mask",
                                               oshape, chunks=chunks,
                                               dtype=np.uint32,
                                               compression="gzip",
                                               compression_opts=1,
                                               shuffle=True,
                                               fletcher32=True)

                for train in range(first_arr.size):
                    first = first_arr[train]
                    last = last_arr[train]
                    if first == last:
                        continue
                    data = np.squeeze(infile[f"{h5path}/data"][first:last, ...].astype(np.float32))
                    cellId = np.squeeze(infile[f"{h5path}/cellId"][first:last, ...])
                    pulseId = np.squeeze(infile[f"{h5path}/pulseId"][first:last, ...])
                    if not offset_not_found:
                        data[...] -= offset[cellId,...]

                    if hists_signal_low is None:
                        pulseId = np.repeat(pulseId[:, None], data.shape[1], axis=1)
                        pulseId = np.repeat(pulseId[:,:,None], data.shape[2], axis=2)
                        bins = (55, int(pulseId.max()))
                        rnge = [[-5, 50], [0, int(pulseId.max())]]
                        hists_signal_low, low_edges, pulse_edges = np.histogram2d(data.flatten(),
                                                                                  pulseId.flatten(),
                                                                                  bins=bins,
                                                                                  range=rnge)
                        rnge = [[-5, 300], [0, pulseId.max()]]
                        hists_signal_high, high_edges, _ = np.histogram2d(data.flatten(),
                                                                          pulseId.flatten(),
                                                                          bins=bins,
                                                                          range=rnge)
                    ddset[first:last, ...] = data

                # find static and noisy values in dark images
                data = infile[f"{h5path}/data"][last, ...].astype(np.float32)
                bpix = np.zeros(oshape[1:], np.uint32)
                dark_std = np.std(data, axis=0)
                bpix[dark_std > noisy_pix_threshold] = BadPixels.NOISE_OUT_OF_THRESHOLD.value

                for i in range(8):
                    for j in range(2):
                        count_noise = np.count_nonzero(bpix[i*64:(i+1)*64, j*64:(j+1)*64])
                        asic_std = np.std(data[:, i*64:(i+1)*64, j*64:(j+1)*64])
                        if mask_noisy_asic:
                            if count_noise/(64*64) > mask_noisy_asic:
                                bpix[i*64:(i+1)*64, j*64:(j+1)*64] = BadPixels.NOISY_ADC.value

                        if mask_cold_asic:
                            count_cold = np.count_nonzero(asic_std < 0.5)
                            if count_cold/(64*64) > mask_cold_asic:
                                bpix[i*64:(i+1)*64, j*64:(j+1)*64] = BadPixels.ASIC_STD_BELOW_NOISE.value

    except Exception as e:
        print(e)
        success = False
        reason = "Error"
        err = e

    if err is None and offset_not_found:
        err = "Offset not found in database!. No offset correction applied."

    return (hists_signal_low, hists_signal_high, low_edges, high_edges, pulse_edges, when, qm, err)

 done = False
 first_files = {}
 inp = []
 left = total_sequences

 hists_signal_low = 0
 hists_signal_high = 0

 low_edges, high_edges, pulse_edges = None, None, None

 tGain, encodedGain, operatingFreq = get_dssc_ctrl_data(in_folder\
                                + "/r{:04d}/".format(run),\
                                slow_data_pattern,slow_data_aggregators, run, slow_data_path)

 whens = []
 qms = []
 Errors = []
 while not done:
    dones = []
    for i, k_da in zip(modules, karabo_da):
        qm = "Q{}M{}".format(i//4 +1, i % 4 + 1)

        if qm in mapped_files:
            if not mapped_files[qm].empty():
                fname_in = str(mapped_files[qm].get())
                dones.append(mapped_files[qm].empty())
            else:
                print(f"{qm} file is missing")
                continue
        else:
            print(f"Skipping {qm}")
            continue
        fout = os.path.abspath("{}/{}".format(out_folder, (os.path.split(fname_in)[-1]).replace("RAW", "CORR")))

        first_files[i] = (fname_in, fout)
        conditions = {}
        conditions['acquisition_rate'] = operatingFreq[qm]
        conditions['target_gain'] = tGain[qm]
        conditions['encoded_gain'] = encodedGain[qm]
        inp.append((fname_in, fout, i, k_da, qm, conditions))

    if len(inp) >= min(MAX_PAR, left):
        print(f"Running {len(inp)} tasks parallel")
        p = partial(correct_module, total_sequences, sequences_qm,
                    karabo_id, dinstance, mask_noisy_asic, mask_cold_asic,
                    noisy_pix_threshold, chunk_size_idim, mem_cells,
                    bias_voltage, cal_db_timeout, creation_time, cal_db_interface,
                    h5path, h5path_idx)

        r = view.map_sync(p, inp)
        #r = list(map(p, inp))

        inp = []
        left -= MAX_PAR

        for rr in r:
            if rr is not None:
                hl, hh, low_edges, high_edges, pulse_edges, when, qm, err = rr
                whens.append(when)
                qms.append(qm)
                Errors.append(err)
                if hl is not None:  # any one being None will also make the others None
                    hists_signal_low += hl.astype(np.float64)
                    hists_signal_high += hh.astype(np.float64)

    done = all(dones)

 whens = [x for _,x in sorted(zip(qms,whens))]
 qms = sorted(qms)
 for i, qm in enumerate(qms):
    try:
        when = whens[i].isoformat()
    except:
        when = whens[i]
    if Errors[i] is not None:

        # Avoid writing wrong injection date if cons. not found.
        if "not found" in str(Errors[i]):
            print(f"ERROR! {qm}: {Errors[i]}")
        else:
            print(f"Offset for {qm} was injected on {when}, ERROR!: {Errors[i]}")
    else:
        print(f"Offset for {qm} was injected on {when}")
 ```

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 import numpy as np
 from matplotlib import cm
 from matplotlib.ticker import FormatStrFormatter, LinearLocator
 from mpl_toolkits.mplot3d import Axes3D

 %matplotlib inline
 def do_3d_plot(data, edges, x_axis, y_axis):
    fig = plt.figure(figsize=(10,10))
    ax = fig.gca(projection='3d')

    # Make data.
    X = edges[0][:-1]
    Y = edges[1][:-1]
    X, Y = np.meshgrid(X, Y)

    Z = data.T

    # Plot the surface.
    surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                           linewidth=0, antialiased=False)
    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    ax.set_zlabel("Counts")
 ```

 %% Cell type:code id: tags:

 ``` python
 def do_2d_plot(data, edges, y_axis, x_axis):
    from matplotlib.colors import LogNorm
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    extent = [np.min(edges[1]), np.max(edges[1]),np.min(edges[0]), np.max(edges[0])]
    im = ax.imshow(data[::-1,:], extent=extent, aspect="auto", norm=LogNorm(vmin=1, vmax=np.max(data)))
    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    cb = fig.colorbar(im)
    cb.set_label("Counts")


 ```

 %% Cell type:markdown id: tags:

 ## Mean Intensity per Pulse ##

 The following plots show the mean signal for each pulse in a detailed and expanded intensity region.

 %% Cell type:code id: tags:

 ``` python
 do_3d_plot(hists_signal_low, [low_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 do_2d_plot(hists_signal_low, [low_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 do_3d_plot(hists_signal_high, [high_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 do_2d_plot(hists_signal_high, [high_edges, pulse_edges], "Signal (ADU)", "Pulse id")
 ```

 %% Cell type:code id: tags:

 ``` python
 corrected = []
 raw = []
 mask = []
 pulse_ids = []
 train_ids = []
 for channel, ff in first_files.items():
    try:
        raw_file, corr_file = ff
        data_path = h5path.format(channel)
        index_path = h5path_idx.format(channel)
        try:
            infile = h5py.File(raw_file, "r")
            first_idx = int(np.array(infile[f"{index_path}/first"])[0])

            raw_d = np.array(infile[f"{data_path}/data"])
            # Use first 128 images for plotting
            if raw_d.shape[0] >= 128:
                # random number for plotting
                plt_im = 128
            else:
                plt_im = d.shape[0]
            last_idx = first_idx + plt_im
            raw.append((channel,raw_d[first_idx:last_idx,0,...]))
        finally:
            infile.close()

        infile = h5py.File(corr_file, "r")
        try:
            corrected.append((channel, np.array(infile[f"{data_path}/data"][first_idx:last_idx,...])))
            mask.append((channel, np.array(infile[f"{data_path}/mask"][first_idx:last_idx,...])))
            pulse_ids.append((channel, np.squeeze(infile[f"{data_path}/pulseId"][first_idx:last_idx,...])))
            train_ids.append((channel, np.squeeze(infile[f"{data_path}/trainId"][first_idx:last_idx,...])))
        finally:
            infile.close()

    except Exception as e:
        print(e)
 ```

 %% Cell type:code id: tags:

 ``` python
 def combine_stack(d, sdim):
    combined = np.zeros((sdim, 1300,1300), np.float32)
    combined[...] = 0

    dy = 0
    quad_pos = [
        (0, 145),
        (130, 140),
        (125, 15),
        (0, 15),

    ]

    px = 0.236
    py = 0.204
    with h5py.File(geo_file, "r") as gf:
        # TODO: refactor to -> for ch, f in d:
        for i in range(len(d)):

            ch = d[i][0]

            mi = 3-(ch%4)
            mp = gf["Q{}/M{}/Position".format(ch//4+1, mi%4+1)][()]
            t1 = gf["Q{}/M{}/T01/Position".format(ch//4+1, ch%4+1)][()]
            t2 = gf["Q{}/M{}/T02/Position".format(ch//4+1, ch%4+1)][()]
            if ch//4 < 2:
                t1, t2 = t2, t1

            if ch // 4 == 0 or ch // 4 == 1:
                td = d[i][1][:,::-1,:]
            else:
                td = d[i][1][:,:,::-1]

            t1d = td[:,:,:256]
            t2d = td[:,:,256:]

            x0t1 = int((t1[0]+mp[0])/px)
            y0t1 = int((t1[1]+mp[1])/py)
            x0t2 = int((t2[0]+mp[0])/px)
            y0t2 = int((t2[1]+mp[1])/py)

            x0t1 += int(quad_pos[i//4][1]/px)
            x0t2 += int(quad_pos[i//4][1]/px)
            y0t1 += int(quad_pos[i//4][0]/py)+combined.shape[1]//16
            y0t2 += int(quad_pos[i//4][0]/py)+combined.shape[1]//16
            combined[:,y0t1:y0t1+128,x0t1:x0t1+256] = t1d
            combined[:,y0t2:y0t2+128,x0t2:x0t2+256] = t2d

    return combined
 ```

 %% Cell type:code id: tags:

 ``` python
 combined = combine_stack(corrected, last_idx-first_idx)
 combined_raw = combine_stack(raw, last_idx-first_idx)
 combined_mask = combine_stack(mask, last_idx-first_idx)
 ```

 %% Cell type:markdown id: tags:

 ### Mean RAW Preview ###


 %% Cell type:code id: tags:

 ``` python
 display(Markdown("The per pixel mean of the first {} images of the RAW data".format(plt_im)))
 ```

 %% Cell type:code id: tags:

 ``` python
 %matplotlib inline
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.mean(combined_raw[:,...],axis=0),
               vmin=min(0.75*np.median(combined_raw[combined_raw > 0]), -5),
               vmax=max(1.5*np.median(combined_raw[combined_raw > 0]), 50), cmap="jet")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 ### Single Shot Preview ###

 A single shot image from cell 2 of the first train

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 dim = combined[2,...]

 im = ax.imshow(dim, vmin=-0, vmax=max(1.5*np.median(dim[dim > 0]), 50), cmap="jet", interpolation="nearest")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 h = ax.hist(dim.flatten(), bins=100, range=(0, 100))
 ```

 %% Cell type:markdown id: tags:

 ### Mean CORRECTED Preview ###

 %% Cell type:code id: tags:

 ``` python
 display(Markdown("The per pixel mean of the first {} images of the CORRECTED data".format(plt_im)))
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.mean(combined[:,...], axis=0), vmin=0,
               vmax=max(1.5*np.median(combined[combined > 0]), 10), cmap="jet", interpolation="nearest")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 ### Max CORRECTED Preview ###

 The per pixel maximum of the first 128 images of the CORRECTED data

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.max(combined[:,...], axis=0), vmin=0,
               vmax=max(100*np.median(combined[combined > 0]), 20), cmap="jet", interpolation="nearest")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 combined[combined <= 0] = 0
 h = ax.hist(combined.flatten(), bins=100, range=(-5, 100), log=True)
 ```

 %% Cell type:markdown id: tags:

 ## Bad Pixels ##
 The mask contains dedicated entries for all pixels and memory cells as well as all three gains stages. Each mask entry is encoded in 32 bits as:

 %% Cell type:code id: tags:

 ``` python
 import tabulate
 from cal_tools.enums import BadPixels
 from IPython.display import HTML, Latex, Markdown, display

 table = []
 for item in BadPixels:
    table.append((item.name, "{:016b}".format(item.value)))
 md = display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["Bad pixel type", "Bit mask"])))
 ```

 %% Cell type:markdown id: tags:

 ### Full Train Bad Pixels ###

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.log2(np.max(combined_mask[:,...], axis=0)), vmin=0,
               vmax=32, cmap="jet")
 cb = fig.colorbar(im, ax=ax)
 ```

 %% Cell type:markdown id: tags:

 ### Full Train Bad Pixels - Only Dark Char. Related ###

 %% Cell type:code id: tags:

 ``` python
 fig = plt.figure(figsize=(20,10))
 ax = fig.add_subplot(111)
 im = ax.imshow(np.max((combined_mask.astype(np.uint32)[:,...] & BadPixels.NOISY_ADC.value) != 0, axis=0), vmin=0,
               vmax=1, cmap="jet")
 cb = fig.colorbar(im, ax=ax)
 ```

--- a/notebooks/REMI/REMI_Digitize_and_Transform.ipynb
+++ b/notebooks/REMI/REMI_Digitize_and_Transform.ipynb
 %% Cell type:code id: tags:

 ``` python
 # Data selection parameters.
 run = 104  # Run ID.
 in_folder = '/gpfs/exfel/exp/SQS/202101/p002535/raw'  # Partial input path appended with run ID.
 out_folder = '/gpfs/exfel/exp/SQS/202101/p002535/scratch/cal_test'  # Full path to output folder.

 calib_config_path = '/gpfs/exfel/exp/SQS/202101/p002535/usr/config_board2+4.yaml'  # Path to correction and transform configuration

 # These parameters are required by xfel-calibrate but ignored in this notebook.
 cycle = ''  # Proposal cycle, currently not used.
 cal_db_timeout = 0  # Calibration DB timeout, currently not used.
 cal_db_interface = 'foo'  # Calibration DB interface, currently not used.
 karabo_da = 'bar'  # Karabo data aggregator name, currently not used

 # Output parameters.
 karabo_id = 'SQS_REMI_DLD6'  # Karabo device ID root for virtual output device.
 proposal = ''  # Proposal, leave empty for auto detection based on in_folder
 out_aggregator = 'REMI01'  # Aggregator name for output files.
 out_seq_len = 5000  # Number of trains per sequence file in output.
 det_device_id = '{karabo_id}/DET/{det_name}'  # Karabo device ID for virtual output device.
 det_output_key = 'output'  # Pipeline name for fast data output.
 save_raw_triggers = True  # Whether to save trigger position in files.
 save_raw_edges = True  # Whether to save digitized edge positions in files.
 save_raw_amplitudes = True  # Whether to save analog pulse amplitudes in files.
 save_rec_signals = True  # Whether to save reconstructed signals (u1-w2, mcp) in files.
 save_rec_hits = True  # Whether to save reoncstructed hits (x,y,t,m) in files.
 chunks_triggers = [500]  # HDF chunk size for triggers.
 chunks_edges = [500, 7, 50]  # HDF chunk size for edges.
 chunks_amplitudes = [500, 7, 50]  # HDF chunk size for amplitudes.
 chunks_hits = [50, 50]  # HDF chunk size for hits.
 chunks_signals = [50, 50]  # HDF chunk size for signals.
 dataset_compression = 'gzip'  # HDF compression method.
 dataset_compression_opts = 3  # HDF GZIP compression level.

 # Detector parameters.
 quad_anode = False  # Reconstruction assumes a hex anode by default, change for quad anodes.

 # Trigger parameters.
 ppt_source = 'SQS_RR_UTC/TSYS/TIMESERVER:outputBunchPattern'
 ignore_fel = False  # Ignore any FEL entries in the PPT.
 ignore_ppl = False  # Ignore any PPL entries in the PPT.
-trailing_trigger = True  # Add a trigger after all regular pulses with the remaining trace.
+trailing_trigger = False  # Add a trigger after all regular pulses with the remaining trace.
 ppl_offset = 0  # In units of the PPT.
 laser_ppt_mask = -1  # Bit mask for used laser, negative to auto-detect from instrument.
 instrument_sase = 3  # Which SASE we're running at for PPT decoding.
 first_pulse_offset = 10000  # Sample position where the first pulse begins, ignored when PPT is reconstructed.
 single_pulse_length = 25000  # How many samples if there's only one pulse.
 pulse_start_offset = 0  # Signal offset at the start of each pulse.
 pulse_end_offset = 0  # Signal offset at the end of each pulse.

 # PPT reconstruction parameters.
 reconstruct_ppt = False  # Reconstruct PPT from some trigger edges.
 trigger_edge_channel = '4_D'  # Channel to use for triggering.
 trigger_edge_offset = 0  # Offset to apply to the first trigger edge position to compute first pulse offset.
 fake_ppt_offset = 0  # Offset in reconstructed PPT for pulses.

 # Parallelization parameters.
 mp_find_triggers = 0.5  # Parallelization for finding triggers.
 mp_find_edges = 0.5  # Parallelization for digitizing analog signal.
 mt_avg_trace = 2  # Parallelization for trace averaging.
 mp_rec_hits = 1.0  # Parallelization for hit reconstruction.
 ```

 %% Cell type:code id: tags:

 ``` python
 from datetime import datetime
 from logging import warning
 from pathlib import Path
 import re

 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.colors import LogNorm
 from threadpoolctl import threadpool_limits

 import h5py

 import pasha as psh
 from euxfel_bunch_pattern import indices_at_sase, indices_at_laser
 from extra_data import RunDirectory, by_id
 from extra_remi import Analysis, trigger_dt
 from extra_remi.util import timing
 from extra_remi.rd_resort import signal_dt, hit_dt
 from extra_remi.files import DataFile, sequence_pulses

 if quad_anode:
    from extra_remi.plots import plot_detector_diagnostics_quad as plot_detector_diagnostics
 else:
    from extra_remi.plots import plot_detector_diagnostics_hex as plot_detector_diagnostics

 %matplotlib inline
 ```

 %% Cell type:code id: tags:

 ``` python
 def finite_flattened_slice(array, slice_=np.s_[:]):
    """Return flattened and finite values for a given slice."""
    sliced_array = array[slice_]
    return sliced_array[np.isfinite(sliced_array)]
 ```

 %% Cell type:code id: tags:

 ``` python
 calib_config_path = Path(calib_config_path)

 if not calib_config_path.is_file():
    # If the path cannot be resolved right now, try the same path relative to in_folder.
    calib_config_path = Path(in_folder) / calib_config_path

    if not calib_config_path.is_file():
        # Disallow implicit config file creation.
        raise ValueError('calib_config_path not found - neither absolute nor relative to in_folder')

 remi = Analysis(calib_config_path, use_hex=not quad_anode)

 with timing('open_run'):
    dc = remi.prepare_dc(RunDirectory(Path(in_folder) / f'r{run:04d}', inc_suspect_trains=True),
                         require_ppt=not reconstruct_ppt)
 ```

 %% Cell type:markdown id: tags:

 # Transformation parameters

 %% Cell type:code id: tags:

 ``` python
 def print_leaf(leaf, indent=0):
    for key, value in leaf.items():
        if isinstance(value, dict):
            print(indent * 4 * ' ' + key)
            print_leaf(value, indent=indent+1)
        else:
            print(indent * 4 * ' ' + f'{key}: {value}')

 print_leaf(remi.tree)
 ```

 %% Cell type:markdown id: tags:

 # Pulse and trigger information

 %% Cell type:markdown id: tags:

 ### Read PPT from file or reconstruct PPT for older data

 %% Cell type:code id: tags:

 ``` python
 if reconstruct_ppt:
    # Take up to the first hundred trains for now.
    # Could be done for each train individually, but likely not necessary for now.
    trigger_trace = dc[remi['digitizer']['source'], remi['digitizer']['key_pattern'].format(trigger_edge_channel)] \
        [:100].ndarray().mean(axis=0).astype(np.float64)
    trigger_trace -= trigger_trace[0]  # Use simple offset correction.

    fake_ppt = np.zeros(2700, dtype=np.uint32)

    discr_func, discr_params = remi.get_discriminator([trigger_edge_channel])

    edges = np.zeros(1000, dtype=np.float64)
    num_pulses = discr_func(trigger_trace, edges=edges, **discr_params[0])
    edges = edges[:num_pulses]

    first_edge = edges[0]
    rel_edges = np.round(edges - first_edge)
    edge_diff = rel_edges[1] - rel_edges[0]

    if not np.allclose(rel_edges[1:] - rel_edges[:-1], edge_diff):
        raise ValueError('PPT reconstruction for unstable edge intervals not supported')

    pulse_spacing = edge_diff / (2 * remi['digitizer']['clock_factor'])  # In units of PPT

    if not float.is_integer(pulse_spacing):
        raise ValueError('PPT reconstruction encountered non-integer pulse spacing')

    pulse_spacing = int(pulse_spacing)

    # Taken from euxfel_bunch_pattern/__init__.py
    from euxfel_bunch_pattern import DESTINATION_T4D, DESTINATION_T5D, PHOTON_LINE_DEFLECTION
    if instrument_sase == 1:
        flag = DESTINATION_T4D
    elif instrument_sase == 2:
        flag = DESTINATION_T5D
    elif instrument_sase == 3:
        flag = DESTINATION_T4D | PHOTON_LINE_DEFLECTION

    first_pulse_offset = int(first_edge + trigger_edge_offset)  # Overwrite notebook argument.
    fake_ppt[fake_ppt_offset:fake_ppt_offset + (pulse_spacing * num_pulses):pulse_spacing] = flag

    from pasha.functor import Functor, gen_split_slices
    class FakeKeyDataFunctor(Functor):
        """Functor appearing KeyData-like with constant data.

        This functor serves a constant data row for a given number
        of train IDs the same way a KeyData object would.
        """

        def __init__(self, row, train_ids):
            self.row = row
            self.train_ids = train_ids

        def split(self, num_workers):
            return gen_split_slices(len(self.train_ids), n_parts=num_workers)

        def iterate(self, share):
            it = zip(range(*share.indices(len(self.train_ids))), self.train_ids)

            for index, train_id in it:
                yield index, train_id, self.row

    ppt_data = FakeKeyDataFunctor(fake_ppt, dc.train_ids)

    fig, ax = plt.subplots(num=99, figsize=(9, 6), clear=True, ncols=1, nrows=1)

    ax.set_title('Edge trigger signal')
    ax.plot(trigger_trace, lw=1, label=f'Mean {trigger_edge_channel} trace')
    ax.vlines(edges, trigger_trace.min()*1.1, trigger_trace.max()*1.1,
              color='red', linewidth=3, alpha=0.3, label='Edge positions')

    ax.set_xlabel('Samples')
    ax.set_ylabel('Intensity / ADU')
    ax.legend()

 else:
    ppt_data = dc[ppt_source, 'data.bunchPatternTable']
 ```

 %% Cell type:markdown id: tags:

 ### Count pulses per train and compute offsets

 %% Cell type:code id: tags:

 ``` python
 # Based on the pulse pattern tables, three global variables are obtained:
 #
 # * `pulse_counts [int32: len(dc.train_ids)]` containing the number of pulses per train.
 # * `pulse_offsets [int32: len(dc.train_ids)]` containing the global offset for the first pulse of each train.
 # * `num_pulses = pulse_counts.sum(axis=0)`

 def get_pulse_positions(ppt, sase, laser, ppl_offset):
    # Combine FEL and PPL positions.

    fel_pos = indices_at_sase(ppt, sase) if not ignore_fel else np.array([])
    ppl_pos = indices_at_laser(ppt, laser) if not ignore_ppl else np.array([])

    if len(fel_pos) > 0:
        # Move PPL up to the FEL position.
        ppl_pos += fel_pos[0] + ppl_offset

    return np.union1d(fel_pos, ppl_pos), fel_pos, ppl_pos

 if laser_ppt_mask < 0:
    # If laser PPT mask is not specified, try to figure it out from device IDs.
    from euxfel_bunch_pattern import PPL_BITS

    instrument = karabo_id[:karabo_id.index('_')]

    try:
        laser_ppt_mask = PPL_BITS[f'LP_{instrument}']
    except KeyError:
        raise ValueError(f'Laser PPT mask unknown for instrument `{instrument}`')

 with timing('pulse_info'):
    psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))

    # Build the pulse index
    pulse_counts = psh.alloc(shape=len(dc.train_ids), dtype=np.uint64)
    has_ppt = psh.alloc(shape=len(dc.train_ids), dtype=bool, fill=False)

    def count_pulses(wid, index, tid, ppt):
        pulse_counts[index] = len(get_pulse_positions(ppt, instrument_sase, laser_ppt_mask, ppl_offset)[0])
        has_ppt[index] = True

    psh.map(count_pulses, ppt_data)

    # Fill any missing values with the highest.
    pulse_counts[has_ppt == False] = pulse_counts.max()

    if trailing_trigger:
        # Add a single count to every train for trailing trigger.
        pulse_counts += 1

    # Compute offsets based on pulse counts.
    pulse_offsets = np.zeros_like(pulse_counts)
    np.cumsum(pulse_counts[:-1], out=pulse_offsets[1:])

    # Total number of pulses.
    num_pulses = int(pulse_counts.sum())
 ```

 %% Cell type:code id: tags:

 ``` python
 fig, ax = plt.subplots(num=1, ncols=1, nrows=1, figsize=(9, 4), clear=True)

 ax.set_title('Pulse count')
 ax.plot(dc.train_ids, pulse_counts, lw=1)
 ax.set_xlabel('Train ID')
 ax.set_ylabel('Number of pulses')
 ax.set_ylim(0, max(300, pulse_counts.max() + 10))
 ax.ticklabel_format(style='plain')
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Find triggers

 The trigger defines the boundary of a pulse on the digitizer trace, which is stored per train.

 %% Cell type:code id: tags:

 ``` python
 # A trigger defines the boundary of a pulse on the digitizer trace stored per train. This cell creates a
 # global variable:
 # * `triggers [(start: int32, stop: int32, offset: float64, fel: bool, ppl: bool): num_pulses]`
 #   containing the triggers for each pulse.
 #
 # This uses the pulse puttern table to locate the pulse positions on the trace. Only number of pulses and
 # their distance can be drawn this way, leaving the absolute offset for the very first pulse to be
 # configured via `trigger/ppt/first_pulse_offset`. If a PPL is used, it will be included in the trigger
 # pattern. The ppt_offset parameter allows taking into account an offset betwen PPL and FEL.

 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))

 triggers = psh.alloc(shape=(num_pulses,), dtype=trigger_dt, fill=(-1, -1, np.nan, -1, 0, 0))

 clock_factor = remi['digitizer']['clock_factor']

 min_trace_len = min([
    dc[src, key].entry_shape[0] for det_name in remi['detector'].keys()
    for src, key in remi.get_detector_sourcekeys(det_name)
 ])

 def trigger_by_ppt(worker_id, index, train_id, ppt):
    all_pos, fel_pos, ppl_pos = get_pulse_positions(ppt, instrument_sase, laser_ppt_mask, ppl_offset)
    num_pulses = len(all_pos)

    if num_pulses > 0:
        if len(ppl_pos) == 0 and ppl_offset < 0:
            # No PPL pulses, but a negative offset is configured. This will cause
            # first_pulse_offset to start early and most likely miss pulses at the
            # end, so we correct by adding the ppl_offset to relative positions
            # when computing trace positions.
            pos_corr = abs(ppl_offset)
        else:
            pos_corr = 0

        rel_pos = all_pos - all_pos[0]

        if num_pulses > 1:
            pulse_len = np.unique(rel_pos[1:] - rel_pos[:-1]).min()
        elif num_pulses == 1:
            pulse_len = single_pulse_length

        start_frac = first_pulse_offset + (rel_pos + pos_corr) * 2 * clock_factor
        start_int = start_frac.astype(int)

        train_triggers = triggers[pulse_offsets[index]:int(pulse_offsets[index]+num_pulses)]
        train_triggers['start'] = start_int + pulse_start_offset
        train_triggers['stop'] = start_int + int(pulse_len * 2 * clock_factor) - 1 + pulse_end_offset
        train_triggers['offset'] = start_frac - start_int
        train_triggers['pulse'] = all_pos.astype(np.int16)
        train_triggers['fel'] = [pos in fel_pos for pos in all_pos]
        train_triggers['ppl'] = [pos in ppl_pos for pos in all_pos]

        last_sample = train_triggers['stop'].max()

    else:
        last_sample = first_pulse_offset

    if trailing_trigger:
        # Add trailing trigger if required.
        trigger = triggers[int(pulse_offsets[index]+pulse_counts[index]-1)]
        trigger['start'] = last_sample
        trigger['stop'] = min_trace_len
        trigger['offset'] = 0.0
        trigger['pulse'] = -1
        trigger['fel'] = False
        trigger['ppl'] = False

 with timing('find_triggers'):
    psh.map(trigger_by_ppt, ppt_data)

 if (np.unique(triggers['pulse'][1:] - triggers['pulse'][:-1]) > 0).sum() > 1:
    # There is more than one delta between pulse entries across all pulses. This is not
    # necessarily a problem, as the pattern could simply have changed in between trains
    # with each train being split properly.
    # If there's more than one delta in a single train, this likely points to a mismatch
    # of FEL and PPL repetition rate. This is most likely not intended.

    one = np.uint64(1)  # Because np.uint64 + int = np.float64
    pulse_deltas = set()

    for pulse_id, (offset, count) in enumerate(zip(
        pulse_offsets, pulse_counts - one if trailing_trigger else pulse_counts
    )):
        deltas = triggers['pulse'][offset+one:offset+count] - triggers['pulse'][offset:offset+count-one]

        if len(np.unique(deltas)) > 1:
            for delta in deltas:
                pulse_deltas.add(delta)

    if len(pulse_deltas) > 1:
        delta_str = ', '.join([str(x) for x in sorted(pulse_deltas)])
        warning(f'Different pulse lengths (PPT: {delta_str}) encountered within single trains, '
                f'separated pulse spectra may split up signals!')
 ```

 %% Cell type:code id: tags:

 ``` python
 fig, (lx, rx) = plt.subplots(num=2, ncols=2, nrows=1, figsize=(9, 4), clear=True,
                             gridspec_kw=dict(top=0.75))

 # Display ~400 pulses or 10 trains, whatever is lower
 n_trains = max(abs(pulse_offsets - 200).argmin(), 5)

 visible_triggers = triggers[:pulse_offsets[n_trains]]

 pulse_index = np.arange(len(visible_triggers))
 pumped = visible_triggers['fel'] & visible_triggers['ppl']
 fel_only = visible_triggers['fel'] & ~pumped
 ppl_only = visible_triggers['ppl'] & ~pumped

 lx.plot(pulse_index[pumped], visible_triggers[pumped]['start'], ' .', ms=3, c='C0', label='FEL+PPL')
 lx.plot(pulse_index[fel_only], visible_triggers[fel_only]['start'], '.', ms=3, c='C1', label='FEL-only')
 lx.plot(pulse_index[ppl_only], visible_triggers[ppl_only]['start'], '.', ms=2, c='C2', label='PPL-only')

 max_start = visible_triggers['start'].max()

 lx.vlines(pulse_offsets[:n_trains], 0, max_start, color='grey', linewidth=1, alpha=0.2)
 lx.tick_params(right=True)

 lx.set_xlabel('Pulse index')
 lx.set_xlim(-15, pulse_offsets[n_trains]+15)

 lx.set_ylabel('Trigger position')
 lx.set_ylim(-max_start // 20, max_start + max_start // 20)

 lx.legend(fontsize='small', loc='lower right')

 train_lx = lx.twiny()
 train_lx.set_xlabel('Train ID', labelpad=8)
 train_lx.set_xlim(lx.get_xlim())
 train_lx.set_xticks(pulse_offsets[:n_trains])
 train_lx.set_xticklabels([str(int(x)) for x in dc.train_ids[:n_trains]],
                         rotation=-45, fontsize='x-small')

 rx.plot(triggers['start'], lw=0.2)

 rx.set_xlabel('Pulse index')
 rx.tick_params(left=False, labelleft=False, right=True, labelright=True)

 pass
 ```

 %% Cell type:markdown id: tags:

 # Analog signal to digital edges

 %% Cell type:markdown id: tags:

 ### Find edges in analog signal

 %% Cell type:code id: tags:

 ``` python
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_edges))
 threadpool_limits(limits=remi.get_num_workers(mt_avg_trace))

 det_data = {}

 for i, (det_name, det) in enumerate(remi['detector'].items()):
    det_sourcekeys = remi.get_detector_sourcekeys(det_name)
    det_get_traces = remi.get_traces_getter(det_name)
    trace_len = dc[next(iter(det_sourcekeys))].entry_shape[0]

    edges = psh.alloc(shape=(num_pulses, 7, det['max_hits']),
                      dtype=np.float64, fill=np.nan)
    amplitudes = psh.alloc(shape=(num_pulses, 7, det['max_hits']),
                           dtype=np.float64, fill=np.nan)
    avg_traces = psh.alloc_per_worker(shape=(7, trace_len), dtype=np.float64)

    def prepare_edge_worker(worker_id):
        correct_func = remi.get_baseline_corrector()
        discr_func, discr_params = remi.get_discriminator(det['channels'])

        source_name = remi['digitizer']['source']
        bl_start, bl_stop, _ = remi.get_baseline_limits(trace_len)
        bl_sym = remi['digitizer']['baseline_symmetry']
        time_cal = remi.get_time_calibration()

        traces_corr = np.empty((7, trace_len), dtype=np.float64)
        baselines = np.empty(bl_sym, dtype=np.float64)
        yield

    @psh.with_init(prepare_edge_worker)
    def find_edges(worker_id, index, train_id, data):
        try:
            data = det_get_traces(data[source_name])
        except KeyError:
            return

        for channel_idx in range(7):
            correct_func(data[channel_idx], traces_corr[channel_idx],
                         baselines, bl_start, bl_stop)

        avg_traces[worker_id] += traces_corr

        pulses_slice = np.s_[pulse_offsets[index]:pulse_offsets[index]+pulse_counts[index]]

        for trigger, pulse_edges, pulse_amplitudes in zip(
            triggers[pulses_slice], edges[pulses_slice], amplitudes[pulses_slice]
        ):
            trigger_slice = np.s_[trigger['start']:trigger['stop']]

            for trace, channel_params, channel_edges, channel_amplitudes in zip(
                traces_corr, discr_params, pulse_edges, pulse_amplitudes
            ):
                discr_func(trace[trigger_slice], edges=channel_edges,
                           amplitudes=channel_amplitudes, **channel_params)

            if np.isfinite(pulse_edges).sum(axis=1).max() == det['max_hits']:
                warning(f'Maximum number of edges reached in train {train_id}, pulse: {trigger["pulse"]}')

    with timing(f'find_edges, {det_name}'):
        psh.map(find_edges, dc.select(det_sourcekeys))

    if not np.isfinite(edges).any():
        warning(f'No edges found for {det_name}')

    fig, (ux, bx) = plt.subplots(num=110+i, ncols=1, nrows=2, figsize=(9.5, 8), clear=True,
                                 gridspec_kw=dict(left=0.1, right=0.98, top=0.98, bottom=0.1, hspace=0.25))

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    max_num = 0

    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        n, _, _ = ux.hist(finite_flattened_slice(amplitudes, np.s_[:, edge_idx, :]),
                          bins=1000, range=(0, 2048), histtype='step', lw=1,
                          color=f'C{edge_idx}' if edge_idx < 6 else 'k', label=edge_name)
        max_num = max(max_num, n.max())

        cur_edges = finite_flattened_slice(edges, np.s_[:, edge_idx, :])
        bx.hist(cur_edges - np.floor(cur_edges), bins=500, range=(0, 1), histtype='step',
                lw=1, color=f'C{edge_idx}' if edge_idx < 6 else 'k', label=edge_name)

    ux.legend()
    ux.set_title('Pulse height distributions')
    ux.set_xlabel('Pulse height')
    ux.set_yscale('log')
    ux.set_xlim(0, 4096)
    ux.set_ylim(10, 1.5*max(max_num, 10))

    if remi['digitizer']['discriminator'] == 'cfd':
        ux.text(1024, 12.5, 'No pulse height feedback for constant fraction discrimination',
                ha='center', va='center')

    bx.set_title('Fractional edge distributions')
    bx.set_xlabel('Edge positions - ⌊edge positions⌋')
    bx.set_yscale('log')
    bx.set_xlim(-0.05, 1.2)
    bx.legend()

    # Properly offset edges to their trigger offset and convert to time.
    # This is not done earlier to preserve the information for plotting.
    edges += triggers['offset'][:, None, None]
    edges *= remi.get_time_calibration()

    det_data[det_name] = {
        'edges': edges,
        'amplitudes': amplitudes,
        'avg_trace': avg_traces.sum(axis=0) / len(dc.train_ids)
    }
 ```

 %% Cell type:markdown id: tags:

 ### Global average of analog signals

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    fig, axs = plt.subplots(num=10+i, nrows=7, figsize=(9.5, 8), clear=True,
                            gridspec_kw=dict(left=0.1, right=0.98, top=0.98, bottom=0.1))
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        axs[edge_idx].plot(det_data[det_name]['avg_trace'][edge_idx], lw=1)
        axs[edge_idx].tick_params(labelbottom=False)
        axs[edge_idx].set_ylabel(edge_name)

    axs[-1].tick_params(labelbottom=True)
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Sample for found edges

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    edges = det_data[det_name]['edges']

    fig = plt.figure(num=100+i, figsize=(9.5, 8))
    grid = fig.add_gridspec(ncols=2, nrows=4, left=0.1, right=0.98, top=0.98, bottom=0.1)

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    for signal_idx, signal_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        row = (1 + signal_idx // 2) if signal_idx < 6 else 0
        col = (signal_idx % 2) if signal_idx < 6 else np.s_[:]
        ax = fig.add_subplot(grid[row, col])

        finite_edges = np.isfinite(edges[:, signal_idx, 0])
        if not finite_edges.any():
            warning(f'No edges found for {det_name}/{signal_name}')
            continue

        pulse_idx = np.uint64(finite_edges.nonzero()[0][0])  # Is combined with other uint64 values below.
        train_idx = (pulse_idx >= pulse_offsets).nonzero()[0][-1]
        trigger = triggers[pulse_idx]

        sourcekey = remi.get_channel_sourcekey(
            remi['detector'][det_name]['channels'][signal_idx])
        train_trace = dc[sourcekey].select_trains(np.s_[train_idx:train_idx+1]).ndarray()[0]
        corr_trace = np.zeros_like(train_trace, dtype=np.float64)

        remi.get_baseline_corrector()(
            train_trace, corr_trace,
            np.empty(remi['digitizer']['baseline_symmetry'], dtype=np.float64),
            *remi.get_baseline_limits(len(train_trace))[:2])

        pulse_trace = corr_trace[np.s_[trigger['start']:trigger['stop']]]

        x_time = remi.get_time_calibration() * (np.arange(len(pulse_trace)) + trigger['offset'])

        ax.plot(x_time, pulse_trace, lw=1)
        ax.set_xlim(x_time[0], x_time[-1])
        ax.set_ylim(-200, pulse_trace.max()*1.1)
        ax.text(x_time[-1], pulse_trace.max(),
                f'T{train_idx} P{pulse_idx - pulse_offsets[train_idx]} ',
                va='top', ha='right')
        ax.tick_params(labelbottom=False)
        ax.set_ylabel(signal_name)

        ax.vlines(edges[pulse_idx, signal_idx, :], *ax.get_ylim(), color='red', linewidth=1)
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Digitized channel spectra

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    fig = plt.figure(num=20+i, figsize=(9.5, 6))

    edges = det_data[det_name]['edges']

    min_edge = np.nanmin(edges)
    max_edge = np.nanmax(edges)

    grid = fig.add_gridspec(ncols=3, nrows=3, left=0.08, right=0.98, top=0.95, hspace=0.4)

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    numx = fig.add_subplot(grid[0, 0])
    numx.set_title('Edges per pulse')

    agg_window = num_pulses // 60
    max_num_edges = 0.0
    max_spectral_intensity = 0
    hist_axs = []

    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        if edge_idx < 6:
            row = 1 + edge_idx % 2
            col = edge_idx // 2
        else:
            row = 0
            col = np.s_[1:3]

        ax = fig.add_subplot(grid[row, col])
        ax.set_title(f'TOF spectrum: {edge_name}')

        num_edges = np.isfinite(edges[:, edge_idx, :]).sum(axis=1)
        num_edges = num_edges[:((len(num_edges) // agg_window) * agg_window)]
        num_edges = num_edges.reshape(-1, agg_window).mean(axis=1)

        if (num_edges == 0).all():
            warning(f'No edges found for {det_name}/{edge_name}')
            continue

        if edge_idx < 6:
            plot_kwargs = dict(c=f'C{edge_idx}', ls='solid', lw=1.0)
        else:
            plot_kwargs = dict(c='k', ls='dashed', lw=1.0)

        numx.plot(np.arange(len(num_edges)) * agg_window, num_edges, label=edge_name, **plot_kwargs)
        max_num_edges = max(max_num_edges, num_edges.max())

        y, _, _ = ax.hist(finite_flattened_slice(edges, np.s_[:, edge_idx, :]),
                          bins=int((max_edge - min_edge) // 5), range=(min_edge, max_edge),
                          color=plot_kwargs['c'], histtype='step', linewidth=1)
        hist_axs.append(ax)

        max_spectral_intensity = max(max_spectral_intensity, y.max())

    numx.tick_params(labelbottom=False)
    numx.set_ylim(0, 1.2*max_num_edges)

    for ax in hist_axs:
        ax.set_ylim(0, max_spectral_intensity*1.1)
        ax.ticklabel_format(axis='y', style='sci', scilimits=(0, 3))
 pass
 ```

 %% Cell type:markdown id: tags:

 # Detector diagnostics

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    edges = det_data[det_name]['edges']

    sort = remi.get_dld_sorter(det_name)

    sum_shifts = sort.sum_shifts if sort.sum_shifts != (0.0, 0.0, 0.0) else None

    is_valid = remi.get_presort_mask(edges, edge_idx=0, w=not quad_anode,
                                     sum_limit=max(sort.uncorrected_time_sum_half_widths),
                                     sum_shifts=sum_shifts)

    if not is_valid.any():
        warning(f'No valid preliminary edge combinations found for {det_name}')

    signals, sums = remi.get_signals_and_sums(edges, indices=sort.channel_indices, sum_shifts=sum_shifts,
                                              mask=is_valid)
    fig = plot_detector_diagnostics(signals=signals, sums=sums, fig_num=30+i, im_scale=1.5,
                                    sum_range=max(sort.uncorrected_time_sum_half_widths),
                                    sorter=sort)
    fig.text(0.02, 0.98, det_name.upper() + ' before corrections', rotation=90, ha='left', va='top', size='x-large')

    if remi['detector'][det_name]['use_sum_correction'] or remi['detector'][det_name]['use_pos_correction']:
        n_masked = is_valid.sum()
        signals = np.full((n_masked, 3), np.nan, dtype=np.float64)
        sums = np.full((n_masked, 3), np.nan, dtype=np.float64)

        sort.correct(edges[is_valid], signals, sums)
        fig = plot_detector_diagnostics(signals=signals, sums=sums, fig_num=40+i, im_scale=1.5,
                                        sum_range=max(sort.uncorrected_time_sum_half_widths),
                                        sorter=sort)
        fig.text(0.02, 0.98, det_name.upper() + ' after corrections', rotation=90, ha='left', va='top', size='x-large')
 pass
 ```

 %% Cell type:markdown id: tags:

 # Hit reconstruction

 %% Cell type:code id: tags:

 ``` python
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_rec_hits))

 for det_name, det in remi['detector'].items():
    edges = det_data[det_name]['edges']

    signals = psh.alloc(shape=(num_pulses, 50), dtype=signal_dt, fill=np.nan)
    hits = psh.alloc(shape=(num_pulses, 50), dtype=hit_dt, fill=(np.nan, np.nan, np.nan, -1))
    hit_counts = psh.alloc(shape=len(dc.train_ids), dtype=np.uint32)

    def prepare_hit_worker(worker_id):
        sort = remi.get_dld_sorter(det_name)
        yield

    @psh.with_init(prepare_hit_worker)
    def reconstruct_hits(worker_id, index, train_id):
        hit_counts[index] += sort.run_on_train(
            edges, signals, hits, pulse_offsets[index], pulse_offsets[index] + pulse_counts[index])

    with timing(f'rec_hits, {det_name}'):
        psh.map(reconstruct_hits, dc.train_ids)

    det_data[det_name].update(signals=signals, hits=hits, hit_counts=hit_counts)
 ```

 %% Cell type:code id: tags:

 ``` python
 fig, ax = plt.subplots(num=50+i, figsize=(9.5, 4), ncols=1, clear=True,
                       gridspec_kw=dict(top=0.92, right=0.98, left=0.05, bottom=0.12))

 max_num_hits = 0.0

 for det_name in remi['detector'].keys():
    agg_window = num_pulses // min(1000, num_pulses)

    num_hits = np.isfinite(det_data[det_name]['hits']['x']).sum(axis=1)
    num_hits = num_hits[:(len(num_hits) // agg_window) * agg_window]
    num_hits = num_hits.reshape(-1, agg_window).mean(axis=1)
    max_num_hits = max(max_num_hits, num_hits.max())

    ax.plot(np.arange(0, (num_pulses // agg_window) * agg_window, agg_window), num_hits,
            lw=1, label=det_name.upper())

 ax.set_title('Hits per pulse')
 ax.set_xlabel('Pulse index')
 ax.set_ylim(0, max_num_hits*1.1)
 ax.legend()
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Reconstruction methods
 Each hit may be reconstructed by one of 19 different methods. These differ by the number of real signals across the channels, which could be combined to form the hit. Each of these methods is designed by a number between `0` and `19` (with empty hits using `-1`), which can be found in the `m` key of a hit, e.g.:

 * `0`: All six anode signals and the corresponding MCP signal were found.
 * `4`: One signal on layer `u` is missing, all other signals for this event were found.
 * `18`: Only one anode signal on each layer was found and the MCP signal is missing. There is no way to check whether this combination of signals is actually valid.

 | Method | `u+v+w +mcp` |
 | - | - |
 | 0 | `2+2+2 +1` |
 | 1 | `0+2+2 +1` |
 | 2 | `2+0+2 +1` |
 | 3 | `2+2+0 +1` |
 | 4 | `1+2+2 +1` (2 permutations) |
 | 5 | `2+1+2 +1` (2 permutations) |
 | 6 | `2+2+1 +1` (2 permutations) |
 | 7 | `2+2+2 +0` |
 | 8 | `0+2+2 +0` |
 | 9 | `2+0+2 +0` |
 | 10 | `2+2+0 +0` |
 | 11 | `1+2+2 +0` (2 permutations) |
 | 12 | `2+1+2 +0` (2 permutations) |
 | 13 | `2+2+1 +0` (2 permutations) |
 | 14 | `2+1+1 +1` `1+2+1 +1` `1+1+2 +1` (12 permutations) |
 | 15 | `2+1+0 +1` `2+0+1 +1` `1+2+0 +1` `1+0+2 +1` `0+2+1 +1` `0+1+2 +1` (12 permutations) |
 | 16 | `1+1+1 +1` (8 permutations) |
 | 17 | `2+1+1 +0` `1+2+1 +0` `1+1+2 +0` (12 permutations) |
 | 18 | `1+1+1 +0` (8 permutations) |
 | 19 | `2+1+0 +0` `2+0+1 +0` `1+2+0 +0` `1+0+2 +0` `0+1+2 +0` `0+2+1 +0` (12 permutations) |

 * For hits reconstructed with method `> 10`, extra attention should be given to ensure they add meaningful signal.
 * Any method `> 14` has to considered risky, because neither a time sum nor the position can be checked. If the scale factors and/or `w` shift are not correct, then the number of events reconstructed with the risky methods will increase. They will most likely be *ghost hits*, which do not correspond to actual impacts on the detector.

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    hits = det_data[det_name]['hits']

    fig, ax = plt.subplots(num=60+i, figsize=(9.5, 5), ncols=1, clear=True,
                           gridspec_kw=dict(left=0.08, right=0.91, top=0.8))

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    if not (hits['m'] >= 0).any():
        warning(f'No hits found for {det_name}')
        continue

    method_bins = np.bincount(hits['m'][hits['m'] >= 0], minlength=20)
    ax.bar(np.arange(20), method_bins, width=0.5)

    ax.set_xlabel('Reconstruction method')
    ax.set_xlim(-0.5, 19.5)
    ax.set_xticks(np.arange(20))

    ax.set_ylabel('Number of hits')
    ax.set_ylim(0, method_bins.max()*1.05)
    ylims = ax.get_ylim()

    ax.tick_params(which='both', right=True, labelright=True)

    num_risky = method_bins[15:].sum()
    num_total = method_bins.sum()

    ax.text(14.2, method_bins.max(), f'{(100*(num_total-num_risky)/num_total):.2g}%',
            va='top', ha='right', color='black')
    ax.text(14.8, method_bins.max(), f'{(100*num_risky/num_total):.2g}%',
            va='top', ha='left', color='red')

    ax.fill([14.5, 19.5, 19.5, 14.5], [ylims[0], ylims[0], ylims[1], ylims[1]], c='r', alpha=0.2)

    labelx = ax.twiny()
    labelx.set_xlim(*ax.get_xlim())
    labelx.set_xticks(ax.get_xticks())
    labelx.set_xticklabels([
        '2+2+2 +1',
        '0+2+2 +1', '2+0+2 +1', '2+2+0 +1',
        '1+2+2 +1', '2+1+2 +1', '2+2+1 +1',
        '2+2+2 +0',
        '0+2+2 +0', '2+0+2 +0', '2+2+0 +0', '1+2+2 +0', '2+1+2 +0', '2+2+1 +0',
        '2+1+1 +1',
        '2+1+0 +1',
        '1+1+1 +1',
        '2+1+1 +0',
        '1+1+1 +0',
        '2+1+0 +0',
    ], rotation=90)

    min_rel_tick = np.ceil((ax.get_ylim()[0] / num_total) / 0.1) * 0.1
    max_rel_tick = np.floor((method_bins.max() / num_total) / 0.1) * 0.1

    rely = ax.twinx()
    rely.set_ylim(*ax.get_ylim())
    rely.set_yticks(np.arange(0.0, max_rel_tick+0.01, 0.1)*num_total)
    rely.set_yticks(np.arange(0.0, ylims[1]/num_total, 0.02)*num_total, minor=True)
    rely.set_yticklabels([f'{(y/num_total)*100:.0f}%' for y in rely.get_yticks()])
    rely.set_ylabel('Percentage of total hits')
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Detector image and fishes

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    flat_hits = det_data[det_name]['hits'].reshape(-1)
    flat_hits = flat_hits[np.isfinite(flat_hits[:]['x'])]
    flat_hits = flat_hits[flat_hits['m'] <= 10]

    fig = plt.figure(num=70+i, figsize=(9, 13.5))

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    fig.text(0.02, 0.02, det_name.upper(), rotation=90, ha='left', va='bottom', size='x-large')

    imp = fig.add_axes([0.1 + 0.25/2, 0.56, 0.6, 0.4])
    txp = fig.add_axes([0.1, 0.28, 0.85, 0.22])
    typ = fig.add_axes([0.1, 0.04, 0.85, 0.22])

    if flat_hits.size == 0:
        warning(f'No hits found for {det_name}')
        continue

    im_radius = remi['detector'][det_name]['mcp_radius']*1.1

    imp.hist2d(flat_hits['x'], flat_hits['y'], bins=(256, 256),
               range=[[-im_radius, im_radius], [-im_radius, im_radius]], norm=LogNorm())
    imp.xaxis.set_label_position('top')
    imp.set_xlabel('X / mm')
    imp.set_ylabel('Y / mm')
    imp.tick_params(right=True, labelright=True, top=True, labeltop=True)
    imp.grid()

    min_tof = flat_hits['t'].min()
    max_tof = flat_hits['t'].max()

    num_tof_bins = min(int((max_tof - min_tof) // 10), 500)

    if num_tof_bins == 0:
        warning(f'All TOFs limited to single bin for {det_name}')
        continue

    for ax, dim_label in zip([txp, typ], ['x', 'y']):
        ax.hist2d(flat_hits['t'], flat_hits[dim_label], bins=(num_tof_bins, 256),
                   range=[[min_tof, max_tof], [-im_radius, im_radius]], norm=LogNorm())
        ax.set_ylabel(f'{dim_label.upper()} / mm')

    typ.set_xlabel('Time-of-flight / ns')
    txp.tick_params(bottom=True, labelbottom=False, top=True, labeltop=True, right=True, labelright=True)
    typ.tick_params(right=True, labelright=True, top=True)
 pass
 ```

 %% Cell type:markdown id: tags:

 # Transformed data files

 %% Cell type:code id: tags:

 ``` python
 # Try to figure out proposal number from in_folder to work with older files.
 m = re.match(r'p(\d{6})', Path(in_folder).parts[-2])
 if not proposal and m is not None:
    proposal = int(m[1])

 seq_len = out_seq_len if out_seq_len > 0 else len(dc.files[0].train_ids)
 dataset_kwargs = {k[8:]: v for k, v in locals().items() if k.startswith('dataset_compression')}

 control_sources = [det_device_id.format(karabo_id=karabo_id, det_name=det_name.upper())
                   for det_name in remi['detector']]

 channels = []
 if save_raw_triggers or save_raw_edges:
    channels.append('raw')
 if save_rec_signals or save_rec_hits:
    channels.append('rec')

 instrument_channels = [
    f'{device_id}:{det_output_key}/{channel}'
    for device_id in control_sources
    for channel in channels
 ]
 ```

 %% Cell type:code id: tags:

 ``` python
 Path(out_folder).mkdir(parents=True, exist_ok=True)
 print('Writing sequence files', flush=True, end='')

 t_write = timing('write_files')
 t_write.__enter__()

 for seq_id, train_mask, pulse_mask in sequence_pulses(dc.train_ids, pulse_counts, pulse_offsets, seq_len):
    seq_train_ids = dc.train_ids[train_mask]

    with DataFile.from_details(out_folder, out_aggregator, run, seq_id) as outp:
        outp.create_metadata(like=dc, proposal=proposal, run=run, sequence=seq_id,
                             control_sources=control_sources, instrument_channels=instrument_channels)
        outp.create_index(
            seq_train_ids,
            timestamps=dc.select_trains(by_id[seq_train_ids]).train_timestamps().astype(np.uint64)
        )

        for det_name in remi['detector']:
            cur_device_id = det_device_id.format(karabo_id=karabo_id, det_name=det_name.upper())
            cur_max_hits = remi['detector'][det_name]['max_hits']

            cur_control_data = outp.create_control_source(cur_device_id)
            # Manually manipulate the file here, still creates the index properly.
            remi.attach_detector_config(det_name, cur_control_data.get_run_group())
            cur_control_data.create_index(len(seq_train_ids))

            cur_fast_data = outp.create_instrument_source(f'{cur_device_id}:{det_output_key}')

            cur_data = det_data[det_name]

            if save_raw_triggers:
                cur_fast_data.create_key('raw.triggers', triggers[pulse_mask],
                                         maxshape=(None,) + triggers.shape[1:],
                                         chunks=tuple(chunks_triggers), **dataset_kwargs)

            if save_raw_edges:
                cur_fast_data.create_key('raw.edges', cur_data['edges'][pulse_mask],
                                         maxshape=(None,) + cur_data['edges'].shape[1:],
                                         chunks=tuple(chunks_edges if chunks_edges[-1] <= cur_max_hits
                                                      else chunks_edges[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            if save_raw_amplitudes:
                cur_fast_data.create_key('raw.amplitudes', cur_data['amplitudes'][pulse_mask],
                                         maxshape=(None,) + cur_data['amplitudes'].shape[1:],
                                         chunks=tuple(chunks_amplitudes if chunks_amplitudes[-1] <= cur_max_hits
                                                      else chunks_amplitudes[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            if save_rec_signals:
                cur_fast_data.create_key('rec.signals', cur_data['signals'][pulse_mask],
                                         maxshape=(None,) + cur_data['signals'].shape[1:],
                                         chunks=tuple(chunks_signals if chunks_signals[-1] <= cur_max_hits
                                                      else chunks_signals[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            if save_rec_hits:
                cur_fast_data.create_key('rec.hits', cur_data['hits'][pulse_mask],
                                         maxshape=(None,) + hits.shape[1:],
                                         chunks=tuple(chunks_hits if chunks_hits[-1] <= cur_max_hits
                                                      else chunks_hits[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            cur_fast_data.create_index(raw=pulse_counts[train_mask], rec=pulse_counts[train_mask])

    print('.', flush=True, end='')

 print('')
 t_write.__exit__()
 ```

 %% Cell type:code id: tags:

 ``` python
 # Data selection parameters.
 run = 104  # Run ID.
 in_folder = '/gpfs/exfel/exp/SQS/202101/p002535/raw'  # Partial input path appended with run ID.
 out_folder = '/gpfs/exfel/exp/SQS/202101/p002535/scratch/cal_test'  # Full path to output folder.

 calib_config_path = '/gpfs/exfel/exp/SQS/202101/p002535/usr/config_board2+4.yaml'  # Path to correction and transform configuration

 # These parameters are required by xfel-calibrate but ignored in this notebook.
 cycle = ''  # Proposal cycle, currently not used.
 cal_db_timeout = 0  # Calibration DB timeout, currently not used.
 cal_db_interface = 'foo'  # Calibration DB interface, currently not used.
 karabo_da = 'bar'  # Karabo data aggregator name, currently not used

 # Output parameters.
 karabo_id = 'SQS_REMI_DLD6'  # Karabo device ID root for virtual output device.
 proposal = ''  # Proposal, leave empty for auto detection based on in_folder
 out_aggregator = 'REMI01'  # Aggregator name for output files.
 out_seq_len = 5000  # Number of trains per sequence file in output.
 det_device_id = '{karabo_id}/DET/{det_name}'  # Karabo device ID for virtual output device.
 det_output_key = 'output'  # Pipeline name for fast data output.
 save_raw_triggers = True  # Whether to save trigger position in files.
 save_raw_edges = True  # Whether to save digitized edge positions in files.
 save_raw_amplitudes = True  # Whether to save analog pulse amplitudes in files.
 save_rec_signals = True  # Whether to save reconstructed signals (u1-w2, mcp) in files.
 save_rec_hits = True  # Whether to save reoncstructed hits (x,y,t,m) in files.
 chunks_triggers = [500]  # HDF chunk size for triggers.
 chunks_edges = [500, 7, 50]  # HDF chunk size for edges.
 chunks_amplitudes = [500, 7, 50]  # HDF chunk size for amplitudes.
 chunks_hits = [50, 50]  # HDF chunk size for hits.
 chunks_signals = [50, 50]  # HDF chunk size for signals.
 dataset_compression = 'gzip'  # HDF compression method.
 dataset_compression_opts = 3  # HDF GZIP compression level.

 # Detector parameters.
 quad_anode = False  # Reconstruction assumes a hex anode by default, change for quad anodes.

 # Trigger parameters.
 ppt_source = 'SQS_RR_UTC/TSYS/TIMESERVER:outputBunchPattern'
 ignore_fel = False  # Ignore any FEL entries in the PPT.
 ignore_ppl = False  # Ignore any PPL entries in the PPT.
-trailing_trigger = True  # Add a trigger after all regular pulses with the remaining trace.
+trailing_trigger = False  # Add a trigger after all regular pulses with the remaining trace.
 ppl_offset = 0  # In units of the PPT.
 laser_ppt_mask = -1  # Bit mask for used laser, negative to auto-detect from instrument.
 instrument_sase = 3  # Which SASE we're running at for PPT decoding.
 first_pulse_offset = 10000  # Sample position where the first pulse begins, ignored when PPT is reconstructed.
 single_pulse_length = 25000  # How many samples if there's only one pulse.
 pulse_start_offset = 0  # Signal offset at the start of each pulse.
 pulse_end_offset = 0  # Signal offset at the end of each pulse.

 # PPT reconstruction parameters.
 reconstruct_ppt = False  # Reconstruct PPT from some trigger edges.
 trigger_edge_channel = '4_D'  # Channel to use for triggering.
 trigger_edge_offset = 0  # Offset to apply to the first trigger edge position to compute first pulse offset.
 fake_ppt_offset = 0  # Offset in reconstructed PPT for pulses.

 # Parallelization parameters.
 mp_find_triggers = 0.5  # Parallelization for finding triggers.
 mp_find_edges = 0.5  # Parallelization for digitizing analog signal.
 mt_avg_trace = 2  # Parallelization for trace averaging.
 mp_rec_hits = 1.0  # Parallelization for hit reconstruction.
 ```

 %% Cell type:code id: tags:

 ``` python
 from datetime import datetime
 from logging import warning
 from pathlib import Path
 import re

 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.colors import LogNorm
 from threadpoolctl import threadpool_limits

 import h5py

 import pasha as psh
 from euxfel_bunch_pattern import indices_at_sase, indices_at_laser
 from extra_data import RunDirectory, by_id
 from extra_remi import Analysis, trigger_dt
 from extra_remi.util import timing
 from extra_remi.rd_resort import signal_dt, hit_dt
 from extra_remi.files import DataFile, sequence_pulses

 if quad_anode:
    from extra_remi.plots import plot_detector_diagnostics_quad as plot_detector_diagnostics
 else:
    from extra_remi.plots import plot_detector_diagnostics_hex as plot_detector_diagnostics

 %matplotlib inline
 ```

 %% Cell type:code id: tags:

 ``` python
 def finite_flattened_slice(array, slice_=np.s_[:]):
    """Return flattened and finite values for a given slice."""
    sliced_array = array[slice_]
    return sliced_array[np.isfinite(sliced_array)]
 ```

 %% Cell type:code id: tags:

 ``` python
 calib_config_path = Path(calib_config_path)

 if not calib_config_path.is_file():
    # If the path cannot be resolved right now, try the same path relative to in_folder.
    calib_config_path = Path(in_folder) / calib_config_path

    if not calib_config_path.is_file():
        # Disallow implicit config file creation.
        raise ValueError('calib_config_path not found - neither absolute nor relative to in_folder')

 remi = Analysis(calib_config_path, use_hex=not quad_anode)

 with timing('open_run'):
    dc = remi.prepare_dc(RunDirectory(Path(in_folder) / f'r{run:04d}', inc_suspect_trains=True),
                         require_ppt=not reconstruct_ppt)
 ```

 %% Cell type:markdown id: tags:

 # Transformation parameters

 %% Cell type:code id: tags:

 ``` python
 def print_leaf(leaf, indent=0):
    for key, value in leaf.items():
        if isinstance(value, dict):
            print(indent * 4 * ' ' + key)
            print_leaf(value, indent=indent+1)
        else:
            print(indent * 4 * ' ' + f'{key}: {value}')

 print_leaf(remi.tree)
 ```

 %% Cell type:markdown id: tags:

 # Pulse and trigger information

 %% Cell type:markdown id: tags:

 ### Read PPT from file or reconstruct PPT for older data

 %% Cell type:code id: tags:

 ``` python
 if reconstruct_ppt:
    # Take up to the first hundred trains for now.
    # Could be done for each train individually, but likely not necessary for now.
    trigger_trace = dc[remi['digitizer']['source'], remi['digitizer']['key_pattern'].format(trigger_edge_channel)] \
        [:100].ndarray().mean(axis=0).astype(np.float64)
    trigger_trace -= trigger_trace[0]  # Use simple offset correction.

    fake_ppt = np.zeros(2700, dtype=np.uint32)

    discr_func, discr_params = remi.get_discriminator([trigger_edge_channel])

    edges = np.zeros(1000, dtype=np.float64)
    num_pulses = discr_func(trigger_trace, edges=edges, **discr_params[0])
    edges = edges[:num_pulses]

    first_edge = edges[0]
    rel_edges = np.round(edges - first_edge)
    edge_diff = rel_edges[1] - rel_edges[0]

    if not np.allclose(rel_edges[1:] - rel_edges[:-1], edge_diff):
        raise ValueError('PPT reconstruction for unstable edge intervals not supported')

    pulse_spacing = edge_diff / (2 * remi['digitizer']['clock_factor'])  # In units of PPT

    if not float.is_integer(pulse_spacing):
        raise ValueError('PPT reconstruction encountered non-integer pulse spacing')

    pulse_spacing = int(pulse_spacing)

    # Taken from euxfel_bunch_pattern/__init__.py
    from euxfel_bunch_pattern import DESTINATION_T4D, DESTINATION_T5D, PHOTON_LINE_DEFLECTION
    if instrument_sase == 1:
        flag = DESTINATION_T4D
    elif instrument_sase == 2:
        flag = DESTINATION_T5D
    elif instrument_sase == 3:
        flag = DESTINATION_T4D | PHOTON_LINE_DEFLECTION

    first_pulse_offset = int(first_edge + trigger_edge_offset)  # Overwrite notebook argument.
    fake_ppt[fake_ppt_offset:fake_ppt_offset + (pulse_spacing * num_pulses):pulse_spacing] = flag

    from pasha.functor import Functor, gen_split_slices
    class FakeKeyDataFunctor(Functor):
        """Functor appearing KeyData-like with constant data.

        This functor serves a constant data row for a given number
        of train IDs the same way a KeyData object would.
        """

        def __init__(self, row, train_ids):
            self.row = row
            self.train_ids = train_ids

        def split(self, num_workers):
            return gen_split_slices(len(self.train_ids), n_parts=num_workers)

        def iterate(self, share):
            it = zip(range(*share.indices(len(self.train_ids))), self.train_ids)

            for index, train_id in it:
                yield index, train_id, self.row

    ppt_data = FakeKeyDataFunctor(fake_ppt, dc.train_ids)

    fig, ax = plt.subplots(num=99, figsize=(9, 6), clear=True, ncols=1, nrows=1)

    ax.set_title('Edge trigger signal')
    ax.plot(trigger_trace, lw=1, label=f'Mean {trigger_edge_channel} trace')
    ax.vlines(edges, trigger_trace.min()*1.1, trigger_trace.max()*1.1,
              color='red', linewidth=3, alpha=0.3, label='Edge positions')

    ax.set_xlabel('Samples')
    ax.set_ylabel('Intensity / ADU')
    ax.legend()

 else:
    ppt_data = dc[ppt_source, 'data.bunchPatternTable']
 ```

 %% Cell type:markdown id: tags:

 ### Count pulses per train and compute offsets

 %% Cell type:code id: tags:

 ``` python
 # Based on the pulse pattern tables, three global variables are obtained:
 #
 # * `pulse_counts [int32: len(dc.train_ids)]` containing the number of pulses per train.
 # * `pulse_offsets [int32: len(dc.train_ids)]` containing the global offset for the first pulse of each train.
 # * `num_pulses = pulse_counts.sum(axis=0)`

 def get_pulse_positions(ppt, sase, laser, ppl_offset):
    # Combine FEL and PPL positions.

    fel_pos = indices_at_sase(ppt, sase) if not ignore_fel else np.array([])
    ppl_pos = indices_at_laser(ppt, laser) if not ignore_ppl else np.array([])

    if len(fel_pos) > 0:
        # Move PPL up to the FEL position.
        ppl_pos += fel_pos[0] + ppl_offset

    return np.union1d(fel_pos, ppl_pos), fel_pos, ppl_pos

 if laser_ppt_mask < 0:
    # If laser PPT mask is not specified, try to figure it out from device IDs.
    from euxfel_bunch_pattern import PPL_BITS

    instrument = karabo_id[:karabo_id.index('_')]

    try:
        laser_ppt_mask = PPL_BITS[f'LP_{instrument}']
    except KeyError:
        raise ValueError(f'Laser PPT mask unknown for instrument `{instrument}`')

 with timing('pulse_info'):
    psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))

    # Build the pulse index
    pulse_counts = psh.alloc(shape=len(dc.train_ids), dtype=np.uint64)
    has_ppt = psh.alloc(shape=len(dc.train_ids), dtype=bool, fill=False)

    def count_pulses(wid, index, tid, ppt):
        pulse_counts[index] = len(get_pulse_positions(ppt, instrument_sase, laser_ppt_mask, ppl_offset)[0])
        has_ppt[index] = True

    psh.map(count_pulses, ppt_data)

    # Fill any missing values with the highest.
    pulse_counts[has_ppt == False] = pulse_counts.max()

    if trailing_trigger:
        # Add a single count to every train for trailing trigger.
        pulse_counts += 1

    # Compute offsets based on pulse counts.
    pulse_offsets = np.zeros_like(pulse_counts)
    np.cumsum(pulse_counts[:-1], out=pulse_offsets[1:])

    # Total number of pulses.
    num_pulses = int(pulse_counts.sum())
 ```

 %% Cell type:code id: tags:

 ``` python
 fig, ax = plt.subplots(num=1, ncols=1, nrows=1, figsize=(9, 4), clear=True)

 ax.set_title('Pulse count')
 ax.plot(dc.train_ids, pulse_counts, lw=1)
 ax.set_xlabel('Train ID')
 ax.set_ylabel('Number of pulses')
 ax.set_ylim(0, max(300, pulse_counts.max() + 10))
 ax.ticklabel_format(style='plain')
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Find triggers

 The trigger defines the boundary of a pulse on the digitizer trace, which is stored per train.

 %% Cell type:code id: tags:

 ``` python
 # A trigger defines the boundary of a pulse on the digitizer trace stored per train. This cell creates a
 # global variable:
 # * `triggers [(start: int32, stop: int32, offset: float64, fel: bool, ppl: bool): num_pulses]`
 #   containing the triggers for each pulse.
 #
 # This uses the pulse puttern table to locate the pulse positions on the trace. Only number of pulses and
 # their distance can be drawn this way, leaving the absolute offset for the very first pulse to be
 # configured via `trigger/ppt/first_pulse_offset`. If a PPL is used, it will be included in the trigger
 # pattern. The ppt_offset parameter allows taking into account an offset betwen PPL and FEL.

 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))

 triggers = psh.alloc(shape=(num_pulses,), dtype=trigger_dt, fill=(-1, -1, np.nan, -1, 0, 0))

 clock_factor = remi['digitizer']['clock_factor']

 min_trace_len = min([
    dc[src, key].entry_shape[0] for det_name in remi['detector'].keys()
    for src, key in remi.get_detector_sourcekeys(det_name)
 ])

 def trigger_by_ppt(worker_id, index, train_id, ppt):
    all_pos, fel_pos, ppl_pos = get_pulse_positions(ppt, instrument_sase, laser_ppt_mask, ppl_offset)
    num_pulses = len(all_pos)

    if num_pulses > 0:
        if len(ppl_pos) == 0 and ppl_offset < 0:
            # No PPL pulses, but a negative offset is configured. This will cause
            # first_pulse_offset to start early and most likely miss pulses at the
            # end, so we correct by adding the ppl_offset to relative positions
            # when computing trace positions.
            pos_corr = abs(ppl_offset)
        else:
            pos_corr = 0

        rel_pos = all_pos - all_pos[0]

        if num_pulses > 1:
            pulse_len = np.unique(rel_pos[1:] - rel_pos[:-1]).min()
        elif num_pulses == 1:
            pulse_len = single_pulse_length

        start_frac = first_pulse_offset + (rel_pos + pos_corr) * 2 * clock_factor
        start_int = start_frac.astype(int)

        train_triggers = triggers[pulse_offsets[index]:int(pulse_offsets[index]+num_pulses)]
        train_triggers['start'] = start_int + pulse_start_offset
        train_triggers['stop'] = start_int + int(pulse_len * 2 * clock_factor) - 1 + pulse_end_offset
        train_triggers['offset'] = start_frac - start_int
        train_triggers['pulse'] = all_pos.astype(np.int16)
        train_triggers['fel'] = [pos in fel_pos for pos in all_pos]
        train_triggers['ppl'] = [pos in ppl_pos for pos in all_pos]

        last_sample = train_triggers['stop'].max()

    else:
        last_sample = first_pulse_offset

    if trailing_trigger:
        # Add trailing trigger if required.
        trigger = triggers[int(pulse_offsets[index]+pulse_counts[index]-1)]
        trigger['start'] = last_sample
        trigger['stop'] = min_trace_len
        trigger['offset'] = 0.0
        trigger['pulse'] = -1
        trigger['fel'] = False
        trigger['ppl'] = False

 with timing('find_triggers'):
    psh.map(trigger_by_ppt, ppt_data)

 if (np.unique(triggers['pulse'][1:] - triggers['pulse'][:-1]) > 0).sum() > 1:
    # There is more than one delta between pulse entries across all pulses. This is not
    # necessarily a problem, as the pattern could simply have changed in between trains
    # with each train being split properly.
    # If there's more than one delta in a single train, this likely points to a mismatch
    # of FEL and PPL repetition rate. This is most likely not intended.

    one = np.uint64(1)  # Because np.uint64 + int = np.float64
    pulse_deltas = set()

    for pulse_id, (offset, count) in enumerate(zip(
        pulse_offsets, pulse_counts - one if trailing_trigger else pulse_counts
    )):
        deltas = triggers['pulse'][offset+one:offset+count] - triggers['pulse'][offset:offset+count-one]

        if len(np.unique(deltas)) > 1:
            for delta in deltas:
                pulse_deltas.add(delta)

    if len(pulse_deltas) > 1:
        delta_str = ', '.join([str(x) for x in sorted(pulse_deltas)])
        warning(f'Different pulse lengths (PPT: {delta_str}) encountered within single trains, '
                f'separated pulse spectra may split up signals!')
 ```

 %% Cell type:code id: tags:

 ``` python
 fig, (lx, rx) = plt.subplots(num=2, ncols=2, nrows=1, figsize=(9, 4), clear=True,
                             gridspec_kw=dict(top=0.75))

 # Display ~400 pulses or 10 trains, whatever is lower
 n_trains = max(abs(pulse_offsets - 200).argmin(), 5)

 visible_triggers = triggers[:pulse_offsets[n_trains]]

 pulse_index = np.arange(len(visible_triggers))
 pumped = visible_triggers['fel'] & visible_triggers['ppl']
 fel_only = visible_triggers['fel'] & ~pumped
 ppl_only = visible_triggers['ppl'] & ~pumped

 lx.plot(pulse_index[pumped], visible_triggers[pumped]['start'], ' .', ms=3, c='C0', label='FEL+PPL')
 lx.plot(pulse_index[fel_only], visible_triggers[fel_only]['start'], '.', ms=3, c='C1', label='FEL-only')
 lx.plot(pulse_index[ppl_only], visible_triggers[ppl_only]['start'], '.', ms=2, c='C2', label='PPL-only')

 max_start = visible_triggers['start'].max()

 lx.vlines(pulse_offsets[:n_trains], 0, max_start, color='grey', linewidth=1, alpha=0.2)
 lx.tick_params(right=True)

 lx.set_xlabel('Pulse index')
 lx.set_xlim(-15, pulse_offsets[n_trains]+15)

 lx.set_ylabel('Trigger position')
 lx.set_ylim(-max_start // 20, max_start + max_start // 20)

 lx.legend(fontsize='small', loc='lower right')

 train_lx = lx.twiny()
 train_lx.set_xlabel('Train ID', labelpad=8)
 train_lx.set_xlim(lx.get_xlim())
 train_lx.set_xticks(pulse_offsets[:n_trains])
 train_lx.set_xticklabels([str(int(x)) for x in dc.train_ids[:n_trains]],
                         rotation=-45, fontsize='x-small')

 rx.plot(triggers['start'], lw=0.2)

 rx.set_xlabel('Pulse index')
 rx.tick_params(left=False, labelleft=False, right=True, labelright=True)

 pass
 ```

 %% Cell type:markdown id: tags:

 # Analog signal to digital edges

 %% Cell type:markdown id: tags:

 ### Find edges in analog signal

 %% Cell type:code id: tags:

 ``` python
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_edges))
 threadpool_limits(limits=remi.get_num_workers(mt_avg_trace))

 det_data = {}

 for i, (det_name, det) in enumerate(remi['detector'].items()):
    det_sourcekeys = remi.get_detector_sourcekeys(det_name)
    det_get_traces = remi.get_traces_getter(det_name)
    trace_len = dc[next(iter(det_sourcekeys))].entry_shape[0]

    edges = psh.alloc(shape=(num_pulses, 7, det['max_hits']),
                      dtype=np.float64, fill=np.nan)
    amplitudes = psh.alloc(shape=(num_pulses, 7, det['max_hits']),
                           dtype=np.float64, fill=np.nan)
    avg_traces = psh.alloc_per_worker(shape=(7, trace_len), dtype=np.float64)

    def prepare_edge_worker(worker_id):
        correct_func = remi.get_baseline_corrector()
        discr_func, discr_params = remi.get_discriminator(det['channels'])

        source_name = remi['digitizer']['source']
        bl_start, bl_stop, _ = remi.get_baseline_limits(trace_len)
        bl_sym = remi['digitizer']['baseline_symmetry']
        time_cal = remi.get_time_calibration()

        traces_corr = np.empty((7, trace_len), dtype=np.float64)
        baselines = np.empty(bl_sym, dtype=np.float64)
        yield

    @psh.with_init(prepare_edge_worker)
    def find_edges(worker_id, index, train_id, data):
        try:
            data = det_get_traces(data[source_name])
        except KeyError:
            return

        for channel_idx in range(7):
            correct_func(data[channel_idx], traces_corr[channel_idx],
                         baselines, bl_start, bl_stop)

        avg_traces[worker_id] += traces_corr

        pulses_slice = np.s_[pulse_offsets[index]:pulse_offsets[index]+pulse_counts[index]]

        for trigger, pulse_edges, pulse_amplitudes in zip(
            triggers[pulses_slice], edges[pulses_slice], amplitudes[pulses_slice]
        ):
            trigger_slice = np.s_[trigger['start']:trigger['stop']]

            for trace, channel_params, channel_edges, channel_amplitudes in zip(
                traces_corr, discr_params, pulse_edges, pulse_amplitudes
            ):
                discr_func(trace[trigger_slice], edges=channel_edges,
                           amplitudes=channel_amplitudes, **channel_params)

            if np.isfinite(pulse_edges).sum(axis=1).max() == det['max_hits']:
                warning(f'Maximum number of edges reached in train {train_id}, pulse: {trigger["pulse"]}')

    with timing(f'find_edges, {det_name}'):
        psh.map(find_edges, dc.select(det_sourcekeys))

    if not np.isfinite(edges).any():
        warning(f'No edges found for {det_name}')

    fig, (ux, bx) = plt.subplots(num=110+i, ncols=1, nrows=2, figsize=(9.5, 8), clear=True,
                                 gridspec_kw=dict(left=0.1, right=0.98, top=0.98, bottom=0.1, hspace=0.25))

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    max_num = 0

    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        n, _, _ = ux.hist(finite_flattened_slice(amplitudes, np.s_[:, edge_idx, :]),
                          bins=1000, range=(0, 2048), histtype='step', lw=1,
                          color=f'C{edge_idx}' if edge_idx < 6 else 'k', label=edge_name)
        max_num = max(max_num, n.max())

        cur_edges = finite_flattened_slice(edges, np.s_[:, edge_idx, :])
        bx.hist(cur_edges - np.floor(cur_edges), bins=500, range=(0, 1), histtype='step',
                lw=1, color=f'C{edge_idx}' if edge_idx < 6 else 'k', label=edge_name)

    ux.legend()
    ux.set_title('Pulse height distributions')
    ux.set_xlabel('Pulse height')
    ux.set_yscale('log')
    ux.set_xlim(0, 4096)
    ux.set_ylim(10, 1.5*max(max_num, 10))

    if remi['digitizer']['discriminator'] == 'cfd':
        ux.text(1024, 12.5, 'No pulse height feedback for constant fraction discrimination',
                ha='center', va='center')

    bx.set_title('Fractional edge distributions')
    bx.set_xlabel('Edge positions - ⌊edge positions⌋')
    bx.set_yscale('log')
    bx.set_xlim(-0.05, 1.2)
    bx.legend()

    # Properly offset edges to their trigger offset and convert to time.
    # This is not done earlier to preserve the information for plotting.
    edges += triggers['offset'][:, None, None]
    edges *= remi.get_time_calibration()

    det_data[det_name] = {
        'edges': edges,
        'amplitudes': amplitudes,
        'avg_trace': avg_traces.sum(axis=0) / len(dc.train_ids)
    }
 ```

 %% Cell type:markdown id: tags:

 ### Global average of analog signals

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    fig, axs = plt.subplots(num=10+i, nrows=7, figsize=(9.5, 8), clear=True,
                            gridspec_kw=dict(left=0.1, right=0.98, top=0.98, bottom=0.1))
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        axs[edge_idx].plot(det_data[det_name]['avg_trace'][edge_idx], lw=1)
        axs[edge_idx].tick_params(labelbottom=False)
        axs[edge_idx].set_ylabel(edge_name)

    axs[-1].tick_params(labelbottom=True)
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Sample for found edges

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    edges = det_data[det_name]['edges']

    fig = plt.figure(num=100+i, figsize=(9.5, 8))
    grid = fig.add_gridspec(ncols=2, nrows=4, left=0.1, right=0.98, top=0.98, bottom=0.1)

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    for signal_idx, signal_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        row = (1 + signal_idx // 2) if signal_idx < 6 else 0
        col = (signal_idx % 2) if signal_idx < 6 else np.s_[:]
        ax = fig.add_subplot(grid[row, col])

        finite_edges = np.isfinite(edges[:, signal_idx, 0])
        if not finite_edges.any():
            warning(f'No edges found for {det_name}/{signal_name}')
            continue

        pulse_idx = np.uint64(finite_edges.nonzero()[0][0])  # Is combined with other uint64 values below.
        train_idx = (pulse_idx >= pulse_offsets).nonzero()[0][-1]
        trigger = triggers[pulse_idx]

        sourcekey = remi.get_channel_sourcekey(
            remi['detector'][det_name]['channels'][signal_idx])
        train_trace = dc[sourcekey].select_trains(np.s_[train_idx:train_idx+1]).ndarray()[0]
        corr_trace = np.zeros_like(train_trace, dtype=np.float64)

        remi.get_baseline_corrector()(
            train_trace, corr_trace,
            np.empty(remi['digitizer']['baseline_symmetry'], dtype=np.float64),
            *remi.get_baseline_limits(len(train_trace))[:2])

        pulse_trace = corr_trace[np.s_[trigger['start']:trigger['stop']]]

        x_time = remi.get_time_calibration() * (np.arange(len(pulse_trace)) + trigger['offset'])

        ax.plot(x_time, pulse_trace, lw=1)
        ax.set_xlim(x_time[0], x_time[-1])
        ax.set_ylim(-200, pulse_trace.max()*1.1)
        ax.text(x_time[-1], pulse_trace.max(),
                f'T{train_idx} P{pulse_idx - pulse_offsets[train_idx]} ',
                va='top', ha='right')
        ax.tick_params(labelbottom=False)
        ax.set_ylabel(signal_name)

        ax.vlines(edges[pulse_idx, signal_idx, :], *ax.get_ylim(), color='red', linewidth=1)
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Digitized channel spectra

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    fig = plt.figure(num=20+i, figsize=(9.5, 6))

    edges = det_data[det_name]['edges']

    min_edge = np.nanmin(edges)
    max_edge = np.nanmax(edges)

    grid = fig.add_gridspec(ncols=3, nrows=3, left=0.08, right=0.98, top=0.95, hspace=0.4)

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    numx = fig.add_subplot(grid[0, 0])
    numx.set_title('Edges per pulse')

    agg_window = num_pulses // 60
    max_num_edges = 0.0
    max_spectral_intensity = 0
    hist_axs = []

    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        if edge_idx < 6:
            row = 1 + edge_idx % 2
            col = edge_idx // 2
        else:
            row = 0
            col = np.s_[1:3]

        ax = fig.add_subplot(grid[row, col])
        ax.set_title(f'TOF spectrum: {edge_name}')

        num_edges = np.isfinite(edges[:, edge_idx, :]).sum(axis=1)
        num_edges = num_edges[:((len(num_edges) // agg_window) * agg_window)]
        num_edges = num_edges.reshape(-1, agg_window).mean(axis=1)

        if (num_edges == 0).all():
            warning(f'No edges found for {det_name}/{edge_name}')
            continue

        if edge_idx < 6:
            plot_kwargs = dict(c=f'C{edge_idx}', ls='solid', lw=1.0)
        else:
            plot_kwargs = dict(c='k', ls='dashed', lw=1.0)

        numx.plot(np.arange(len(num_edges)) * agg_window, num_edges, label=edge_name, **plot_kwargs)
        max_num_edges = max(max_num_edges, num_edges.max())

        y, _, _ = ax.hist(finite_flattened_slice(edges, np.s_[:, edge_idx, :]),
                          bins=int((max_edge - min_edge) // 5), range=(min_edge, max_edge),
                          color=plot_kwargs['c'], histtype='step', linewidth=1)
        hist_axs.append(ax)

        max_spectral_intensity = max(max_spectral_intensity, y.max())

    numx.tick_params(labelbottom=False)
    numx.set_ylim(0, 1.2*max_num_edges)

    for ax in hist_axs:
        ax.set_ylim(0, max_spectral_intensity*1.1)
        ax.ticklabel_format(axis='y', style='sci', scilimits=(0, 3))
 pass
 ```

 %% Cell type:markdown id: tags:

 # Detector diagnostics

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    edges = det_data[det_name]['edges']

    sort = remi.get_dld_sorter(det_name)

    sum_shifts = sort.sum_shifts if sort.sum_shifts != (0.0, 0.0, 0.0) else None

    is_valid = remi.get_presort_mask(edges, edge_idx=0, w=not quad_anode,
                                     sum_limit=max(sort.uncorrected_time_sum_half_widths),
                                     sum_shifts=sum_shifts)

    if not is_valid.any():
        warning(f'No valid preliminary edge combinations found for {det_name}')

    signals, sums = remi.get_signals_and_sums(edges, indices=sort.channel_indices, sum_shifts=sum_shifts,
                                              mask=is_valid)
    fig = plot_detector_diagnostics(signals=signals, sums=sums, fig_num=30+i, im_scale=1.5,
                                    sum_range=max(sort.uncorrected_time_sum_half_widths),
                                    sorter=sort)
    fig.text(0.02, 0.98, det_name.upper() + ' before corrections', rotation=90, ha='left', va='top', size='x-large')

    if remi['detector'][det_name]['use_sum_correction'] or remi['detector'][det_name]['use_pos_correction']:
        n_masked = is_valid.sum()
        signals = np.full((n_masked, 3), np.nan, dtype=np.float64)
        sums = np.full((n_masked, 3), np.nan, dtype=np.float64)

        sort.correct(edges[is_valid], signals, sums)
        fig = plot_detector_diagnostics(signals=signals, sums=sums, fig_num=40+i, im_scale=1.5,
                                        sum_range=max(sort.uncorrected_time_sum_half_widths),
                                        sorter=sort)
        fig.text(0.02, 0.98, det_name.upper() + ' after corrections', rotation=90, ha='left', va='top', size='x-large')
 pass
 ```

 %% Cell type:markdown id: tags:

 # Hit reconstruction

 %% Cell type:code id: tags:

 ``` python
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_rec_hits))

 for det_name, det in remi['detector'].items():
    edges = det_data[det_name]['edges']

    signals = psh.alloc(shape=(num_pulses, 50), dtype=signal_dt, fill=np.nan)
    hits = psh.alloc(shape=(num_pulses, 50), dtype=hit_dt, fill=(np.nan, np.nan, np.nan, -1))
    hit_counts = psh.alloc(shape=len(dc.train_ids), dtype=np.uint32)

    def prepare_hit_worker(worker_id):
        sort = remi.get_dld_sorter(det_name)
        yield

    @psh.with_init(prepare_hit_worker)
    def reconstruct_hits(worker_id, index, train_id):
        hit_counts[index] += sort.run_on_train(
            edges, signals, hits, pulse_offsets[index], pulse_offsets[index] + pulse_counts[index])

    with timing(f'rec_hits, {det_name}'):
        psh.map(reconstruct_hits, dc.train_ids)

    det_data[det_name].update(signals=signals, hits=hits, hit_counts=hit_counts)
 ```

 %% Cell type:code id: tags:

 ``` python
 fig, ax = plt.subplots(num=50+i, figsize=(9.5, 4), ncols=1, clear=True,
                       gridspec_kw=dict(top=0.92, right=0.98, left=0.05, bottom=0.12))

 max_num_hits = 0.0

 for det_name in remi['detector'].keys():
    agg_window = num_pulses // min(1000, num_pulses)

    num_hits = np.isfinite(det_data[det_name]['hits']['x']).sum(axis=1)
    num_hits = num_hits[:(len(num_hits) // agg_window) * agg_window]
    num_hits = num_hits.reshape(-1, agg_window).mean(axis=1)
    max_num_hits = max(max_num_hits, num_hits.max())

    ax.plot(np.arange(0, (num_pulses // agg_window) * agg_window, agg_window), num_hits,
            lw=1, label=det_name.upper())

 ax.set_title('Hits per pulse')
 ax.set_xlabel('Pulse index')
 ax.set_ylim(0, max_num_hits*1.1)
 ax.legend()
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Reconstruction methods
 Each hit may be reconstructed by one of 19 different methods. These differ by the number of real signals across the channels, which could be combined to form the hit. Each of these methods is designed by a number between `0` and `19` (with empty hits using `-1`), which can be found in the `m` key of a hit, e.g.:

 * `0`: All six anode signals and the corresponding MCP signal were found.
 * `4`: One signal on layer `u` is missing, all other signals for this event were found.
 * `18`: Only one anode signal on each layer was found and the MCP signal is missing. There is no way to check whether this combination of signals is actually valid.

 | Method | `u+v+w +mcp` |
 | - | - |
 | 0 | `2+2+2 +1` |
 | 1 | `0+2+2 +1` |
 | 2 | `2+0+2 +1` |
 | 3 | `2+2+0 +1` |
 | 4 | `1+2+2 +1` (2 permutations) |
 | 5 | `2+1+2 +1` (2 permutations) |
 | 6 | `2+2+1 +1` (2 permutations) |
 | 7 | `2+2+2 +0` |
 | 8 | `0+2+2 +0` |
 | 9 | `2+0+2 +0` |
 | 10 | `2+2+0 +0` |
 | 11 | `1+2+2 +0` (2 permutations) |
 | 12 | `2+1+2 +0` (2 permutations) |
 | 13 | `2+2+1 +0` (2 permutations) |
 | 14 | `2+1+1 +1` `1+2+1 +1` `1+1+2 +1` (12 permutations) |
 | 15 | `2+1+0 +1` `2+0+1 +1` `1+2+0 +1` `1+0+2 +1` `0+2+1 +1` `0+1+2 +1` (12 permutations) |
 | 16 | `1+1+1 +1` (8 permutations) |
 | 17 | `2+1+1 +0` `1+2+1 +0` `1+1+2 +0` (12 permutations) |
 | 18 | `1+1+1 +0` (8 permutations) |
 | 19 | `2+1+0 +0` `2+0+1 +0` `1+2+0 +0` `1+0+2 +0` `0+1+2 +0` `0+2+1 +0` (12 permutations) |

 * For hits reconstructed with method `> 10`, extra attention should be given to ensure they add meaningful signal.
 * Any method `> 14` has to considered risky, because neither a time sum nor the position can be checked. If the scale factors and/or `w` shift are not correct, then the number of events reconstructed with the risky methods will increase. They will most likely be *ghost hits*, which do not correspond to actual impacts on the detector.

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    hits = det_data[det_name]['hits']

    fig, ax = plt.subplots(num=60+i, figsize=(9.5, 5), ncols=1, clear=True,
                           gridspec_kw=dict(left=0.08, right=0.91, top=0.8))

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')

    if not (hits['m'] >= 0).any():
        warning(f'No hits found for {det_name}')
        continue

    method_bins = np.bincount(hits['m'][hits['m'] >= 0], minlength=20)
    ax.bar(np.arange(20), method_bins, width=0.5)

    ax.set_xlabel('Reconstruction method')
    ax.set_xlim(-0.5, 19.5)
    ax.set_xticks(np.arange(20))

    ax.set_ylabel('Number of hits')
    ax.set_ylim(0, method_bins.max()*1.05)
    ylims = ax.get_ylim()

    ax.tick_params(which='both', right=True, labelright=True)

    num_risky = method_bins[15:].sum()
    num_total = method_bins.sum()

    ax.text(14.2, method_bins.max(), f'{(100*(num_total-num_risky)/num_total):.2g}%',
            va='top', ha='right', color='black')
    ax.text(14.8, method_bins.max(), f'{(100*num_risky/num_total):.2g}%',
            va='top', ha='left', color='red')

    ax.fill([14.5, 19.5, 19.5, 14.5], [ylims[0], ylims[0], ylims[1], ylims[1]], c='r', alpha=0.2)

    labelx = ax.twiny()
    labelx.set_xlim(*ax.get_xlim())
    labelx.set_xticks(ax.get_xticks())
    labelx.set_xticklabels([
        '2+2+2 +1',
        '0+2+2 +1', '2+0+2 +1', '2+2+0 +1',
        '1+2+2 +1', '2+1+2 +1', '2+2+1 +1',
        '2+2+2 +0',
        '0+2+2 +0', '2+0+2 +0', '2+2+0 +0', '1+2+2 +0', '2+1+2 +0', '2+2+1 +0',
        '2+1+1 +1',
        '2+1+0 +1',
        '1+1+1 +1',
        '2+1+1 +0',
        '1+1+1 +0',
        '2+1+0 +0',
    ], rotation=90)

    min_rel_tick = np.ceil((ax.get_ylim()[0] / num_total) / 0.1) * 0.1
    max_rel_tick = np.floor((method_bins.max() / num_total) / 0.1) * 0.1

    rely = ax.twinx()
    rely.set_ylim(*ax.get_ylim())
    rely.set_yticks(np.arange(0.0, max_rel_tick+0.01, 0.1)*num_total)
    rely.set_yticks(np.arange(0.0, ylims[1]/num_total, 0.02)*num_total, minor=True)
    rely.set_yticklabels([f'{(y/num_total)*100:.0f}%' for y in rely.get_yticks()])
    rely.set_ylabel('Percentage of total hits')
 pass
 ```

 %% Cell type:markdown id: tags:

 ### Detector image and fishes

 %% Cell type:code id: tags:

 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    flat_hits = det_data[det_name]['hits'].reshape(-1)
    flat_hits = flat_hits[np.isfinite(flat_hits[:]['x'])]
    flat_hits = flat_hits[flat_hits['m'] <= 10]

    fig = plt.figure(num=70+i, figsize=(9, 13.5))

    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    fig.text(0.02, 0.02, det_name.upper(), rotation=90, ha='left', va='bottom', size='x-large')

    imp = fig.add_axes([0.1 + 0.25/2, 0.56, 0.6, 0.4])
    txp = fig.add_axes([0.1, 0.28, 0.85, 0.22])
    typ = fig.add_axes([0.1, 0.04, 0.85, 0.22])

    if flat_hits.size == 0:
        warning(f'No hits found for {det_name}')
        continue

    im_radius = remi['detector'][det_name]['mcp_radius']*1.1

    imp.hist2d(flat_hits['x'], flat_hits['y'], bins=(256, 256),
               range=[[-im_radius, im_radius], [-im_radius, im_radius]], norm=LogNorm())
    imp.xaxis.set_label_position('top')
    imp.set_xlabel('X / mm')
    imp.set_ylabel('Y / mm')
    imp.tick_params(right=True, labelright=True, top=True, labeltop=True)
    imp.grid()

    min_tof = flat_hits['t'].min()
    max_tof = flat_hits['t'].max()

    num_tof_bins = min(int((max_tof - min_tof) // 10), 500)

    if num_tof_bins == 0:
        warning(f'All TOFs limited to single bin for {det_name}')
        continue

    for ax, dim_label in zip([txp, typ], ['x', 'y']):
        ax.hist2d(flat_hits['t'], flat_hits[dim_label], bins=(num_tof_bins, 256),
                   range=[[min_tof, max_tof], [-im_radius, im_radius]], norm=LogNorm())
        ax.set_ylabel(f'{dim_label.upper()} / mm')

    typ.set_xlabel('Time-of-flight / ns')
    txp.tick_params(bottom=True, labelbottom=False, top=True, labeltop=True, right=True, labelright=True)
    typ.tick_params(right=True, labelright=True, top=True)
 pass
 ```

 %% Cell type:markdown id: tags:

 # Transformed data files

 %% Cell type:code id: tags:

 ``` python
 # Try to figure out proposal number from in_folder to work with older files.
 m = re.match(r'p(\d{6})', Path(in_folder).parts[-2])
 if not proposal and m is not None:
    proposal = int(m[1])

 seq_len = out_seq_len if out_seq_len > 0 else len(dc.files[0].train_ids)
 dataset_kwargs = {k[8:]: v for k, v in locals().items() if k.startswith('dataset_compression')}

 control_sources = [det_device_id.format(karabo_id=karabo_id, det_name=det_name.upper())
                   for det_name in remi['detector']]

 channels = []
 if save_raw_triggers or save_raw_edges:
    channels.append('raw')
 if save_rec_signals or save_rec_hits:
    channels.append('rec')

 instrument_channels = [
    f'{device_id}:{det_output_key}/{channel}'
    for device_id in control_sources
    for channel in channels
 ]
 ```

 %% Cell type:code id: tags:

 ``` python
 Path(out_folder).mkdir(parents=True, exist_ok=True)
 print('Writing sequence files', flush=True, end='')

 t_write = timing('write_files')
 t_write.__enter__()

 for seq_id, train_mask, pulse_mask in sequence_pulses(dc.train_ids, pulse_counts, pulse_offsets, seq_len):
    seq_train_ids = dc.train_ids[train_mask]

    with DataFile.from_details(out_folder, out_aggregator, run, seq_id) as outp:
        outp.create_metadata(like=dc, proposal=proposal, run=run, sequence=seq_id,
                             control_sources=control_sources, instrument_channels=instrument_channels)
        outp.create_index(
            seq_train_ids,
            timestamps=dc.select_trains(by_id[seq_train_ids]).train_timestamps().astype(np.uint64)
        )

        for det_name in remi['detector']:
            cur_device_id = det_device_id.format(karabo_id=karabo_id, det_name=det_name.upper())
            cur_max_hits = remi['detector'][det_name]['max_hits']

            cur_control_data = outp.create_control_source(cur_device_id)
            # Manually manipulate the file here, still creates the index properly.
            remi.attach_detector_config(det_name, cur_control_data.get_run_group())
            cur_control_data.create_index(len(seq_train_ids))

            cur_fast_data = outp.create_instrument_source(f'{cur_device_id}:{det_output_key}')

            cur_data = det_data[det_name]

            if save_raw_triggers:
                cur_fast_data.create_key('raw.triggers', triggers[pulse_mask],
                                         maxshape=(None,) + triggers.shape[1:],
                                         chunks=tuple(chunks_triggers), **dataset_kwargs)

            if save_raw_edges:
                cur_fast_data.create_key('raw.edges', cur_data['edges'][pulse_mask],
                                         maxshape=(None,) + cur_data['edges'].shape[1:],
                                         chunks=tuple(chunks_edges if chunks_edges[-1] <= cur_max_hits
                                                      else chunks_edges[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            if save_raw_amplitudes:
                cur_fast_data.create_key('raw.amplitudes', cur_data['amplitudes'][pulse_mask],
                                         maxshape=(None,) + cur_data['amplitudes'].shape[1:],
                                         chunks=tuple(chunks_amplitudes if chunks_amplitudes[-1] <= cur_max_hits
                                                      else chunks_amplitudes[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            if save_rec_signals:
                cur_fast_data.create_key('rec.signals', cur_data['signals'][pulse_mask],
                                         maxshape=(None,) + cur_data['signals'].shape[1:],
                                         chunks=tuple(chunks_signals if chunks_signals[-1] <= cur_max_hits
                                                      else chunks_signals[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            if save_rec_hits:
                cur_fast_data.create_key('rec.hits', cur_data['hits'][pulse_mask],
                                         maxshape=(None,) + hits.shape[1:],
                                         chunks=tuple(chunks_hits if chunks_hits[-1] <= cur_max_hits
                                                      else chunks_hits[:-1] + [cur_max_hits]),
                                         **dataset_kwargs)

            cur_fast_data.create_index(raw=pulse_counts[train_mask], rec=pulse_counts[train_mask])

    print('.', flush=True, end='')

 print('')
 t_write.__exit__()
 ```

--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ install_requires = [
        "docutils==0.17.1",
        "dynaconf==3.1.4",
        "env_cache==0.1",
-        "extra_data==1.12.0",
+        "extra_data==1.15.1",
        "extra_geom==1.10.0",
        "gitpython==3.1.0",
        "h5py==3.5.0",
@@ -110,7 +110,7 @@ if "readthedocs.org" not in sys.executable:
    install_requires += [
        "iCalibrationDB @ git+ssh://git@git.xfel.eu:10022/detectors/cal_db_interactive.git@2.4.1",  # noqa
        "XFELDetectorAnalysis @ git+ssh://git@git.xfel.eu:10022/karaboDevices/pyDetLib.git@2.7.0",  # noqa
-        "CalParrot @ git+ssh://git@git.xfel.eu:10022/calibration/calparrot.git@0.2",  # noqa
+        "CalParrot @ git+ssh://git@git.xfel.eu:10022/calibration/calparrot.git@0.3",  # noqa
    ]

 setup(

--- a/src/cal_tools/dssclib.py
+++ b/src/cal_tools/dssclib.py
@@ -9,6 +9,16 @@ import h5py
 import numpy as np


+def get_num_cells(fname, h5path):
+
+    with h5py.File(fname, "r") as f:
+        cells = f[f"{h5path}/cellId"][()]
+        if cells == []:
+            return
+        maxcell = np.max(cells)
+        return maxcell+1
+
+
 def get_pulseid_checksum(fname, h5path, h5path_idx):
    """generates hash value from pulse pattern (veto defined)."""
    with h5py.File(fname, "r") as infile:

--- a/tests/test_reference_runs/callab_tests.py
+++ b/tests/test_reference_runs/callab_tests.py
@@ -584,11 +584,7 @@ automated_test_config = {
            "run": "9028",  # Original run: "1723",
            "karabo-id": "SCS_DET_DSSC1M-1",
            "slow-data-path": "SCS_CDIDET_DSSC/FPGA/PPT_Q",
-            "slow-data-aggregators":
-                - 1
-                - 2
-                - 3
-                - 4
+            "slow-data-aggregators": [1, 2, 3, 4]
        },
        "reference-folder": "{}/{}/{}",
    },
@@ -602,6 +598,7 @@ automated_test_config = {
            "run": "9028",  # Original run: "1723",
            "karabo-id": "SCS_DET_DSSC1M-1",
            "slow-data-path": "SCS_CDIDET_DSSC/FPGA/PPT_Q",
+            "slow-data-aggregators": [1, 2, 3, 4]
        },
        "reference-folder": "{}/{}/{}",
    },
No results found