Compare revisions

4786d0dc · 4786d0dc · 4786d0dc · 4786d0dc · 4786d0dc · 4786d0dc
--- a/notebooks/Timepix/Compute_Timepix_Event_Centroids.ipynb
+++ b/notebooks/Timepix/Compute_Timepix_Event_Centroids.ipynb
 %% Cell type:markdown id:0393ed18-b4e5-499b-bdd3-c9e5f24b9627 tags:

 # Timepix3

 Author: Björn Senfftleben / Philipp Schmidt, Version: 1.0

 The following notebook provides centroiding for data acquired with the Timepix3 camera detector (ASI TPX3CAM).

 %% Cell type:code id:9484ee10 tags:

 ``` python
 # Data selection parameters.
-run = 420  # required
-in_folder = '/gpfs/exfel/exp/SQS/202230/p900256/raw'  # required
-out_folder = '/gpfs/exfel/exp/SQS/202230/p900256/scratch/cal_test'  # required
+run = 307  # required
+in_folder = '/gpfs/exfel/exp/SQS/202430/p900421/raw'  # required
+out_folder = '/gpfs/exfel/exp/SQS/202430/p900421/scratch/cal_test'  # required
 proposal = ''  # Proposal, leave empty for auto detection based on in_folder

 # These parameters are required by xfel-calibrate but ignored in this notebook.
 cal_db_timeout = 0  # Calibration DB timeout, currently not used.
 cal_db_interface = 'foo'  # Calibration DB interface, currently not used.
 karabo_da = 'bar'  # Karabo data aggregator name, currently not used

-karabo_id = 'SQS_AQS_CAM'
-in_fast_data = '{karabo_id}/CAM/TIMEPIX3:daqEventOutput'
+karabo_id = 'SQS_EXP_TIMEPIX'
+in_fast_data = '{karabo_id}/DET/TIMEPIX3:daqOutput.chip0'
 out_device_id = '{karabo_id}/CAL/TIMEPIX3'
-out_fast_data = '{karabo_id}/CAL/TIMEPIX3:output'
-out_aggregator = 'TPX01'
+out_fast_data = '{karabo_id}/CAL/TIMEPIX3:daqOutput.chip0'
+out_aggregator = 'TIMEPIX01'
 out_seq_len = 2000

 max_num_centroids = 10000  # Maximum number of centroids per train
 chunks_centroids = [1, 5000]  # Chunking of centroid data
 dataset_compression = 'gzip'  # HDF compression method.
 dataset_compression_opts = 3  # HDF GZIP compression level.

 clustering_epsilon = 2.0  # centroiding: The maximum distance between two samples for one to be considered as in the neighborhood of the other
 clustering_tof_scale = 1e7  # centroiding: Scaling factor for the ToA axis so that the epsilon parameter in DB scan works in all 3 dimensions
 clustering_min_samples = 2  # centroiding: minimum number of samples necessary for a cluster
-clustering_n_jobs = 1  # centroiding: (DBSCAN) The number of parallel jobs to run.
 threshold_tot = 0 # raw data: minimum ToT necessary for a pixel to contain valid data

 raw_timewalk_lut_filepath = ''  # fpath to look up table for timewalk correction relative to proposal path or empty string,
 centroiding_timewalk_lut_filepath = ''  # fpath to look up table for timewalk correction relative to proposal path or empty string.
 ```

 %% Cell type:code id:524fe654-e112-4abe-813c-a0be9b3a3034 tags:

 ``` python
 from datetime import datetime
 from pathlib import Path
 from time import monotonic
+from os import cpu_count
 from warnings import warn

 import numpy as np
 import scipy.ndimage as nd
 import h5py
+import pasha as psh

 from sklearn.cluster import DBSCAN
 from extra_data import RunDirectory
 from extra_data.read_machinery import find_proposal

 from cal_tools.files import DataFile, sequence_pulses

 %matplotlib inline
 ```

 %% Cell type:code id:e36f997c-4b66-4b11-99a8-5887e3572f56 tags:

 ``` python
 # centroiding
 error_msgs = {
    -1: "tpx_data has an invalid structure - ignore provided data",
    -2: "tpx_data arrays are of invalid lengths - ignore provided data",
    -3: "tpx_data arrays are empty"
 }


 def check_data(tpx_data):
    required_keys = ["x", "y", "toa", "tot"]
    for key in required_keys:
        if key not in tpx_data.keys():
            warn("tpx data must contain the keys %s, but key %s not in tpx data keys (%s)" % (required_keys, key, list(tpx_data.keys())),
                 category=UserWarning)
            return -1

    reference_n_samples_key = "x"
    n_samples = len(tpx_data[reference_n_samples_key])
    for key in tpx_data.keys():
        if n_samples != len(tpx_data[key]):
            warn("arrays in tpx data must be of same length ( len(tpx_data[%s])=%i!=%i=(len(tpx_data[%s]) )" % (reference_n_samples_key, n_samples, len(tpx_data[key]), key),
                 category=UserWarning)
            return -2
    if n_samples == 0:
        warn("no samples were provides with tpx data", category=UserWarning)
        return -3
    return 0


 def apply_single_filter(tpx_data, _filter):
    """
    Simple function to apply a selecting or sorting filter to a dictionary of equally sized arrays
    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!

    Parameters
    ----------
    tpx_data: dictionary with timepix data, all arrays behind each key must be of same length
    _filter:  1d array or list of integers or booleans or np.s_ to select or sort data like a = a[_filter]

    Returns
    -------
    tpx_data: like input tpx_data but with applied filter

    """
    try:
        for key in tpx_data.keys():
            tpx_data[key] = np.array(tpx_data[key])[_filter]
    except Exception as e:
        print(_filter)
        print(_filter.dtype)
        print(_filter.shape)
        print(tpx_data[key].shape)
        raise e
    return tpx_data


-def pre_clustering_filter(tpx_data, tot_threshold=0):
-    """
-    Collection of filters directly applied before clustering.
-    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
-
-    Parameters
-    ----------
-    tpx_data:      Dictionary with timepix data, all arrays behind each key must be of same length
-    tot_threshold: minimum ToT required for a pixel to contain valid data
-
-    Returns
-    -------
-    tpx_data: like input tpx_data but with applied filters
-    """
-    if tot_threshold > 0:
-        tpx_data = apply_single_filter(tpx_data, tpx_data["tot"] >= tot_threshold)
-
-    return tpx_data
-
-
-def post_clustering_filter(tpx_data):
-    """
-    Collection of filters directly applied after clustering.
-    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
-
-    Parameters
-    ----------
-    tpx_data:    Dictionary with timepix data, all arrays behind each key must be of same length, now with key labels
-
-    Returns
-    -------
-    tpx_data: like input tpx_data but with applied filters
-    """
-    if tpx_data["labels"] is not None:
-        tpx_data = apply_single_filter(tpx_data, tpx_data["labels"] != 0)
-
-    return tpx_data
-

 def clustering(tpx_data, epsilon=2, tof_scale=1e7, min_samples=3, n_jobs=1):
    """

    Parameters
    ----------
    tpx_data       Dictionary with timepix data, all arrays behind each key must be of same length, now with key labels
    epsilon        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
                   This is not a maximum bound on the distances of points within a cluster. This is the most important
                   DBSCAN parameter to choose appropriately for your data set and distance function.
    tof_scale      Scaling factor for the ToA data so that the epsilon parameter in DB scan works not only in the x/y
                   axes, but also in the ToA axis. So it converts ToA in s into "ToA pixels" -> e.g. tof_scale=1e7 means,
                   that 100 ns is considered comparable to 1 spatial pixel.
    min_samples    The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
                   This includes the point itself.
    n_jobs         The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context.
                   -1 means using all processors. See Glossary for more details.

    Returns
    -------

    """
    coords = np.column_stack((tpx_data["x"], tpx_data["y"], tpx_data["toa"]*tof_scale))
    dist = DBSCAN(eps=epsilon, min_samples=min_samples, metric="euclidean", n_jobs=n_jobs).fit(coords)
-    return dist.labels_ + 1
+    return dist.labels_

 def empty_centroid_data():
    return {
        "x": np.array([]),
        "y": np.array([]),
        "toa": np.array([]),
        "tot": np.array([]),
        "tot_avg": np.array([]),
        "tot_max": np.array([]),
        "size": np.array([]),
    }

 def get_centroids(tpx_data, timewalk_lut=None):
    centroid_data = empty_centroid_data()
    cluster_labels, cluster_size = np.unique(tpx_data["labels"], return_counts=True)

    cluster_tot_peaks = np.array(nd.maximum_position(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels)).ravel()
    cluster_tot_integrals = nd.sum(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels)

    # compute centroid center through weighted average
    centroid_data["x"] = np.array(nd.sum(tpx_data["x"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
    centroid_data["y"] = np.array(nd.sum(tpx_data["y"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
    centroid_data["toa"] = np.array(nd.sum(tpx_data["toa"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()

    # intensity & size information
    centroid_data["tot_avg"] = np.array(nd.mean(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels))
    centroid_data["tot_max"] = tpx_data["tot"][cluster_tot_peaks]
    centroid_data["tot"] = np.array(cluster_tot_integrals)
    centroid_data["size"] = cluster_size

    # train ID information
    # ~ centroid_data["tid"] = tpx_data["tid"][cluster_tot_peaks]

    # correct for timewalk if provided
    if timewalk_lut is not None:
        centroid_data["toa"] -= timewalk_lut[np.int_(centroid_data["tot_max"] // 25) - 1] * 1e3
    return centroid_data


 def compute_centroids(x, y, tof, tot,
                      threshold_tot=0,
                      clustering_epsilon=2,
                      clustering_tof_scale=1e7,
                      clustering_min_samples=3,
-                      clustering_n_jobs=1,
                      centroiding_timewalk_lut=None):
    # format input data
    _tpx_data = {
-        "x": x,
-        "y": y,
-        "toa": tof,
-        "tot": tot
+        "x": x.astype(float),
+        "y": y.astype(float),
+        "toa": tof.astype(float),
+        "tot": tot.astype(float)
    }

    # ensure that valid data is available
    data_validation = check_data(_tpx_data)
    if data_validation < 0:
        if data_validation in error_msgs.keys():
            print("Data validation failed with message: %s" % error_msgs[data_validation])
        else:
            print("Data validation failed: unknown reason")
-        return None
+        return np.array([]), empty_centroid_data()

    # clustering (identify clusters in 2d data (x,y,tof) that belong to a single hit,
    # each sample belonging to a cluster is labeled with an integer cluster id no)
-    _tpx_data = pre_clustering_filter(_tpx_data, tot_threshold=threshold_tot)
-    _tpx_data["labels"] = clustering(_tpx_data, epsilon=clustering_epsilon, tof_scale=clustering_tof_scale, min_samples=clustering_min_samples, n_jobs=clustering_n_jobs)
-    _tpx_data = post_clustering_filter(_tpx_data)
+    if threshold_tot > 0:
+        _tpx_data = apply_single_filter(_tpx_data, _tpx_data["tot"] >= threshold_tot)
+
+    labels = clustering(_tpx_data, epsilon=clustering_epsilon, tof_scale=clustering_tof_scale, min_samples=clustering_min_samples)
+    _tpx_data["labels"] = labels
+
+    if labels is not None:
+        _tpx_data = apply_single_filter(_tpx_data, labels >= 0)
+
    # compute centroid data (reduce cluster of samples to a single point with properties)
-    if _tpx_data["labels"] is None or _tpx_data["labels"].size == 0:
+    if labels is None or len(_tpx_data['x']) == 0:
        # handle case of no identified clusters, return empty dictionary with expected keys
-        return empty_centroid_data()
-    _centroids = get_centroids(_tpx_data, timewalk_lut=centroiding_timewalk_lut)
-    return _centroids
+        return np.array([]), empty_centroid_data()
+    return labels, get_centroids(_tpx_data, timewalk_lut=centroiding_timewalk_lut)
+
+
+def process_train(worker_id, index, train_id, data):
+    events = data[in_fast_data]
+
+    sel = np.s_[:events['data.size']]
+
+    x = events['data.x'][sel]
+    y = events['data.y'][sel]
+    tot = events['data.tot'][sel]
+    toa = events['data.toa'][sel]
+
+    if raw_timewalk_lut is not None:
+        toa -= raw_timewalk_lut[np.int_(tot // 25) - 1] * 1e3
+
+    labels, centroids = compute_centroids(x, y, toa, tot, **centroiding_kwargs)
+
+    num_centroids = len(centroids['x'])
+    fraction_centroids = np.sum(centroids["size"])/events['data.size'] if events['data.size']>0 else np.nan
+    missing_centroids = num_centroids > max_num_centroids
+
+    if num_centroids > max_num_centroids:
+        warn('Number of centroids is larger than the defined maximum, some data cannot be written to disk')
+
+    for key in centroid_dt.names:
+        out_data[index, :num_centroids][key] = centroids[key]
+    out_labels[index, :len(labels)] = labels
+    out_stats[index]["fraction_px_in_centroids"] = fraction_centroids
+    out_stats[index]["N_centroids"] = num_centroids
+    out_stats[index]["missing_centroids"] = missing_centroids
 ```

 %% Cell type:code id:56306c15-513e-4e7f-9c47-c52ca61b27a8 tags:

 ``` python
 dc = RunDirectory(Path(in_folder) / f'r{run:04d}', inc_suspect_trains=True)
-
-base_path=find_proposal("p%06i" % int(dc.run_metadata()['proposalNumber']))
+proposal = list(filter(None, in_folder.strip('/').split('/')))[-2]
+base_path=find_proposal(proposal)

 if raw_timewalk_lut_filepath:
    raw_timewalk_lut_filepath_full = (Path(base_path) / Path(raw_timewalk_lut_filepath)).resolve()
    raw_timewalk_lut = np.load(raw_timewalk_lut_filepath_full)
 else:
    raw_timewalk_lut = None

 if centroiding_timewalk_lut_filepath:
    centroiding_timewalk_lut_filepath_full = (Path(base_path) / Path(centroiding_timewalk_lut_filepath)).resolve()
    centroiding_timewalk_lut = np.load(centroiding_timewalk_lut_filepath_full)
 else:
    centroiding_timewalk_lut = None
 ```

 %% Cell type:code id:8c28a4c8-961c-496b-80da-7fd867e5b0d3 tags:

 ``` python
 in_fast_data = in_fast_data.format(karabo_id=karabo_id)
 out_device_id = out_device_id.format(karabo_id=karabo_id)
 out_fast_data = out_fast_data.format(karabo_id=karabo_id)

 Path(out_folder).mkdir(exist_ok=True, parents=True)

 in_dc = dc.select(in_fast_data, require_all=True)

 dataset_kwargs = {k[8:]: v for k, v in locals().items() if k.startswith('dataset_compression')}

 centroid_dt = np.dtype([('x', np.float64),
                        ('y', np.float64),
                        ('toa', np.float64),
                        ('tot', np.float64),
                        ('tot_avg', np.float64),
                        ('tot_max', np.uint16),
                        ('size', np.int16)])

+pixel_shape = in_dc[in_fast_data]['data.x'].entry_shape

 centroid_settings_template = {
    'timewalk_correction.raw_applied': (np.bool, bool(raw_timewalk_lut_filepath)),
    'timewalk_correction.raw_file': ("S100", str(raw_timewalk_lut_filepath)[-100:]),
    'timewalk_correction.centroiding_applied': (np.bool, bool(centroiding_timewalk_lut_filepath)),
    'timewalk_correction.centroiding_file': ("S100", str(centroiding_timewalk_lut_filepath)[-100:]),
    'clustering.epsilon': (np.float64, float(clustering_epsilon)),
    'clustering.tof_scale': (np.float64, float(clustering_tof_scale)),
    'clustering.min_samples': (np.int16, int(clustering_min_samples)),
    'threshold_tot': (np.int16, int(threshold_tot)),
 }

 centroid_stats_template = {
    'N_centroids': (np.int, -1),
    'missing_centroids': (np.bool, False),
    'fraction_px_in_centroids': (np.float64, np.nan),
 }

 centroid_settings_dt = np.dtype([(key, centroid_settings_template[key][0]) for key in centroid_settings_template])
 centroid_stats_dt = np.dtype([(key, centroid_stats_template[key][0]) for key in centroid_stats_template])

 centroiding_kwargs = dict(
    threshold_tot=threshold_tot,
    clustering_epsilon=clustering_epsilon,
    clustering_tof_scale=clustering_tof_scale,
    clustering_min_samples=clustering_min_samples,
-    clustering_n_jobs=clustering_n_jobs,
    centroiding_timewalk_lut=centroiding_timewalk_lut)

-print('Computing centroids and writing to file', flush=True, end='')
+
+psh.set_default_context('processes', num_workers=(num_workers := cpu_count() // 4))
+
+print(f'Computing centroids with {num_workers} workers and writing to file', flush=True, end='')
 start = monotonic()

 for seq_id, seq_dc in enumerate(in_dc.split_trains(trains_per_part=out_seq_len)):
    train_ids = seq_dc.train_ids
    m_data_sources = []

    with DataFile.from_details(out_folder, out_aggregator, run, seq_id) as seq_file:
        # No support needed for old EXDF files.
        seq_file.create_metadata(like=in_dc, sequence=seq_id,
                                 control_sources=[out_device_id],
                                 instrument_channels=[f'{out_fast_data}/data'])
        seq_file.create_index(train_ids)

-        out_data = np.empty((len(train_ids), max_num_centroids), dtype=centroid_dt)
+        out_labels = psh.alloc(shape=(len(train_ids),) + pixel_shape, dtype=np.int32)
+        out_data = psh.alloc(shape=(len(train_ids), max_num_centroids), dtype=centroid_dt)
+        out_stats = psh.alloc(shape=(len(train_ids),), dtype=centroid_stats_dt)
+
+        out_labels[:] = -1
        out_data[:] = (np.nan, np.nan, np.nan, np.nan, np.nan, 0, -1)
-        out_stats = np.empty((len(train_ids),), dtype=centroid_stats_dt)
        out_stats[:] = tuple([centroid_stats_template[key][1] for key in centroid_stats_template])

-        for index, (train_id, data) in enumerate(seq_dc.trains()):
-            events = data[in_fast_data]
-
-            sel = np.s_[:events['data.size']]
-
-            x = events['data.x'][sel]
-            y = events['data.y'][sel]
-            tot = events['data.tot'][sel]
-            toa = events['data.toa'][sel]
-
-            if raw_timewalk_lut is not None:
-                toa -= raw_timewalk_lut[np.int_(tot // 25) - 1] * 1e3
-
-            centroids = compute_centroids(x, y, toa, tot, **centroiding_kwargs)
-
-            num_centroids = len(centroids['x'])
-            fraction_centroids = np.sum(centroids["size"])/events['data.size'] if events['data.size']>0 else np.nan
-            missing_centroids = num_centroids > max_num_centroids
-
-            if num_centroids > max_num_centroids:
-                warn('number of centroids larger than definde maximum, some data cannot be written to disk')
-
-            for key in centroid_dt.names:
-                out_data[key][index, :num_centroids] = centroids[key]
-            out_stats["fraction_px_in_centroids"][index] = fraction_centroids
-            out_stats["N_centroids"][index] = num_centroids
-            out_stats["missing_centroids"][index] = missing_centroids
+        psh.map(process_train, seq_dc)

        # Create sources.
        cur_slow_data = seq_file.create_control_source(out_device_id)
        cur_fast_data = seq_file.create_instrument_source(out_fast_data)

        # Add source indices.
        cur_slow_data.create_index(len(train_ids))
        cur_fast_data.create_index(data=np.ones_like(train_ids))

        for key, (type_, data) in centroid_settings_template.items():
            cur_slow_data.create_run_key(f'settings.{key}', data)

+        cur_fast_data.create_key('data.labels', data=out_labels,
+                                 chunks=(1,) + pixel_shape, **dataset_kwargs)
        cur_fast_data.create_key('data.centroids', out_data,
                                 chunks=tuple(chunks_centroids),
                                 **dataset_kwargs)
        cur_fast_data.create_key('data.stats', out_stats)

    print('.', flush=True, end='')

 end = monotonic()
 print('')

 print(f'{end-start:.01f}s')
 ```

 %% Cell type:markdown id:0393ed18-b4e5-499b-bdd3-c9e5f24b9627 tags:

 # Timepix3

 Author: Björn Senfftleben / Philipp Schmidt, Version: 1.0

 The following notebook provides centroiding for data acquired with the Timepix3 camera detector (ASI TPX3CAM).

 %% Cell type:code id:9484ee10 tags:

 ``` python
 # Data selection parameters.
-run = 420  # required
-in_folder = '/gpfs/exfel/exp/SQS/202230/p900256/raw'  # required
-out_folder = '/gpfs/exfel/exp/SQS/202230/p900256/scratch/cal_test'  # required
+run = 307  # required
+in_folder = '/gpfs/exfel/exp/SQS/202430/p900421/raw'  # required
+out_folder = '/gpfs/exfel/exp/SQS/202430/p900421/scratch/cal_test'  # required
 proposal = ''  # Proposal, leave empty for auto detection based on in_folder

 # These parameters are required by xfel-calibrate but ignored in this notebook.
 cal_db_timeout = 0  # Calibration DB timeout, currently not used.
 cal_db_interface = 'foo'  # Calibration DB interface, currently not used.
 karabo_da = 'bar'  # Karabo data aggregator name, currently not used

-karabo_id = 'SQS_AQS_CAM'
-in_fast_data = '{karabo_id}/CAM/TIMEPIX3:daqEventOutput'
+karabo_id = 'SQS_EXP_TIMEPIX'
+in_fast_data = '{karabo_id}/DET/TIMEPIX3:daqOutput.chip0'
 out_device_id = '{karabo_id}/CAL/TIMEPIX3'
-out_fast_data = '{karabo_id}/CAL/TIMEPIX3:output'
-out_aggregator = 'TPX01'
+out_fast_data = '{karabo_id}/CAL/TIMEPIX3:daqOutput.chip0'
+out_aggregator = 'TIMEPIX01'
 out_seq_len = 2000

 max_num_centroids = 10000  # Maximum number of centroids per train
 chunks_centroids = [1, 5000]  # Chunking of centroid data
 dataset_compression = 'gzip'  # HDF compression method.
 dataset_compression_opts = 3  # HDF GZIP compression level.

 clustering_epsilon = 2.0  # centroiding: The maximum distance between two samples for one to be considered as in the neighborhood of the other
 clustering_tof_scale = 1e7  # centroiding: Scaling factor for the ToA axis so that the epsilon parameter in DB scan works in all 3 dimensions
 clustering_min_samples = 2  # centroiding: minimum number of samples necessary for a cluster
-clustering_n_jobs = 1  # centroiding: (DBSCAN) The number of parallel jobs to run.
 threshold_tot = 0 # raw data: minimum ToT necessary for a pixel to contain valid data

 raw_timewalk_lut_filepath = ''  # fpath to look up table for timewalk correction relative to proposal path or empty string,
 centroiding_timewalk_lut_filepath = ''  # fpath to look up table for timewalk correction relative to proposal path or empty string.
 ```

 %% Cell type:code id:524fe654-e112-4abe-813c-a0be9b3a3034 tags:

 ``` python
 from datetime import datetime
 from pathlib import Path
 from time import monotonic
+from os import cpu_count
 from warnings import warn

 import numpy as np
 import scipy.ndimage as nd
 import h5py
+import pasha as psh

 from sklearn.cluster import DBSCAN
 from extra_data import RunDirectory
 from extra_data.read_machinery import find_proposal

 from cal_tools.files import DataFile, sequence_pulses

 %matplotlib inline
 ```

 %% Cell type:code id:e36f997c-4b66-4b11-99a8-5887e3572f56 tags:

 ``` python
 # centroiding
 error_msgs = {
    -1: "tpx_data has an invalid structure - ignore provided data",
    -2: "tpx_data arrays are of invalid lengths - ignore provided data",
    -3: "tpx_data arrays are empty"
 }


 def check_data(tpx_data):
    required_keys = ["x", "y", "toa", "tot"]
    for key in required_keys:
        if key not in tpx_data.keys():
            warn("tpx data must contain the keys %s, but key %s not in tpx data keys (%s)" % (required_keys, key, list(tpx_data.keys())),
                 category=UserWarning)
            return -1

    reference_n_samples_key = "x"
    n_samples = len(tpx_data[reference_n_samples_key])
    for key in tpx_data.keys():
        if n_samples != len(tpx_data[key]):
            warn("arrays in tpx data must be of same length ( len(tpx_data[%s])=%i!=%i=(len(tpx_data[%s]) )" % (reference_n_samples_key, n_samples, len(tpx_data[key]), key),
                 category=UserWarning)
            return -2
    if n_samples == 0:
        warn("no samples were provides with tpx data", category=UserWarning)
        return -3
    return 0


 def apply_single_filter(tpx_data, _filter):
    """
    Simple function to apply a selecting or sorting filter to a dictionary of equally sized arrays
    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!

    Parameters
    ----------
    tpx_data: dictionary with timepix data, all arrays behind each key must be of same length
    _filter:  1d array or list of integers or booleans or np.s_ to select or sort data like a = a[_filter]

    Returns
    -------
    tpx_data: like input tpx_data but with applied filter

    """
    try:
        for key in tpx_data.keys():
            tpx_data[key] = np.array(tpx_data[key])[_filter]
    except Exception as e:
        print(_filter)
        print(_filter.dtype)
        print(_filter.shape)
        print(tpx_data[key].shape)
        raise e
    return tpx_data


-def pre_clustering_filter(tpx_data, tot_threshold=0):
-    """
-    Collection of filters directly applied before clustering.
-    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
-
-    Parameters
-    ----------
-    tpx_data:      Dictionary with timepix data, all arrays behind each key must be of same length
-    tot_threshold: minimum ToT required for a pixel to contain valid data
-
-    Returns
-    -------
-    tpx_data: like input tpx_data but with applied filters
-    """
-    if tot_threshold > 0:
-        tpx_data = apply_single_filter(tpx_data, tpx_data["tot"] >= tot_threshold)
-
-    return tpx_data
-
-
-def post_clustering_filter(tpx_data):
-    """
-    Collection of filters directly applied after clustering.
-    Note: at no point a copy of the dictionary is made, as they are mutable, the input array is changed in memory!
-
-    Parameters
-    ----------
-    tpx_data:    Dictionary with timepix data, all arrays behind each key must be of same length, now with key labels
-
-    Returns
-    -------
-    tpx_data: like input tpx_data but with applied filters
-    """
-    if tpx_data["labels"] is not None:
-        tpx_data = apply_single_filter(tpx_data, tpx_data["labels"] != 0)
-
-    return tpx_data
-

 def clustering(tpx_data, epsilon=2, tof_scale=1e7, min_samples=3, n_jobs=1):
    """

    Parameters
    ----------
    tpx_data       Dictionary with timepix data, all arrays behind each key must be of same length, now with key labels
    epsilon        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
                   This is not a maximum bound on the distances of points within a cluster. This is the most important
                   DBSCAN parameter to choose appropriately for your data set and distance function.
    tof_scale      Scaling factor for the ToA data so that the epsilon parameter in DB scan works not only in the x/y
                   axes, but also in the ToA axis. So it converts ToA in s into "ToA pixels" -> e.g. tof_scale=1e7 means,
                   that 100 ns is considered comparable to 1 spatial pixel.
    min_samples    The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
                   This includes the point itself.
    n_jobs         The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context.
                   -1 means using all processors. See Glossary for more details.

    Returns
    -------

    """
    coords = np.column_stack((tpx_data["x"], tpx_data["y"], tpx_data["toa"]*tof_scale))
    dist = DBSCAN(eps=epsilon, min_samples=min_samples, metric="euclidean", n_jobs=n_jobs).fit(coords)
-    return dist.labels_ + 1
+    return dist.labels_

 def empty_centroid_data():
    return {
        "x": np.array([]),
        "y": np.array([]),
        "toa": np.array([]),
        "tot": np.array([]),
        "tot_avg": np.array([]),
        "tot_max": np.array([]),
        "size": np.array([]),
    }

 def get_centroids(tpx_data, timewalk_lut=None):
    centroid_data = empty_centroid_data()
    cluster_labels, cluster_size = np.unique(tpx_data["labels"], return_counts=True)

    cluster_tot_peaks = np.array(nd.maximum_position(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels)).ravel()
    cluster_tot_integrals = nd.sum(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels)

    # compute centroid center through weighted average
    centroid_data["x"] = np.array(nd.sum(tpx_data["x"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
    centroid_data["y"] = np.array(nd.sum(tpx_data["y"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()
    centroid_data["toa"] = np.array(nd.sum(tpx_data["toa"] * tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels) / cluster_tot_integrals).ravel()

    # intensity & size information
    centroid_data["tot_avg"] = np.array(nd.mean(tpx_data["tot"], labels=tpx_data["labels"], index=cluster_labels))
    centroid_data["tot_max"] = tpx_data["tot"][cluster_tot_peaks]
    centroid_data["tot"] = np.array(cluster_tot_integrals)
    centroid_data["size"] = cluster_size

    # train ID information
    # ~ centroid_data["tid"] = tpx_data["tid"][cluster_tot_peaks]

    # correct for timewalk if provided
    if timewalk_lut is not None:
        centroid_data["toa"] -= timewalk_lut[np.int_(centroid_data["tot_max"] // 25) - 1] * 1e3
    return centroid_data


 def compute_centroids(x, y, tof, tot,
                      threshold_tot=0,
                      clustering_epsilon=2,
                      clustering_tof_scale=1e7,
                      clustering_min_samples=3,
-                      clustering_n_jobs=1,
                      centroiding_timewalk_lut=None):
    # format input data
    _tpx_data = {
-        "x": x,
-        "y": y,
-        "toa": tof,
-        "tot": tot
+        "x": x.astype(float),
+        "y": y.astype(float),
+        "toa": tof.astype(float),
+        "tot": tot.astype(float)
    }

    # ensure that valid data is available
    data_validation = check_data(_tpx_data)
    if data_validation < 0:
        if data_validation in error_msgs.keys():
            print("Data validation failed with message: %s" % error_msgs[data_validation])
        else:
            print("Data validation failed: unknown reason")
-        return None
+        return np.array([]), empty_centroid_data()

    # clustering (identify clusters in 2d data (x,y,tof) that belong to a single hit,
    # each sample belonging to a cluster is labeled with an integer cluster id no)
-    _tpx_data = pre_clustering_filter(_tpx_data, tot_threshold=threshold_tot)
-    _tpx_data["labels"] = clustering(_tpx_data, epsilon=clustering_epsilon, tof_scale=clustering_tof_scale, min_samples=clustering_min_samples, n_jobs=clustering_n_jobs)
-    _tpx_data = post_clustering_filter(_tpx_data)
+    if threshold_tot > 0:
+        _tpx_data = apply_single_filter(_tpx_data, _tpx_data["tot"] >= threshold_tot)
+
+    labels = clustering(_tpx_data, epsilon=clustering_epsilon, tof_scale=clustering_tof_scale, min_samples=clustering_min_samples)
+    _tpx_data["labels"] = labels
+
+    if labels is not None:
+        _tpx_data = apply_single_filter(_tpx_data, labels >= 0)
+
    # compute centroid data (reduce cluster of samples to a single point with properties)
-    if _tpx_data["labels"] is None or _tpx_data["labels"].size == 0:
+    if labels is None or len(_tpx_data['x']) == 0:
        # handle case of no identified clusters, return empty dictionary with expected keys
-        return empty_centroid_data()
-    _centroids = get_centroids(_tpx_data, timewalk_lut=centroiding_timewalk_lut)
-    return _centroids
+        return np.array([]), empty_centroid_data()
+    return labels, get_centroids(_tpx_data, timewalk_lut=centroiding_timewalk_lut)
+
+
+def process_train(worker_id, index, train_id, data):
+    events = data[in_fast_data]
+
+    sel = np.s_[:events['data.size']]
+
+    x = events['data.x'][sel]
+    y = events['data.y'][sel]
+    tot = events['data.tot'][sel]
+    toa = events['data.toa'][sel]
+
+    if raw_timewalk_lut is not None:
+        toa -= raw_timewalk_lut[np.int_(tot // 25) - 1] * 1e3
+
+    labels, centroids = compute_centroids(x, y, toa, tot, **centroiding_kwargs)
+
+    num_centroids = len(centroids['x'])
+    fraction_centroids = np.sum(centroids["size"])/events['data.size'] if events['data.size']>0 else np.nan
+    missing_centroids = num_centroids > max_num_centroids
+
+    if num_centroids > max_num_centroids:
+        warn('Number of centroids is larger than the defined maximum, some data cannot be written to disk')
+
+    for key in centroid_dt.names:
+        out_data[index, :num_centroids][key] = centroids[key]
+    out_labels[index, :len(labels)] = labels
+    out_stats[index]["fraction_px_in_centroids"] = fraction_centroids
+    out_stats[index]["N_centroids"] = num_centroids
+    out_stats[index]["missing_centroids"] = missing_centroids
 ```

 %% Cell type:code id:56306c15-513e-4e7f-9c47-c52ca61b27a8 tags:

 ``` python
 dc = RunDirectory(Path(in_folder) / f'r{run:04d}', inc_suspect_trains=True)
-
-base_path=find_proposal("p%06i" % int(dc.run_metadata()['proposalNumber']))
+proposal = list(filter(None, in_folder.strip('/').split('/')))[-2]
+base_path=find_proposal(proposal)

 if raw_timewalk_lut_filepath:
    raw_timewalk_lut_filepath_full = (Path(base_path) / Path(raw_timewalk_lut_filepath)).resolve()
    raw_timewalk_lut = np.load(raw_timewalk_lut_filepath_full)
 else:
    raw_timewalk_lut = None

 if centroiding_timewalk_lut_filepath:
    centroiding_timewalk_lut_filepath_full = (Path(base_path) / Path(centroiding_timewalk_lut_filepath)).resolve()
    centroiding_timewalk_lut = np.load(centroiding_timewalk_lut_filepath_full)
 else:
    centroiding_timewalk_lut = None
 ```

 %% Cell type:code id:8c28a4c8-961c-496b-80da-7fd867e5b0d3 tags:

 ``` python
 in_fast_data = in_fast_data.format(karabo_id=karabo_id)
 out_device_id = out_device_id.format(karabo_id=karabo_id)
 out_fast_data = out_fast_data.format(karabo_id=karabo_id)

 Path(out_folder).mkdir(exist_ok=True, parents=True)

 in_dc = dc.select(in_fast_data, require_all=True)

 dataset_kwargs = {k[8:]: v for k, v in locals().items() if k.startswith('dataset_compression')}

 centroid_dt = np.dtype([('x', np.float64),
                        ('y', np.float64),
                        ('toa', np.float64),
                        ('tot', np.float64),
                        ('tot_avg', np.float64),
                        ('tot_max', np.uint16),
                        ('size', np.int16)])

+pixel_shape = in_dc[in_fast_data]['data.x'].entry_shape

 centroid_settings_template = {
    'timewalk_correction.raw_applied': (np.bool, bool(raw_timewalk_lut_filepath)),
    'timewalk_correction.raw_file': ("S100", str(raw_timewalk_lut_filepath)[-100:]),
    'timewalk_correction.centroiding_applied': (np.bool, bool(centroiding_timewalk_lut_filepath)),
    'timewalk_correction.centroiding_file': ("S100", str(centroiding_timewalk_lut_filepath)[-100:]),
    'clustering.epsilon': (np.float64, float(clustering_epsilon)),
    'clustering.tof_scale': (np.float64, float(clustering_tof_scale)),
    'clustering.min_samples': (np.int16, int(clustering_min_samples)),
    'threshold_tot': (np.int16, int(threshold_tot)),
 }

 centroid_stats_template = {
    'N_centroids': (np.int, -1),
    'missing_centroids': (np.bool, False),
    'fraction_px_in_centroids': (np.float64, np.nan),
 }

 centroid_settings_dt = np.dtype([(key, centroid_settings_template[key][0]) for key in centroid_settings_template])
 centroid_stats_dt = np.dtype([(key, centroid_stats_template[key][0]) for key in centroid_stats_template])

 centroiding_kwargs = dict(
    threshold_tot=threshold_tot,
    clustering_epsilon=clustering_epsilon,
    clustering_tof_scale=clustering_tof_scale,
    clustering_min_samples=clustering_min_samples,
-    clustering_n_jobs=clustering_n_jobs,
    centroiding_timewalk_lut=centroiding_timewalk_lut)

-print('Computing centroids and writing to file', flush=True, end='')
+
+psh.set_default_context('processes', num_workers=(num_workers := cpu_count() // 4))
+
+print(f'Computing centroids with {num_workers} workers and writing to file', flush=True, end='')
 start = monotonic()

 for seq_id, seq_dc in enumerate(in_dc.split_trains(trains_per_part=out_seq_len)):
    train_ids = seq_dc.train_ids
    m_data_sources = []

    with DataFile.from_details(out_folder, out_aggregator, run, seq_id) as seq_file:
        # No support needed for old EXDF files.
        seq_file.create_metadata(like=in_dc, sequence=seq_id,
                                 control_sources=[out_device_id],
                                 instrument_channels=[f'{out_fast_data}/data'])
        seq_file.create_index(train_ids)

-        out_data = np.empty((len(train_ids), max_num_centroids), dtype=centroid_dt)
+        out_labels = psh.alloc(shape=(len(train_ids),) + pixel_shape, dtype=np.int32)
+        out_data = psh.alloc(shape=(len(train_ids), max_num_centroids), dtype=centroid_dt)
+        out_stats = psh.alloc(shape=(len(train_ids),), dtype=centroid_stats_dt)
+
+        out_labels[:] = -1
        out_data[:] = (np.nan, np.nan, np.nan, np.nan, np.nan, 0, -1)
-        out_stats = np.empty((len(train_ids),), dtype=centroid_stats_dt)
        out_stats[:] = tuple([centroid_stats_template[key][1] for key in centroid_stats_template])

-        for index, (train_id, data) in enumerate(seq_dc.trains()):
-            events = data[in_fast_data]
-
-            sel = np.s_[:events['data.size']]
-
-            x = events['data.x'][sel]
-            y = events['data.y'][sel]
-            tot = events['data.tot'][sel]
-            toa = events['data.toa'][sel]
-
-            if raw_timewalk_lut is not None:
-                toa -= raw_timewalk_lut[np.int_(tot // 25) - 1] * 1e3
-
-            centroids = compute_centroids(x, y, toa, tot, **centroiding_kwargs)
-
-            num_centroids = len(centroids['x'])
-            fraction_centroids = np.sum(centroids["size"])/events['data.size'] if events['data.size']>0 else np.nan
-            missing_centroids = num_centroids > max_num_centroids
-
-            if num_centroids > max_num_centroids:
-                warn('number of centroids larger than definde maximum, some data cannot be written to disk')
-
-            for key in centroid_dt.names:
-                out_data[key][index, :num_centroids] = centroids[key]
-            out_stats["fraction_px_in_centroids"][index] = fraction_centroids
-            out_stats["N_centroids"][index] = num_centroids
-            out_stats["missing_centroids"][index] = missing_centroids
+        psh.map(process_train, seq_dc)

        # Create sources.
        cur_slow_data = seq_file.create_control_source(out_device_id)
        cur_fast_data = seq_file.create_instrument_source(out_fast_data)

        # Add source indices.
        cur_slow_data.create_index(len(train_ids))
        cur_fast_data.create_index(data=np.ones_like(train_ids))

        for key, (type_, data) in centroid_settings_template.items():
            cur_slow_data.create_run_key(f'settings.{key}', data)

+        cur_fast_data.create_key('data.labels', data=out_labels,
+                                 chunks=(1,) + pixel_shape, **dataset_kwargs)
        cur_fast_data.create_key('data.centroids', out_data,
                                 chunks=tuple(chunks_centroids),
                                 **dataset_kwargs)
        cur_fast_data.create_key('data.stats', out_stats)

    print('.', flush=True, end='')

 end = monotonic()
 print('')

 print(f'{end-start:.01f}s')
 ```

--- a/notebooks/ePix100/Correction_ePix100_NBC.ipynb
+++ b/notebooks/ePix100/Correction_ePix100_NBC.ipynb
 %% Cell type:markdown id: tags:

 # ePix100 Data Correction

 Author: European XFEL Detector Group, Version: 2.0

 The following notebook provides data correction of images acquired with the ePix100 detector.

 The sequence of correction applied are:
 Offset --> Common Mode Noise --> Relative Gain --> Charge Sharing --> Absolute Gain.

 Offset, common mode and gain corrected data is saved to /data/image/pixels in the CORR files.

 If pattern classification is applied (charge sharing correction), this data will be saved to /data/image/pixels_classified, while the corresponding patterns will be saved to /data/image/patterns in the CORR files.

 %% Cell type:code id: tags:

 ``` python
 in_folder = "/gpfs/exfel/exp/HED/202102/p002739/raw" # input folder, required
 out_folder = ""  # output folder, required
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 sequences = [-1]  # sequences to correct, set to -1 for all, range allowed
 sequences_per_node = 1  # number of sequence files per cluster node if run as slurm job, set to 0 to not run SLURM parallel
 run = 38  # which run to read data from, required

 # Parameters for accessing the raw data.
 karabo_id = "HED_IA1_EPX100-1"  # karabo karabo_id
 karabo_da = "EPIX01"  # data aggregators
 db_module = ""  # module id in the database
 receiver_template = "RECEIVER"  # detector receiver template for accessing raw data files
 path_template = 'RAW-R{:04d}-{}-S{{:05d}}.h5'  # the template to use to access data
 instrument_source_template = '{}/DET/{}:daqOutput'  # instrument detector data source in h5files

 # Parameters affecting writing corrected data.
 chunk_size_idim = 1  # H5 chunking size of output data
 limit_trains = 0  # Process only first N images, 0 - process all.

 # Parameters for the calibration database.
 cal_db_interface = "tcp://max-exfl-cal001:8015#8025"  # calibration DB interface to use
 cal_db_timeout = 300000  # timeout on caldb requests
 creation_time = ""  # The timestamp to use with Calibration DBe. Required Format: "YYYY-MM-DD hh:mm:ss" e.g. 2019-07-04 11:02:41

 # Conditions for retrieving calibration constants.
 bias_voltage = 200  # bias voltage
 in_vacuum = False  # detector operated in vacuum
 integration_time = -1  # Detector integration time, Default value -1 to use the value from the slow data.
 fix_temperature = -1  # fixed temperature value in Kelvin, Default value -1 to use the value from files.
 gain_photon_energy = 8.048  # Photon energy used for gain calibration
 photon_energy = 0.  # Photon energy to calibrate in number of photons, 0 for calibration in keV

 # Flags to select type of applied corrections.
 pattern_classification = True  # do clustering.
 relative_gain = True  # Apply relative gain correction.
 absolute_gain = True  # Apply absolute gain correction (implies relative gain).
 common_mode = True  # Apply common mode correction.

 # Parameters affecting applied correction.
 cm_min_frac = 0.25  # No CM correction is performed if after masking the ratio of good pixels falls below this
 cm_noise_sigma = 5.  # CM correction noise standard deviation
 split_evt_primary_threshold = 7.  # primary threshold for split event correction
 split_evt_secondary_threshold = 5.  # secondary threshold for split event correction
 split_evt_mip_threshold = 1000.  # minimum ionizing particle threshold


 def balance_sequences(in_folder, run, sequences, sequences_per_node, karabo_da):
    from xfel_calibrate.calibrate import balance_sequences as bs
    return bs(in_folder, run, sequences, sequences_per_node, karabo_da)
 ```

 %% Cell type:code id: tags:

 ``` python
 import tabulate
 import warnings
 from logging import warning
 from sys import exit

 import h5py
 import pasha as psh
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import Latex, Markdown, display
 from extra_data import RunDirectory, H5File
 from extra_geom import Epix100Geometry
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 from pathlib import Path

 import cal_tools.restful_config as rest_cfg
 from XFELDetAna import xfelpycaltools as xcal
 from cal_tools.calcat_interface import EPIX100_CalibrationData, CalCatError
 from cal_tools.epix100 import epix100lib
 from cal_tools.files import DataFile
 from cal_tools.tools import (
    calcat_creation_time,
    write_constants_fragment,
 )
 from cal_tools.step_timing import StepTimer

 warnings.filterwarnings('ignore')

 prettyPlotting = True


 %matplotlib inline
 ```

 %% Cell type:code id: tags:

 ``` python
 x = 708  # rows of the ePix100
 y = 768  # columns of the ePix100

 if absolute_gain:
    relative_gain = True

 plot_unit = 'ADU'
 ```

 %% Cell type:code id: tags:

 ``` python
 prop_str = in_folder[in_folder.find('/p')+1:in_folder.find('/p')+8]

 in_folder = Path(in_folder)
 out_folder = Path(out_folder)

 out_folder.mkdir(parents=True, exist_ok=True)

 run_folder = in_folder / f"r{run:04d}"

 instrument_src = instrument_source_template.format(
    karabo_id, receiver_template)

 print(f"Correcting run: {run_folder}")
 print(f"Instrument H5File source: {instrument_src}")
 print(f"Data corrected files are stored at: {out_folder}")
 ```

 %% Cell type:code id: tags:

 ``` python
 creation_time = calcat_creation_time(in_folder, run, creation_time)
 print(f"Using {creation_time.isoformat()} as creation time")
 ```

 %% Cell type:code id: tags:

 ``` python
 run_dc = RunDirectory(run_folder, _use_voview=False)

 seq_files = [Path(f.filename) for f in run_dc.select(f"*{karabo_id}*").files]

 # If a set of sequences requested to correct,
 # adapt seq_files list.
 if sequences != [-1]:
    seq_files = [f for f in seq_files if any(f.match(f"*-S{s:05d}.h5") for s in sequences)]

 if not len(seq_files):
    raise IndexError("No sequence files available for the selected sequences.")

 print(f"Processing a total of {len(seq_files)} sequence files")
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer = StepTimer()
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()

 sensorSize = [x, y]
 # Sensor area will be analysed according to blocksize
 blockSize = [sensorSize[0]//2, sensorSize[1]//2]
 xcal.defaultBlockSize = blockSize
 memoryCells = 1  # ePIX has no memory cells
 run_parallel = False

 # Read control data.
 ctrl_data = epix100lib.epix100Ctrl(
    run_dc=run_dc,
    instrument_src=instrument_src,
    ctrl_src=f"{karabo_id}/DET/CONTROL",
    )

 if integration_time < 0:
    integration_time = ctrl_data.get_integration_time()
    integration_time_str_add = ""
 else:
    integration_time_str_add = "(manual input)"

 if fix_temperature < 0:
    temperature = ctrl_data.get_temprature()
    temperature_k = temperature + 273.15
    temp_str_add = ""
 else:
    temperature_k = fix_temperature
    temperature = fix_temperature - 273.15
    temp_str_add = "(manual input)"

 print(f"Bias voltage is {bias_voltage} V")
 print(f"Detector integration time is set to {integration_time} \u03BCs {integration_time_str_add}")
 print(f"Mean temperature: {temperature:0.2f}°C / {temperature_k:0.2f} K {temp_str_add}")
 print(f"Operated in vacuum: {in_vacuum}")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Table of sequence files to process
 table = [(k, f) for k, f in enumerate(seq_files)]

 if len(table):
    md = display(Latex(tabulate.tabulate(
        table,
        tablefmt='latex',
        headers=["#", "file"]
    )))
 ```

 %% Cell type:markdown id: tags:

 ## Retrieving calibration constants

 As a first step, dark maps have to be loaded.

 %% Cell type:code id: tags:

 ``` python
 epix_cal = EPIX100_CalibrationData(
    detector_name=karabo_id,
    sensor_bias_voltage=bias_voltage,
    integration_time=integration_time,
    sensor_temperature=temperature_k,
    in_vacuum=in_vacuum,
    source_energy=gain_photon_energy,
    event_at=creation_time,
    client=rest_cfg.calibration_client(),
 )

 const_metadata = epix_cal.metadata(calibrations=epix_cal.dark_calibrations)

 if relative_gain:
    try:
        metadata = epix_cal.metadata(epix_cal.illuminated_calibrations)
        for key, value in metadata.items():
            const_metadata.setdefault(key, {}).update(value)
    except CalCatError as e:
        warning(f"CalCatError: {e}")

 # Display retrieved calibration constants timestamps
 epix_cal.display_markdown_retrieved_constants(metadata=const_metadata)
 # Load the constant data from files
 const_data = epix_cal.ndarray_map(metadata=const_metadata)[karabo_da]

 # Validate the constants availability and raise/warn correspondingly.
 missing_dark_constants = {"OffsetEPix100", "NoiseEPix100"} - set(const_data)
 if missing_dark_constants:
    raise ValueError(
        f"Dark constants {missing_dark_constants} are not available to correct {karabo_da}."
        "No correction is performed!")

 if relative_gain and "RelativeGainEPix100" not in const_data.keys():
    warning("RelativeGainEPix100 is not found in the calibration database.")
    relative_gain = False
    absolute_gain = False
 ```

 %% Cell type:code id: tags:

 ``` python
 # Record constant details in YAML metadata
 write_constants_fragment(
    out_folder=(metadata_folder or out_folder),
    det_metadata=const_metadata,
    caldb_root=epix_cal.caldb_root,
    )
 ```

 %% Cell type:code id: tags:

 ``` python
 # Initializing some parameters.
 hscale = 1
 stats = True
 bins = np.arange(-50,1000)
 hist = {'O': 0} # dictionary to store histograms
 ```

 %% Cell type:code id: tags:

 ``` python
 if common_mode:

    commonModeBlockSize = [x//2, y//8]

    cmCorrectionB = xcal.CommonModeCorrection(
        shape=sensorSize,
        blockSize=commonModeBlockSize,
        orientation='block',
        nCells=memoryCells,
        noiseMap=const_data['NoiseEPix100'],
        runParallel=run_parallel,
        parallel=run_parallel,
        stats=stats,
        minFrac=cm_min_frac,
        noiseSigma=cm_noise_sigma,
    )
    cmCorrectionR = xcal.CommonModeCorrection(
        shape=sensorSize,
        blockSize=commonModeBlockSize,
        orientation='row',
        nCells=memoryCells,
        noiseMap=const_data['NoiseEPix100'],
        runParallel=run_parallel,
        parallel=run_parallel,
        stats=stats,
        minFrac=cm_min_frac,
        noiseSigma=cm_noise_sigma,
    )
    cmCorrectionC = xcal.CommonModeCorrection(
        shape=sensorSize,
        blockSize=commonModeBlockSize,
        orientation='col',
        nCells=memoryCells,
        noiseMap=const_data['NoiseEPix100'],
        runParallel=run_parallel,
        parallel=run_parallel,
        stats=stats,
        minFrac=cm_min_frac,
        noiseSigma=cm_noise_sigma,
    )

    hist['CM'] = 0
 ```

 %% Cell type:code id: tags:

 ``` python
 if relative_gain:

    # gain constant is given by the mode of the gain map
    # because all bad pixels are masked using this value
    _vals,_counts = np.unique(const_data["RelativeGainEPix100"], return_counts=True)
    gain_cnst = _vals[np.argmax(_counts)]

    gainCorrection = xcal.RelativeGainCorrection(
        sensorSize,
        gain_cnst/const_data["RelativeGainEPix100"][..., None],
        nCells=memoryCells,
        parallel=run_parallel,
        blockSize=blockSize,
        gains=None,
    )

    hist['RG'] = 0

    if absolute_gain:
        hscale = gain_cnst
        plot_unit = 'keV'
        if photon_energy > 0:
            plot_unit = '$\gamma$'
            hscale /= photon_energy
        hist['AG'] = 0
 ```

 %% Cell type:code id: tags:

 ``` python
 if pattern_classification :
    patternClassifier = xcal.PatternClassifier(
        [x, y],
        const_data["NoiseEPix100"],
        split_evt_primary_threshold,
        split_evt_secondary_threshold,
        split_evt_mip_threshold,
        tagFirstSingles=0,
        nCells=memoryCells,
        allowElongated=False,
        blockSize=[x, y],
        parallel=run_parallel,
    )
    hist['CS'] = 0
    hist['S'] = 0
 ```

 %% Cell type:markdown id: tags:

 ## Applying corrections

 %% Cell type:code id: tags:

 ``` python
 def correct_train(wid, index, tid, d):

    d = d[..., np.newaxis].astype(np.float32)
    d = np.compress(
        np.any(d > 0, axis=(0, 1)), d, axis=2)

    # Offset correction.
    d -= const_data["OffsetEPix100"]
    hist['O'] += np.histogram(d,bins=bins)[0]

    # Common Mode correction.
    if common_mode:
        # Block CM
        d = cmCorrectionB.correct(d)
        # Row CM
        d = cmCorrectionR.correct(d)
        # COL CM
        d = cmCorrectionC.correct(d)

        hist['CM'] += np.histogram(d,bins=bins)[0]


    # Relative gain correction.
    if relative_gain:
        d = gainCorrection.correct(d)

        hist['RG'] += np.histogram(d,bins=bins)[0]

    """The gain correction is currently applying
    an absolute correction (not a relative correction
    as the implied by the name);
    it changes the scale (the unit of measurement)
    of the data from ADU to either keV or n_of_photons.
    But the pattern classification relies on comparing
    data with the NoiseEPix100 map, which is still in ADU.

    The best solution is to do a relative gain
    correction first and apply the global absolute
    gain to the data at the end, after clustering.
    """

    if pattern_classification:

        d_clu, patterns = patternClassifier.classify(d)
        d_clu[d_clu < (split_evt_primary_threshold*const_data["NoiseEPix100"])] = 0

        data_clu[index, ...] = np.squeeze(d_clu)
        data_patterns[index, ...] = np.squeeze(patterns)

        hist['CS'] += np.histogram(d_clu,bins=bins)[0]

        d_sing = d_clu[patterns==100] # pattern 100 corresponds to single photons events
        if len(d_sing):
            hist['S'] += np.histogram(d_sing,bins=bins)[0]

    # Absolute gain correction
    # changes data from ADU to keV (or n. of photons)
    if absolute_gain:

        d = d * gain_cnst
        if photon_energy > 0:
            d /= photon_energy

        hist['AG'] += np.histogram(d,bins=bins)[0]

        if pattern_classification:
            # Modify pattern classification.
            d_clu = d_clu * gain_cnst
            if photon_energy > 0:
                d_clu /= photon_energy

            data_clu[index, ...] = np.squeeze(d_clu)

    data[index, ...] = np.squeeze(d)
 ```

 %% Cell type:code id: tags:

 ``` python
 # 10 is a number chosen after testing 1 ... 71 parallel threads
 context = psh.context.ThreadContext(num_workers=10)
 ```

 %% Cell type:code id: tags:

 ``` python
 empty_seq = 0

 for f in seq_files:

    seq_dc = H5File(f)
    # Save corrected data in an output file with name
    # of corresponding raw sequence file.
    out_file = out_folder / f.name.replace("RAW", "CORR")

    # Data shape in seq_dc excluding trains with empty images.
    ishape = seq_dc[instrument_src, "data.image.pixels"].shape
    corr_ntrains = ishape[0]
    all_train_ids = seq_dc.train_ids

    # Raise a WARNING if this sequence has no trains to correct.
    # Otherwise, print number of trains with no data.
    if corr_ntrains == 0:
        warning(f"No trains to correct for {f.name}: "
                "Skipping the processing of this file.")
        empty_seq += 1
        continue
    elif len(all_train_ids) != corr_ntrains:
        print(f"{f.name} has {len(all_train_ids) - corr_ntrains} trains with missing data.")

    # This parameter is only used for testing.
    if limit_trains > 0:
        print(f"\nCorrected trains are limited to: {limit_trains} trains")
        corr_ntrains = min(corr_ntrains, limit_trains)
    oshape = (corr_ntrains, *ishape[1:])

    data = context.alloc(shape=oshape, dtype=np.float32)

    if pattern_classification:
        data_clu = context.alloc(shape=oshape, dtype=np.float32)
        data_patterns = context.alloc(shape=oshape, dtype=np.int32)

    step_timer.start()  # Correct data.

    # Overwrite seq_dc after eliminating empty trains or/and applying limited images.
    seq_dc = seq_dc.select(
        instrument_src, "*", require_all=True).select_trains(np.s_[:corr_ntrains])

    pixel_data = seq_dc[instrument_src, "data.image.pixels"]
    context.map(correct_train, pixel_data)

    step_timer.done_step(f'Correcting {corr_ntrains} trains.')

    step_timer.start()  # Write corrected data.

    # Create CORR files and add corrected data sections.
    image_counts = seq_dc[instrument_src, "data.image.pixels"].data_counts(labelled=False)

    # Write corrected data.
    with DataFile(out_file, "w") as ofile:
        dataset_chunk = ((chunk_size_idim,) + oshape[1:])  # e.g. (1, pixels_x, pixels_y)
-
+        seq_file = seq_dc.files[0]  # FileAccess
        # Create INDEX datasets.
        ofile.create_index(seq_dc.train_ids, from_file=seq_dc.files[0])
        # Create METDATA datasets
        ofile.create_metadata(
            like=seq_dc,
-            sequence=seq_dc.run_metadata()["sequenceNumber"],
+            sequence=seq_file.sequence,
            instrument_channels=(f'{instrument_src}/data',)
        )
        # Create Instrument section to later add corrected datasets.
        outp_source = ofile.create_instrument_source(instrument_src)

        # Create count/first datasets at INDEX source.
        outp_source.create_index(data=image_counts)

        image_raw_fields = [  # /data/image/
            "binning", "bitsPerPixel", "dimTypes", "dims",
            "encoding", "flipX", "flipY", "roiOffsets", "rotation",
        ]
        for field in image_raw_fields:
            field_arr = seq_dc[instrument_src, f"data.image.{field}"].ndarray()

            outp_source.create_key(
                f"data.image.{field}", data=field_arr,
                chunks=(chunk_size_idim, *field_arr.shape[1:]))

        # Add main corrected `data.image.pixels` dataset and store corrected data.
        outp_source.create_key(
            "data.image.pixels", data=data, chunks=dataset_chunk)
        outp_source.create_key(
            "data.trainId", data=seq_dc.train_ids, chunks=min(50, len(seq_dc.train_ids)))

        if np.isin('data.pulseId', list(seq_dc[instrument_src].keys())): # some runs are missing 'data.pulseId'
            outp_source.create_key(
-                "data.pulseId", data=list(seq_dc[instrument_src]['data.pulseId'].ndarray().squeeze()), chunks=min(50, len(seq_dc.train_ids)))
+                "data.pulseId",
+                data=list(seq_dc[instrument_src]['data.pulseId'].ndarray()[:, 0]),
+                chunks=min(50, len(seq_dc.train_ids)),
+            )

        if pattern_classification:
            # Add main corrected `data.image.pixels` dataset and store corrected data.
            outp_source.create_key(
                "data.image.pixels_classified", data=data_clu, chunks=dataset_chunk)
            outp_source.create_key(
                "data.image.patterns", data=data_patterns, chunks=dataset_chunk)

        step_timer.done_step('Storing data.')
 if empty_seq == len(seq_files):
    warning("No valid trains for RAW data to correct.")
    exit(0)
 ```

 %% Cell type:markdown id: tags:

 ## Plot Histograms

 %% Cell type:code id: tags:

 ``` python
 bins_ADU = bins[:-1]+np.diff(bins)[0]/2
 bins_keV = bins_ADU*hscale
 ```

 %% Cell type:code id: tags:

 ``` python
 # Histogram in ADU

 plt.figure(figsize=(12,8))
 plt.plot(bins_ADU,hist['O'], label='Offset corr')

 if common_mode:
    plt.plot(bins_ADU,hist['CM'], label='CM corr')
 if relative_gain:
    plt.plot(bins_ADU,hist['RG'], label='Relative Gain corr')
 if pattern_classification:
    plt.plot(bins_ADU[bins_ADU>10],hist['CS'][bins_ADU>10], label='Charge Sharing corr')
    if np.any(hist['S']):
        plt.plot(bins_ADU,hist['S'], label='Singles')

 xtick_step = 50
 plt.xlim(bins[0], bins[-1]+1)
 plt.xticks(np.arange(bins[0],bins[-1]+2,xtick_step))
 plt.xlabel('ADU',fontsize=12)

 plt.yscale('log')
 plt.title(f'{karabo_id} | {prop_str}, r{run}', fontsize=14, fontweight='bold')
 plt.legend(fontsize=12)
 plt.grid(ls=':')
 ```

 %% Cell type:code id: tags:

 ``` python
 # Histogram in keV/number of photons

 if absolute_gain:
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    plt.figure(figsize=(12,8))

    if relative_gain:
        plt.plot(bins_keV,hist['RG'], label='Absolute Gain corr', c=colors[2])

    if pattern_classification:
        plt.plot(bins_keV[bins_keV>.5],hist['CS'][bins_keV>.5], label='Charge Sharing corr', c=colors[3])
        if np.any(hist['S']):
            plt.plot(bins_keV[bins_keV>.5],hist['S'][bins_keV>.5], label='Singles', c=colors[4])

    if photon_energy==0: # if keV instead of #photons
        xtick_step = 5
        plt.xlim(left=-2)
        plt.xticks(np.arange(0,plt.gca().get_xlim()[1],xtick_step))
    plt.xlabel(plot_unit,fontsize=12)

    plt.yscale('log')
    plt.title(f'{karabo_id} | {prop_str}, r{run}', fontsize=14, fontweight='bold')
    plt.legend(fontsize=12)
    plt.grid(ls=':')
 ```

 %% Cell type:markdown id: tags:

 ## Mean Image of the corrected data

 %% Cell type:code id: tags:

 ``` python
 geom = Epix100Geometry.from_relative_positions(top=[386.5, 364.5, 0.], bottom=[386.5, -12.5, 0.])

 if pattern_classification:
    plt.subplots(1,2,figsize=(18,18)) if pattern_classification else plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,2,1)
    ax.set_title(f'Before CS correction',fontsize=12,fontweight='bold');
 else:
    plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,1,1)
    ax.set_title(f'{karabo_id} | {prop_str}, r{run} | Average of {data.shape[0]} trains',fontsize=12,fontweight='bold');

 # Average image before charge sharing corrcetion
 divider = make_axes_locatable(ax)
 cax = divider.append_axes('bottom', size='5%', pad=0.5)

 image = data.mean(axis=0)
 vmin = max(image.mean()-2*image.std(),0)
 vmax = image.mean()+3*image.std()
 geom.plot_data(image,
               ax=ax,
               colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
               origin='upper',
               vmin=vmin,
               vmax=vmax)

 # Average image after charge sharing corrcetion
 if pattern_classification:

    ax = plt.subplot(1,2,2)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('bottom', size='5%', pad=0.5)

    image = data_clu.mean(axis=0)
    geom.plot_data(image,
                   ax=ax,
                   colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
                   origin='upper',
                   vmin=vmin,
                   vmax=vmax)
    ax.set_title(f'After CS correction',fontsize=12,fontweight='bold');

    plt.suptitle(f'{karabo_id} | {prop_str}, r{run} | Average of {data.shape[0]} trains',fontsize=14,fontweight='bold',y=.72);
 ```

 %% Cell type:markdown id: tags:

 ## Single Shot of the corrected data

 %% Cell type:code id: tags:

 ``` python
 train_idx = -1

 if pattern_classification:
    plt.subplots(1,2,figsize=(18,18)) if pattern_classification else plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,2,1)
    ax.set_title(f'Before CS correction',fontsize=12,fontweight='bold');
 else:
    plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,1,1)
    ax.set_title(f'{karabo_id} | {prop_str}, r{run} | Single frame',fontsize=12,fontweight='bold');

 # Average image before charge sharing corrcetion
 divider = make_axes_locatable(ax)
 cax = divider.append_axes('bottom', size='5%', pad=0.5)

 image = data[train_idx]
 vmin = max(image.mean()-2*image.std(),0)
 vmax = image.mean()+3*image.std()
 geom.plot_data(image,
               ax=ax,
               colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
               origin='upper',
               vmin=vmin,
               vmax=vmax)

 # Average image after charge sharing corrcetion
 if pattern_classification:

    ax = plt.subplot(1,2,2)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('bottom', size='5%', pad=0.5)

    image = data_clu[train_idx]
    geom.plot_data(image,
                   ax=ax,
                   colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
                   origin='upper',
                   vmin=vmin,
                   vmax=vmax)
    ax.set_title(f'After CS correction',fontsize=12,fontweight='bold');

    plt.suptitle(f'{karabo_id} | {prop_str}, r{run} | Single frame',fontsize=14,fontweight='bold',y=.72);
 ```

 %% Cell type:markdown id: tags:

 # ePix100 Data Correction

 Author: European XFEL Detector Group, Version: 2.0

 The following notebook provides data correction of images acquired with the ePix100 detector.

 The sequence of correction applied are:
 Offset --> Common Mode Noise --> Relative Gain --> Charge Sharing --> Absolute Gain.

 Offset, common mode and gain corrected data is saved to /data/image/pixels in the CORR files.

 If pattern classification is applied (charge sharing correction), this data will be saved to /data/image/pixels_classified, while the corresponding patterns will be saved to /data/image/patterns in the CORR files.

 %% Cell type:code id: tags:

 ``` python
 in_folder = "/gpfs/exfel/exp/HED/202102/p002739/raw" # input folder, required
 out_folder = ""  # output folder, required
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 sequences = [-1]  # sequences to correct, set to -1 for all, range allowed
 sequences_per_node = 1  # number of sequence files per cluster node if run as slurm job, set to 0 to not run SLURM parallel
 run = 38  # which run to read data from, required

 # Parameters for accessing the raw data.
 karabo_id = "HED_IA1_EPX100-1"  # karabo karabo_id
 karabo_da = "EPIX01"  # data aggregators
 db_module = ""  # module id in the database
 receiver_template = "RECEIVER"  # detector receiver template for accessing raw data files
 path_template = 'RAW-R{:04d}-{}-S{{:05d}}.h5'  # the template to use to access data
 instrument_source_template = '{}/DET/{}:daqOutput'  # instrument detector data source in h5files

 # Parameters affecting writing corrected data.
 chunk_size_idim = 1  # H5 chunking size of output data
 limit_trains = 0  # Process only first N images, 0 - process all.

 # Parameters for the calibration database.
 cal_db_interface = "tcp://max-exfl-cal001:8015#8025"  # calibration DB interface to use
 cal_db_timeout = 300000  # timeout on caldb requests
 creation_time = ""  # The timestamp to use with Calibration DBe. Required Format: "YYYY-MM-DD hh:mm:ss" e.g. 2019-07-04 11:02:41

 # Conditions for retrieving calibration constants.
 bias_voltage = 200  # bias voltage
 in_vacuum = False  # detector operated in vacuum
 integration_time = -1  # Detector integration time, Default value -1 to use the value from the slow data.
 fix_temperature = -1  # fixed temperature value in Kelvin, Default value -1 to use the value from files.
 gain_photon_energy = 8.048  # Photon energy used for gain calibration
 photon_energy = 0.  # Photon energy to calibrate in number of photons, 0 for calibration in keV

 # Flags to select type of applied corrections.
 pattern_classification = True  # do clustering.
 relative_gain = True  # Apply relative gain correction.
 absolute_gain = True  # Apply absolute gain correction (implies relative gain).
 common_mode = True  # Apply common mode correction.

 # Parameters affecting applied correction.
 cm_min_frac = 0.25  # No CM correction is performed if after masking the ratio of good pixels falls below this
 cm_noise_sigma = 5.  # CM correction noise standard deviation
 split_evt_primary_threshold = 7.  # primary threshold for split event correction
 split_evt_secondary_threshold = 5.  # secondary threshold for split event correction
 split_evt_mip_threshold = 1000.  # minimum ionizing particle threshold


 def balance_sequences(in_folder, run, sequences, sequences_per_node, karabo_da):
    from xfel_calibrate.calibrate import balance_sequences as bs
    return bs(in_folder, run, sequences, sequences_per_node, karabo_da)
 ```

 %% Cell type:code id: tags:

 ``` python
 import tabulate
 import warnings
 from logging import warning
 from sys import exit

 import h5py
 import pasha as psh
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import Latex, Markdown, display
 from extra_data import RunDirectory, H5File
 from extra_geom import Epix100Geometry
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 from pathlib import Path

 import cal_tools.restful_config as rest_cfg
 from XFELDetAna import xfelpycaltools as xcal
 from cal_tools.calcat_interface import EPIX100_CalibrationData, CalCatError
 from cal_tools.epix100 import epix100lib
 from cal_tools.files import DataFile
 from cal_tools.tools import (
    calcat_creation_time,
    write_constants_fragment,
 )
 from cal_tools.step_timing import StepTimer

 warnings.filterwarnings('ignore')

 prettyPlotting = True


 %matplotlib inline
 ```

 %% Cell type:code id: tags:

 ``` python
 x = 708  # rows of the ePix100
 y = 768  # columns of the ePix100

 if absolute_gain:
    relative_gain = True

 plot_unit = 'ADU'
 ```

 %% Cell type:code id: tags:

 ``` python
 prop_str = in_folder[in_folder.find('/p')+1:in_folder.find('/p')+8]

 in_folder = Path(in_folder)
 out_folder = Path(out_folder)

 out_folder.mkdir(parents=True, exist_ok=True)

 run_folder = in_folder / f"r{run:04d}"

 instrument_src = instrument_source_template.format(
    karabo_id, receiver_template)

 print(f"Correcting run: {run_folder}")
 print(f"Instrument H5File source: {instrument_src}")
 print(f"Data corrected files are stored at: {out_folder}")
 ```

 %% Cell type:code id: tags:

 ``` python
 creation_time = calcat_creation_time(in_folder, run, creation_time)
 print(f"Using {creation_time.isoformat()} as creation time")
 ```

 %% Cell type:code id: tags:

 ``` python
 run_dc = RunDirectory(run_folder, _use_voview=False)

 seq_files = [Path(f.filename) for f in run_dc.select(f"*{karabo_id}*").files]

 # If a set of sequences requested to correct,
 # adapt seq_files list.
 if sequences != [-1]:
    seq_files = [f for f in seq_files if any(f.match(f"*-S{s:05d}.h5") for s in sequences)]

 if not len(seq_files):
    raise IndexError("No sequence files available for the selected sequences.")

 print(f"Processing a total of {len(seq_files)} sequence files")
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer = StepTimer()
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()

 sensorSize = [x, y]
 # Sensor area will be analysed according to blocksize
 blockSize = [sensorSize[0]//2, sensorSize[1]//2]
 xcal.defaultBlockSize = blockSize
 memoryCells = 1  # ePIX has no memory cells
 run_parallel = False

 # Read control data.
 ctrl_data = epix100lib.epix100Ctrl(
    run_dc=run_dc,
    instrument_src=instrument_src,
    ctrl_src=f"{karabo_id}/DET/CONTROL",
    )

 if integration_time < 0:
    integration_time = ctrl_data.get_integration_time()
    integration_time_str_add = ""
 else:
    integration_time_str_add = "(manual input)"

 if fix_temperature < 0:
    temperature = ctrl_data.get_temprature()
    temperature_k = temperature + 273.15
    temp_str_add = ""
 else:
    temperature_k = fix_temperature
    temperature = fix_temperature - 273.15
    temp_str_add = "(manual input)"

 print(f"Bias voltage is {bias_voltage} V")
 print(f"Detector integration time is set to {integration_time} \u03BCs {integration_time_str_add}")
 print(f"Mean temperature: {temperature:0.2f}°C / {temperature_k:0.2f} K {temp_str_add}")
 print(f"Operated in vacuum: {in_vacuum}")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Table of sequence files to process
 table = [(k, f) for k, f in enumerate(seq_files)]

 if len(table):
    md = display(Latex(tabulate.tabulate(
        table,
        tablefmt='latex',
        headers=["#", "file"]
    )))
 ```

 %% Cell type:markdown id: tags:

 ## Retrieving calibration constants

 As a first step, dark maps have to be loaded.

 %% Cell type:code id: tags:

 ``` python
 epix_cal = EPIX100_CalibrationData(
    detector_name=karabo_id,
    sensor_bias_voltage=bias_voltage,
    integration_time=integration_time,
    sensor_temperature=temperature_k,
    in_vacuum=in_vacuum,
    source_energy=gain_photon_energy,
    event_at=creation_time,
    client=rest_cfg.calibration_client(),
 )

 const_metadata = epix_cal.metadata(calibrations=epix_cal.dark_calibrations)

 if relative_gain:
    try:
        metadata = epix_cal.metadata(epix_cal.illuminated_calibrations)
        for key, value in metadata.items():
            const_metadata.setdefault(key, {}).update(value)
    except CalCatError as e:
        warning(f"CalCatError: {e}")

 # Display retrieved calibration constants timestamps
 epix_cal.display_markdown_retrieved_constants(metadata=const_metadata)
 # Load the constant data from files
 const_data = epix_cal.ndarray_map(metadata=const_metadata)[karabo_da]

 # Validate the constants availability and raise/warn correspondingly.
 missing_dark_constants = {"OffsetEPix100", "NoiseEPix100"} - set(const_data)
 if missing_dark_constants:
    raise ValueError(
        f"Dark constants {missing_dark_constants} are not available to correct {karabo_da}."
        "No correction is performed!")

 if relative_gain and "RelativeGainEPix100" not in const_data.keys():
    warning("RelativeGainEPix100 is not found in the calibration database.")
    relative_gain = False
    absolute_gain = False
 ```

 %% Cell type:code id: tags:

 ``` python
 # Record constant details in YAML metadata
 write_constants_fragment(
    out_folder=(metadata_folder or out_folder),
    det_metadata=const_metadata,
    caldb_root=epix_cal.caldb_root,
    )
 ```

 %% Cell type:code id: tags:

 ``` python
 # Initializing some parameters.
 hscale = 1
 stats = True
 bins = np.arange(-50,1000)
 hist = {'O': 0} # dictionary to store histograms
 ```

 %% Cell type:code id: tags:

 ``` python
 if common_mode:

    commonModeBlockSize = [x//2, y//8]

    cmCorrectionB = xcal.CommonModeCorrection(
        shape=sensorSize,
        blockSize=commonModeBlockSize,
        orientation='block',
        nCells=memoryCells,
        noiseMap=const_data['NoiseEPix100'],
        runParallel=run_parallel,
        parallel=run_parallel,
        stats=stats,
        minFrac=cm_min_frac,
        noiseSigma=cm_noise_sigma,
    )
    cmCorrectionR = xcal.CommonModeCorrection(
        shape=sensorSize,
        blockSize=commonModeBlockSize,
        orientation='row',
        nCells=memoryCells,
        noiseMap=const_data['NoiseEPix100'],
        runParallel=run_parallel,
        parallel=run_parallel,
        stats=stats,
        minFrac=cm_min_frac,
        noiseSigma=cm_noise_sigma,
    )
    cmCorrectionC = xcal.CommonModeCorrection(
        shape=sensorSize,
        blockSize=commonModeBlockSize,
        orientation='col',
        nCells=memoryCells,
        noiseMap=const_data['NoiseEPix100'],
        runParallel=run_parallel,
        parallel=run_parallel,
        stats=stats,
        minFrac=cm_min_frac,
        noiseSigma=cm_noise_sigma,
    )

    hist['CM'] = 0
 ```

 %% Cell type:code id: tags:

 ``` python
 if relative_gain:

    # gain constant is given by the mode of the gain map
    # because all bad pixels are masked using this value
    _vals,_counts = np.unique(const_data["RelativeGainEPix100"], return_counts=True)
    gain_cnst = _vals[np.argmax(_counts)]

    gainCorrection = xcal.RelativeGainCorrection(
        sensorSize,
        gain_cnst/const_data["RelativeGainEPix100"][..., None],
        nCells=memoryCells,
        parallel=run_parallel,
        blockSize=blockSize,
        gains=None,
    )

    hist['RG'] = 0

    if absolute_gain:
        hscale = gain_cnst
        plot_unit = 'keV'
        if photon_energy > 0:
            plot_unit = '$\gamma$'
            hscale /= photon_energy
        hist['AG'] = 0
 ```

 %% Cell type:code id: tags:

 ``` python
 if pattern_classification :
    patternClassifier = xcal.PatternClassifier(
        [x, y],
        const_data["NoiseEPix100"],
        split_evt_primary_threshold,
        split_evt_secondary_threshold,
        split_evt_mip_threshold,
        tagFirstSingles=0,
        nCells=memoryCells,
        allowElongated=False,
        blockSize=[x, y],
        parallel=run_parallel,
    )
    hist['CS'] = 0
    hist['S'] = 0
 ```

 %% Cell type:markdown id: tags:

 ## Applying corrections

 %% Cell type:code id: tags:

 ``` python
 def correct_train(wid, index, tid, d):

    d = d[..., np.newaxis].astype(np.float32)
    d = np.compress(
        np.any(d > 0, axis=(0, 1)), d, axis=2)

    # Offset correction.
    d -= const_data["OffsetEPix100"]
    hist['O'] += np.histogram(d,bins=bins)[0]

    # Common Mode correction.
    if common_mode:
        # Block CM
        d = cmCorrectionB.correct(d)
        # Row CM
        d = cmCorrectionR.correct(d)
        # COL CM
        d = cmCorrectionC.correct(d)

        hist['CM'] += np.histogram(d,bins=bins)[0]


    # Relative gain correction.
    if relative_gain:
        d = gainCorrection.correct(d)

        hist['RG'] += np.histogram(d,bins=bins)[0]

    """The gain correction is currently applying
    an absolute correction (not a relative correction
    as the implied by the name);
    it changes the scale (the unit of measurement)
    of the data from ADU to either keV or n_of_photons.
    But the pattern classification relies on comparing
    data with the NoiseEPix100 map, which is still in ADU.

    The best solution is to do a relative gain
    correction first and apply the global absolute
    gain to the data at the end, after clustering.
    """

    if pattern_classification:

        d_clu, patterns = patternClassifier.classify(d)
        d_clu[d_clu < (split_evt_primary_threshold*const_data["NoiseEPix100"])] = 0

        data_clu[index, ...] = np.squeeze(d_clu)
        data_patterns[index, ...] = np.squeeze(patterns)

        hist['CS'] += np.histogram(d_clu,bins=bins)[0]

        d_sing = d_clu[patterns==100] # pattern 100 corresponds to single photons events
        if len(d_sing):
            hist['S'] += np.histogram(d_sing,bins=bins)[0]

    # Absolute gain correction
    # changes data from ADU to keV (or n. of photons)
    if absolute_gain:

        d = d * gain_cnst
        if photon_energy > 0:
            d /= photon_energy

        hist['AG'] += np.histogram(d,bins=bins)[0]

        if pattern_classification:
            # Modify pattern classification.
            d_clu = d_clu * gain_cnst
            if photon_energy > 0:
                d_clu /= photon_energy

            data_clu[index, ...] = np.squeeze(d_clu)

    data[index, ...] = np.squeeze(d)
 ```

 %% Cell type:code id: tags:

 ``` python
 # 10 is a number chosen after testing 1 ... 71 parallel threads
 context = psh.context.ThreadContext(num_workers=10)
 ```

 %% Cell type:code id: tags:

 ``` python
 empty_seq = 0

 for f in seq_files:

    seq_dc = H5File(f)
    # Save corrected data in an output file with name
    # of corresponding raw sequence file.
    out_file = out_folder / f.name.replace("RAW", "CORR")

    # Data shape in seq_dc excluding trains with empty images.
    ishape = seq_dc[instrument_src, "data.image.pixels"].shape
    corr_ntrains = ishape[0]
    all_train_ids = seq_dc.train_ids

    # Raise a WARNING if this sequence has no trains to correct.
    # Otherwise, print number of trains with no data.
    if corr_ntrains == 0:
        warning(f"No trains to correct for {f.name}: "
                "Skipping the processing of this file.")
        empty_seq += 1
        continue
    elif len(all_train_ids) != corr_ntrains:
        print(f"{f.name} has {len(all_train_ids) - corr_ntrains} trains with missing data.")

    # This parameter is only used for testing.
    if limit_trains > 0:
        print(f"\nCorrected trains are limited to: {limit_trains} trains")
        corr_ntrains = min(corr_ntrains, limit_trains)
    oshape = (corr_ntrains, *ishape[1:])

    data = context.alloc(shape=oshape, dtype=np.float32)

    if pattern_classification:
        data_clu = context.alloc(shape=oshape, dtype=np.float32)
        data_patterns = context.alloc(shape=oshape, dtype=np.int32)

    step_timer.start()  # Correct data.

    # Overwrite seq_dc after eliminating empty trains or/and applying limited images.
    seq_dc = seq_dc.select(
        instrument_src, "*", require_all=True).select_trains(np.s_[:corr_ntrains])

    pixel_data = seq_dc[instrument_src, "data.image.pixels"]
    context.map(correct_train, pixel_data)

    step_timer.done_step(f'Correcting {corr_ntrains} trains.')

    step_timer.start()  # Write corrected data.

    # Create CORR files and add corrected data sections.
    image_counts = seq_dc[instrument_src, "data.image.pixels"].data_counts(labelled=False)

    # Write corrected data.
    with DataFile(out_file, "w") as ofile:
        dataset_chunk = ((chunk_size_idim,) + oshape[1:])  # e.g. (1, pixels_x, pixels_y)
-
+        seq_file = seq_dc.files[0]  # FileAccess
        # Create INDEX datasets.
        ofile.create_index(seq_dc.train_ids, from_file=seq_dc.files[0])
        # Create METDATA datasets
        ofile.create_metadata(
            like=seq_dc,
-            sequence=seq_dc.run_metadata()["sequenceNumber"],
+            sequence=seq_file.sequence,
            instrument_channels=(f'{instrument_src}/data',)
        )
        # Create Instrument section to later add corrected datasets.
        outp_source = ofile.create_instrument_source(instrument_src)

        # Create count/first datasets at INDEX source.
        outp_source.create_index(data=image_counts)

        image_raw_fields = [  # /data/image/
            "binning", "bitsPerPixel", "dimTypes", "dims",
            "encoding", "flipX", "flipY", "roiOffsets", "rotation",
        ]
        for field in image_raw_fields:
            field_arr = seq_dc[instrument_src, f"data.image.{field}"].ndarray()

            outp_source.create_key(
                f"data.image.{field}", data=field_arr,
                chunks=(chunk_size_idim, *field_arr.shape[1:]))

        # Add main corrected `data.image.pixels` dataset and store corrected data.
        outp_source.create_key(
            "data.image.pixels", data=data, chunks=dataset_chunk)
        outp_source.create_key(
            "data.trainId", data=seq_dc.train_ids, chunks=min(50, len(seq_dc.train_ids)))

        if np.isin('data.pulseId', list(seq_dc[instrument_src].keys())): # some runs are missing 'data.pulseId'
            outp_source.create_key(
-                "data.pulseId", data=list(seq_dc[instrument_src]['data.pulseId'].ndarray().squeeze()), chunks=min(50, len(seq_dc.train_ids)))
+                "data.pulseId",
+                data=list(seq_dc[instrument_src]['data.pulseId'].ndarray()[:, 0]),
+                chunks=min(50, len(seq_dc.train_ids)),
+            )

        if pattern_classification:
            # Add main corrected `data.image.pixels` dataset and store corrected data.
            outp_source.create_key(
                "data.image.pixels_classified", data=data_clu, chunks=dataset_chunk)
            outp_source.create_key(
                "data.image.patterns", data=data_patterns, chunks=dataset_chunk)

        step_timer.done_step('Storing data.')
 if empty_seq == len(seq_files):
    warning("No valid trains for RAW data to correct.")
    exit(0)
 ```

 %% Cell type:markdown id: tags:

 ## Plot Histograms

 %% Cell type:code id: tags:

 ``` python
 bins_ADU = bins[:-1]+np.diff(bins)[0]/2
 bins_keV = bins_ADU*hscale
 ```

 %% Cell type:code id: tags:

 ``` python
 # Histogram in ADU

 plt.figure(figsize=(12,8))
 plt.plot(bins_ADU,hist['O'], label='Offset corr')

 if common_mode:
    plt.plot(bins_ADU,hist['CM'], label='CM corr')
 if relative_gain:
    plt.plot(bins_ADU,hist['RG'], label='Relative Gain corr')
 if pattern_classification:
    plt.plot(bins_ADU[bins_ADU>10],hist['CS'][bins_ADU>10], label='Charge Sharing corr')
    if np.any(hist['S']):
        plt.plot(bins_ADU,hist['S'], label='Singles')

 xtick_step = 50
 plt.xlim(bins[0], bins[-1]+1)
 plt.xticks(np.arange(bins[0],bins[-1]+2,xtick_step))
 plt.xlabel('ADU',fontsize=12)

 plt.yscale('log')
 plt.title(f'{karabo_id} | {prop_str}, r{run}', fontsize=14, fontweight='bold')
 plt.legend(fontsize=12)
 plt.grid(ls=':')
 ```

 %% Cell type:code id: tags:

 ``` python
 # Histogram in keV/number of photons

 if absolute_gain:
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    plt.figure(figsize=(12,8))

    if relative_gain:
        plt.plot(bins_keV,hist['RG'], label='Absolute Gain corr', c=colors[2])

    if pattern_classification:
        plt.plot(bins_keV[bins_keV>.5],hist['CS'][bins_keV>.5], label='Charge Sharing corr', c=colors[3])
        if np.any(hist['S']):
            plt.plot(bins_keV[bins_keV>.5],hist['S'][bins_keV>.5], label='Singles', c=colors[4])

    if photon_energy==0: # if keV instead of #photons
        xtick_step = 5
        plt.xlim(left=-2)
        plt.xticks(np.arange(0,plt.gca().get_xlim()[1],xtick_step))
    plt.xlabel(plot_unit,fontsize=12)

    plt.yscale('log')
    plt.title(f'{karabo_id} | {prop_str}, r{run}', fontsize=14, fontweight='bold')
    plt.legend(fontsize=12)
    plt.grid(ls=':')
 ```

 %% Cell type:markdown id: tags:

 ## Mean Image of the corrected data

 %% Cell type:code id: tags:

 ``` python
 geom = Epix100Geometry.from_relative_positions(top=[386.5, 364.5, 0.], bottom=[386.5, -12.5, 0.])

 if pattern_classification:
    plt.subplots(1,2,figsize=(18,18)) if pattern_classification else plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,2,1)
    ax.set_title(f'Before CS correction',fontsize=12,fontweight='bold');
 else:
    plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,1,1)
    ax.set_title(f'{karabo_id} | {prop_str}, r{run} | Average of {data.shape[0]} trains',fontsize=12,fontweight='bold');

 # Average image before charge sharing corrcetion
 divider = make_axes_locatable(ax)
 cax = divider.append_axes('bottom', size='5%', pad=0.5)

 image = data.mean(axis=0)
 vmin = max(image.mean()-2*image.std(),0)
 vmax = image.mean()+3*image.std()
 geom.plot_data(image,
               ax=ax,
               colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
               origin='upper',
               vmin=vmin,
               vmax=vmax)

 # Average image after charge sharing corrcetion
 if pattern_classification:

    ax = plt.subplot(1,2,2)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('bottom', size='5%', pad=0.5)

    image = data_clu.mean(axis=0)
    geom.plot_data(image,
                   ax=ax,
                   colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
                   origin='upper',
                   vmin=vmin,
                   vmax=vmax)
    ax.set_title(f'After CS correction',fontsize=12,fontweight='bold');

    plt.suptitle(f'{karabo_id} | {prop_str}, r{run} | Average of {data.shape[0]} trains',fontsize=14,fontweight='bold',y=.72);
 ```

 %% Cell type:markdown id: tags:

 ## Single Shot of the corrected data

 %% Cell type:code id: tags:

 ``` python
 train_idx = -1

 if pattern_classification:
    plt.subplots(1,2,figsize=(18,18)) if pattern_classification else plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,2,1)
    ax.set_title(f'Before CS correction',fontsize=12,fontweight='bold');
 else:
    plt.subplots(1,1,figsize=(9,9))
    ax = plt.subplot(1,1,1)
    ax.set_title(f'{karabo_id} | {prop_str}, r{run} | Single frame',fontsize=12,fontweight='bold');

 # Average image before charge sharing corrcetion
 divider = make_axes_locatable(ax)
 cax = divider.append_axes('bottom', size='5%', pad=0.5)

 image = data[train_idx]
 vmin = max(image.mean()-2*image.std(),0)
 vmax = image.mean()+3*image.std()
 geom.plot_data(image,
               ax=ax,
               colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
               origin='upper',
               vmin=vmin,
               vmax=vmax)

 # Average image after charge sharing corrcetion
 if pattern_classification:

    ax = plt.subplot(1,2,2)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('bottom', size='5%', pad=0.5)

    image = data_clu[train_idx]
    geom.plot_data(image,
                   ax=ax,
                   colorbar={'cax': cax, 'label': plot_unit, 'orientation': 'horizontal'},
                   origin='upper',
                   vmin=vmin,
                   vmax=vmax)
    ax.set_title(f'After CS correction',fontsize=12,fontweight='bold');

    plt.suptitle(f'{karabo_id} | {prop_str}, r{run} | Single frame',fontsize=14,fontweight='bold',y=.72);
 ```

--- a/notebooks/generic/overallmodules_Darks_Summary_NBC.ipynb
+++ b/notebooks/generic/overallmodules_Darks_Summary_NBC.ipynb
 %% Cell type:code id: tags:

 ``` python
 # Author: European XFEL Detector Group, Version: 1.0

 #  Summary for processed of dark calibration constants and a comparison with previous injected constants.

 out_folder = "/gpfs/exfel/data/scratch/kluyvert/lpd-dark-p900320-r26_27_28" # path to output to, required
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 karabo_id = "FXE_DET_LPD1M-1" # detector instance
 gain_names = ['High gain', 'Medium gain', 'Low gain'] # a list of gain names to be used in plotting
 threshold_names = ['HG-MG threshold', 'MG_LG threshold'] # a list of gain names to be used in plotting
 local_output = True  # Boolean indicating that local constants were stored in the out_folder

 # Skip the whole notebook if local_output is false in the preceding notebooks.
 if not local_output:
    print('No local constants saved. Skipping summary plots')
    import sys
    sys.exit(0)
 ```

 %% Cell type:code id: tags:

 ``` python
 import warnings
 from pathlib import Path

 warnings.filterwarnings('ignore')

 import h5py
 import matplotlib
 import numpy as np
 import pasha as psh
 import yaml
 from IPython.display import Latex, Markdown, display

 matplotlib.use("agg")
 import matplotlib.gridspec as gridspec
 import matplotlib.pyplot as plt

 %matplotlib inline
 import extra_geom
 import tabulate
 from cal_tools import step_timing
 from cal_tools.ana_tools import get_range
 from cal_tools.enums import BadPixels
-from cal_tools.plotting import show_processed_modules
+from cal_tools.plotting import agipd_single_module_geometry, show_processed_modules
 from cal_tools.tools import CalibrationMetadata, module_index_to_qm
 from XFELDetAna.plotting.simpleplot import simplePlot
 ```

 %% Cell type:code id: tags:

 ``` python
 def bp_entry(bp):
    return [f"{bp.name:<30s}", f"{bp.value:032b}", f"{int(bp.value)}"]
 ```

 %% Cell type:code id: tags:

 ``` python
 if "AGIPD" in karabo_id:
-    if "SPB" in karabo_id:
-        dinstance = "AGIPD1M1"
-        nmods = 16
-    elif "MID" in karabo_id:
-        dinstance = "AGIPD1M2"
+    module_shape = (512, 128)
+    if "AGIPD1M" in karabo_id:
        nmods = 16
-    elif "HED" in karabo_id:
-        dinstance = "AGIPD500K"
+    elif "AGIPD500K" in karabo_id:
        nmods = 8
+    else:
+        nmods = 1
    # This list needs to be in that order as later Adaptive or fixed gain is
    # decided based on the condition for the Offset constant.
    expected_constants = ['Offset', 'Noise', 'ThresholdsDark', 'BadPixelsDark']

    table = []
    badpixels = [
        BadPixels.OFFSET_OUT_OF_THRESHOLD,
        BadPixels.NOISE_OUT_OF_THRESHOLD,
        BadPixels.OFFSET_NOISE_EVAL_ERROR,
        BadPixels.GAIN_THRESHOLDING_ERROR,
    ]
    for bp in badpixels:
        table.append(bp_entry(bp))

    display(Markdown("""
 # Summary of AGIPD dark characterization #

 The following report shows a set of dark images taken with the AGIPD detector to deduce detector offsets,
 noise, bad-pixel maps and thresholding. All four types of constants are evaluated per-pixel and per-memory cell.

 **The offset** ($O$) is defined as the median ($M$) of the dark signal ($Ds$) over trains ($t$) for a given pixel
 ($x,y$) and memory cell ($c$).

 **The noise** $N$ is the standard deviation $\sigma$ of the dark signal.

 $$ O_{x,y,c} = M(Ds)_{t} ,\,\,\,\,\,\, N_{x,y,c} = \sigma(Ds)_{t}$$

 **The bad pixel** mask is encoded as a bit mask."""))

    display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["Name", "bit value", "integer value"])))
    display(Markdown("""
 **"OFFSET_OUT_OF_THRESHOLD":**

 Offset outside of bounds:

 $$M(O)_{x,y} - \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} < O < M(O)_{x,y} + \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} $$

 or offset outside of hard limits

 $$ \mathrm{thresholds\_offset\_hard}_\mathrm{low} < O < \mathrm{thresholds\_offset\_hard}_\mathrm{high} $$

 **"NOISE_OUT_OF_THRESHOLD":**

 Noise outside of bounds:

 $$M(N)_{x,y} - \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} < N < M(N)_{x,y} + \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} $$

 or noise outside of hard limits

 $$\mathrm{thresholds\_noise\_hard}_\mathrm{low} < N < \mathrm{thresholds\_noise\_hard}_\mathrm{high} $$

 **"OFFSET_NOISE_EVAL_ERROR":**

 Offset and Noise both not $nan$ values

 Values: $\mathrm{thresholds\_offset\_sigma}$, $\mathrm{thresholds\_offset\_hard}$, $\mathrm{thresholds\_noise\_sigma}$, $\mathrm{thresholds\_noise\_hard}$ are given as parameters.

 **"GAIN_THRESHOLDING_ERROR":**

 Bad gain separated pixels with sigma separation less than gain_separation_sigma_threshold

 $$ sigma\_separation = \\frac{\mathrm{gain\_offset} - \mathrm{previous\_gain\_offset}}{\sqrt{\mathrm{gain\_offset_{std}}^\mathrm{2} + \mathrm{previuos\_gain\_offset_{std}}^\mathrm{2}}}$$
 $$ Bad\_separation = sigma\_separation < \mathrm{gain\_separation\_sigma\_threshold} $$

 """))


 elif "LPD" in karabo_id:
-    dinstance = "LPD1M1"
    nmods = 16
+    module_shape = (256, 256)
    expected_constants = ['Offset', 'Noise', 'BadPixelsDark']
    table = []
    badpixels = [
        BadPixels.OFFSET_OUT_OF_THRESHOLD,
        BadPixels.NOISE_OUT_OF_THRESHOLD,
        BadPixels.OFFSET_NOISE_EVAL_ERROR,
    ]
    for bp in badpixels:
        table.append(bp_entry(bp))
    display(Markdown("""

 # Summary of LPD dark characterization #

 The following report shows a set of dark images taken with the LPD detector to deduce detector offsets, noise, bad-pixel maps. All three types of constants are evaluated per-pixel and per-memory cell.

 **The offset** ($O$) is defined as the median ($M$) of the dark signal ($Ds$) over trains ($t$) for a given pixel ($x,y$) and memory cell ($c$).

 **The noise** $N$ is the standard deviation $\sigma$ of the dark signal.

 $$ O_{x,y,c} = M(Ds)_{t} ,\,\,\,\,\,\, N_{x,y,c} = \sigma(Ds)_{t}$$

 **The bad pixel** mask is encoded as a bit mask."""))
    display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["Name", "bit value", "integer value"])))
    display(Markdown("""

 **"OFFSET_OUT_OF_THRESHOLD":**

 Offset outside of bounds:

 $$M(O)_{x,y} - \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} < O < M(O)_{x,y} + \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} $$

 or offset outside of hard limits

 $$ \mathrm{thresholds\_offset\_hard}_\mathrm{low} < O < \mathrm{thresholds\_offset\_hard}_\mathrm{high} $$

 **"NOISE_OUT_OF_THRESHOLD":**

 Noise outside of bounds:

 $$M(N)_{x,y} - \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} < N < M(N)_{x,y} + \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} $$

 or noise outside of hard limits

 $$\mathrm{thresholds\_noise\_hard}_\mathrm{low} < N < \mathrm{thresholds\_noise\_hard}_\mathrm{high} $$

 **"OFFSET_NOISE_EVAL_ERROR":**

 Offset and Noise both not $nan$ values

 "Values: $\\mathrm{thresholds\\_offset\\_sigma}$, $\\mathrm{thresholds\\_offset\\_hard}$, $\\mathrm{thresholds\\_noise\\_sigma}$, $\\mathrm{thresholds\\_noise\\_hard}$ are given as parameters.\n",
 """))
 elif "DSSC" in karabo_id:
-    dinstance = "DSSC1M1"
    nmods = 16
+    module_shape = (128, 512)
    expected_constants = ['Offset', 'Noise']
    display(Markdown("""

 # Summary of DSSC dark characterization #

    """))
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer = step_timing.StepTimer()
 out_folder = Path(out_folder)
 metadata = CalibrationMetadata(metadata_folder or out_folder)
 mod_mapping = metadata.setdefault("modules-mapping", {})
 old_constant_metadata = {}
 for fn in out_folder.glob("module_metadata_*.yml"):
    with fn.open("r") as fd:
        fdict = yaml.safe_load(fd)
    module = fdict["module"]
    mod_mapping[module] = fdict["pdu"]
    old_constant_metadata[module] = fdict["old-constants"]

 metadata.save()
 ```

 %% Cell type:code id: tags:

 ``` python
 # In AGIPD fixed gain mode, ThresholdsDark is not expected
 if 'AGIPD' in karabo_id:
    for i in range(nmods):
        qm = module_index_to_qm(i)
        if not mod_mapping.get(qm):
            continue
        mod_pdu = mod_mapping[qm]
        fpath = out_folder / f"const_Offset_{mod_pdu}.h5"
        if not fpath.exists():
            continue

        with h5py.File(fpath, 'r') as f:
            if 'Gain mode' in f['condition']:
                if f["condition"]["Gain mode"]["value"][()]:
                    expected_constants.remove("ThresholdsDark")
        break
 ```

 %% Cell type:markdown id: tags:

 Preparing newly injected and previous constants from produced local folder in out_folder.

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 # Get shape, dtype, and number of files for each constant.
 # Also build lists of the files involved, to be loaded in parallel in a later cell.
 const_shape_and_dtype = {}
 found_module_nums = set()
 pieces_to_load = []
 pieces_to_load_prev = []

 for cname in expected_constants:
    for i in range(nmods):
        qm = module_index_to_qm(i)
        if not mod_mapping.get(qm):
            continue
        mod_pdu = mod_mapping[qm]
        fpath = out_folder / f"const_{cname}_{mod_pdu}.h5"
        if not fpath.exists():
            continue

        pieces_to_load.append((cname, i, fpath))
        found_module_nums.add(i)

        # try finding old constants using paths from CalCat store
        if qm not in old_constant_metadata:
            continue
        qm_mdata = old_constant_metadata[qm]

        if cname not in qm_mdata:
            continue

        fpath_prev = qm_mdata[cname]["filepath"]
        h5path_prev = qm_mdata[cname]["h5path"]

        if fpath_prev and h5path_prev:
            pieces_to_load_prev.append((cname, i, fpath_prev, h5path_prev))

    # Get the constant shape from one of the module files
    with h5py.File(fpath, 'r') as f:
        const_shape_and_dtype[cname] = (f['data'].shape, f['data'].dtype)

 # Allocate arrays for these constants (without space for missing modules)
 nmods_found = len(found_module_nums)
 constants = {
    cname: psh.alloc((nmods_found,) + module_const_shape, dtype=dt, fill=0)
    for cname, (module_const_shape, dt) in const_shape_and_dtype.items()
 }
 prev_const = {
    cname: psh.alloc((nmods_found,) + module_const_shape, dtype=dt, fill=0)
    for cname, (module_const_shape, dt) in const_shape_and_dtype.items()
 }
 step_timer.done_step("Preparing arrays for old and new constants.")
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 # Load the constant data in parallel
 found_module_nums = sorted(found_module_nums)
 mod_names = [module_index_to_qm(n) for n in found_module_nums]

 def load_piece(wid, ix, entry):
    cname, mod_no, fpath = entry
    mod_ix = found_module_nums.index(mod_no)

    with h5py.File(fpath, 'r') as f:
        f['data'].read_direct(constants[cname][mod_ix])

 psh.map(load_piece, pieces_to_load)
 print(f"Loaded constant data from {len(pieces_to_load)} files")

 def load_piece_prev(wid, ix, entry):
    cname, mod_no, fpath, h5path = entry
    mod_ix = found_module_nums.index(mod_no)

    with h5py.File(fpath, 'r') as f:
        f[h5path]['data'].read_direct(prev_const[cname][mod_ix])

 psh.map(load_piece_prev, pieces_to_load_prev)
 print(f"Loaded previous constant data from {len(pieces_to_load_prev)} files")
 step_timer.done_step("Loading constants data.")
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('## Processed modules'))
-show_processed_modules(dinstance, constants, mod_names, mode="processed")
+show_processed_modules(karabo_id, constants, mod_names, mode="processed")
 ```

 %% Cell type:markdown id: tags:

 ## Summary figures across Modules ##

 The following plots give an overview of calibration constants averaged across pixels and memory cells. A bad pixel mask is applied.

 %% Cell type:code id: tags:

 ``` python
-if "LPD" in dinstance:
-    geom = extra_geom.LPD_1MGeometry.from_quad_positions(quad_pos=[(11.4, 299),
-                                                                   (-11.5, 8),
-                                                                   (254.5, -16),
-                                                                   (278.5, 275)])
-    module_shape = (256, 256)
-
-elif dinstance in ('AGIPD1M1', 'AGIPD1M2'):
-    geom = extra_geom.AGIPD_1MGeometry.from_quad_positions(quad_pos=[(-525, 625),
-                                                                     (-550, -10),
-                                                                     (520, -160),
-                                                                     (542.5, 475)])
-    module_shape = (512, 128)
-
-elif dinstance == "AGIPD500K":
+if "LPD" in karabo_id:
+    geom = extra_geom.LPD_1MGeometry.from_quad_positions(
+        quad_pos=[
+            (11.4, 299),
+            (-11.5, 8),
+            (254.5, -16),
+            (278.5, 275)])
+elif "AGIPD1M" in karabo_id:
+    geom = extra_geom.AGIPD_1MGeometry.from_quad_positions(
+        quad_pos=[
+            (-525, 625),
+            (-550, -10),
+            (520, -160),
+            (542.5, 475)])
+elif "AGIPD500K" in karabo_id:
    geom = extra_geom.AGIPD_500K2GGeometry.from_origin()
-    module_shape = (512, 128)
-
-elif "DSSC" in dinstance:
-    module_shape = (128, 512)
+elif "AGIPD65K" in karabo_id:  # single module detector
+    geom = agipd_single_module_geometry()
+elif "DSSC" in karabo_id:
    quadpos = [(-130, 5), (-130, -125), (5, -125), (5, 5)]
    geom = extra_geom.DSSC_1MGeometry.from_quad_positions(quadpos)
 ```

 %% Cell type:code id: tags:

 ``` python
 def plot_const_and_delta(const, delta, const_name, gain_name):
    gs = gridspec.GridSpec(2, 2)
    fig = plt.figure(figsize=(24, 32))

    ax0 = fig.add_subplot(gs[0, :])
    vmin, vmax = get_range(const, 2)
    geom.plot_data_fast(
        const, vmin=vmin, vmax=vmax, ax=ax0, colorbar={
            'shrink': 0.9, 'pad': 0.05, 'label': 'ADUs'
    })
    ax0.set_title(f"{const_name} - {gain_name}", fontsize=15)

    if np.count_nonzero(delta) == np.count_nonzero(np.isnan(delta)):
        fig.text(0.5, 0.4, "No difference from previous constant",
                 ha='center', va='center', fontsize=15)
        return

    # Plot delta from previous constant
    ax1 = fig.add_subplot(gs[1, 0])
    vmin, vmax = get_range(delta, 2)
    vmax = max(vmax, abs(vmin))  # Center around zero
    geom.plot_data_fast(
-        delta, vmin=-vmax, vmax=vmax, ax=ax1, cmap="RdBu", colorbar={
+        delta, vmin=-vmax, vmax=vmax, ax=ax1, colorbar={
            'shrink': 0.6, 'pad': 0.1, 'label': 'ADUs'
    })
    ax1.set_title(f"Difference with previous {const_name} - {gain_name}", fontsize=15)

    # Plot % delta from previous constant
    delta_pct = delta / const * 100
    ax2 = fig.add_subplot(gs[1, 1])
    vmin, vmax = get_range(delta_pct, 2)
    vmax = max(vmax, abs(vmin))  # Center around zero
    geom.plot_data_fast(
-        delta_pct, vmin=-vmax, vmax=vmax, ax=ax2, cmap="RdBu", colorbar={
+        delta_pct, vmin=-vmax, vmax=vmax, ax=ax2, colorbar={
            'shrink': 0.6, 'pad': 0.1, 'label': '%'
    })
    ax2.set_title("Percentage difference", fontsize=15)
 ```

 %% Cell type:code id: tags:

 ``` python
 psh_ctx = psh.ProcessContext(nmods)
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()

 gainstages = 1

 for const_name, const in constants.items():
    if const_name == 'BadPixelsDark':
        continue
    # Check if constant gain available in constant e.g. AGIPD, LPD
    if len(const.shape) == 5:
        gainstages = 3
    else:
        gainstages = 1

    display(Markdown(f'##### {const_name}'))
    print_once = True
    for gain in range(gainstages):
        if const_name == 'ThresholdsDark':
            if gain > 1:
                continue
            glabel = threshold_names[gain]
        else:
            glabel = gain_names[gain]

        stacked_const = psh_ctx.alloc((nmods,) + module_shape, dtype=np.float64, fill=0)
        stacked_delta = psh_ctx.alloc((nmods,) + module_shape, dtype=np.float64, fill=0)

        def average_module(wid, i, _):
            qm = module_index_to_qm(i)
            if qm in mod_names:
                m_idx = mod_names.index(qm)
                # Check if constant shape of 5 indices e.g. AGIPD, LPD
                if const.ndim == 5:
                    values = np.nanmean(const[m_idx, :, :, :, gain], axis=2)
                    prev_val = np.nanmean(prev_const[const_name][m_idx, :, :, :, gain], axis=2)
                else:
                    values = np.nanmean(const[m_idx, :, :, :], axis=2)
                    prev_val = np.nanmean(prev_const[const_name][m_idx, :, :, :], axis=2)
                values[values == 0] = np.nan
                prev_val[prev_val == 0] = np.nan
                stacked_const[i] = np.moveaxis(values, 0, -1)
                stacked_delta[i] = np.moveaxis(values - prev_val, 0, -1)
            else:
                # if module not available fill space with nan
                stacked_const[i] = np.nan

        psh_ctx.map(average_module, range(nmods))

        # Plotting constant overall modules.
        display(Markdown(f'###### {glabel} ######'))

        plot_const_and_delta(stacked_const, stacked_delta, const_name, glabel)

        plt.show()
 step_timer.done_step("Plotting constants and relative differences.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Loop over modules and constants
 step_timer.start()
 for const_name, const in constants.items():
    if const_name == 'BadPixelsDark':
        continue  # Displayed separately below

    display(Markdown(f'### Summary across Modules - {const_name}'))

    for gain in range(gainstages):
        if const_name == 'ThresholdsDark':
            if gain == 2:
                continue
            glabel = threshold_names[gain]
        else:
            glabel = gain_names[gain]

        if const.ndim == 5:
            data = const[:, :, :, :, gain]
        else:
            data = const

        # Bad pixels are per gain stage, and you need thresholds to pick the gain
        # stage, so we don't mask the thresholds.
        if ('BadPixelsDark' in constants) and (const_name != 'ThresholdsDark'):
            label = f'{const_name} value [ADU], good pixels only'
            if const.ndim == 5:
                goodpix = constants['BadPixelsDark'][:, :, :, :, gain] == 0
            else:
                goodpix = constants['BadPixelsDark'] == 0
        else:
            label = f'{const_name} value [ADU], good and bad pixels'
            goodpix = [True] * data.shape[0]

        # Reduce data in parallel (one worker per module):
        datamean = psh_ctx.alloc(data.shape[:1] + data.shape[3:], data.dtype)
        datastd = psh_ctx.alloc(data.shape[:1] + data.shape[3:], data.dtype)

        def average_mem_cells(wid, i, _):
            datamean[i] = np.mean(data[i], axis=(0, 1), where=goodpix[i])
            datastd[i] = np.std(data[i], axis=(0, 1), where=goodpix[i])
        psh_ctx.map(average_mem_cells, range(data.shape[0]))

        fig = plt.figure(figsize=(15, 6), tight_layout={
                         'pad': 0.2, 'w_pad': 1.3, 'h_pad': 1.3})
        ax = fig.add_subplot(121)

        d = []
        for im, mod in enumerate(datamean):
            d.append({'x': np.arange(mod.shape[0]),
                      'y': mod,
                      'drawstyle': 'steps-pre',
                      'label': mod_names[im],
                      })

        _ = simplePlot(d, figsize=(10, 10), xrange=(-12, 510),
                            x_label='Memory Cell ID',
                            y_label=label,
                            use_axis=ax,
                            title=glabel,
                            title_position=[0.5, 1.18],
                            legend='outside-top-ncol6-frame', legend_size='18%',
                            legend_pad=0.00)

        # Plot standard deviation
        ax = fig.add_subplot(122)
        if "BadPixelsDark" in constants.keys():
            label = f'$\sigma$ {const_name} [ADU], good pixels only'
        else:
            label = f'$\sigma$ {const_name} [ADU], good and bad pixels'
        d = []
        for im, mod in enumerate(datastd):
            d.append({'x': np.arange(mod.shape[0]),
                      'y': mod,
                      'drawstyle': 'steps-pre',
                      'label': mod_names[im],
                      })

        _ = simplePlot(d, figsize=(10, 10), xrange=(-12, 510),
                            x_label='Memory Cell ID',
                            y_label=label,
                            use_axis=ax,
                            title=f'{glabel} $\sigma$',
                            title_position=[0.5, 1.18],
                            legend='outside-top-ncol6-frame', legend_size='18%',
                            legend_pad=0.00)

        plt.show()
 step_timer.done_step("Plotting summary across modules.")
 ```

 %% Cell type:code id: tags:

 ``` python
 if 'BadPixelsDark' in constants:
    step_timer.start()
    display(Markdown(f'### Summary across Modules - BadPixelsDark'))

    bad_px_dark = constants['BadPixelsDark']

    for gain in range(gainstages):
        glabel = gain_names[gain]

        if bad_px_dark.ndim == 5:
            data = bad_px_dark[:, :, :, :, gain]
        else:
            data = bad_px_dark[:, :, :, :]

        bad_px_per_cell = np.count_nonzero(data, axis=(1, 2))
        fraction_bad_per_cell = bad_px_per_cell / (data.shape[1] * data.shape[2])
        fraction_bad_per_cell[fraction_bad_per_cell == 1.0] = np.nan

        fig = plt.figure(figsize=(15, 6), tight_layout={
                         'pad': 0.2, 'w_pad': 1.3, 'h_pad': 1.3})
        ax = fig.add_subplot(1, 1, 1)

        d = []
        for im, mod in enumerate(fraction_bad_per_cell):
            d.append({'x': np.arange(mod.shape[0]),
                      'y': mod,
                      'drawstyle': 'steps-pre',
                      'label': mod_names[im],
                      })

        _ = simplePlot(d, figsize=(10, 10), xrange=(-12, 510),
                            x_label='Memory Cell ID',
                            y_label='Fraction of bad pixels',
                            use_axis=ax,
                            title=glabel,
                            title_position=[0.5, 1.18],
                            legend='outside-top-ncol6-frame', legend_size='18%',
                            legend_pad=0.00)

        plt.show()
    step_timer.done_step("Summary across modules for BadPixels.")
 ```

 %% Cell type:markdown id: tags:

 ## Summary tables across Modules ##

 Tables show values averaged across all pixels and memory cells of a given detector module.

 %% Cell type:code id: tags:

 ``` python
 if u'$' in tabulate.LATEX_ESCAPE_RULES:
    del(tabulate.LATEX_ESCAPE_RULES[u'$'])

 if u'\\' in tabulate.LATEX_ESCAPE_RULES:
    del(tabulate.LATEX_ESCAPE_RULES[u'\\'])
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 head = ['Module', 'High gain', 'Medium gain', 'Low gain']
 head_th = ['Module', 'HG_MG threshold', 'MG_LG threshold']
 for const_name, const in constants.items():
    if const_name == 'BadPixelsDark':
        continue  # Handled below

    if const.ndim == 4:  # Add gain dimension if not present
        const = const[:, :, : , :, np.newaxis]

    if ('BadPixelsDark' in constants) and (const_name != 'ThresholdsDark'):
        goodpix = constants['BadPixelsDark'] == 0
        if goodpix.ndim == 4:
            goodpix = goodpix[:, :, : , :, np.newaxis]

        label = f'### Average {const_name} [ADU], good pixels only'
    else:
        goodpix = [True] * const.shape[0]
        label = f'### Average {const_name} [ADU], good and bad pixels'

    # Reduce data in parallel (one worker per module)
    mean_by_module_gain = psh.alloc(const.shape[:1] + const.shape[4:], const.dtype)
    std_by_module_gain  = psh.alloc(const.shape[:1] + const.shape[4:], const.dtype)

    def average_module(wid, i, _):
        mean_by_module_gain[i] = np.mean(const[i], axis=(0, 1, 2), where=goodpix[i])
        std_by_module_gain[i]  = np.std (const[i], axis=(0, 1, 2), where=goodpix[i])
    psh_ctx.map(average_module, range(const.shape[0]))

    table = []

    for i_mod, mod in enumerate(mod_names):
        t_line = [mod]
        for gain in range(gainstages):
            if const_name == 'ThresholdsDark' and gain == 2:
                continue

            datamean = mean_by_module_gain[i_mod, gain]
            datastd = std_by_module_gain[i_mod, gain]
            t_line.append(f'{datamean:6.1f} $\\pm$ {datastd:6.1f}')

        table.append(t_line)

    display(Markdown(label))
    header = head_th if const_name == 'ThresholdsDark' else head
    md = display(Latex(tabulate.tabulate(
        table, tablefmt='latex', headers=header)))
 step_timer.done_step("Summary tables across modules.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Bad pixels summary table
 if 'BadPixelsDark' in constants:
    step_timer.start()
    bad_px_dark = constants['BadPixelsDark']

    table = []

    for i_mod, mod in enumerate(mod_names):

        t_line = [mod]
        for gain in range(gainstages):
            if bad_px_dark.ndim == 5:
                data = bad_px_dark[i_mod, :, :, :, gain]
            else:
                data = bad_px_dark[i_mod]

            datasum = np.count_nonzero(data)
            datamean = datasum / data.size

            t_line.append(f'{datasum:6.0f} ({datamean:6.3f}) ')
            label = '## Number(fraction) of bad pixels'

        table.append(t_line)

    display(Markdown(label))
    md = display(Latex(tabulate.tabulate(
        table, tablefmt='latex', headers=head)))
    step_timer.done_step("Summary table across modules for BadPixels.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Author: European XFEL Detector Group, Version: 1.0

 #  Summary for processed of dark calibration constants and a comparison with previous injected constants.

 out_folder = "/gpfs/exfel/data/scratch/kluyvert/lpd-dark-p900320-r26_27_28" # path to output to, required
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 karabo_id = "FXE_DET_LPD1M-1" # detector instance
 gain_names = ['High gain', 'Medium gain', 'Low gain'] # a list of gain names to be used in plotting
 threshold_names = ['HG-MG threshold', 'MG_LG threshold'] # a list of gain names to be used in plotting
 local_output = True  # Boolean indicating that local constants were stored in the out_folder

 # Skip the whole notebook if local_output is false in the preceding notebooks.
 if not local_output:
    print('No local constants saved. Skipping summary plots')
    import sys
    sys.exit(0)
 ```

 %% Cell type:code id: tags:

 ``` python
 import warnings
 from pathlib import Path

 warnings.filterwarnings('ignore')

 import h5py
 import matplotlib
 import numpy as np
 import pasha as psh
 import yaml
 from IPython.display import Latex, Markdown, display

 matplotlib.use("agg")
 import matplotlib.gridspec as gridspec
 import matplotlib.pyplot as plt

 %matplotlib inline
 import extra_geom
 import tabulate
 from cal_tools import step_timing
 from cal_tools.ana_tools import get_range
 from cal_tools.enums import BadPixels
-from cal_tools.plotting import show_processed_modules
+from cal_tools.plotting import agipd_single_module_geometry, show_processed_modules
 from cal_tools.tools import CalibrationMetadata, module_index_to_qm
 from XFELDetAna.plotting.simpleplot import simplePlot
 ```

 %% Cell type:code id: tags:

 ``` python
 def bp_entry(bp):
    return [f"{bp.name:<30s}", f"{bp.value:032b}", f"{int(bp.value)}"]
 ```

 %% Cell type:code id: tags:

 ``` python
 if "AGIPD" in karabo_id:
-    if "SPB" in karabo_id:
-        dinstance = "AGIPD1M1"
-        nmods = 16
-    elif "MID" in karabo_id:
-        dinstance = "AGIPD1M2"
+    module_shape = (512, 128)
+    if "AGIPD1M" in karabo_id:
        nmods = 16
-    elif "HED" in karabo_id:
-        dinstance = "AGIPD500K"
+    elif "AGIPD500K" in karabo_id:
        nmods = 8
+    else:
+        nmods = 1
    # This list needs to be in that order as later Adaptive or fixed gain is
    # decided based on the condition for the Offset constant.
    expected_constants = ['Offset', 'Noise', 'ThresholdsDark', 'BadPixelsDark']

    table = []
    badpixels = [
        BadPixels.OFFSET_OUT_OF_THRESHOLD,
        BadPixels.NOISE_OUT_OF_THRESHOLD,
        BadPixels.OFFSET_NOISE_EVAL_ERROR,
        BadPixels.GAIN_THRESHOLDING_ERROR,
    ]
    for bp in badpixels:
        table.append(bp_entry(bp))

    display(Markdown("""
 # Summary of AGIPD dark characterization #

 The following report shows a set of dark images taken with the AGIPD detector to deduce detector offsets,
 noise, bad-pixel maps and thresholding. All four types of constants are evaluated per-pixel and per-memory cell.

 **The offset** ($O$) is defined as the median ($M$) of the dark signal ($Ds$) over trains ($t$) for a given pixel
 ($x,y$) and memory cell ($c$).

 **The noise** $N$ is the standard deviation $\sigma$ of the dark signal.

 $$ O_{x,y,c} = M(Ds)_{t} ,\,\,\,\,\,\, N_{x,y,c} = \sigma(Ds)_{t}$$

 **The bad pixel** mask is encoded as a bit mask."""))

    display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["Name", "bit value", "integer value"])))
    display(Markdown("""
 **"OFFSET_OUT_OF_THRESHOLD":**

 Offset outside of bounds:

 $$M(O)_{x,y} - \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} < O < M(O)_{x,y} + \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} $$

 or offset outside of hard limits

 $$ \mathrm{thresholds\_offset\_hard}_\mathrm{low} < O < \mathrm{thresholds\_offset\_hard}_\mathrm{high} $$

 **"NOISE_OUT_OF_THRESHOLD":**

 Noise outside of bounds:

 $$M(N)_{x,y} - \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} < N < M(N)_{x,y} + \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} $$

 or noise outside of hard limits

 $$\mathrm{thresholds\_noise\_hard}_\mathrm{low} < N < \mathrm{thresholds\_noise\_hard}_\mathrm{high} $$

 **"OFFSET_NOISE_EVAL_ERROR":**

 Offset and Noise both not $nan$ values

 Values: $\mathrm{thresholds\_offset\_sigma}$, $\mathrm{thresholds\_offset\_hard}$, $\mathrm{thresholds\_noise\_sigma}$, $\mathrm{thresholds\_noise\_hard}$ are given as parameters.

 **"GAIN_THRESHOLDING_ERROR":**

 Bad gain separated pixels with sigma separation less than gain_separation_sigma_threshold

 $$ sigma\_separation = \\frac{\mathrm{gain\_offset} - \mathrm{previous\_gain\_offset}}{\sqrt{\mathrm{gain\_offset_{std}}^\mathrm{2} + \mathrm{previuos\_gain\_offset_{std}}^\mathrm{2}}}$$
 $$ Bad\_separation = sigma\_separation < \mathrm{gain\_separation\_sigma\_threshold} $$

 """))


 elif "LPD" in karabo_id:
-    dinstance = "LPD1M1"
    nmods = 16
+    module_shape = (256, 256)
    expected_constants = ['Offset', 'Noise', 'BadPixelsDark']
    table = []
    badpixels = [
        BadPixels.OFFSET_OUT_OF_THRESHOLD,
        BadPixels.NOISE_OUT_OF_THRESHOLD,
        BadPixels.OFFSET_NOISE_EVAL_ERROR,
    ]
    for bp in badpixels:
        table.append(bp_entry(bp))
    display(Markdown("""

 # Summary of LPD dark characterization #

 The following report shows a set of dark images taken with the LPD detector to deduce detector offsets, noise, bad-pixel maps. All three types of constants are evaluated per-pixel and per-memory cell.

 **The offset** ($O$) is defined as the median ($M$) of the dark signal ($Ds$) over trains ($t$) for a given pixel ($x,y$) and memory cell ($c$).

 **The noise** $N$ is the standard deviation $\sigma$ of the dark signal.

 $$ O_{x,y,c} = M(Ds)_{t} ,\,\,\,\,\,\, N_{x,y,c} = \sigma(Ds)_{t}$$

 **The bad pixel** mask is encoded as a bit mask."""))
    display(Latex(tabulate.tabulate(table, tablefmt='latex', headers=["Name", "bit value", "integer value"])))
    display(Markdown("""

 **"OFFSET_OUT_OF_THRESHOLD":**

 Offset outside of bounds:

 $$M(O)_{x,y} - \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} < O < M(O)_{x,y} + \sigma(O)_{x,y} * \mathrm{thresholds\_offset\_sigma} $$

 or offset outside of hard limits

 $$ \mathrm{thresholds\_offset\_hard}_\mathrm{low} < O < \mathrm{thresholds\_offset\_hard}_\mathrm{high} $$

 **"NOISE_OUT_OF_THRESHOLD":**

 Noise outside of bounds:

 $$M(N)_{x,y} - \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} < N < M(N)_{x,y} + \sigma(N)_{x,y} * \mathrm{thresholds\_noise\_sigma} $$

 or noise outside of hard limits

 $$\mathrm{thresholds\_noise\_hard}_\mathrm{low} < N < \mathrm{thresholds\_noise\_hard}_\mathrm{high} $$

 **"OFFSET_NOISE_EVAL_ERROR":**

 Offset and Noise both not $nan$ values

 "Values: $\\mathrm{thresholds\\_offset\\_sigma}$, $\\mathrm{thresholds\\_offset\\_hard}$, $\\mathrm{thresholds\\_noise\\_sigma}$, $\\mathrm{thresholds\\_noise\\_hard}$ are given as parameters.\n",
 """))
 elif "DSSC" in karabo_id:
-    dinstance = "DSSC1M1"
    nmods = 16
+    module_shape = (128, 512)
    expected_constants = ['Offset', 'Noise']
    display(Markdown("""

 # Summary of DSSC dark characterization #

    """))
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer = step_timing.StepTimer()
 out_folder = Path(out_folder)
 metadata = CalibrationMetadata(metadata_folder or out_folder)
 mod_mapping = metadata.setdefault("modules-mapping", {})
 old_constant_metadata = {}
 for fn in out_folder.glob("module_metadata_*.yml"):
    with fn.open("r") as fd:
        fdict = yaml.safe_load(fd)
    module = fdict["module"]
    mod_mapping[module] = fdict["pdu"]
    old_constant_metadata[module] = fdict["old-constants"]

 metadata.save()
 ```

 %% Cell type:code id: tags:

 ``` python
 # In AGIPD fixed gain mode, ThresholdsDark is not expected
 if 'AGIPD' in karabo_id:
    for i in range(nmods):
        qm = module_index_to_qm(i)
        if not mod_mapping.get(qm):
            continue
        mod_pdu = mod_mapping[qm]
        fpath = out_folder / f"const_Offset_{mod_pdu}.h5"
        if not fpath.exists():
            continue

        with h5py.File(fpath, 'r') as f:
            if 'Gain mode' in f['condition']:
                if f["condition"]["Gain mode"]["value"][()]:
                    expected_constants.remove("ThresholdsDark")
        break
 ```

 %% Cell type:markdown id: tags:

 Preparing newly injected and previous constants from produced local folder in out_folder.

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 # Get shape, dtype, and number of files for each constant.
 # Also build lists of the files involved, to be loaded in parallel in a later cell.
 const_shape_and_dtype = {}
 found_module_nums = set()
 pieces_to_load = []
 pieces_to_load_prev = []

 for cname in expected_constants:
    for i in range(nmods):
        qm = module_index_to_qm(i)
        if not mod_mapping.get(qm):
            continue
        mod_pdu = mod_mapping[qm]
        fpath = out_folder / f"const_{cname}_{mod_pdu}.h5"
        if not fpath.exists():
            continue

        pieces_to_load.append((cname, i, fpath))
        found_module_nums.add(i)

        # try finding old constants using paths from CalCat store
        if qm not in old_constant_metadata:
            continue
        qm_mdata = old_constant_metadata[qm]

        if cname not in qm_mdata:
            continue

        fpath_prev = qm_mdata[cname]["filepath"]
        h5path_prev = qm_mdata[cname]["h5path"]

        if fpath_prev and h5path_prev:
            pieces_to_load_prev.append((cname, i, fpath_prev, h5path_prev))

    # Get the constant shape from one of the module files
    with h5py.File(fpath, 'r') as f:
        const_shape_and_dtype[cname] = (f['data'].shape, f['data'].dtype)

 # Allocate arrays for these constants (without space for missing modules)
 nmods_found = len(found_module_nums)
 constants = {
    cname: psh.alloc((nmods_found,) + module_const_shape, dtype=dt, fill=0)
    for cname, (module_const_shape, dt) in const_shape_and_dtype.items()
 }
 prev_const = {
    cname: psh.alloc((nmods_found,) + module_const_shape, dtype=dt, fill=0)
    for cname, (module_const_shape, dt) in const_shape_and_dtype.items()
 }
 step_timer.done_step("Preparing arrays for old and new constants.")
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 # Load the constant data in parallel
 found_module_nums = sorted(found_module_nums)
 mod_names = [module_index_to_qm(n) for n in found_module_nums]

 def load_piece(wid, ix, entry):
    cname, mod_no, fpath = entry
    mod_ix = found_module_nums.index(mod_no)

    with h5py.File(fpath, 'r') as f:
        f['data'].read_direct(constants[cname][mod_ix])

 psh.map(load_piece, pieces_to_load)
 print(f"Loaded constant data from {len(pieces_to_load)} files")

 def load_piece_prev(wid, ix, entry):
    cname, mod_no, fpath, h5path = entry
    mod_ix = found_module_nums.index(mod_no)

    with h5py.File(fpath, 'r') as f:
        f[h5path]['data'].read_direct(prev_const[cname][mod_ix])

 psh.map(load_piece_prev, pieces_to_load_prev)
 print(f"Loaded previous constant data from {len(pieces_to_load_prev)} files")
 step_timer.done_step("Loading constants data.")
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('## Processed modules'))
-show_processed_modules(dinstance, constants, mod_names, mode="processed")
+show_processed_modules(karabo_id, constants, mod_names, mode="processed")
 ```

 %% Cell type:markdown id: tags:

 ## Summary figures across Modules ##

 The following plots give an overview of calibration constants averaged across pixels and memory cells. A bad pixel mask is applied.

 %% Cell type:code id: tags:

 ``` python
-if "LPD" in dinstance:
-    geom = extra_geom.LPD_1MGeometry.from_quad_positions(quad_pos=[(11.4, 299),
-                                                                   (-11.5, 8),
-                                                                   (254.5, -16),
-                                                                   (278.5, 275)])
-    module_shape = (256, 256)
-
-elif dinstance in ('AGIPD1M1', 'AGIPD1M2'):
-    geom = extra_geom.AGIPD_1MGeometry.from_quad_positions(quad_pos=[(-525, 625),
-                                                                     (-550, -10),
-                                                                     (520, -160),
-                                                                     (542.5, 475)])
-    module_shape = (512, 128)
-
-elif dinstance == "AGIPD500K":
+if "LPD" in karabo_id:
+    geom = extra_geom.LPD_1MGeometry.from_quad_positions(
+        quad_pos=[
+            (11.4, 299),
+            (-11.5, 8),
+            (254.5, -16),
+            (278.5, 275)])
+elif "AGIPD1M" in karabo_id:
+    geom = extra_geom.AGIPD_1MGeometry.from_quad_positions(
+        quad_pos=[
+            (-525, 625),
+            (-550, -10),
+            (520, -160),
+            (542.5, 475)])
+elif "AGIPD500K" in karabo_id:
    geom = extra_geom.AGIPD_500K2GGeometry.from_origin()
-    module_shape = (512, 128)
-
-elif "DSSC" in dinstance:
-    module_shape = (128, 512)
+elif "AGIPD65K" in karabo_id:  # single module detector
+    geom = agipd_single_module_geometry()
+elif "DSSC" in karabo_id:
    quadpos = [(-130, 5), (-130, -125), (5, -125), (5, 5)]
    geom = extra_geom.DSSC_1MGeometry.from_quad_positions(quadpos)
 ```

 %% Cell type:code id: tags:

 ``` python
 def plot_const_and_delta(const, delta, const_name, gain_name):
    gs = gridspec.GridSpec(2, 2)
    fig = plt.figure(figsize=(24, 32))

    ax0 = fig.add_subplot(gs[0, :])
    vmin, vmax = get_range(const, 2)
    geom.plot_data_fast(
        const, vmin=vmin, vmax=vmax, ax=ax0, colorbar={
            'shrink': 0.9, 'pad': 0.05, 'label': 'ADUs'
    })
    ax0.set_title(f"{const_name} - {gain_name}", fontsize=15)

    if np.count_nonzero(delta) == np.count_nonzero(np.isnan(delta)):
        fig.text(0.5, 0.4, "No difference from previous constant",
                 ha='center', va='center', fontsize=15)
        return

    # Plot delta from previous constant
    ax1 = fig.add_subplot(gs[1, 0])
    vmin, vmax = get_range(delta, 2)
    vmax = max(vmax, abs(vmin))  # Center around zero
    geom.plot_data_fast(
-        delta, vmin=-vmax, vmax=vmax, ax=ax1, cmap="RdBu", colorbar={
+        delta, vmin=-vmax, vmax=vmax, ax=ax1, colorbar={
            'shrink': 0.6, 'pad': 0.1, 'label': 'ADUs'
    })
    ax1.set_title(f"Difference with previous {const_name} - {gain_name}", fontsize=15)

    # Plot % delta from previous constant
    delta_pct = delta / const * 100
    ax2 = fig.add_subplot(gs[1, 1])
    vmin, vmax = get_range(delta_pct, 2)
    vmax = max(vmax, abs(vmin))  # Center around zero
    geom.plot_data_fast(
-        delta_pct, vmin=-vmax, vmax=vmax, ax=ax2, cmap="RdBu", colorbar={
+        delta_pct, vmin=-vmax, vmax=vmax, ax=ax2, colorbar={
            'shrink': 0.6, 'pad': 0.1, 'label': '%'
    })
    ax2.set_title("Percentage difference", fontsize=15)
 ```

 %% Cell type:code id: tags:

 ``` python
 psh_ctx = psh.ProcessContext(nmods)
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()

 gainstages = 1

 for const_name, const in constants.items():
    if const_name == 'BadPixelsDark':
        continue
    # Check if constant gain available in constant e.g. AGIPD, LPD
    if len(const.shape) == 5:
        gainstages = 3
    else:
        gainstages = 1

    display(Markdown(f'##### {const_name}'))
    print_once = True
    for gain in range(gainstages):
        if const_name == 'ThresholdsDark':
            if gain > 1:
                continue
            glabel = threshold_names[gain]
        else:
            glabel = gain_names[gain]

        stacked_const = psh_ctx.alloc((nmods,) + module_shape, dtype=np.float64, fill=0)
        stacked_delta = psh_ctx.alloc((nmods,) + module_shape, dtype=np.float64, fill=0)

        def average_module(wid, i, _):
            qm = module_index_to_qm(i)
            if qm in mod_names:
                m_idx = mod_names.index(qm)
                # Check if constant shape of 5 indices e.g. AGIPD, LPD
                if const.ndim == 5:
                    values = np.nanmean(const[m_idx, :, :, :, gain], axis=2)
                    prev_val = np.nanmean(prev_const[const_name][m_idx, :, :, :, gain], axis=2)
                else:
                    values = np.nanmean(const[m_idx, :, :, :], axis=2)
                    prev_val = np.nanmean(prev_const[const_name][m_idx, :, :, :], axis=2)
                values[values == 0] = np.nan
                prev_val[prev_val == 0] = np.nan
                stacked_const[i] = np.moveaxis(values, 0, -1)
                stacked_delta[i] = np.moveaxis(values - prev_val, 0, -1)
            else:
                # if module not available fill space with nan
                stacked_const[i] = np.nan

        psh_ctx.map(average_module, range(nmods))

        # Plotting constant overall modules.
        display(Markdown(f'###### {glabel} ######'))

        plot_const_and_delta(stacked_const, stacked_delta, const_name, glabel)

        plt.show()
 step_timer.done_step("Plotting constants and relative differences.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Loop over modules and constants
 step_timer.start()
 for const_name, const in constants.items():
    if const_name == 'BadPixelsDark':
        continue  # Displayed separately below

    display(Markdown(f'### Summary across Modules - {const_name}'))

    for gain in range(gainstages):
        if const_name == 'ThresholdsDark':
            if gain == 2:
                continue
            glabel = threshold_names[gain]
        else:
            glabel = gain_names[gain]

        if const.ndim == 5:
            data = const[:, :, :, :, gain]
        else:
            data = const

        # Bad pixels are per gain stage, and you need thresholds to pick the gain
        # stage, so we don't mask the thresholds.
        if ('BadPixelsDark' in constants) and (const_name != 'ThresholdsDark'):
            label = f'{const_name} value [ADU], good pixels only'
            if const.ndim == 5:
                goodpix = constants['BadPixelsDark'][:, :, :, :, gain] == 0
            else:
                goodpix = constants['BadPixelsDark'] == 0
        else:
            label = f'{const_name} value [ADU], good and bad pixels'
            goodpix = [True] * data.shape[0]

        # Reduce data in parallel (one worker per module):
        datamean = psh_ctx.alloc(data.shape[:1] + data.shape[3:], data.dtype)
        datastd = psh_ctx.alloc(data.shape[:1] + data.shape[3:], data.dtype)

        def average_mem_cells(wid, i, _):
            datamean[i] = np.mean(data[i], axis=(0, 1), where=goodpix[i])
            datastd[i] = np.std(data[i], axis=(0, 1), where=goodpix[i])
        psh_ctx.map(average_mem_cells, range(data.shape[0]))

        fig = plt.figure(figsize=(15, 6), tight_layout={
                         'pad': 0.2, 'w_pad': 1.3, 'h_pad': 1.3})
        ax = fig.add_subplot(121)

        d = []
        for im, mod in enumerate(datamean):
            d.append({'x': np.arange(mod.shape[0]),
                      'y': mod,
                      'drawstyle': 'steps-pre',
                      'label': mod_names[im],
                      })

        _ = simplePlot(d, figsize=(10, 10), xrange=(-12, 510),
                            x_label='Memory Cell ID',
                            y_label=label,
                            use_axis=ax,
                            title=glabel,
                            title_position=[0.5, 1.18],
                            legend='outside-top-ncol6-frame', legend_size='18%',
                            legend_pad=0.00)

        # Plot standard deviation
        ax = fig.add_subplot(122)
        if "BadPixelsDark" in constants.keys():
            label = f'$\sigma$ {const_name} [ADU], good pixels only'
        else:
            label = f'$\sigma$ {const_name} [ADU], good and bad pixels'
        d = []
        for im, mod in enumerate(datastd):
            d.append({'x': np.arange(mod.shape[0]),
                      'y': mod,
                      'drawstyle': 'steps-pre',
                      'label': mod_names[im],
                      })

        _ = simplePlot(d, figsize=(10, 10), xrange=(-12, 510),
                            x_label='Memory Cell ID',
                            y_label=label,
                            use_axis=ax,
                            title=f'{glabel} $\sigma$',
                            title_position=[0.5, 1.18],
                            legend='outside-top-ncol6-frame', legend_size='18%',
                            legend_pad=0.00)

        plt.show()
 step_timer.done_step("Plotting summary across modules.")
 ```

 %% Cell type:code id: tags:

 ``` python
 if 'BadPixelsDark' in constants:
    step_timer.start()
    display(Markdown(f'### Summary across Modules - BadPixelsDark'))

    bad_px_dark = constants['BadPixelsDark']

    for gain in range(gainstages):
        glabel = gain_names[gain]

        if bad_px_dark.ndim == 5:
            data = bad_px_dark[:, :, :, :, gain]
        else:
            data = bad_px_dark[:, :, :, :]

        bad_px_per_cell = np.count_nonzero(data, axis=(1, 2))
        fraction_bad_per_cell = bad_px_per_cell / (data.shape[1] * data.shape[2])
        fraction_bad_per_cell[fraction_bad_per_cell == 1.0] = np.nan

        fig = plt.figure(figsize=(15, 6), tight_layout={
                         'pad': 0.2, 'w_pad': 1.3, 'h_pad': 1.3})
        ax = fig.add_subplot(1, 1, 1)

        d = []
        for im, mod in enumerate(fraction_bad_per_cell):
            d.append({'x': np.arange(mod.shape[0]),
                      'y': mod,
                      'drawstyle': 'steps-pre',
                      'label': mod_names[im],
                      })

        _ = simplePlot(d, figsize=(10, 10), xrange=(-12, 510),
                            x_label='Memory Cell ID',
                            y_label='Fraction of bad pixels',
                            use_axis=ax,
                            title=glabel,
                            title_position=[0.5, 1.18],
                            legend='outside-top-ncol6-frame', legend_size='18%',
                            legend_pad=0.00)

        plt.show()
    step_timer.done_step("Summary across modules for BadPixels.")
 ```

 %% Cell type:markdown id: tags:

 ## Summary tables across Modules ##

 Tables show values averaged across all pixels and memory cells of a given detector module.

 %% Cell type:code id: tags:

 ``` python
 if u'$' in tabulate.LATEX_ESCAPE_RULES:
    del(tabulate.LATEX_ESCAPE_RULES[u'$'])

 if u'\\' in tabulate.LATEX_ESCAPE_RULES:
    del(tabulate.LATEX_ESCAPE_RULES[u'\\'])
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 head = ['Module', 'High gain', 'Medium gain', 'Low gain']
 head_th = ['Module', 'HG_MG threshold', 'MG_LG threshold']
 for const_name, const in constants.items():
    if const_name == 'BadPixelsDark':
        continue  # Handled below

    if const.ndim == 4:  # Add gain dimension if not present
        const = const[:, :, : , :, np.newaxis]

    if ('BadPixelsDark' in constants) and (const_name != 'ThresholdsDark'):
        goodpix = constants['BadPixelsDark'] == 0
        if goodpix.ndim == 4:
            goodpix = goodpix[:, :, : , :, np.newaxis]

        label = f'### Average {const_name} [ADU], good pixels only'
    else:
        goodpix = [True] * const.shape[0]
        label = f'### Average {const_name} [ADU], good and bad pixels'

    # Reduce data in parallel (one worker per module)
    mean_by_module_gain = psh.alloc(const.shape[:1] + const.shape[4:], const.dtype)
    std_by_module_gain  = psh.alloc(const.shape[:1] + const.shape[4:], const.dtype)

    def average_module(wid, i, _):
        mean_by_module_gain[i] = np.mean(const[i], axis=(0, 1, 2), where=goodpix[i])
        std_by_module_gain[i]  = np.std (const[i], axis=(0, 1, 2), where=goodpix[i])
    psh_ctx.map(average_module, range(const.shape[0]))

    table = []

    for i_mod, mod in enumerate(mod_names):
        t_line = [mod]
        for gain in range(gainstages):
            if const_name == 'ThresholdsDark' and gain == 2:
                continue

            datamean = mean_by_module_gain[i_mod, gain]
            datastd = std_by_module_gain[i_mod, gain]
            t_line.append(f'{datamean:6.1f} $\\pm$ {datastd:6.1f}')

        table.append(t_line)

    display(Markdown(label))
    header = head_th if const_name == 'ThresholdsDark' else head
    md = display(Latex(tabulate.tabulate(
        table, tablefmt='latex', headers=header)))
 step_timer.done_step("Summary tables across modules.")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Bad pixels summary table
 if 'BadPixelsDark' in constants:
    step_timer.start()
    bad_px_dark = constants['BadPixelsDark']

    table = []

    for i_mod, mod in enumerate(mod_names):

        t_line = [mod]
        for gain in range(gainstages):
            if bad_px_dark.ndim == 5:
                data = bad_px_dark[i_mod, :, :, :, gain]
            else:
                data = bad_px_dark[i_mod]

            datasum = np.count_nonzero(data)
            datamean = datasum / data.size

            t_line.append(f'{datasum:6.0f} ({datamean:6.3f}) ')
            label = '## Number(fraction) of bad pixels'

        table.append(t_line)

    display(Markdown(label))
    md = display(Latex(tabulate.tabulate(
        table, tablefmt='latex', headers=head)))
    step_timer.done_step("Summary table across modules for BadPixels.")
 ```

--- a/notebooks/pnCCD/Correct_pnCCD_NBC.ipynb
+++ b/notebooks/pnCCD/Correct_pnCCD_NBC.ipynb
 %% Cell type:markdown id: tags:

 # pnCCD Data Correction #

 Authors: DET Group, Modified by Kiana Setoodehnia - Version 5.0

 The following notebook provides offset, common mode, relative gain, split events and pattern classification corrections of images acquired with the pnCCD. This notebook *does not* yet correct for charge transfer inefficiency.

 %% Cell type:code id: tags:

 ``` python
 in_folder = "/gpfs/exfel/exp/SQS/202031/p900166/raw"  # input folder
 out_folder = "/gpfs/exfel/data/scratch/ahmedk/test/remove/pnccd_correct"  # output folder
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 run = 347  # which run to read data from
 sequences = [-1]  # sequences to correct, set to -1 for all, range allowed
 sequences_per_node = 1  # number of sequences running on the same slurm node.

 karabo_da = 'PNCCD01' # data aggregators
 karabo_id = "SQS_NQS_PNCCD1MP" # karabo prefix of PNCCD devices
 receiver_id = "PNCCD_FMT-0" # inset for receiver devices
 path_template = 'RAW-R{:04d}-{}-S{:05d}.h5'  # the template to use to access data
 instrument_source_template = '{}/CAL/{}:output'  # template for data source name, will be filled with karabo_id and receiver_id.

 # Parameters affecting data correction.
 commonModeAxis = 0 # axis along which common mode will be calculated, 0 = row, and 1 = column
 commonModeBlockSize = [512, 512] # size of the detector in pixels for common mode calculations
 split_evt_primary_threshold = 4. # primary threshold for split event classification in terms of n sigma noise
 split_evt_secondary_threshold = 3. # secondary threshold for split event classification in terms of n sigma noise
 saturated_threshold = 32000.  # full well capacity in ADU

 # Conditions for retrieving calibration constants
 fix_temperature_top = 0. # fix temperature for top sensor in K, set to 0. to use value from slow data.
 fix_temperature_bot = 0. # fix temperature for bottom senspr in K, set to 0. to use value from slow data.
 gain = -1  # the detector's gain setting. Set to -1 to use the value from the slow data.
 bias_voltage = 0. # the detector's bias voltage. set to 0. to use value from slow data.
 integration_time = 70  # detector's integration time
 photon_energy = 1.6 # Al fluorescence in keV

 # Parameters for the calibration database.
 cal_db_interface = "tcp://max-exfl-cal001:8015" # calibration DB interface to use
 cal_db_timeout = 300000 # timeout on caldb requests
 creation_time = ""  # The timestamp to use with Calibration DB. Required Format: "YYYY-MM-DD hh:mm:ss" e.g. 2019-07-04 11:02:41
 remove_bias_voltage_if_zero = True  # This flag enables removing bias voltage from the conditions if a 0 value is read from RAW data. This is useful when the corresponding constants for old RAW had no bias voltage because of a mistake in control data. e.g. p002857

 # Booleans for selecting corrections to apply.
 only_offset = False # Only, apply offset.
 common_mode = True # Apply common mode correction
 relgain = True # Apply relative gain correction
 pattern_classification = True  # classify split events

 # parameters affecting stored output data.
 chunk_size_idim = 1  # H5 chunking size of output data
 limit_trains = 0  # this parameter is used for limiting number of images to correct from a sequence file.


 def balance_sequences(in_folder, run, sequences, sequences_per_node, karabo_da):
    from xfel_calibrate.calibrate import balance_sequences as bs
    return bs(in_folder, run, sequences, sequences_per_node, karabo_da)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Here the herarichy and dependability for correction booleans are defined
 corr_bools = {}

 corr_bools["only_offset"] = only_offset

 # Apply offset only.
 if not only_offset:
    corr_bools["relgain"] = relgain
    corr_bools["common_mode"] = common_mode
    corr_bools["pattern_class"] = pattern_classification
 ```

 %% Cell type:code id: tags:

 ``` python
 import os
 import sys
 import warnings
 from logging import warning
 from pathlib import Path
 warnings.filterwarnings('ignore')

 import matplotlib.pyplot as plt
 import numpy as np
 import pasha as psh
 from IPython.display import Markdown, display
 from extra_data import H5File, RunDirectory
 from prettytable import PrettyTable

 %matplotlib inline

 import cal_tools.restful_config as rest_cfg
 from XFELDetAna import xfelpyanatools as xana
 from XFELDetAna import xfelpycaltools as xcal
 from cal_tools import pnccdlib
 from cal_tools.files import DataFile
 from cal_tools.calcat_interface import CalCatError, PNCCD_CalibrationData

 from cal_tools.tools import (
    calcat_creation_time,
    write_constants_fragment,
 )
 from cal_tools.step_timing import StepTimer
 ```

 %% Cell type:code id: tags:

 ``` python
 # Calibration Database Settings, and Some Initial Run Parameters & Paths:
 display(Markdown('### Initial Settings and Paths'))

 # Sensor size and block size definitions (important for common mode and other corrections):
 pixels_x = 1024  # rows of pnCCD in pixels
 pixels_y = 1024  # columns of pnCCD in pixels
 in_folder = Path(in_folder)
 sensorSize = [pixels_x, pixels_y]
 # For xcal.HistogramCalculators.
 blockSize = [pixels_x//2, pixels_y//2]  # sensor area will be analysed according to blockSize.

 print(f"pnCCD size is: {pixels_x}x{pixels_y} pixels.")
 print(f'Calibration database interface selected: {cal_db_interface}')

 # Paths to the data:
 instrument_src = instrument_source_template.format(karabo_id, receiver_id)
 print(f"Instrument H5File source: {instrument_src}\n")

 # Run's creation time:
 creation_time = calcat_creation_time(in_folder, run, creation_time)
 print(f"Creation time: {creation_time}")
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer = StepTimer()
 ```

 %% Cell type:code id: tags:

 ``` python
 run_dc = RunDirectory(in_folder / f"r{run:04d}", _use_voview=False)

 # Output Folder Creation:
 os.makedirs(out_folder, exist_ok=True)

 # extract control data
 step_timer.start()
 ctrl_data = pnccdlib.PnccdCtrl(run_dc, karabo_id)
 if bias_voltage == 0.:
    bias_voltage = ctrl_data.get_bias_voltage()
 if gain == -1:
    gain = ctrl_data.get_gain()
 if fix_temperature_top == 0:
    fix_temperature_top = ctrl_data.get_fix_temperature_top()
 if fix_temperature_bot == 0:
    fix_temperature_bot = ctrl_data.get_fix_temperature_bot()

 step_timer.done_step("Reading control parameters.")

 # Printing the Parameters Read from the Data File:
 display(Markdown('### Detector Parameters'))
 print(f"Bias voltage is {bias_voltage:0.1f} V.")
 print(f"Detector gain is set to 1/{int(gain)}.")
 print(f"Detector integration time is set to {integration_time} ms")
 print(f"Top pnCCD sensor is at temperature of {fix_temperature_top:0.2f} K")
 print(f"Bottom pnCCD sensor is at temperature of {fix_temperature_bot:0.2f} K")
 ```

 %% Cell type:code id: tags:

 ``` python
 seq_files = []
 for f in run_dc.select(instrument_src).files:
    fpath = Path(f.filename)
    if fpath.match(f"*{karabo_da}*.h5"):
        seq_files.append(fpath)
 if sequences != [-1]:
    seq_files = sorted([f for f in seq_files if any(f.match(f"*-S{s:05d}.h5") for s in sequences)])
 print(f"Processing a total of {len(seq_files)} sequence files:")
 print(*seq_files, sep='\n')
 ```

 %% Cell type:code id: tags:

 ``` python
 gain_k = [k for k, v in pnccdlib.VALID_GAINS.items() if v == gain][0]
 if gain_k == 'a':
    split_evt_mip_threshold = 1000. # MIP threshold in ADU for event classification (10 times average noise)

    # Each xcal.HistogramCalculator requires a total number of bins and a binning range. We define these
    # using a dictionary:

    # For all xcal histograms:

    Hist_Bin_Dict = {
        "bins": 35000, # number of bins
        "bin_range": [0, 35000]
    }

    # For the numpy histograms on the last cell of the notebook:
    Event_Bin_Dict = {
        "event_bins": 1000, # number of bins
        "b_range": [0, 35000] # bin range
    }
 elif gain_k == 'b':
    split_evt_mip_threshold = 270. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 10000,
        "bin_range": [0, 10000]
    }
    Event_Bin_Dict = {
        "event_bins": 1000,
        "b_range": [0, 10000]
    }
 elif gain_k == 'c':
    split_evt_mip_threshold = 110. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 3000,
        "bin_range": [0, 3000]
    }
    Event_Bin_Dict = {
        "event_bins": 1000,
        "b_range": [0, 3000]
    }
 elif gain_k == 'd':
    split_evt_mip_threshold = 90. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 1000,
        "bin_range": [0, 1000]
    }
    Event_Bin_Dict = {
        "event_bins": 1000,
        "b_range": [0, 1000]
    }
 elif gain_k == 'e':
    split_evt_mip_threshold = 90. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 500,
        "bin_range": [0, 500]
    }
    Event_Bin_Dict = {
        "event_bins": 500,
        "b_range": [0, 500]
    }
 else:
    split_evt_mip_threshold = 90. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 220,
        "bin_range": [0, 220]
    }
    Event_Bin_Dict = {
        "event_bins": 220,
        "b_range": [0, 220]
    }

 bins = Hist_Bin_Dict["bins"]
 bin_range = Hist_Bin_Dict["bin_range"]
 event_bins = Event_Bin_Dict["event_bins"]
 b_range = Event_Bin_Dict["b_range"]
 ```

 %% Cell type:markdown id: tags:

 As a first step, calibration constants have to be retrieved from the calibration database

 %% Cell type:code id: tags:

 ``` python
 display(Markdown("### Constants retrieval"))
 step_timer.start()

 # In the case of an older proposal (e.g., proposal 002857),
 # it is possible that the bias voltage was 0
 # resulting in the absence of bias voltage values in
 # the previously injected dark constants. This situation can be
 # attributed to a feature that is currently not available in iCalibrationDB.
 if bias_voltage == 0 and remove_bias_voltage_if_zero:
    bias_voltage = None

 pnccd_cal = PNCCD_CalibrationData(
    detector_name=karabo_id,
    sensor_bias_voltage=bias_voltage,
    integration_time=integration_time,
    sensor_temperature=fix_temperature_top,
    gain_setting=gain,
    event_at=creation_time,
    source_energy=photon_energy,
    client=rest_cfg.calibration_client(),
 )

 pnccd_metadata = pnccd_cal.metadata(calibrations=pnccd_cal.dark_calibrations)

 if relgain:
    try:
        gain_metadata = pnccd_cal.metadata(calibrations=["RelativeGainCCD"])
        for mod, md in gain_metadata.items():
            pnccd_metadata[mod].update(md)

    except CalCatError as e:  # TODO: fix after getting new exceptions.
        warning(f"{e} While asking for {pnccd_cal.illuminated_calibrations}")
        warning("RelativeGainEPix100 is not retrieved from the calibration database. "
                "Relative gain correction is disabled.")
        corr_bools['relgain'] = False

 # Display retrieved calibration constants timestamps
 pnccd_cal.display_markdown_retrieved_constants(metadata=pnccd_metadata)

 metadata = pnccd_metadata[karabo_da]

 # Validate the constants availability and raise/warn correspondingly.
 missing_dark_constants = set(
    c for c in pnccd_cal.dark_calibrations if c not in metadata.keys())

 if missing_dark_constants:
    raise KeyError(
        f"Dark constants {missing_dark_constants} are not available for correction.")

 # Record constant details in YAML metadata
 write_constants_fragment(
    out_folder=(metadata_folder or out_folder),
    det_metadata=pnccd_metadata,
    caldb_root=pnccd_cal.caldb_root,
 )

 # load constants arrays after storing fragment YAML file
 # and validating constants availability.
 constants = pnccd_cal.ndarray_map(metadata=pnccd_metadata).get(karabo_da, {})

 step_timer.done_step("Constants retrieval")
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = xana.heatmapPlot(constants["OffsetCCD"][:,:,0], x_label='Columns', y_label='Rows', lut_label='Offset (ADU)',
                       aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x), vmax=16000,
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Dark Offset Map')

 fig = xana.heatmapPlot(constants["NoiseCCD"][:,:,0], x_label='Columns', y_label='Rows',
                       lut_label='Corrected Noise (ADU)',
                       aspect=1, x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Dark Noise Map')

 fig = xana.heatmapPlot(np.log2(constants["BadPixelsDarkCCD"][:,:,0]), x_label='Columns', y_label='Rows',
                       lut_label='Bad Pixel Value (ADU)',
                       aspect=1, x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Dark Bad Pixels Map')

 if corr_bools.get('relgain'):
    fig = xana.heatmapPlot(constants["RelativeGainCCD"], figsize=(8, 8), x_label='Columns', y_label='Rows',
                            lut_label='Relative Gain',
                            aspect=1, x_range=(0, pixels_y), y_range=(0, pixels_x), vmin=0.8, vmax=1.2,
                            panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                            panel_top_low_lim = 0.5, panel_top_high_lim = 1.5, panel_side_low_lim = 0.5,
                            panel_side_high_lim = 1.5,
                            title = f'Relative Gain Map for pnCCD (Gain = 1/{int(gain)})')
 ```

 %% Cell type:code id: tags:

 ``` python
 #************************ Calculators ************************#
 if corr_bools.get('common_mode'):
    # Common Mode Correction Calculator:
    cmCorrection = xcal.CommonModeCorrection([pixels_x, pixels_y],
                                             commonModeBlockSize,
                                             commonModeAxis,
                                             parallel=False, dType=np.float32, stride=1,
                                             noiseMap=constants["NoiseCCD"].astype(np.float32), minFrac=0.25)

 if corr_bools.get('pattern_class'):
    # Pattern Classifier Calculator:
    # Left Hemisphere:
    patternClassifierLH = xcal.PatternClassifier([pixels_x, pixels_y//2],
                                                 constants["NoiseCCD"][:, :pixels_y//2],
                                                 split_evt_primary_threshold,
                                                 split_evt_secondary_threshold,
                                                 split_evt_mip_threshold,
                                                 tagFirstSingles=3, # track along y-axis, left to right (see
                                                 nCells=1, # split_event.py file in pydetlib/lib/src/
                                                 allowElongated=False, # XFELDetAna/algorithms)
                                                 blockSize=[pixels_x, pixels_y//2],
                                                 parallel=False)

    # Right Hemisphere:
    patternClassifierRH = xcal.PatternClassifier([pixels_x, pixels_y//2],
                                                 constants["NoiseCCD"][:, pixels_y//2:],
                                                 split_evt_primary_threshold,
                                                 split_evt_secondary_threshold,
                                                 split_evt_mip_threshold,
                                                 tagFirstSingles=4, # track along y-axis, right to left
                                                 nCells=1,
                                                 allowElongated=False,
                                                 blockSize=[pixels_x, pixels_y//2],
                                                 parallel=False)

    patternClassifierLH._imagesPerChunk = 1
    patternClassifierRH._imagesPerChunk = 1

    patternClassifierLH._noisemap = constants["NoiseCCD"][:, :pixels_x//2]
    patternClassifierRH._noisemap = constants["NoiseCCD"][:, pixels_x//2:]
    # Setting bad pixels:
    patternClassifierLH.setBadPixelMask(constants["BadPixelsDarkCCD"][:, :pixels_x//2] != 0)
    patternClassifierRH.setBadPixelMask(constants["BadPixelsDarkCCD"][:, pixels_x//2:] != 0)
 ```

 %% Cell type:code id: tags:

 ``` python
 #***************** Histogram Calculators ******************#
 # Will contain uncorrected data:
 histCalRaw = xcal.HistogramCalculator(sensorSize,
                                      bins=bins,
                                      range=bin_range,
                                      nCells=1,
                                      parallel=False,
                                      blockSize=blockSize)
 # Will contain offset corrected data:
 histCalOffsetCor = xcal.HistogramCalculator(sensorSize,
                                            bins=bins,
                                            range=bin_range,
                                            nCells=1,
                                            parallel=False,
                                            blockSize=blockSize)
 if corr_bools.get('common_mode'):
    # Will contain common mode corrected data:
    histCalCommonModeCor = xcal.HistogramCalculator(sensorSize,
                                                    bins=bins,
                                                    range=bin_range,
                                                    nCells=1,
                                                    parallel=False,
                                                    blockSize=blockSize)
 if corr_bools.get('pattern_class'):
    # Will contain split events pattern data:
    histCalPcorr = xcal.HistogramCalculator(sensorSize,
                                            bins=bins,
                                            range=bin_range,
                                            nCells=1,
                                            parallel=False,
                                            blockSize=blockSize)
    # Will contain singles events data:
    histCalPcorrS = xcal.HistogramCalculator(sensorSize,
                                             bins=bins,
                                             range=bin_range,
                                             nCells=1,
                                             parallel=False,
                                             blockSize=blockSize)
 if corr_bools.get('relgain'):
    # Will contain gain corrected data:
    histCalGainCor = xcal.HistogramCalculator(sensorSize,
                                              bins=bins,
                                              range=bin_range,
                                              nCells=1,
                                              parallel=False,
                                              blockSize=blockSize)
 ```

 %% Cell type:markdown id: tags:

 ## Applying corrections to the raw data

 %% Cell type:code id: tags:

 ``` python
 def offset_correction(wid, index, d):
    """offset correction.
    Equating bad pixels' values to np.nan,
    so that the pattern classifier ignores them:
    """
    d = d.copy()

    # TODO: To clear this up. Is it on purpose to save corrected data with nans?
    d[bpix != 0] = np.nan
    d -= offset  # offset correction

    # TODO: to clear this up. why save the badpixels map in the corrected data?
    bpix_data[index, ...] = bpix
    data[index, ...] = d

 def common_mode(wid, index, d):
    """common-mode correction.
    Discarding events caused by saturated pixels:
    """
    d = np.squeeze(cmCorrection.correct(d, cellTable=np.zeros(pixels_y, np.int32)))
    # we equate these values to np.nan so that the pattern classifier ignores them:
    d[d >= saturated_threshold] = np.nan
    data[index, ...] = d


 def gain_correction(wid, index, d):
    """relative gain correction."""
    d /= relativegain
    data[index, ...] = d


 def pattern_classification_correction(wid, index, d):
    """pattern classification correction.
    data set to save split event corrected images

    The calculation of the cluster map:]
    Dividing the data into left and right hemispheres:
    """

    # pattern classification on corrected data
    dataLH, patternsLH = patternClassifierLH.classify(d[:, :pixels_x//2])
    dataRH, patternsRH = patternClassifierRH.classify(d[:, pixels_x//2:])

    d[:, :pixels_x//2] = np.squeeze(dataLH)
    d[:, pixels_x//2:] = np.squeeze(dataRH)

    patterns = np.zeros(d.shape, patternsLH.dtype)
    patterns[:, :pixels_x//2] = np.squeeze(patternsLH)
    patterns[:, pixels_x//2:] = np.squeeze(patternsRH)
    d[d < split_evt_primary_threshold*noise] = 0
    data[index, ...] = d
    ptrn_data[index, ...] = patterns
    d[patterns != 100] = np.nan  # Discard doubles, triples, quadruple, clusters, first singles
    filtered_data[index, ...] = d
 ```

 %% Cell type:code id: tags:

 ``` python
 # 10 is a number chosen after testing 1 ... 71 parallel threads for a node with 72 cpus.
 parallel_num_threads = 10
 context = psh.context.ThreadContext(num_workers=parallel_num_threads)

 data_path = "INSTRUMENT/"+instrument_src+"/data/"

 offset = np.squeeze(constants["OffsetCCD"])
 noise = np.squeeze(constants["NoiseCCD"])
 bpix = np.squeeze(constants["BadPixelsDarkCCD"])
 relativegain = constants.get("RelativeGainCCD")
 ```

 %% Cell type:code id: tags:

 ``` python
 def write_datasets(seq_dc, corr_arrays, out_file, instrument_src):
    """
    Creating datasets first then adding data.
    To have metadata together available at the start of the file,
    so it's quick to see what the file contains.
    """
    # Create CORR files and add corrected data sections.
    image_counts = seq_dc[instrument_src, "data.image"].data_counts(labelled=False)
    dataset_chunk = ((chunk_size_idim,) + corr_arrays["pixels"].shape[1:])  # e.g. (1, pixels_x, pixels_y)
    with DataFile(out_file, 'w') as ofile:
-
+        seq_file = seq_dc.files[0]
        # Create INDEX datasets.
-        ofile.create_index(seq_dc.train_ids, from_file=seq_dc.files[0])
+        ofile.create_index(seq_dc.train_ids, from_file=seq_file)
        # Create METDATA datasets
        ofile.create_metadata(
            like=seq_dc,
-            sequence=seq_dc.run_metadata()["sequenceNumber"],
+            sequence=seq_file.sequence,
            instrument_channels=(f"{instrument_src}/data",)
        )

        # Create Instrument section to later add corrected datasets.
        outp_source = ofile.create_instrument_source(instrument_src)

        # Create count/first datasets at INDEX source.
        outp_source.create_index(data=image_counts)

        # Store uncorrected trainId in the corrected file.
        outp_source.create_key(
            f"data.trainId", data=seq_dc.train_ids,
            chunks=min(50, len(seq_dc.train_ids))
        )

        # TODO: gain dataset is just the RelativeGain constant
        # and it doesn't make sense to write it into corrected data.
        comp_fields = ["gain", "patterns", "pixels_classified"]

        # TODO: to clear this up: why save corrected data
        # in data/pixels rather than data/image.
        for field, data in corr_arrays.items():
            if field in comp_fields:  # Write compressed corrected data.
                outp_source.create_compressed_key(f"data.{field}", data=data)
            else:
                outp_source.create_key(
                    f"data.{field}", data=data,
                    chunks=dataset_chunk
                )
 ```

 %% Cell type:code id: tags:

 ``` python
 # Data corrections and event classifications happen here.
 # Also, the corrected data are written to datasets:
 empty_seq = 0
 for seq_n, seq_f in enumerate(seq_files):
    seq_dc = H5File(seq_f)
    out_file = f"{out_folder}/{seq_f.name}".replace("RAW", "CORR")

    step_timer.start()

    img_dc = seq_dc[instrument_src, "data.image"]

    dshape = seq_dc[instrument_src, "data.image"].shape

    n_trains = dshape[0]
    corr_ntrains = dshape[0]  # number of available trains to correct.
    all_train_ids = img_dc.train_ids    # All trains including trains with no data.

    # Raise a WARNING if this sequence has no trains to correct.
    # Otherwise, print number of trains with no data.
    if corr_ntrains == 0:
        warning(f"No trains to correct for {seq_f.name}: "
                "Skipping the processing of this file.")
        empty_seq += 1
        continue
    elif len(all_train_ids) != corr_ntrains:
        print(
            f"{seq_f.name} has {len(all_train_ids) - corr_ntrains} "
            "trains with missing data."
        )

    # If you want to analyze only a certain number of frames
    # instead of all available good frames.

    if limit_trains > 0:
        print(f"\nCorrected trains are limited to: {limit_trains} trains")
        corr_ntrains = min(corr_ntrains, limit_trains)
    data_shape = (corr_ntrains, dshape[1], dshape[2])

    print(f"Correcting file {seq_f} of {corr_ntrains} trains.")

    # Overwrite seq_dc after eliminating empty trains or/and applying limited images.
    seq_dc = seq_dc.select(
        instrument_src, "*", require_all=True).select_trains(np.s_[:corr_ntrains])

    raw_data = seq_dc[instrument_src, "data.image"].ndarray().astype(np.float32)
    to_store_arrays = {"image": raw_data}

    # TODO: move the parts for reading data to plot to later cells.
    if seq_n == 0:
        raw_plt = raw_data.copy()  # plot first sequence only

    step_timer.start()

    # Allocating shared arrays for data arrays for each correction stage.
    data = context.alloc(shape=data_shape, dtype=np.float32)
    bpix_data = context.alloc(shape=data_shape, dtype=np.uint32)
    histCalRaw.fill(raw_data)  # filling histogram with raw uncorrected data

    # Applying offset correction
    context.map(offset_correction, raw_data)
    histCalOffsetCor.fill(data)  # filling histogram with offset corrected data

    if seq_n == 0:
        off_data = data.copy()  # plot first sequence only

    to_store_arrays["pixels"] = data.copy()
    to_store_arrays["mask"] = bpix_data

    step_timer.done_step(f'offset correction.')

    if corr_bools.get('common_mode'):
        step_timer.start()

        # Applying common mode correction
        context.map(common_mode, data)
        if seq_n == 0:
            cm_data = data.copy()  # plot first sequence only
        to_store_arrays["pixels_cm"] = data.copy()
        histCalCommonModeCor.fill(data)  # filling histogram with common mode corrected data

        step_timer.done_step(f'common-mode correction.')

    if corr_bools.get('relgain'):

        step_timer.start()

        # Applying gain correction
        context.map(gain_correction, data)
        if seq_n == 0:
            rg_data = data.copy()  # plot first sequence only
        # TODO: Why storing a repeated constant for each image in corrected files.
        to_store_arrays["gain"] = np.repeat(relativegain[np.newaxis, ...], corr_ntrains, axis=0).astype(np.float32)  # noqa
        histCalGainCor.fill(data)  # filling histogram with gain corrected data
        step_timer.done_step(f'gain correction.')

    if corr_bools.get('pattern_class'):
        step_timer.start()

        ptrn_data = context.alloc(shape=data_shape, dtype=np.int32)
        filtered_data = context.alloc(shape=data_shape, dtype=np.int32)
        # Applying pattern classification correction
        # Even though data is indeed of dtype np.float32,
        # not specifying this again screw with the data quality.
        context.map(pattern_classification_correction, data.astype(np.float32))

        if seq_n == 0:
            cls_data = data.copy()  # plot first sequence only
        # split event corrected images plotted for first sequence only
        # (also these events are only singles events):
        to_store_arrays["pixels_classified"] = data.copy()
        to_store_arrays["patterns"] = ptrn_data

        histCalPcorr.fill(data)  # filling histogram with split events corrected data
        # filling histogram with corr data after discarding doubles, triples, quadruple, clusters, and first singles
        histCalPcorrS.fill(filtered_data)
        step_timer.done_step(f'pattern classification correction.')

    step_timer.start()

    # Storing corrected data sources.
    write_datasets(
        seq_dc=seq_dc,
        corr_arrays=to_store_arrays,
        out_file=out_file,
        instrument_src=instrument_src,
    )
    step_timer.done_step(f'Storing data.')
 # Exit and raise warning if there are no data to correct for all sequences.
 if empty_seq == len(seq_files):
    warning("No valid trains for RAW data to correct.")
    sys.exit(0)
 ```

 %% Cell type:code id: tags:

 ``` python
 print("In addition to offset correction, the following corrections were performed:")
 for k, v in corr_bools.items():
    if v:
        print("  -", k.upper())

 print(f"Total processing time {step_timer.timespan():.01f} s")
 step_timer.print_summary()
 ```

 %% Cell type:code id: tags:

 ``` python
 print("In addition to offset correction, the following corrections were performed:")
 for k, v in corr_bools.items():
    if v:
        print("  -", k.upper())

 print(f"Total processing time {step_timer.timespan():.01f} s")
 step_timer.print_summary()
 ```

 %% Cell type:code id: tags:

 ``` python
 # Histograming the resulting spectra:

 # First _ refers to the bin edges and second _ refers to statistics and we ignore both.

 # if you use histCalRaw.get(cumulatative = True) and so on, the cumulatative = True turns the counts array such as
 # RawHistVals and so on into a 1D array instead of keeping the original shape:

 RawHistVals, _, RawHistMids, _ = histCalRaw.get()
 off_cor_HistVals, _, off_cor_HistMids, _ = histCalOffsetCor.get()
 if corr_bools.get('common_mode'):
    cm_cor_HistVals, _, cm_HistMids, _ = histCalCommonModeCor.get()
 if corr_bools.get('relgain'):
    gain_cor_HistVals, _, gain_cor_HistMids, _ = histCalGainCor.get()
 if corr_bools.get('pattern_class'):
    split_HistVals, _, split_HistMids, _ = histCalPcorr.get() # split events corrected
    singles_HistVals, _, singles_HistMids, _ = histCalPcorrS.get() # last s in variable names: singles events
 ```

 %% Cell type:code id: tags:

 ``` python
 # Saving intermediate data to disk:
 step_timer.start()
 np.savez(os.path.join(out_folder, 'Raw_Events.npz'), RawHistMids, RawHistVals)
 np.savez(os.path.join(out_folder, 'Offset_Corrected_Events.npz'), off_cor_HistMids, off_cor_HistVals)
 if corr_bools.get('common_mode'):
    np.savez(os.path.join(out_folder, 'Common_Mode_Corrected_Events.npz'), cm_HistMids, cm_cor_HistVals)
 if corr_bools.get('relgain'):
    np.savez(os.path.join(out_folder, 'Gain_Corrected_Events.npz'), gain_cor_HistMids, gain_cor_HistVals)
 if corr_bools.get('pattern_class'):
    np.savez(os.path.join(out_folder, 'Split_Events_Corrected_Events.npz'), split_HistMids, split_HistVals)
    np.savez(os.path.join(out_folder, 'Singles_Events.npz'), singles_HistMids, singles_HistVals)
 step_timer.done_step(f'Saving intermediate data to disk.')
 print("Various spectra are saved to disk in the form of histograms. Please check {}".format(out_folder))
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('### Raw vs. Corrected Spectra'))
 step_timer.start()

 figure = [{'x': RawHistMids,
      'y': RawHistVals,
      'y_err': np.sqrt(RawHistVals[:]),
      'drawstyle': 'steps-post',
      'errorstyle': 'bars',
      'errorcoarsing': 2,
      'label': 'Uncorrected'
     },
     {'x': off_cor_HistMids,
      'y': off_cor_HistVals,
      'y_err': np.sqrt(off_cor_HistVals[:]),
      'drawstyle': 'steps-post',
      'errorstyle': 'bars',
      'errorcoarsing': 2,
      'label': 'Offset Corrected'
     }]

 if corr_bools.get('common_mode'):
    figure.append({'x': cm_HistMids,
                   'y': cm_cor_HistVals,
                   'y_err': np.sqrt(cm_cor_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Common Mode Corrected'})

 if corr_bools.get('relgain'):
    xrange = bin_range
    figure.append({'x': gain_cor_HistMids,
                   'y': gain_cor_HistVals,
                   'y_err': np.sqrt(gain_cor_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Gain Corrected'})

 if corr_bools.get('pattern_class'):
    figure.extend([{'x': split_HistMids,
                   'y': split_HistVals,
                   'y_err': np.sqrt(split_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Split Events Corrected'
                  },
                  {'x': singles_HistMids,
                   'y': singles_HistVals,
                   'y_err': np.sqrt(singles_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Singles Events'
                  }])
 fig = xana.simplePlot(figure, aspect=1, x_label='ADU', y_label='Number of Occurrences', figsize='2col',
                      y_log=True, x_range=bin_range, title = '1 ADU per bin is used.',
                      legend='top-right-frame-1col')
 step_timer.done_step('Plotting')
 ```

 %% Cell type:code id: tags:

 ``` python
 # This function plots pattern statistics:

 def classification_plot(patternStats, hemisphere):

    print("****************** {} HEMISPHERE ******************\n"
          .format(hemisphere))
    fig = plt.figure(figsize=(15, 15))
    ax = fig.add_subplot(4, 4, 1)
    sfields = ["singles", "first singles", "clusters"]
    mfields = ["doubles", "triples", "quads"]
    relativeOccurances = []
    labels = []
    for i, f in enumerate(sfields):
        relativeOccurances.append(patternStats[f])
        labels.append(f)
    for i, f in enumerate(mfields):
        for k in range(len(patternStats[f])):
            relativeOccurances.append(patternStats[f][k])
            labels.append("{}({})".format(f, k))
    relativeOccurances = np.array(relativeOccurances, np.float)
    relativeOccurances /= np.sum(relativeOccurances)
    pie = ax.pie(relativeOccurances, labels=labels, autopct='%1.1f%%', shadow=True)
    ax.set_title("Pattern Occurrence")
    # Set aspect ratio to be equal so that pie is drawn as a circle.
    a = ax.axis('equal')

    smaps = ["singlemap", "firstsinglemap", "clustermap"]
    for i, m in enumerate(smaps):
        ax = fig.add_subplot(4, 4, 2+i)
        pmap = ax.imshow(patternStats[m], interpolation="nearest", vmax=2*np.nanmedian(patternStats[m]))
        ax.set_title(m)
        cb = fig.colorbar(pmap)

    mmaps = ["doublemap", "triplemap", "quadmap"]
    k = 0
    for i, m in enumerate(mmaps):
        for j in range(4):
            ax = fig.add_subplot(4, 4, 2+len(smaps)+k)
            pmap = ax.imshow(patternStats[m][j], interpolation="nearest", vmax=2*np.median(patternStats[m][j]))
            ax.set_title("{}({})".format(m,j))
            cb = fig.colorbar(pmap)
            k+=1
 ```

 %% Cell type:code id: tags:

 ``` python
 # The next two cells plot the classification results for left and right hemispheres, respectively:
 display(Markdown('### Classification Results - Plots'))
 if corr_bools.get('pattern_class'):
    patternStatsLH = patternClassifierLH.getPatternStats()
    classification_plot(patternStatsLH, 'Left')
 ```

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    patternStatsRH = patternClassifierRH.getPatternStats()
    classification_plot(patternStatsRH, 'Right')
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('### Classification Results - Tabulated Statistics'))

 if corr_bools.get('pattern_class'):
    step_timer.start()
    t0 = PrettyTable()
    t0.title = "Total Number of Counts after All Corrections"
    t0.field_names = ["Hemisphere", "Singles", "First-Singles", "Clusters"]
    t0.add_row(["LH", patternStatsLH['singles'], patternStatsLH['first singles'], patternStatsLH['clusters']])
    t0.add_row(["RH", patternStatsRH['singles'], patternStatsRH['first singles'], patternStatsRH['clusters']])
    print(t0)

    print("Abbreviations: D (Doubles), T (Triples), Q (Quadruples), L (Left), R (Right), and H (Hemisphere).")
    t1 = PrettyTable()
    t1.field_names = ["Index", "D-LH", "D-RH", "T-LH", "T-RH", "Q-LH", "Q-RH"]
    t1.add_row([0, patternStatsLH['doubles'][0], patternStatsRH['doubles'][0], patternStatsLH['triples'][0],
                patternStatsRH['triples'][0], patternStatsLH['quads'][0], patternStatsRH['quads'][0]])
    t1.add_row([1, patternStatsLH['doubles'][1], patternStatsRH['doubles'][1], patternStatsLH['triples'][1],
                patternStatsRH['triples'][1], patternStatsLH['quads'][1], patternStatsRH['quads'][1]])
    t1.add_row([2, patternStatsLH['doubles'][2], patternStatsRH['doubles'][2], patternStatsLH['triples'][2],
                patternStatsRH['triples'][2], patternStatsLH['quads'][2], patternStatsRH['quads'][2]])
    t1.add_row([3, patternStatsLH['doubles'][3], patternStatsRH['doubles'][3], patternStatsLH['triples'][3],
                patternStatsRH['triples'][3], patternStatsLH['quads'][3], patternStatsRH['quads'][3]])
    print(t1)
    step_timer.done_step('Classification Results - Tabulated Statistics')
 ```

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    doublesLH = patternStatsLH['doubles'][0] + patternStatsLH['doubles'][1] + patternStatsLH['doubles'][2] + \
    patternStatsLH['doubles'][3]
    triplesLH = patternStatsLH['triples'][0] + patternStatsLH['triples'][1] + patternStatsLH['triples'][2] + \
    patternStatsLH['triples'][3]
    quadsLH = patternStatsLH['quads'][0] + patternStatsLH['quads'][1] + patternStatsLH['quads'][2] + \
    patternStatsLH['quads'][3]
    allsinglesLH = patternStatsLH['singles'] + patternStatsLH['first singles']
    eventsLH = allsinglesLH + doublesLH + triplesLH + quadsLH

    doublesRH = patternStatsRH['doubles'][0] + patternStatsRH['doubles'][1] + patternStatsRH['doubles'][2] + \
    patternStatsRH['doubles'][3]
    triplesRH = patternStatsRH['triples'][0] + patternStatsRH['triples'][1] + patternStatsRH['triples'][2] + \
    patternStatsRH['triples'][3]
    quadsRH = patternStatsRH['quads'][0] + patternStatsRH['quads'][1] + patternStatsRH['quads'][2] + \
    patternStatsRH['quads'][3]
    allsinglesRH = patternStatsRH['singles'] + patternStatsRH['first singles']
    eventsRH = allsinglesRH + doublesRH + triplesRH + quadsRH

    if eventsLH > 0.:
        reloccurLH = np.array([allsinglesLH/eventsLH, doublesLH/eventsLH, triplesLH/eventsLH, quadsLH/eventsLH])
    else:
        reloccurLH = np.array([0]*4)
    if eventsRH > 0.:
        reloccurRH = np.array([allsinglesRH/eventsRH, doublesRH/eventsRH, triplesRH/eventsRH, quadsRH/eventsRH])
    else:
        reloccurRH = np.array([0]*4)
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('### Classification Results - Pie Charts'))

 if corr_bools.get('pattern_class'):
    step_timer.start()
    fig = plt.figure(figsize=(12, 7))
    ax = fig.add_subplot(1, 2, 1)
    labels = ['Singles', 'Doubles', 'Triples', 'Quads']
    pie = ax.pie(reloccurLH, labels=labels, autopct='%1.1f%%', shadow=True)
    ax.set_title("Pattern Occurrence in LH")
    # Set aspect ratio to be equal so that pie is drawn as a circle.
    a = ax.axis('equal')
    ax = fig.add_subplot(1, 2, 2)
    pie = ax.pie(reloccurRH, labels=labels, autopct='%1.1f%%', shadow=True)
    ax.set_title("Pattern Occurrence in RH")
    # Set aspect ratio to be equal so that pie is drawn as a circle.
    a = ax.axis('equal')
    step_timer.done_step('Classification Results - Pie Charts')
 ```

 %% Cell type:markdown id: tags:

 ### Various Images Averaged Over All Frames of Only the First Sequence ###

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()

 uncor_mean_im = np.nanmean(raw_data, axis=0)
 offset_mean_im = np.nanmean(off_data, axis=0)

 if corr_bools.get('common_mode'):
    cm_mean_im = np.nanmean(cm_data, axis=0)
 if corr_bools.get('relgain'):
    gain_mean_im = np.nanmean(rg_data, axis=0)
 if corr_bools.get('pattern_class'):
    mean_im_cc = np.nanmean(cls_data, axis=0)

 fig = xana.heatmapPlot(uncor_mean_im, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Uncorrected Image Averaged over Frames in the First Sequence')

 fig = xana.heatmapPlot(offset_mean_im, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Offset Corrected Image Averaged over Frames in the First Sequence')

 if corr_bools.get('common_mode'):
    fig = xana.heatmapPlot(cm_mean_im, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x),
                           panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                           title = 'Common Mode Corrected Image Averaged over Frames in the First Sequence')

 if corr_bools.get('relgain'):
    fig = xana.heatmapPlot(gain_mean_im, x_label='Columns', y_label='Rows',
                           lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x),
                           panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                           title = 'Gain Corrected Image Averaged over Frames in the First Sequence')

 if corr_bools.get('pattern_class'):
    fig = xana.heatmapPlot(mean_im_cc, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x), vmin=0, vmax= 18000,
                           title = 'Image of Single Events Averaged over Frames in the First Sequence')
 step_timer.done_step("Plotting")
 ```

 %% Cell type:markdown id: tags:

 ### Images of the First Frame of the First Sequence ###

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 fig = xana.heatmapPlot(raw_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Uncorrected Image (First Frame of the First Sequence)')

 fig = xana.heatmapPlot(off_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Offset Corrected Image (First Frame of the First Sequence)')

 if corr_bools.get('common_mode'):
        fig = xana.heatmapPlot(cm_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)',
                               aspect=1,
                               x_range=(0, pixels_y), y_range=(0, pixels_x),
                               panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                               title = 'Common Mode Corrected Image (First Frame of the First Sequence)')

 if corr_bools.get('relgain'):
        fig = xana.heatmapPlot(rg_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)',
                               aspect=1,
                               x_range=(0, pixels_y), y_range=(0, pixels_x),
                               panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                               title = 'Gain Corrected Image (First Frame of the First Sequence)')

 if corr_bools.get('pattern_class'):
    fig = xana.heatmapPlot(cls_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x),
                           panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                           title = 'Image of Single Events (First Frame of the First Sequence)')
 step_timer.done_step("Plotting")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Resetting the histogram calculators:
 histCalRaw.reset()
 histCalOffsetCor.reset()
 if corr_bools.get('common_mode'):
    histCalCommonModeCor.reset()
 if corr_bools.get('relgain'):
    histCalGainCor.reset()
 if corr_bools.get('pattern_class'):
    histCalPcorr.reset()
    histCalPcorrS.reset()
 ```

 %% Cell type:markdown id: tags:

 Next, the corrected event patterns are read from the patterns/dataset created previously and are separated into 4 different categories (singles, doubles, triples and quadruples) using the pattern indices. However, this is done only for one sequence, corresponding to the seq_num variable, as an example.

 Note that the number of bins and the bin range for the following histograms may be different from those presented above (depending on gain) to make the counts more noticible and the peaks more defined.

 If you are interested in plotting the events from all sequences or the spectra of half of the sensor, execute the spectra_pnCCD_NBC.ipynb notebook.

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    singles = []
    doubles = []
    triples = []
    quads =  []
    with H5File(f"{out_folder}/{seq_files[0].name.replace('RAW', 'CORR')}") as dc:  # noqa
        data = dc[instrument_src, "data.pixels_classified"].ndarray()
        patterns = dc[instrument_src, "data.patterns"].ndarray()
    # events' patterns indices are as follows: 100 (singles), 101 (first singles), 200 - 203 (doubles),
    # 300 - 303 (triples), and 400 - 403 (quadruples). Note that for the last three types of patterns,
    # there are left, right, up, and down indices.

    # Separating the events:
    # Singles and First Singles:
    for s in range(100, 102):
        single = data.copy()
        single[patterns != s] = np.nan
        singles.append(single)


    for d in range(200, 204):
        double = data.copy()
        double[patterns != d] = np.nan
        doubles.append(double)

    for t in range(300, 304):
        triple = data.copy()
        triple[patterns != t] = np.nan
        triples.append(triple)

    for q in range(400, 404):
        quad = data.copy()
        quad[patterns != q] = np.nan
        quads.append(quad)
 ```

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    step_timer.start()
    hA = 0
    h = 0
    for single in singles:
        hs, e = np.histogram(single.flatten(), bins=event_bins, range=b_range) # h: histogram counts, e: bin edges
        h += hs
        hA += hs # hA: counts all events (see below)

    # bin edges array has one extra element => need to plot from 0 to the one before the last element to have the
    # same size as h-array => in what follows, we use e[:-1] (-1 means one before the last element)

    display(Markdown('### Histograms of Corrected Events for One Sequence Only'))
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    ax.step(e[:-1], h, color='blue', label='Events Involving Single Pixels Only')
    ax.semilogy() # y-axis is log, x-axis is linear
    ax.set_xlabel("Energy (ADU) [{} bins per {} ADU]".format(event_bins, b_range[1]-b_range[0]))
    ax.set_ylabel("Corrected Events for One Sequence (counts)")
    ax.set_xlim(b_range)

    h = 0
    for double in doubles:
        hd, e = np.histogram(double.flatten(), bins=event_bins, range=b_range)
        h += hd
        hA += hd

    ax.step(e[:-1], h, color='red', label='Events Splitting on Double Pixels')

    h = 0
    for triple in triples:
        ht, e = np.histogram(triple.flatten(), bins=event_bins, range=b_range)
        h += ht
        hA += ht

    ax.step(e[:-1], h, color='green', label='Events Splitting on Triple Pixels')

    h = 0
    for quad in quads:
        hq, e = np.histogram(quad.flatten(), bins=event_bins, range=b_range)
        h += hq
        hA += hq

    ax.step(e[:-1], h, color='purple', label='Events Splitting on Quadruple Pixels')

    ax.step(e[:-1], hA, color='grey', label='All Valid Events')
    l = ax.legend()
    step_timer.done_step("Plotting")
 ```

 %% Cell type:code id: tags:

 ``` python
 print(f"Total processing time {step_timer.timespan():.01f} s")
 step_timer.print_summary()
 ```

 %% Cell type:markdown id: tags:

 # pnCCD Data Correction #

 Authors: DET Group, Modified by Kiana Setoodehnia - Version 5.0

 The following notebook provides offset, common mode, relative gain, split events and pattern classification corrections of images acquired with the pnCCD. This notebook *does not* yet correct for charge transfer inefficiency.

 %% Cell type:code id: tags:

 ``` python
 in_folder = "/gpfs/exfel/exp/SQS/202031/p900166/raw"  # input folder
 out_folder = "/gpfs/exfel/data/scratch/ahmedk/test/remove/pnccd_correct"  # output folder
 metadata_folder = ""  # Directory containing calibration_metadata.yml when run by xfel-calibrate
 run = 347  # which run to read data from
 sequences = [-1]  # sequences to correct, set to -1 for all, range allowed
 sequences_per_node = 1  # number of sequences running on the same slurm node.

 karabo_da = 'PNCCD01' # data aggregators
 karabo_id = "SQS_NQS_PNCCD1MP" # karabo prefix of PNCCD devices
 receiver_id = "PNCCD_FMT-0" # inset for receiver devices
 path_template = 'RAW-R{:04d}-{}-S{:05d}.h5'  # the template to use to access data
 instrument_source_template = '{}/CAL/{}:output'  # template for data source name, will be filled with karabo_id and receiver_id.

 # Parameters affecting data correction.
 commonModeAxis = 0 # axis along which common mode will be calculated, 0 = row, and 1 = column
 commonModeBlockSize = [512, 512] # size of the detector in pixels for common mode calculations
 split_evt_primary_threshold = 4. # primary threshold for split event classification in terms of n sigma noise
 split_evt_secondary_threshold = 3. # secondary threshold for split event classification in terms of n sigma noise
 saturated_threshold = 32000.  # full well capacity in ADU

 # Conditions for retrieving calibration constants
 fix_temperature_top = 0. # fix temperature for top sensor in K, set to 0. to use value from slow data.
 fix_temperature_bot = 0. # fix temperature for bottom senspr in K, set to 0. to use value from slow data.
 gain = -1  # the detector's gain setting. Set to -1 to use the value from the slow data.
 bias_voltage = 0. # the detector's bias voltage. set to 0. to use value from slow data.
 integration_time = 70  # detector's integration time
 photon_energy = 1.6 # Al fluorescence in keV

 # Parameters for the calibration database.
 cal_db_interface = "tcp://max-exfl-cal001:8015" # calibration DB interface to use
 cal_db_timeout = 300000 # timeout on caldb requests
 creation_time = ""  # The timestamp to use with Calibration DB. Required Format: "YYYY-MM-DD hh:mm:ss" e.g. 2019-07-04 11:02:41
 remove_bias_voltage_if_zero = True  # This flag enables removing bias voltage from the conditions if a 0 value is read from RAW data. This is useful when the corresponding constants for old RAW had no bias voltage because of a mistake in control data. e.g. p002857

 # Booleans for selecting corrections to apply.
 only_offset = False # Only, apply offset.
 common_mode = True # Apply common mode correction
 relgain = True # Apply relative gain correction
 pattern_classification = True  # classify split events

 # parameters affecting stored output data.
 chunk_size_idim = 1  # H5 chunking size of output data
 limit_trains = 0  # this parameter is used for limiting number of images to correct from a sequence file.


 def balance_sequences(in_folder, run, sequences, sequences_per_node, karabo_da):
    from xfel_calibrate.calibrate import balance_sequences as bs
    return bs(in_folder, run, sequences, sequences_per_node, karabo_da)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Here the herarichy and dependability for correction booleans are defined
 corr_bools = {}

 corr_bools["only_offset"] = only_offset

 # Apply offset only.
 if not only_offset:
    corr_bools["relgain"] = relgain
    corr_bools["common_mode"] = common_mode
    corr_bools["pattern_class"] = pattern_classification
 ```

 %% Cell type:code id: tags:

 ``` python
 import os
 import sys
 import warnings
 from logging import warning
 from pathlib import Path
 warnings.filterwarnings('ignore')

 import matplotlib.pyplot as plt
 import numpy as np
 import pasha as psh
 from IPython.display import Markdown, display
 from extra_data import H5File, RunDirectory
 from prettytable import PrettyTable

 %matplotlib inline

 import cal_tools.restful_config as rest_cfg
 from XFELDetAna import xfelpyanatools as xana
 from XFELDetAna import xfelpycaltools as xcal
 from cal_tools import pnccdlib
 from cal_tools.files import DataFile
 from cal_tools.calcat_interface import CalCatError, PNCCD_CalibrationData

 from cal_tools.tools import (
    calcat_creation_time,
    write_constants_fragment,
 )
 from cal_tools.step_timing import StepTimer
 ```

 %% Cell type:code id: tags:

 ``` python
 # Calibration Database Settings, and Some Initial Run Parameters & Paths:
 display(Markdown('### Initial Settings and Paths'))

 # Sensor size and block size definitions (important for common mode and other corrections):
 pixels_x = 1024  # rows of pnCCD in pixels
 pixels_y = 1024  # columns of pnCCD in pixels
 in_folder = Path(in_folder)
 sensorSize = [pixels_x, pixels_y]
 # For xcal.HistogramCalculators.
 blockSize = [pixels_x//2, pixels_y//2]  # sensor area will be analysed according to blockSize.

 print(f"pnCCD size is: {pixels_x}x{pixels_y} pixels.")
 print(f'Calibration database interface selected: {cal_db_interface}')

 # Paths to the data:
 instrument_src = instrument_source_template.format(karabo_id, receiver_id)
 print(f"Instrument H5File source: {instrument_src}\n")

 # Run's creation time:
 creation_time = calcat_creation_time(in_folder, run, creation_time)
 print(f"Creation time: {creation_time}")
 ```

 %% Cell type:code id: tags:

 ``` python
 step_timer = StepTimer()
 ```

 %% Cell type:code id: tags:

 ``` python
 run_dc = RunDirectory(in_folder / f"r{run:04d}", _use_voview=False)

 # Output Folder Creation:
 os.makedirs(out_folder, exist_ok=True)

 # extract control data
 step_timer.start()
 ctrl_data = pnccdlib.PnccdCtrl(run_dc, karabo_id)
 if bias_voltage == 0.:
    bias_voltage = ctrl_data.get_bias_voltage()
 if gain == -1:
    gain = ctrl_data.get_gain()
 if fix_temperature_top == 0:
    fix_temperature_top = ctrl_data.get_fix_temperature_top()
 if fix_temperature_bot == 0:
    fix_temperature_bot = ctrl_data.get_fix_temperature_bot()

 step_timer.done_step("Reading control parameters.")

 # Printing the Parameters Read from the Data File:
 display(Markdown('### Detector Parameters'))
 print(f"Bias voltage is {bias_voltage:0.1f} V.")
 print(f"Detector gain is set to 1/{int(gain)}.")
 print(f"Detector integration time is set to {integration_time} ms")
 print(f"Top pnCCD sensor is at temperature of {fix_temperature_top:0.2f} K")
 print(f"Bottom pnCCD sensor is at temperature of {fix_temperature_bot:0.2f} K")
 ```

 %% Cell type:code id: tags:

 ``` python
 seq_files = []
 for f in run_dc.select(instrument_src).files:
    fpath = Path(f.filename)
    if fpath.match(f"*{karabo_da}*.h5"):
        seq_files.append(fpath)
 if sequences != [-1]:
    seq_files = sorted([f for f in seq_files if any(f.match(f"*-S{s:05d}.h5") for s in sequences)])
 print(f"Processing a total of {len(seq_files)} sequence files:")
 print(*seq_files, sep='\n')
 ```

 %% Cell type:code id: tags:

 ``` python
 gain_k = [k for k, v in pnccdlib.VALID_GAINS.items() if v == gain][0]
 if gain_k == 'a':
    split_evt_mip_threshold = 1000. # MIP threshold in ADU for event classification (10 times average noise)

    # Each xcal.HistogramCalculator requires a total number of bins and a binning range. We define these
    # using a dictionary:

    # For all xcal histograms:

    Hist_Bin_Dict = {
        "bins": 35000, # number of bins
        "bin_range": [0, 35000]
    }

    # For the numpy histograms on the last cell of the notebook:
    Event_Bin_Dict = {
        "event_bins": 1000, # number of bins
        "b_range": [0, 35000] # bin range
    }
 elif gain_k == 'b':
    split_evt_mip_threshold = 270. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 10000,
        "bin_range": [0, 10000]
    }
    Event_Bin_Dict = {
        "event_bins": 1000,
        "b_range": [0, 10000]
    }
 elif gain_k == 'c':
    split_evt_mip_threshold = 110. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 3000,
        "bin_range": [0, 3000]
    }
    Event_Bin_Dict = {
        "event_bins": 1000,
        "b_range": [0, 3000]
    }
 elif gain_k == 'd':
    split_evt_mip_threshold = 90. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 1000,
        "bin_range": [0, 1000]
    }
    Event_Bin_Dict = {
        "event_bins": 1000,
        "b_range": [0, 1000]
    }
 elif gain_k == 'e':
    split_evt_mip_threshold = 90. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 500,
        "bin_range": [0, 500]
    }
    Event_Bin_Dict = {
        "event_bins": 500,
        "b_range": [0, 500]
    }
 else:
    split_evt_mip_threshold = 90. # 10 times the average noise
    Hist_Bin_Dict = {
        "bins": 220,
        "bin_range": [0, 220]
    }
    Event_Bin_Dict = {
        "event_bins": 220,
        "b_range": [0, 220]
    }

 bins = Hist_Bin_Dict["bins"]
 bin_range = Hist_Bin_Dict["bin_range"]
 event_bins = Event_Bin_Dict["event_bins"]
 b_range = Event_Bin_Dict["b_range"]
 ```

 %% Cell type:markdown id: tags:

 As a first step, calibration constants have to be retrieved from the calibration database

 %% Cell type:code id: tags:

 ``` python
 display(Markdown("### Constants retrieval"))
 step_timer.start()

 # In the case of an older proposal (e.g., proposal 002857),
 # it is possible that the bias voltage was 0
 # resulting in the absence of bias voltage values in
 # the previously injected dark constants. This situation can be
 # attributed to a feature that is currently not available in iCalibrationDB.
 if bias_voltage == 0 and remove_bias_voltage_if_zero:
    bias_voltage = None

 pnccd_cal = PNCCD_CalibrationData(
    detector_name=karabo_id,
    sensor_bias_voltage=bias_voltage,
    integration_time=integration_time,
    sensor_temperature=fix_temperature_top,
    gain_setting=gain,
    event_at=creation_time,
    source_energy=photon_energy,
    client=rest_cfg.calibration_client(),
 )

 pnccd_metadata = pnccd_cal.metadata(calibrations=pnccd_cal.dark_calibrations)

 if relgain:
    try:
        gain_metadata = pnccd_cal.metadata(calibrations=["RelativeGainCCD"])
        for mod, md in gain_metadata.items():
            pnccd_metadata[mod].update(md)

    except CalCatError as e:  # TODO: fix after getting new exceptions.
        warning(f"{e} While asking for {pnccd_cal.illuminated_calibrations}")
        warning("RelativeGainEPix100 is not retrieved from the calibration database. "
                "Relative gain correction is disabled.")
        corr_bools['relgain'] = False

 # Display retrieved calibration constants timestamps
 pnccd_cal.display_markdown_retrieved_constants(metadata=pnccd_metadata)

 metadata = pnccd_metadata[karabo_da]

 # Validate the constants availability and raise/warn correspondingly.
 missing_dark_constants = set(
    c for c in pnccd_cal.dark_calibrations if c not in metadata.keys())

 if missing_dark_constants:
    raise KeyError(
        f"Dark constants {missing_dark_constants} are not available for correction.")

 # Record constant details in YAML metadata
 write_constants_fragment(
    out_folder=(metadata_folder or out_folder),
    det_metadata=pnccd_metadata,
    caldb_root=pnccd_cal.caldb_root,
 )

 # load constants arrays after storing fragment YAML file
 # and validating constants availability.
 constants = pnccd_cal.ndarray_map(metadata=pnccd_metadata).get(karabo_da, {})

 step_timer.done_step("Constants retrieval")
 ```

 %% Cell type:code id: tags:

 ``` python
 fig = xana.heatmapPlot(constants["OffsetCCD"][:,:,0], x_label='Columns', y_label='Rows', lut_label='Offset (ADU)',
                       aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x), vmax=16000,
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Dark Offset Map')

 fig = xana.heatmapPlot(constants["NoiseCCD"][:,:,0], x_label='Columns', y_label='Rows',
                       lut_label='Corrected Noise (ADU)',
                       aspect=1, x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Dark Noise Map')

 fig = xana.heatmapPlot(np.log2(constants["BadPixelsDarkCCD"][:,:,0]), x_label='Columns', y_label='Rows',
                       lut_label='Bad Pixel Value (ADU)',
                       aspect=1, x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Dark Bad Pixels Map')

 if corr_bools.get('relgain'):
    fig = xana.heatmapPlot(constants["RelativeGainCCD"], figsize=(8, 8), x_label='Columns', y_label='Rows',
                            lut_label='Relative Gain',
                            aspect=1, x_range=(0, pixels_y), y_range=(0, pixels_x), vmin=0.8, vmax=1.2,
                            panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                            panel_top_low_lim = 0.5, panel_top_high_lim = 1.5, panel_side_low_lim = 0.5,
                            panel_side_high_lim = 1.5,
                            title = f'Relative Gain Map for pnCCD (Gain = 1/{int(gain)})')
 ```

 %% Cell type:code id: tags:

 ``` python
 #************************ Calculators ************************#
 if corr_bools.get('common_mode'):
    # Common Mode Correction Calculator:
    cmCorrection = xcal.CommonModeCorrection([pixels_x, pixels_y],
                                             commonModeBlockSize,
                                             commonModeAxis,
                                             parallel=False, dType=np.float32, stride=1,
                                             noiseMap=constants["NoiseCCD"].astype(np.float32), minFrac=0.25)

 if corr_bools.get('pattern_class'):
    # Pattern Classifier Calculator:
    # Left Hemisphere:
    patternClassifierLH = xcal.PatternClassifier([pixels_x, pixels_y//2],
                                                 constants["NoiseCCD"][:, :pixels_y//2],
                                                 split_evt_primary_threshold,
                                                 split_evt_secondary_threshold,
                                                 split_evt_mip_threshold,
                                                 tagFirstSingles=3, # track along y-axis, left to right (see
                                                 nCells=1, # split_event.py file in pydetlib/lib/src/
                                                 allowElongated=False, # XFELDetAna/algorithms)
                                                 blockSize=[pixels_x, pixels_y//2],
                                                 parallel=False)

    # Right Hemisphere:
    patternClassifierRH = xcal.PatternClassifier([pixels_x, pixels_y//2],
                                                 constants["NoiseCCD"][:, pixels_y//2:],
                                                 split_evt_primary_threshold,
                                                 split_evt_secondary_threshold,
                                                 split_evt_mip_threshold,
                                                 tagFirstSingles=4, # track along y-axis, right to left
                                                 nCells=1,
                                                 allowElongated=False,
                                                 blockSize=[pixels_x, pixels_y//2],
                                                 parallel=False)

    patternClassifierLH._imagesPerChunk = 1
    patternClassifierRH._imagesPerChunk = 1

    patternClassifierLH._noisemap = constants["NoiseCCD"][:, :pixels_x//2]
    patternClassifierRH._noisemap = constants["NoiseCCD"][:, pixels_x//2:]
    # Setting bad pixels:
    patternClassifierLH.setBadPixelMask(constants["BadPixelsDarkCCD"][:, :pixels_x//2] != 0)
    patternClassifierRH.setBadPixelMask(constants["BadPixelsDarkCCD"][:, pixels_x//2:] != 0)
 ```

 %% Cell type:code id: tags:

 ``` python
 #***************** Histogram Calculators ******************#
 # Will contain uncorrected data:
 histCalRaw = xcal.HistogramCalculator(sensorSize,
                                      bins=bins,
                                      range=bin_range,
                                      nCells=1,
                                      parallel=False,
                                      blockSize=blockSize)
 # Will contain offset corrected data:
 histCalOffsetCor = xcal.HistogramCalculator(sensorSize,
                                            bins=bins,
                                            range=bin_range,
                                            nCells=1,
                                            parallel=False,
                                            blockSize=blockSize)
 if corr_bools.get('common_mode'):
    # Will contain common mode corrected data:
    histCalCommonModeCor = xcal.HistogramCalculator(sensorSize,
                                                    bins=bins,
                                                    range=bin_range,
                                                    nCells=1,
                                                    parallel=False,
                                                    blockSize=blockSize)
 if corr_bools.get('pattern_class'):
    # Will contain split events pattern data:
    histCalPcorr = xcal.HistogramCalculator(sensorSize,
                                            bins=bins,
                                            range=bin_range,
                                            nCells=1,
                                            parallel=False,
                                            blockSize=blockSize)
    # Will contain singles events data:
    histCalPcorrS = xcal.HistogramCalculator(sensorSize,
                                             bins=bins,
                                             range=bin_range,
                                             nCells=1,
                                             parallel=False,
                                             blockSize=blockSize)
 if corr_bools.get('relgain'):
    # Will contain gain corrected data:
    histCalGainCor = xcal.HistogramCalculator(sensorSize,
                                              bins=bins,
                                              range=bin_range,
                                              nCells=1,
                                              parallel=False,
                                              blockSize=blockSize)
 ```

 %% Cell type:markdown id: tags:

 ## Applying corrections to the raw data

 %% Cell type:code id: tags:

 ``` python
 def offset_correction(wid, index, d):
    """offset correction.
    Equating bad pixels' values to np.nan,
    so that the pattern classifier ignores them:
    """
    d = d.copy()

    # TODO: To clear this up. Is it on purpose to save corrected data with nans?
    d[bpix != 0] = np.nan
    d -= offset  # offset correction

    # TODO: to clear this up. why save the badpixels map in the corrected data?
    bpix_data[index, ...] = bpix
    data[index, ...] = d

 def common_mode(wid, index, d):
    """common-mode correction.
    Discarding events caused by saturated pixels:
    """
    d = np.squeeze(cmCorrection.correct(d, cellTable=np.zeros(pixels_y, np.int32)))
    # we equate these values to np.nan so that the pattern classifier ignores them:
    d[d >= saturated_threshold] = np.nan
    data[index, ...] = d


 def gain_correction(wid, index, d):
    """relative gain correction."""
    d /= relativegain
    data[index, ...] = d


 def pattern_classification_correction(wid, index, d):
    """pattern classification correction.
    data set to save split event corrected images

    The calculation of the cluster map:]
    Dividing the data into left and right hemispheres:
    """

    # pattern classification on corrected data
    dataLH, patternsLH = patternClassifierLH.classify(d[:, :pixels_x//2])
    dataRH, patternsRH = patternClassifierRH.classify(d[:, pixels_x//2:])

    d[:, :pixels_x//2] = np.squeeze(dataLH)
    d[:, pixels_x//2:] = np.squeeze(dataRH)

    patterns = np.zeros(d.shape, patternsLH.dtype)
    patterns[:, :pixels_x//2] = np.squeeze(patternsLH)
    patterns[:, pixels_x//2:] = np.squeeze(patternsRH)
    d[d < split_evt_primary_threshold*noise] = 0
    data[index, ...] = d
    ptrn_data[index, ...] = patterns
    d[patterns != 100] = np.nan  # Discard doubles, triples, quadruple, clusters, first singles
    filtered_data[index, ...] = d
 ```

 %% Cell type:code id: tags:

 ``` python
 # 10 is a number chosen after testing 1 ... 71 parallel threads for a node with 72 cpus.
 parallel_num_threads = 10
 context = psh.context.ThreadContext(num_workers=parallel_num_threads)

 data_path = "INSTRUMENT/"+instrument_src+"/data/"

 offset = np.squeeze(constants["OffsetCCD"])
 noise = np.squeeze(constants["NoiseCCD"])
 bpix = np.squeeze(constants["BadPixelsDarkCCD"])
 relativegain = constants.get("RelativeGainCCD")
 ```

 %% Cell type:code id: tags:

 ``` python
 def write_datasets(seq_dc, corr_arrays, out_file, instrument_src):
    """
    Creating datasets first then adding data.
    To have metadata together available at the start of the file,
    so it's quick to see what the file contains.
    """
    # Create CORR files and add corrected data sections.
    image_counts = seq_dc[instrument_src, "data.image"].data_counts(labelled=False)
    dataset_chunk = ((chunk_size_idim,) + corr_arrays["pixels"].shape[1:])  # e.g. (1, pixels_x, pixels_y)
    with DataFile(out_file, 'w') as ofile:
-
+        seq_file = seq_dc.files[0]
        # Create INDEX datasets.
-        ofile.create_index(seq_dc.train_ids, from_file=seq_dc.files[0])
+        ofile.create_index(seq_dc.train_ids, from_file=seq_file)
        # Create METDATA datasets
        ofile.create_metadata(
            like=seq_dc,
-            sequence=seq_dc.run_metadata()["sequenceNumber"],
+            sequence=seq_file.sequence,
            instrument_channels=(f"{instrument_src}/data",)
        )

        # Create Instrument section to later add corrected datasets.
        outp_source = ofile.create_instrument_source(instrument_src)

        # Create count/first datasets at INDEX source.
        outp_source.create_index(data=image_counts)

        # Store uncorrected trainId in the corrected file.
        outp_source.create_key(
            f"data.trainId", data=seq_dc.train_ids,
            chunks=min(50, len(seq_dc.train_ids))
        )

        # TODO: gain dataset is just the RelativeGain constant
        # and it doesn't make sense to write it into corrected data.
        comp_fields = ["gain", "patterns", "pixels_classified"]

        # TODO: to clear this up: why save corrected data
        # in data/pixels rather than data/image.
        for field, data in corr_arrays.items():
            if field in comp_fields:  # Write compressed corrected data.
                outp_source.create_compressed_key(f"data.{field}", data=data)
            else:
                outp_source.create_key(
                    f"data.{field}", data=data,
                    chunks=dataset_chunk
                )
 ```

 %% Cell type:code id: tags:

 ``` python
 # Data corrections and event classifications happen here.
 # Also, the corrected data are written to datasets:
 empty_seq = 0
 for seq_n, seq_f in enumerate(seq_files):
    seq_dc = H5File(seq_f)
    out_file = f"{out_folder}/{seq_f.name}".replace("RAW", "CORR")

    step_timer.start()

    img_dc = seq_dc[instrument_src, "data.image"]

    dshape = seq_dc[instrument_src, "data.image"].shape

    n_trains = dshape[0]
    corr_ntrains = dshape[0]  # number of available trains to correct.
    all_train_ids = img_dc.train_ids    # All trains including trains with no data.

    # Raise a WARNING if this sequence has no trains to correct.
    # Otherwise, print number of trains with no data.
    if corr_ntrains == 0:
        warning(f"No trains to correct for {seq_f.name}: "
                "Skipping the processing of this file.")
        empty_seq += 1
        continue
    elif len(all_train_ids) != corr_ntrains:
        print(
            f"{seq_f.name} has {len(all_train_ids) - corr_ntrains} "
            "trains with missing data."
        )

    # If you want to analyze only a certain number of frames
    # instead of all available good frames.

    if limit_trains > 0:
        print(f"\nCorrected trains are limited to: {limit_trains} trains")
        corr_ntrains = min(corr_ntrains, limit_trains)
    data_shape = (corr_ntrains, dshape[1], dshape[2])

    print(f"Correcting file {seq_f} of {corr_ntrains} trains.")

    # Overwrite seq_dc after eliminating empty trains or/and applying limited images.
    seq_dc = seq_dc.select(
        instrument_src, "*", require_all=True).select_trains(np.s_[:corr_ntrains])

    raw_data = seq_dc[instrument_src, "data.image"].ndarray().astype(np.float32)
    to_store_arrays = {"image": raw_data}

    # TODO: move the parts for reading data to plot to later cells.
    if seq_n == 0:
        raw_plt = raw_data.copy()  # plot first sequence only

    step_timer.start()

    # Allocating shared arrays for data arrays for each correction stage.
    data = context.alloc(shape=data_shape, dtype=np.float32)
    bpix_data = context.alloc(shape=data_shape, dtype=np.uint32)
    histCalRaw.fill(raw_data)  # filling histogram with raw uncorrected data

    # Applying offset correction
    context.map(offset_correction, raw_data)
    histCalOffsetCor.fill(data)  # filling histogram with offset corrected data

    if seq_n == 0:
        off_data = data.copy()  # plot first sequence only

    to_store_arrays["pixels"] = data.copy()
    to_store_arrays["mask"] = bpix_data

    step_timer.done_step(f'offset correction.')

    if corr_bools.get('common_mode'):
        step_timer.start()

        # Applying common mode correction
        context.map(common_mode, data)
        if seq_n == 0:
            cm_data = data.copy()  # plot first sequence only
        to_store_arrays["pixels_cm"] = data.copy()
        histCalCommonModeCor.fill(data)  # filling histogram with common mode corrected data

        step_timer.done_step(f'common-mode correction.')

    if corr_bools.get('relgain'):

        step_timer.start()

        # Applying gain correction
        context.map(gain_correction, data)
        if seq_n == 0:
            rg_data = data.copy()  # plot first sequence only
        # TODO: Why storing a repeated constant for each image in corrected files.
        to_store_arrays["gain"] = np.repeat(relativegain[np.newaxis, ...], corr_ntrains, axis=0).astype(np.float32)  # noqa
        histCalGainCor.fill(data)  # filling histogram with gain corrected data
        step_timer.done_step(f'gain correction.')

    if corr_bools.get('pattern_class'):
        step_timer.start()

        ptrn_data = context.alloc(shape=data_shape, dtype=np.int32)
        filtered_data = context.alloc(shape=data_shape, dtype=np.int32)
        # Applying pattern classification correction
        # Even though data is indeed of dtype np.float32,
        # not specifying this again screw with the data quality.
        context.map(pattern_classification_correction, data.astype(np.float32))

        if seq_n == 0:
            cls_data = data.copy()  # plot first sequence only
        # split event corrected images plotted for first sequence only
        # (also these events are only singles events):
        to_store_arrays["pixels_classified"] = data.copy()
        to_store_arrays["patterns"] = ptrn_data

        histCalPcorr.fill(data)  # filling histogram with split events corrected data
        # filling histogram with corr data after discarding doubles, triples, quadruple, clusters, and first singles
        histCalPcorrS.fill(filtered_data)
        step_timer.done_step(f'pattern classification correction.')

    step_timer.start()

    # Storing corrected data sources.
    write_datasets(
        seq_dc=seq_dc,
        corr_arrays=to_store_arrays,
        out_file=out_file,
        instrument_src=instrument_src,
    )
    step_timer.done_step(f'Storing data.')
 # Exit and raise warning if there are no data to correct for all sequences.
 if empty_seq == len(seq_files):
    warning("No valid trains for RAW data to correct.")
    sys.exit(0)
 ```

 %% Cell type:code id: tags:

 ``` python
 print("In addition to offset correction, the following corrections were performed:")
 for k, v in corr_bools.items():
    if v:
        print("  -", k.upper())

 print(f"Total processing time {step_timer.timespan():.01f} s")
 step_timer.print_summary()
 ```

 %% Cell type:code id: tags:

 ``` python
 print("In addition to offset correction, the following corrections were performed:")
 for k, v in corr_bools.items():
    if v:
        print("  -", k.upper())

 print(f"Total processing time {step_timer.timespan():.01f} s")
 step_timer.print_summary()
 ```

 %% Cell type:code id: tags:

 ``` python
 # Histograming the resulting spectra:

 # First _ refers to the bin edges and second _ refers to statistics and we ignore both.

 # if you use histCalRaw.get(cumulatative = True) and so on, the cumulatative = True turns the counts array such as
 # RawHistVals and so on into a 1D array instead of keeping the original shape:

 RawHistVals, _, RawHistMids, _ = histCalRaw.get()
 off_cor_HistVals, _, off_cor_HistMids, _ = histCalOffsetCor.get()
 if corr_bools.get('common_mode'):
    cm_cor_HistVals, _, cm_HistMids, _ = histCalCommonModeCor.get()
 if corr_bools.get('relgain'):
    gain_cor_HistVals, _, gain_cor_HistMids, _ = histCalGainCor.get()
 if corr_bools.get('pattern_class'):
    split_HistVals, _, split_HistMids, _ = histCalPcorr.get() # split events corrected
    singles_HistVals, _, singles_HistMids, _ = histCalPcorrS.get() # last s in variable names: singles events
 ```

 %% Cell type:code id: tags:

 ``` python
 # Saving intermediate data to disk:
 step_timer.start()
 np.savez(os.path.join(out_folder, 'Raw_Events.npz'), RawHistMids, RawHistVals)
 np.savez(os.path.join(out_folder, 'Offset_Corrected_Events.npz'), off_cor_HistMids, off_cor_HistVals)
 if corr_bools.get('common_mode'):
    np.savez(os.path.join(out_folder, 'Common_Mode_Corrected_Events.npz'), cm_HistMids, cm_cor_HistVals)
 if corr_bools.get('relgain'):
    np.savez(os.path.join(out_folder, 'Gain_Corrected_Events.npz'), gain_cor_HistMids, gain_cor_HistVals)
 if corr_bools.get('pattern_class'):
    np.savez(os.path.join(out_folder, 'Split_Events_Corrected_Events.npz'), split_HistMids, split_HistVals)
    np.savez(os.path.join(out_folder, 'Singles_Events.npz'), singles_HistMids, singles_HistVals)
 step_timer.done_step(f'Saving intermediate data to disk.')
 print("Various spectra are saved to disk in the form of histograms. Please check {}".format(out_folder))
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('### Raw vs. Corrected Spectra'))
 step_timer.start()

 figure = [{'x': RawHistMids,
      'y': RawHistVals,
      'y_err': np.sqrt(RawHistVals[:]),
      'drawstyle': 'steps-post',
      'errorstyle': 'bars',
      'errorcoarsing': 2,
      'label': 'Uncorrected'
     },
     {'x': off_cor_HistMids,
      'y': off_cor_HistVals,
      'y_err': np.sqrt(off_cor_HistVals[:]),
      'drawstyle': 'steps-post',
      'errorstyle': 'bars',
      'errorcoarsing': 2,
      'label': 'Offset Corrected'
     }]

 if corr_bools.get('common_mode'):
    figure.append({'x': cm_HistMids,
                   'y': cm_cor_HistVals,
                   'y_err': np.sqrt(cm_cor_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Common Mode Corrected'})

 if corr_bools.get('relgain'):
    xrange = bin_range
    figure.append({'x': gain_cor_HistMids,
                   'y': gain_cor_HistVals,
                   'y_err': np.sqrt(gain_cor_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Gain Corrected'})

 if corr_bools.get('pattern_class'):
    figure.extend([{'x': split_HistMids,
                   'y': split_HistVals,
                   'y_err': np.sqrt(split_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Split Events Corrected'
                  },
                  {'x': singles_HistMids,
                   'y': singles_HistVals,
                   'y_err': np.sqrt(singles_HistVals[:]),
                   'drawstyle': 'steps-post',
                   'errorstyle': 'bars',
                   'errorcoarsing': 2,
                   'label': 'Singles Events'
                  }])
 fig = xana.simplePlot(figure, aspect=1, x_label='ADU', y_label='Number of Occurrences', figsize='2col',
                      y_log=True, x_range=bin_range, title = '1 ADU per bin is used.',
                      legend='top-right-frame-1col')
 step_timer.done_step('Plotting')
 ```

 %% Cell type:code id: tags:

 ``` python
 # This function plots pattern statistics:

 def classification_plot(patternStats, hemisphere):

    print("****************** {} HEMISPHERE ******************\n"
          .format(hemisphere))
    fig = plt.figure(figsize=(15, 15))
    ax = fig.add_subplot(4, 4, 1)
    sfields = ["singles", "first singles", "clusters"]
    mfields = ["doubles", "triples", "quads"]
    relativeOccurances = []
    labels = []
    for i, f in enumerate(sfields):
        relativeOccurances.append(patternStats[f])
        labels.append(f)
    for i, f in enumerate(mfields):
        for k in range(len(patternStats[f])):
            relativeOccurances.append(patternStats[f][k])
            labels.append("{}({})".format(f, k))
    relativeOccurances = np.array(relativeOccurances, np.float)
    relativeOccurances /= np.sum(relativeOccurances)
    pie = ax.pie(relativeOccurances, labels=labels, autopct='%1.1f%%', shadow=True)
    ax.set_title("Pattern Occurrence")
    # Set aspect ratio to be equal so that pie is drawn as a circle.
    a = ax.axis('equal')

    smaps = ["singlemap", "firstsinglemap", "clustermap"]
    for i, m in enumerate(smaps):
        ax = fig.add_subplot(4, 4, 2+i)
        pmap = ax.imshow(patternStats[m], interpolation="nearest", vmax=2*np.nanmedian(patternStats[m]))
        ax.set_title(m)
        cb = fig.colorbar(pmap)

    mmaps = ["doublemap", "triplemap", "quadmap"]
    k = 0
    for i, m in enumerate(mmaps):
        for j in range(4):
            ax = fig.add_subplot(4, 4, 2+len(smaps)+k)
            pmap = ax.imshow(patternStats[m][j], interpolation="nearest", vmax=2*np.median(patternStats[m][j]))
            ax.set_title("{}({})".format(m,j))
            cb = fig.colorbar(pmap)
            k+=1
 ```

 %% Cell type:code id: tags:

 ``` python
 # The next two cells plot the classification results for left and right hemispheres, respectively:
 display(Markdown('### Classification Results - Plots'))
 if corr_bools.get('pattern_class'):
    patternStatsLH = patternClassifierLH.getPatternStats()
    classification_plot(patternStatsLH, 'Left')
 ```

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    patternStatsRH = patternClassifierRH.getPatternStats()
    classification_plot(patternStatsRH, 'Right')
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('### Classification Results - Tabulated Statistics'))

 if corr_bools.get('pattern_class'):
    step_timer.start()
    t0 = PrettyTable()
    t0.title = "Total Number of Counts after All Corrections"
    t0.field_names = ["Hemisphere", "Singles", "First-Singles", "Clusters"]
    t0.add_row(["LH", patternStatsLH['singles'], patternStatsLH['first singles'], patternStatsLH['clusters']])
    t0.add_row(["RH", patternStatsRH['singles'], patternStatsRH['first singles'], patternStatsRH['clusters']])
    print(t0)

    print("Abbreviations: D (Doubles), T (Triples), Q (Quadruples), L (Left), R (Right), and H (Hemisphere).")
    t1 = PrettyTable()
    t1.field_names = ["Index", "D-LH", "D-RH", "T-LH", "T-RH", "Q-LH", "Q-RH"]
    t1.add_row([0, patternStatsLH['doubles'][0], patternStatsRH['doubles'][0], patternStatsLH['triples'][0],
                patternStatsRH['triples'][0], patternStatsLH['quads'][0], patternStatsRH['quads'][0]])
    t1.add_row([1, patternStatsLH['doubles'][1], patternStatsRH['doubles'][1], patternStatsLH['triples'][1],
                patternStatsRH['triples'][1], patternStatsLH['quads'][1], patternStatsRH['quads'][1]])
    t1.add_row([2, patternStatsLH['doubles'][2], patternStatsRH['doubles'][2], patternStatsLH['triples'][2],
                patternStatsRH['triples'][2], patternStatsLH['quads'][2], patternStatsRH['quads'][2]])
    t1.add_row([3, patternStatsLH['doubles'][3], patternStatsRH['doubles'][3], patternStatsLH['triples'][3],
                patternStatsRH['triples'][3], patternStatsLH['quads'][3], patternStatsRH['quads'][3]])
    print(t1)
    step_timer.done_step('Classification Results - Tabulated Statistics')
 ```

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    doublesLH = patternStatsLH['doubles'][0] + patternStatsLH['doubles'][1] + patternStatsLH['doubles'][2] + \
    patternStatsLH['doubles'][3]
    triplesLH = patternStatsLH['triples'][0] + patternStatsLH['triples'][1] + patternStatsLH['triples'][2] + \
    patternStatsLH['triples'][3]
    quadsLH = patternStatsLH['quads'][0] + patternStatsLH['quads'][1] + patternStatsLH['quads'][2] + \
    patternStatsLH['quads'][3]
    allsinglesLH = patternStatsLH['singles'] + patternStatsLH['first singles']
    eventsLH = allsinglesLH + doublesLH + triplesLH + quadsLH

    doublesRH = patternStatsRH['doubles'][0] + patternStatsRH['doubles'][1] + patternStatsRH['doubles'][2] + \
    patternStatsRH['doubles'][3]
    triplesRH = patternStatsRH['triples'][0] + patternStatsRH['triples'][1] + patternStatsRH['triples'][2] + \
    patternStatsRH['triples'][3]
    quadsRH = patternStatsRH['quads'][0] + patternStatsRH['quads'][1] + patternStatsRH['quads'][2] + \
    patternStatsRH['quads'][3]
    allsinglesRH = patternStatsRH['singles'] + patternStatsRH['first singles']
    eventsRH = allsinglesRH + doublesRH + triplesRH + quadsRH

    if eventsLH > 0.:
        reloccurLH = np.array([allsinglesLH/eventsLH, doublesLH/eventsLH, triplesLH/eventsLH, quadsLH/eventsLH])
    else:
        reloccurLH = np.array([0]*4)
    if eventsRH > 0.:
        reloccurRH = np.array([allsinglesRH/eventsRH, doublesRH/eventsRH, triplesRH/eventsRH, quadsRH/eventsRH])
    else:
        reloccurRH = np.array([0]*4)
 ```

 %% Cell type:code id: tags:

 ``` python
 display(Markdown('### Classification Results - Pie Charts'))

 if corr_bools.get('pattern_class'):
    step_timer.start()
    fig = plt.figure(figsize=(12, 7))
    ax = fig.add_subplot(1, 2, 1)
    labels = ['Singles', 'Doubles', 'Triples', 'Quads']
    pie = ax.pie(reloccurLH, labels=labels, autopct='%1.1f%%', shadow=True)
    ax.set_title("Pattern Occurrence in LH")
    # Set aspect ratio to be equal so that pie is drawn as a circle.
    a = ax.axis('equal')
    ax = fig.add_subplot(1, 2, 2)
    pie = ax.pie(reloccurRH, labels=labels, autopct='%1.1f%%', shadow=True)
    ax.set_title("Pattern Occurrence in RH")
    # Set aspect ratio to be equal so that pie is drawn as a circle.
    a = ax.axis('equal')
    step_timer.done_step('Classification Results - Pie Charts')
 ```

 %% Cell type:markdown id: tags:

 ### Various Images Averaged Over All Frames of Only the First Sequence ###

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()

 uncor_mean_im = np.nanmean(raw_data, axis=0)
 offset_mean_im = np.nanmean(off_data, axis=0)

 if corr_bools.get('common_mode'):
    cm_mean_im = np.nanmean(cm_data, axis=0)
 if corr_bools.get('relgain'):
    gain_mean_im = np.nanmean(rg_data, axis=0)
 if corr_bools.get('pattern_class'):
    mean_im_cc = np.nanmean(cls_data, axis=0)

 fig = xana.heatmapPlot(uncor_mean_im, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Uncorrected Image Averaged over Frames in the First Sequence')

 fig = xana.heatmapPlot(offset_mean_im, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Offset Corrected Image Averaged over Frames in the First Sequence')

 if corr_bools.get('common_mode'):
    fig = xana.heatmapPlot(cm_mean_im, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x),
                           panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                           title = 'Common Mode Corrected Image Averaged over Frames in the First Sequence')

 if corr_bools.get('relgain'):
    fig = xana.heatmapPlot(gain_mean_im, x_label='Columns', y_label='Rows',
                           lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x),
                           panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                           title = 'Gain Corrected Image Averaged over Frames in the First Sequence')

 if corr_bools.get('pattern_class'):
    fig = xana.heatmapPlot(mean_im_cc, x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x), vmin=0, vmax= 18000,
                           title = 'Image of Single Events Averaged over Frames in the First Sequence')
 step_timer.done_step("Plotting")
 ```

 %% Cell type:markdown id: tags:

 ### Images of the First Frame of the First Sequence ###

 %% Cell type:code id: tags:

 ``` python
 step_timer.start()
 fig = xana.heatmapPlot(raw_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Uncorrected Image (First Frame of the First Sequence)')

 fig = xana.heatmapPlot(off_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                       x_range=(0, pixels_y), y_range=(0, pixels_x),
                       panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                       title = 'Offset Corrected Image (First Frame of the First Sequence)')

 if corr_bools.get('common_mode'):
        fig = xana.heatmapPlot(cm_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)',
                               aspect=1,
                               x_range=(0, pixels_y), y_range=(0, pixels_x),
                               panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                               title = 'Common Mode Corrected Image (First Frame of the First Sequence)')

 if corr_bools.get('relgain'):
        fig = xana.heatmapPlot(rg_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)',
                               aspect=1,
                               x_range=(0, pixels_y), y_range=(0, pixels_x),
                               panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                               title = 'Gain Corrected Image (First Frame of the First Sequence)')

 if corr_bools.get('pattern_class'):
    fig = xana.heatmapPlot(cls_data[0, :, :], x_label='Columns', y_label='Rows', lut_label='Signal (ADU)', aspect=1,
                           x_range=(0, pixels_y), y_range=(0, pixels_x),
                           panel_x_label='Row Stat (ADU)', panel_y_label='Column Stat (ADU)',
                           title = 'Image of Single Events (First Frame of the First Sequence)')
 step_timer.done_step("Plotting")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Resetting the histogram calculators:
 histCalRaw.reset()
 histCalOffsetCor.reset()
 if corr_bools.get('common_mode'):
    histCalCommonModeCor.reset()
 if corr_bools.get('relgain'):
    histCalGainCor.reset()
 if corr_bools.get('pattern_class'):
    histCalPcorr.reset()
    histCalPcorrS.reset()
 ```

 %% Cell type:markdown id: tags:

 Next, the corrected event patterns are read from the patterns/dataset created previously and are separated into 4 different categories (singles, doubles, triples and quadruples) using the pattern indices. However, this is done only for one sequence, corresponding to the seq_num variable, as an example.

 Note that the number of bins and the bin range for the following histograms may be different from those presented above (depending on gain) to make the counts more noticible and the peaks more defined.

 If you are interested in plotting the events from all sequences or the spectra of half of the sensor, execute the spectra_pnCCD_NBC.ipynb notebook.

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    singles = []
    doubles = []
    triples = []
    quads =  []
    with H5File(f"{out_folder}/{seq_files[0].name.replace('RAW', 'CORR')}") as dc:  # noqa
        data = dc[instrument_src, "data.pixels_classified"].ndarray()
        patterns = dc[instrument_src, "data.patterns"].ndarray()
    # events' patterns indices are as follows: 100 (singles), 101 (first singles), 200 - 203 (doubles),
    # 300 - 303 (triples), and 400 - 403 (quadruples). Note that for the last three types of patterns,
    # there are left, right, up, and down indices.

    # Separating the events:
    # Singles and First Singles:
    for s in range(100, 102):
        single = data.copy()
        single[patterns != s] = np.nan
        singles.append(single)


    for d in range(200, 204):
        double = data.copy()
        double[patterns != d] = np.nan
        doubles.append(double)

    for t in range(300, 304):
        triple = data.copy()
        triple[patterns != t] = np.nan
        triples.append(triple)

    for q in range(400, 404):
        quad = data.copy()
        quad[patterns != q] = np.nan
        quads.append(quad)
 ```

 %% Cell type:code id: tags:

 ``` python
 if corr_bools.get('pattern_class'):
    step_timer.start()
    hA = 0
    h = 0
    for single in singles:
        hs, e = np.histogram(single.flatten(), bins=event_bins, range=b_range) # h: histogram counts, e: bin edges
        h += hs
        hA += hs # hA: counts all events (see below)

    # bin edges array has one extra element => need to plot from 0 to the one before the last element to have the
    # same size as h-array => in what follows, we use e[:-1] (-1 means one before the last element)

    display(Markdown('### Histograms of Corrected Events for One Sequence Only'))
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    ax.step(e[:-1], h, color='blue', label='Events Involving Single Pixels Only')
    ax.semilogy() # y-axis is log, x-axis is linear
    ax.set_xlabel("Energy (ADU) [{} bins per {} ADU]".format(event_bins, b_range[1]-b_range[0]))
    ax.set_ylabel("Corrected Events for One Sequence (counts)")
    ax.set_xlim(b_range)

    h = 0
    for double in doubles:
        hd, e = np.histogram(double.flatten(), bins=event_bins, range=b_range)
        h += hd
        hA += hd

    ax.step(e[:-1], h, color='red', label='Events Splitting on Double Pixels')

    h = 0
    for triple in triples:
        ht, e = np.histogram(triple.flatten(), bins=event_bins, range=b_range)
        h += ht
        hA += ht

    ax.step(e[:-1], h, color='green', label='Events Splitting on Triple Pixels')

    h = 0
    for quad in quads:
        hq, e = np.histogram(quad.flatten(), bins=event_bins, range=b_range)
        h += hq
        hA += hq

    ax.step(e[:-1], h, color='purple', label='Events Splitting on Quadruple Pixels')

    ax.step(e[:-1], hA, color='grey', label='All Valid Events')
    l = ax.legend()
    step_timer.done_step("Plotting")
 ```

 %% Cell type:code id: tags:

 ``` python
 print(f"Total processing time {step_timer.timespan():.01f} s")
 step_timer.print_summary()
 ```

--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@ install_requires = [
        "markupsafe==2.0.1",
        "astcheck==0.3.0",
        "cfelpyutils==2.0.6",
-        "calibration_client==11.2.0",
+        "calibration_client==11.3.0",
        "dill==0.3.0",
        "docutils==0.17.1",
        "dynaconf==3.1.4",
@@ -79,7 +79,7 @@ install_requires = [
        "lxml==4.5.0",
        "markupsafe==2.0.1",
        "matplotlib==3.4.2",
-        "metadata_client==3.0.8",
+        "metadata_client==4.0.0",
        "nbclient==0.5.1",
        "nbconvert==5.6.1",
        "nbformat==5.0.7",
@@ -153,16 +153,7 @@ setup(
        ],
        "test": [
            "coverage",
-            "nbval",
-            "pytest-asyncio",
-            "pytest-cov",
-            "pytest-subprocess",
-            "pytest>=5.4.0",
-            "testpath",
-            "unittest-xml-reporting==3.0.2",
-        ],
-        "automated_test": [
-            "coverage",
+            "deepdiff==6.7.1",
            "nbval",
            "pytest-asyncio",
            "pytest-cov",

--- a/src/__init__.py
+++ b/src/__init__.py
--- a/src/cal_tools/agipdlib.py
+++ b/src/cal_tools/agipdlib.py
@@ -347,10 +347,14 @@ class AgipdCtrlRuns:
        sort_values = []
        if self.gain_modes == self.adaptive_gain_modes:  # Adaptive gain # sort by patterns
            # Patterns -> DarkHG: 1, DarkMG: 2, DarkLG: 3
-            if "AGIPD1M" in self.ctrl_src:
+            if "patternTypeIndex" in self.run_ctrls[0].run_dc[self.ctrl_src]:
                sort_by = "patternTypeIndex"
-            elif "AGIPD500K" in self.ctrl_src:
+            elif "expTypeIndex" in self.run_ctrls[0].run_dc[self.ctrl_src]:
                sort_by = "expTypeIndex"
+            else:
+                raise ValueError(
+                    "Neither `patternTypeIndex` nor `expTypeIndex` "
+                    "keys are available in CTRL data source.")

            for c in self.run_ctrls:
                sort_values.append(

--- a/src/cal_tools/calcat_interface.py
+++ b/src/cal_tools/calcat_interface.py
@@ -2,6 +2,7 @@
 from datetime import date, datetime, time, timezone
 from functools import lru_cache
 from pathlib import Path
+from typing import Optional, Sequence
 from weakref import WeakKeyDictionary

 import h5py
@@ -113,8 +114,7 @@ class CalCatApi(metaclass=ClientWrapper):
        """Encode operating condition to CalCat API format.

        Args:
-            caldata (CalibrationData): Calibration data instance used to
-                interface with database.
+            condition (dict): Mapping of parameter DB name to value

        Returns:
            (dict) Operating condition for use in CalCat API.
@@ -192,6 +192,19 @@ class CalCatApi(metaclass=ClientWrapper):

        return resp_calibration["data"]["id"]

+    @lru_cache()
+    def calibration_name(self, calibration_id):
+        """Name for a calibration in CalCat."""
+
+        resp_calibration = Calibration.get_by_id(
+            self.client, calibration_id
+        )
+
+        if not resp_calibration["success"]:
+            raise CalCatError(resp_calibration)
+
+        return resp_calibration["data"]["name"]
+
    @lru_cache()
    def parameter_id(self, param_name):
        """ID for an operating condition parameter in CalCat."""
@@ -203,6 +216,33 @@ class CalCatApi(metaclass=ClientWrapper):

        return resp_parameter["data"]["id"]

+    def _closest_ccv_by_time_by_condition(
+            self,
+            detector_name: str,
+            calibration_ids: Sequence[int],
+            condition: dict,
+            karabo_da: Optional[str] = None,
+            event_at=None,
+            pdu_snapshot_at=None,
+    ):
+        resp = CalibrationConstantVersion.get_closest_by_time_by_detector_conditions(
+            self.client,
+            detector_name,
+            calibration_ids,
+            self.format_cond(condition),
+            karabo_da=karabo_da or "",
+            event_at=self.format_time(event_at),
+            pdu_snapshot_at=self.format_time(pdu_snapshot_at),
+        )
+
+        if not resp["success"]:
+            if resp["status_code"] == 200:
+                # calibration_client turns empty response into an error
+                return []
+            raise CalCatError(resp)
+
+        return resp["data"]
+
    def closest_ccv_by_time_by_condition(
        self,
        detector_name,
@@ -284,20 +324,16 @@ class CalCatApi(metaclass=ClientWrapper):
        # afterwards, if necessary.
        karabo_da = next(iter(da_to_modname)) if len(da_to_modname) == 1 else '',

-        resp_versions = CalibrationConstantVersion.get_closest_by_time_by_detector_conditions(  # noqa
-            self.client,
+        resp_data = self._closest_ccv_by_time_by_condition(
            detector_name,
            calibration_ids,
-            self.format_cond(condition),
+            condition,
            karabo_da=karabo_da,
            event_at=event_at,
            pdu_snapshot_at=pdu_snapshot_at,
        )

-        if not resp_versions["success"]:
-            raise CalCatError(resp_versions)
-
-        for ccv in resp_versions["data"]:
+        for ccv in resp_data:
            try:
                mod = da_to_modname[ccv['physical_detector_unit']['karabo_da']]
            except KeyError:
@@ -1361,7 +1397,8 @@ class GOTTHARD2_CalibrationData(CalibrationData):
    """Calibration data for the Gotthard II detector."""

    calibrations = {
-        "LUTGotthard2" "OffsetGotthard2",
+        "LUTGotthard2",
+        "OffsetGotthard2",
        "NoiseGotthard2",
        "BadPixelsDarkGotthard2",
        "RelativeGainGotthard2",

--- a/src/cal_tools/calcat_interface2.py
+++ b/src/cal_tools/calcat_interface2.py
+import json
+import re
+from collections.abc import Mapping
+from dataclasses import dataclass, field, replace
+from datetime import date, datetime, time, timezone
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+from urllib.parse import urljoin
+from warnings import warn
+
+import h5py
+import pasha as psh
+import requests
+from oauth2_xfel_client import Oauth2ClientBackend
+
+
+# Default address to connect to, only available internally
+CALCAT_PROXY_URL = "http://exflcalproxy.desy.de:8080/"
+
+
+class ModuleNameError(KeyError):
+    def __init__(self, name):
+        self.name = name
+
+    def __str__(self):
+        return f"No module named {self.name!r}"
+
+
+class CalCatAPIError(requests.HTTPError):
+    """Used when the response includes error details as JSON"""
+
+
+class CalCatAPIClient:
+    def __init__(self, base_api_url, oauth_client=None, user_email=""):
+        if oauth_client is not None:
+            self.oauth_client = oauth_client
+            self.session = self.oauth_client.session
+        else:
+            # Oauth disabled - used with base_api_url pointing to an
+            # xfel-oauth-proxy instance
+            self.oauth_client = None
+            self.session = requests.Session()
+
+        self.user_email = user_email
+        # Ensure the base URL has a trailing slash
+        self.base_api_url = base_api_url.rstrip("/") + "/"
+
+    def default_headers(self):
+        return {
+            "content-type": "application/json",
+            "Accept": "application/json; version=2",
+            "X-User-Email": self.user_email,
+        }
+
+    @classmethod
+    def format_time(cls, dt):
+        """Parse different ways to specify time to CalCat."""
+
+        if isinstance(dt, datetime):
+            return dt.astimezone(timezone.utc).isoformat()
+        elif isinstance(dt, date):
+            return cls.format_time(datetime.combine(dt, time()))
+        elif not isinstance(dt, str):
+            raise TypeError(
+                f"Timestamp parameter ({dt!r}) must be a string, datetime or "
+                f"date object"
+            )
+
+        return dt
+
+    def get_request(self, relative_url, params=None, headers=None, **kwargs):
+        """Make a GET request, return the HTTP response object"""
+        # Base URL may include e.g. '/api/'. This is a prefix for all URLs;
+        # even if they look like an absolute path.
+        url = urljoin(self.base_api_url, relative_url.lstrip("/"))
+        _headers = self.default_headers()
+        if headers:
+            _headers.update(headers)
+        return self.session.get(url, params=params, headers=_headers, **kwargs)
+
+    @staticmethod
+    def _parse_response(resp: requests.Response):
+        if resp.status_code >= 400:
+            try:
+                d = json.loads(resp.content.decode("utf-8"))
+            except Exception:
+                resp.raise_for_status()
+            else:
+                raise CalCatAPIError(
+                    f"Error {resp.status_code} from API: "
+                    f"{d.get('info', 'missing details')}"
+                )
+
+        if resp.content == b"":
+            return None
+        else:
+            return json.loads(resp.content.decode("utf-8"))
+
+    def get(self, relative_url, params=None, **kwargs):
+        """Make a GET request, return response content from JSON"""
+        resp = self.get_request(relative_url, params, **kwargs)
+        return self._parse_response(resp)
+
+    _pagination_headers = (
+        "X-Total-Pages",
+        "X-Count-Per-Page",
+        "X-Current-Page",
+        "X-Total-Count",
+    )
+
+    def get_paged(self, relative_url, params=None, **kwargs):
+        """Make a GET request, return response content & pagination info"""
+        resp = self.get_request(relative_url, params, **kwargs)
+        content = self._parse_response(resp)
+        pagination_info = {
+            k[2:].lower().replace("-", "_"): int(resp.headers[k])
+            for k in self._pagination_headers
+            if k in resp.headers
+        }
+        return content, pagination_info
+
+    # ------------------
+    # Cached wrappers for simple ID lookups of fixed-ish info
+    #
+    # N.B. lru_cache behaves oddly with instance methods (it's a global cache,
+    # with the instance as part of the key), but in this case it should be OK.
+    @lru_cache()
+    def calibration_by_id(self, cal_id):
+        return self.get(f"calibrations/{cal_id}")
+
+    @lru_cache()
+    def detector_by_id(self, det_id):
+        return self.get(f"detectors/{det_id}")
+
+    # --------------------
+    # Shortcuts to find 1 of something by an ID-like field (e.g. name) other
+    # than CalCat's own integer IDs. Error on no match or >1 matches.
+    @lru_cache()
+    def detector_by_identifier(self, identifier):
+        # The "identifier", "name" & "karabo_name" fields seem to have the same names
+        res = self.get("detectors", {"identifier": identifier})
+        if not res:
+            raise KeyError(f"No detector with identifier {identifier}")
+        elif len(res) > 1:
+            raise ValueError(f"Multiple detectors found with identifier {identifier}")
+        return res[0]
+
+    @lru_cache()
+    def calibration_by_name(self, name):
+        res = self.get("calibrations", {"name": name})
+        if not res:
+            raise KeyError(f"No calibration with name {name}")
+        elif len(res) > 1:
+            raise ValueError(f"Multiple calibrations found with name {name}")
+        return res[0]
+
+
+global_client = None
+
+
+def get_client():
+    global global_client
+    if global_client is None:
+        setup_client(CALCAT_PROXY_URL, None, None, None)
+    return global_client
+
+
+def setup_client(
+    base_url,
+    client_id,
+    client_secret,
+    user_email,
+    scope="",
+    session_token=None,
+    oauth_retries=3,
+    oauth_timeout=12,
+    ssl_verify=True,
+):
+    global global_client
+    if client_id is not None:
+        oauth_client = Oauth2ClientBackend(
+            client_id=client_id,
+            client_secret=client_secret,
+            scope=scope,
+            token_url=f"{base_url}/oauth/token",
+            session_token=session_token,
+            max_retries=oauth_retries,
+            timeout=oauth_timeout,
+            ssl_verify=ssl_verify,
+        )
+    else:
+        oauth_client = None
+    global_client = CalCatAPIClient(
+        f"{base_url}/api/",
+        oauth_client=oauth_client,
+        user_email=user_email,
+    )
+
+    # Check we can connect to exflcalproxy
+    if oauth_client is None and base_url == CALCAT_PROXY_URL:
+        try:
+            # timeout=(connect_timeout, read_timeout)
+            global_client.get_request("me", timeout=(1, 5))
+        except requests.ConnectionError as e:
+            raise RuntimeError(
+                "Could not connect to calibration catalog proxy. This proxy allows "
+                "unauthenticated access inside the XFEL/DESY network. To look up "
+                "calibration constants from outside, you will need to create an Oauth "
+                "client ID & secret in the CalCat web interface. You will still not "
+                "be able to load constants without the constant store folder."
+            ) from e
+
+
+_default_caldb_root = None
+
+
+def _get_default_caldb_root():
+    global _default_caldb_root
+    if _default_caldb_root is None:
+        onc_path = Path("/common/cal/caldb_store")
+        maxwell_path = Path("/gpfs/exfel/d/cal/caldb_store")
+        if onc_path.is_dir():
+            _default_caldb_root = onc_path
+        elif maxwell_path.is_dir():
+            _default_caldb_root = maxwell_path
+        else:
+            raise RuntimeError(
+                f"Neither {onc_path} nor {maxwell_path} was found. If the caldb_store "
+                "directory is at another location, pass its path as caldb_root."
+            )
+
+    return _default_caldb_root
+
+
+@dataclass
+class SingleConstant:
+    """A calibration constant for one detector module
+
+    CalCat calls this a calibration constant version (CCV).
+    """
+
+    path: Path
+    dataset: str
+    ccv_id: Optional[int]
+    pdu_name: Optional[str]
+    _metadata: dict = field(default_factory=dict)
+    _have_calcat_metadata: bool = False
+
+    @classmethod
+    def from_response(cls, ccv: dict) -> "SingleConstant":
+        return cls(
+            path=Path(ccv["path_to_file"]) / ccv["file_name"],
+            dataset=ccv["data_set_name"],
+            ccv_id=ccv["id"],
+            pdu_name=ccv["physical_detector_unit"]["physical_name"],
+            _metadata=ccv,
+            _have_calcat_metadata=True,
+        )
+
+    def dataset_obj(self, caldb_root=None) -> h5py.Dataset:
+        if caldb_root is not None:
+            caldb_root = Path(caldb_root)
+        else:
+            caldb_root = _get_default_caldb_root()
+
+        f = h5py.File(caldb_root / self.path, "r")
+        return f[self.dataset]["data"]
+
+    def ndarray(self, caldb_root=None):
+        return self.dataset_obj(caldb_root)[:]
+
+    def _load_calcat_metadata(self, client=None):
+        client = client or get_client()
+        calcat_meta = client.get(f"calibration_constant_versions/{self.ccv_id}")
+        # Any metadata we already have takes precedence over CalCat, so
+        # this can't change a value that was previously returned.
+        self._metadata = calcat_meta | self._metadata
+        self._have_calcat_metadata = True
+
+    def metadata(self, key, client=None):
+        """Get a specific metadata field, e.g. 'begin_validity_at'
+
+        This may make a request to CalCat if the value is not already known.
+        """
+        if key not in self._metadata and not self._have_calcat_metadata:
+            if self.ccv_id is None:
+                raise KeyError(f"{key!r} (no CCV ID to request data from CalCat")
+            self._load_calcat_metadata(client)
+
+        return self._metadata[key]
+
+    def metadata_dict(self, client=None):
+        """Get a dict of available metadata
+
+        If this constant didn't come from CalCat but we have a CalCat CCV ID,
+        this will fetch metadata from CalCat.
+        """
+        if (not self._have_calcat_metadata) and (self.ccv_id is not None):
+            self._load_calcat_metadata(client)
+        return self._metadata.copy()
+
+
+def prepare_selection(
+    module_details, module_nums=None, aggregator_names=None, qm_names=None
+):
+    aggs = aggregator_names  # Shorter name -> fewer multi-line statements
+    n_specified = sum([module_nums is not None, aggs is not None, qm_names is not None])
+    if n_specified > 1:
+        raise TypeError(
+            "select_modules() accepts only one of module_nums, aggregator_names "
+            "& qm_names"
+        )
+
+    if module_nums is not None:
+        by_mod_no = {m["module_number"]: m for m in module_details}
+        return [by_mod_no[n]["karabo_da"] for n in module_nums]
+    elif qm_names is not None:
+        by_qm = {m["virtual_device_name"]: m for m in module_details}
+        return [by_qm[s]["karabo_da"] for s in qm_names]
+    elif aggs is not None:
+        miss = set(aggs) - {m["karabo_da"] for m in module_details}
+        if miss:
+            raise KeyError("Aggregators not found: " + ", ".join(sorted(miss)))
+        return aggs
+    else:
+        raise TypeError("select_modules() requires an argument")
+
+
+@dataclass
+class MultiModuleConstant(Mapping):
+    """A group of similar constants for several modules of one detector"""
+
+    constants: Dict[str, SingleConstant]  # Keys e.g. 'LPD00'
+    module_details: List[Dict]
+    detector_name: str  # e.g. 'HED_DET_AGIPD500K2G'
+    calibration_name: str
+
+    def __repr__(self):
+        return (
+            f"<MultiModuleConstant: {self.calibration_name} for "
+            f"{len(self.constants)} modules of {self.detector_name}>"
+        )
+
+    def __iter__(self):
+        return iter(self.constants)
+
+    def __len__(self):
+        return len(self.constants)
+
+    def __getitem__(self, key):
+        if key in (None, ""):
+            raise KeyError(key)
+
+        candidate_kdas = set()
+        if key in self.constants:  # Karabo DA name, e.g. 'LPD00'
+            candidate_kdas.add(key)
+
+        for m in self.module_details:
+            names = (m["module_number"], m["virtual_device_name"], m["physical_name"])
+            if key in names and m["karabo_da"] in self.constants:
+                candidate_kdas.add(m["karabo_da"])
+
+        if not candidate_kdas:
+            raise KeyError(key)
+        elif len(candidate_kdas) > 1:
+            raise KeyError(f"Ambiguous key: {key} matched {candidate_kdas}")
+
+        return self.constants[candidate_kdas.pop()]
+
+    def select_modules(
+        self, module_nums=None, *, aggregator_names=None, qm_names=None
+    ) -> "MultiModuleConstant":
+        aggs = prepare_selection(
+            self.module_details, module_nums, aggregator_names, qm_names
+        )
+        d = {aggr: scv for (aggr, scv) in self.constants.items() if aggr in aggs}
+        mods = [m for m in self.module_details if m["karabo_da"] in d]
+        return replace(self, constants=d, module_details=mods)
+
+    # These properties label only the modules we have constants for, which may
+    # be a subset of what's in module_details
+    @property
+    def aggregator_names(self):
+        return sorted(self.constants)
+
+    @property
+    def module_nums(self):
+        return [
+            m["module_number"]
+            for m in self.module_details
+            if m["karabo_da"] in self.constants
+        ]
+
+    @property
+    def qm_names(self):
+        return [
+            m["virtual_device_name"]
+            for m in self.module_details
+            if m["karabo_da"] in self.constants
+        ]
+
+    @property
+    def pdu_names(self):
+        return [
+            m["physical_name"]
+            for m in self.module_details
+            if m["karabo_da"] in self.constants
+        ]
+
+    def ndarray(self, caldb_root=None, *, parallel=0):
+        eg_dset = self.constants[self.aggregator_names[0]].dataset_obj(caldb_root)
+        shape = (len(self.constants),) + eg_dset.shape
+
+        if parallel > 0:
+            load_ctx = psh.ProcessContext(num_workers=parallel)
+        else:
+            load_ctx = psh.SerialContext()
+
+        arr = psh.alloc(shape, eg_dset.dtype, fill=0)
+
+        def _load_constant_dataset(wid, index, mod):
+            dset = self.constants[mod].dataset_obj(caldb_root)
+            dset.read_direct(arr[index])
+
+        load_ctx.map(_load_constant_dataset, self.aggregator_names)
+        return arr
+
+    def xarray(self, module_naming="modnum", caldb_root=None, *, parallel=0):
+        import xarray
+
+        if module_naming == "aggregator":
+            modules = self.aggregator_names
+        elif module_naming == "modnum":
+            modules = self.module_nums
+        elif module_naming == "qm":
+            modules = self.qm_names
+        else:
+            raise ValueError(
+                f"{module_naming=} (must be 'aggregator', 'modnum' or 'qm'"
+            )
+
+        ndarr = self.ndarray(caldb_root, parallel=parallel)
+
+        # Dimension labels
+        dims = ["module"] + ["dim_%d" % i for i in range(ndarr.ndim - 1)]
+        coords = {"module": modules}
+        name = self.calibration_name
+
+        return xarray.DataArray(ndarr, dims=dims, coords=coords, name=name)
+
+
+class CalibrationData(Mapping):
+    """Collected constants for a given detector"""
+
+    def __init__(self, constant_groups, module_details, detector_name):
+        # {calibration: {karabo_da: SingleConstant}}
+        self.constant_groups = constant_groups
+        self.module_details = module_details
+        self.detector_name = detector_name
+
+    @staticmethod
+    def _format_cond(condition):
+        """Encode operating condition to CalCat API format.
+
+        Args:
+            condition (dict): Mapping of parameter DB name to value
+
+        Returns:
+            (dict) Operating condition for use in CalCat API.
+        """
+
+        return {
+            "parameters_conditions_attributes": [
+                {"parameter_name": k, "value": str(v)} for k, v in condition.items()
+            ]
+        }
+
+    @classmethod
+    def from_condition(
+        cls,
+        condition: "ConditionsBase",
+        detector_name,
+        calibrations=None,
+        client=None,
+        event_at=None,
+        pdu_snapshot_at=None,
+    ):
+        if calibrations is None:
+            calibrations = set(condition.calibration_types)
+        if pdu_snapshot_at is None:
+            pdu_snapshot_at = event_at
+
+        cal_types_by_params_used = {}
+        for cal_type, params in condition.calibration_types.items():
+            if cal_type in calibrations:
+                cal_types_by_params_used.setdefault(tuple(params), []).append(cal_type)
+
+        client = client or get_client()
+
+        detector_id = client.detector_by_identifier(detector_name)["id"]
+        pdus = client.get(
+            "physical_detector_units/get_all_by_detector",
+            {
+                "detector_id": detector_id,
+                "pdu_snapshot_at": client.format_time(pdu_snapshot_at),
+            },
+        )
+        module_details = sorted(pdus, key=lambda d: d["karabo_da"])
+        for mod in module_details:
+            if mod.get("module_number") is None:
+                mod["module_number"] = int(re.findall(r"\d+", mod["karabo_da"])[-1])
+
+        constant_groups = {}
+
+        for params, cal_types in cal_types_by_params_used.items():
+            condition_dict = condition.make_dict(params)
+
+            cal_id_map = {
+                client.calibration_by_name(name)["id"]: name for name in cal_types
+            }
+            calibration_ids = list(cal_id_map.keys())
+
+            query_res = client.get(
+                "calibration_constant_versions/get_by_detector_conditions",
+                {
+                    "detector_identifier": detector_name,
+                    "calibration_id": str(calibration_ids),
+                    "karabo_da": "",
+                    "event_at": client.format_time(event_at),
+                    "pdu_snapshot_at": client.format_time(pdu_snapshot_at),
+                },
+                data=json.dumps(cls._format_cond(condition_dict)),
+            )
+
+            for ccv in query_res:
+                aggr = ccv["physical_detector_unit"]["karabo_da"]
+                cal_type = cal_id_map[ccv["calibration_constant"]["calibration_id"]]
+
+                const_group = constant_groups.setdefault(cal_type, {})
+                const_group[aggr] = SingleConstant.from_response(ccv)
+
+        return cls(constant_groups, module_details, detector_name)
+
+    @classmethod
+    def from_report(
+        cls,
+        report_id_or_path: Union[int, str],
+        client=None,
+    ):
+        client = client or get_client()
+
+        # Use max page size, hopefully always enough for CCVs from 1 report
+        params = {"page_size": 500}
+        if isinstance(report_id_or_path, int):
+            params["report_id"] = report_id_or_path  # Numeric ID
+        else:
+            params["report.file_path"] = str(report_id_or_path)
+
+        res = client.get("calibration_constant_versions", params)
+
+        constant_groups = {}
+        pdus = {}  # keyed by karabo_da (e.g. 'AGIPD00')
+        det_ids = set()  # Should only have one detector
+
+        for ccv in res:
+            pdu = ccv["physical_detector_unit"]
+            # We're only interested in the PDU mapping from the CCV start time
+            kda = pdu["karabo_da"] = pdu.pop("karabo_da_at_ccv_begin_at")
+            det_id = pdu["detector_id"] = pdu.pop("detector_id_at_ccv_begin_at")
+            pdu["virtual_device_name"] = pdu.pop("virtual_device_name_at_ccv_begin_at")
+            if pdu.get("module_number_at_ccv_begin_at") is not None:
+                pdu["module_number"] = pdu.pop("module_number_at_ccv_begin_at")
+            else:
+                pdu["module_number"] = int(re.findall(r"\d+", kda)[-1])
+
+            det_ids.add(det_id)
+            if kda in pdus:
+                if pdu["physical_name"] != pdus[kda]["physical_name"]:
+                    raise Exception(
+                        f"Mismatched PDU mapping from calibration report: {kda} is both"
+                        f" {pdu['physical_name']} and {pdus[kda]['physical_name']}"
+                    )
+            else:
+                pdus[kda] = pdu
+
+            cal_type = client.calibration_by_id(
+                ccv["calibration_constant"]["calibration_id"]
+            )["name"]
+            const_group = constant_groups.setdefault(cal_type, {})
+            const_group[kda] = SingleConstant.from_response(ccv)
+
+        if len(det_ids) > 1:
+            raise Exception(f"Found multiple detector IDs in report: {det_ids}")
+        # The "identifier", "name" & "karabo_name" fields seem to have the same names
+        det_name = client.detector_by_id(det_ids.pop())["identifier"]
+
+        module_details = sorted(pdus.values(), key=lambda d: d["karabo_da"])
+        return cls(constant_groups, module_details, det_name)
+
+    def __getitem__(self, key) -> MultiModuleConstant:
+        if isinstance(key, str):
+            return MultiModuleConstant(
+                self.constant_groups[key], self.module_details, self.detector_name, key
+            )
+        elif isinstance(key, tuple) and len(key) == 2:
+            cal_type, module = key
+            return self[cal_type][module]
+        else:
+            raise TypeError(f"Key should be string or 2-tuple (got {key!r})")
+
+    def __iter__(self):
+        return iter(self.constant_groups)
+
+    def __len__(self):
+        return len(self.constant_groups)
+
+    def __contains__(self, item):
+        return item in self.constant_groups
+
+    def __repr__(self):
+        return (
+            f"<CalibrationData: {', '.join(sorted(self.constant_groups))} "
+            f"constants for {len(self.module_details)} modules of {self.detector_name}>"
+        )
+
+    # These properties may include modules for which we have no constants -
+    # when created with .from_condition(), they represent all modules present in
+    # the detector (at the specified time).
+    @property
+    def module_nums(self):
+        return [m["module_number"] for m in self.module_details]
+
+    @property
+    def aggregator_names(self):
+        return [m["karabo_da"] for m in self.module_details]
+
+    @property
+    def qm_names(self):
+        return [m["virtual_device_name"] for m in self.module_details]
+
+    @property
+    def pdu_names(self):
+        return [m["physical_name"] for m in self.module_details]
+
+    def require_calibrations(self, calibrations):
+        """Drop any modules missing the specified constant types"""
+        mods = set(self.aggregator_names)
+        for cal_type in calibrations:
+            mods.intersection_update(self[cal_type].constants)
+        return self.select_modules(aggregator_names=mods)
+
+    def select_modules(
+        self, module_nums=None, *, aggregator_names=None, qm_names=None
+    ) -> "CalibrationData":
+        # Validate the specified modules against those we know about.
+        # Each specific constant type may have only a subset of these modules.
+        aggs = prepare_selection(
+            self.module_details, module_nums, aggregator_names, qm_names
+        )
+        constant_groups = {}
+        matched_aggregators = set()
+        for cal_type, const_group in self.constant_groups.items():
+            constant_groups[cal_type] = d = {
+                aggr: const for (aggr, const) in const_group.items() if aggr in aggs
+            }
+            matched_aggregators.update(d.keys())
+        module_details = [
+            m for m in self.module_details if m["karabo_da"] in matched_aggregators
+        ]
+        return type(self)(constant_groups, module_details, self.detector_name)
+
+    def select_calibrations(self, calibrations) -> "CalibrationData":
+        const_groups = {c: self.constant_groups[c] for c in calibrations}
+        return type(self)(const_groups, self.module_details, self.detector_name)
+
+    def merge(self, *others: "CalibrationData") -> "CalibrationData":
+        det_names = set(cd.detector_name for cd in (self,) + others)
+        if len(det_names) > 1:
+            raise Exception(
+                "Cannot merge calibration data for different "
+                "detectors: " + ", ".join(sorted(det_names))
+            )
+        det_name = det_names.pop()
+
+        cal_types = set(self.constant_groups)
+        aggregators = set(self.aggregator_names)
+        pdus_d = {m["karabo_da"]: m for m in self.module_details}
+        for other in others:
+            cal_types.update(other.constant_groups)
+            aggregators.update(other.aggregator_names)
+            for md in other.module_details:
+                # Warn if constants don't refer to same modules
+                md_da = md["karabo_da"]
+                if md_da in pdus_d:
+                    pdu_a = pdus_d[md_da]["physical_name"]
+                    pdu_b = md["physical_name"]
+                    if pdu_a != pdu_b:
+                        warn(
+                            f"Merging constants with different modules for "
+                            f"{md_da}: {pdu_a!r} != {pdu_b!r}",
+                            stacklevel=2,
+                        )
+                else:
+                    pdus_d[md_da] = md
+
+        module_details = sorted(pdus_d.values(), key=lambda d: d["karabo_da"])
+
+        constant_groups = {}
+        for cal_type in cal_types:
+            d = constant_groups[cal_type] = {}
+            for caldata in (self,) + others:
+                if cal_type in caldata:
+                    d.update(caldata.constant_groups[cal_type])
+
+        return type(self)(constant_groups, module_details, det_name)
+
+
+class ConditionsBase:
+    calibration_types = {}  # For subclasses: {calibration: [parameter names]}
+
+    def make_dict(self, parameters) -> dict:
+        d = dict()
+
+        for db_name in parameters:
+            value = getattr(self, db_name.lower().replace(" ", "_"))
+            if value is not None:
+                d[db_name] = value
+
+        return d
+
+
+@dataclass
+class AGIPDConditions(ConditionsBase):
+    sensor_bias_voltage: float
+    memory_cells: int
+    acquisition_rate: float
+    gain_setting: Optional[int]
+    gain_mode: Optional[int]
+    source_energy: float
+    integration_time: int = 12
+    pixels_x: int = 512
+    pixels_y: int = 128
+
+    _gain_parameters = [
+        "Sensor Bias Voltage",
+        "Pixels X",
+        "Pixels Y",
+        "Memory cells",
+        "Acquisition rate",
+        "Gain setting",
+        "Integration time",
+    ]
+    _other_dark_parameters = _gain_parameters + ["Gain mode"]
+    _illuminated_parameters = _gain_parameters + ["Source energy"]
+
+    calibration_types = {
+        "Offset": _other_dark_parameters,
+        "Noise": _other_dark_parameters,
+        "ThresholdsDark": _other_dark_parameters,
+        "BadPixelsDark": _other_dark_parameters,
+        "BadPixelsPC": _gain_parameters,
+        "SlopesPC": _gain_parameters,
+        "BadPixelsFF": _illuminated_parameters,
+        "SlopesFF": _illuminated_parameters,
+    }
+
+    def make_dict(self, parameters):
+        cond = super().make_dict(parameters)
+
+        # Fix-up some database quirks.
+        if int(cond.get("Gain mode", -1)) == 0:
+            del cond["Gain mode"]
+
+        if int(cond.get("Integration time", -1)) == 12:
+            del cond["Integration time"]
+
+        return cond
+
+
+@dataclass
+class LPDConditions(ConditionsBase):
+    sensor_bias_voltage: float
+    memory_cells: int
+    memory_cell_order: Optional[str] = None
+    feedback_capacitor: float = 5.0
+    source_energy: float = 9.2
+    category: int = 1
+    pixels_x: int = 256
+    pixels_y: int = 256
+
+    _base_params = [
+        "Sensor Bias Voltage",
+        "Memory cells",
+        "Pixels X",
+        "Pixels Y",
+        "Feedback capacitor",
+    ]
+    _dark_parameters = _base_params + [
+        "Memory cell order",
+    ]
+    _illuminated_parameters = _base_params + ["Source Energy", "category"]
+
+    calibration_types = {
+        "Offset": _dark_parameters,
+        "Noise": _dark_parameters,
+        "BadPixelsDark": _dark_parameters,
+        "RelativeGain": _illuminated_parameters,
+        "GainAmpMap": _illuminated_parameters,
+        "FFMap": _illuminated_parameters,
+        "BadPixelsFF": _illuminated_parameters,
+    }
+
+
+@dataclass
+class DSSCConditions(ConditionsBase):
+    sensor_bias_voltage: float
+    memory_cells: int
+    pulse_id_checksum: Optional[float] = None
+    acquisition_rate: Optional[float] = None
+    target_gain: Optional[int] = None
+    encoded_gain: Optional[int] = None
+    pixels_x: int = 512
+    pixels_y: int = 128
+
+    _params = [
+        "Sensor Bias Voltage",
+        "Memory cells",
+        "Pixels X",
+        "Pixels Y",
+        "Pulse id checksum",
+        "Acquisition rate",
+        "Target gain",
+        "Encoded gain",
+    ]
+    calibration_types = {
+        "Offset": _params,
+        "Noise": _params,
+    }
--- a/src/cal_tools/epix100/epix100lib.py
+++ b/src/cal_tools/epix100/epix100lib.py
@@ -17,11 +17,16 @@ class epix100Ctrl():
        self.ctrl_src = ctrl_src
        self.instrument_src = instrument_src

-    def get_integration_time(self):
+    def get_integration_time(self) -> float:
+        """Get Integration time for ePix100 from /CTRL/
+
+        Returns:
+            Integration time: integration time value.
+        """
        return self.run_dc[
            self.ctrl_src, 'expTime.value'].as_single_value(reduce_by='first')

-    def get_temprature(self):
+    def get_temprature(self) -> float:
        """Get temperature value from CONTROL.
        atol is degree variation tolerance.
        """
@@ -43,4 +48,4 @@ class epix100Ctrl():
        else:
            return self.run_dc[
                self.instrument_src.split(':daqOutput')[0], 'slowdata.backTemp.value'].as_single_value(
-                reduce_by='mean', atol=1)
\ No newline at end of file
+                reduce_by='mean', atol=1)
--- a/src/cal_tools/files.py
+++ b/src/cal_tools/files.py

-from datetime import datetime
+from datetime import datetime, timezone
 from itertools import chain
 from numbers import Integral
 from pathlib import Path
@@ -107,7 +107,7 @@ class DataFile(h5py.File):

    filename_format = '{prefix}-R{run:04d}-{aggregator}-S{sequence:05d}.h5'
    aggregator_pattern = re.compile(r'^[A-Z]{2,}\d{2}$')
-    instrument_source_pattern = re.compile(r'^[\w\/-]+:\w+$')
+    instrument_source_pattern = re.compile(r'^[\w\/-]+:[\w.]+$')

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -311,10 +311,14 @@ class DataFile(h5py.File):
                sequence = self.__sequence

        if creation_date is None:
-            creation_date = datetime.now()
+            creation_date = datetime.fromtimestamp(0, tz=timezone.utc)
+        elif creation_date is True:
+            creation_date = datetime.now(timezone.utc)

        if update_date is None:
            update_date = creation_date
+        elif update_date is True:
+            update_date = datetime.now(timezone.utc)

        md_group = self.require_group('METADATA')
        md_group.create_dataset(

--- a/src/cal_tools/gotthard2/gotthard2lib.py
+++ b/src/cal_tools/gotthard2/gotthard2lib.py
+from logging import warning
+from pathlib import Path
+from typing import List, Union
+
 import extra_data


@@ -15,17 +19,21 @@ class Gotthard2Ctrl():
        self.ctrl_src = ctrl_src

    def get_bias_voltage(self):
+        """Get bias voltage for gotthard2."""
        return self.run_dc[self.ctrl_src, "highVoltageMax"].as_single_value()

    def get_exposure_time(self):
+        """Get exposure time for gotthard2."""
        return round(
            self.run_dc[self.ctrl_src, "exposureTime"].as_single_value(), 4)

    def get_exposure_period(self):
+        """Get exposure period for gotthard2."""
        return round(
            self.run_dc[self.ctrl_src, "exposurePeriod"].as_single_value(), 4)

    def get_acquisition_rate(self):
+        """Get acquisition rate for gotthard2."""
        try:
            return float(
                self.run_dc.get_run_value(self.ctrl_src, "acquisitionRate"))
@@ -33,6 +41,66 @@ class Gotthard2Ctrl():
            pass

    def get_single_photon(self):
+        """Get single photon for gotthard2."""
        if "singlePhoton.value" in self.run_dc.keys_for_source(self.ctrl_src):
            return bool(
                self.run_dc[self.ctrl_src, "singlePhoton"].as_single_value())
+
+    def get_det_type(self):
+        """GH2 rxHostname is a vector of bytes objects.
+        GH2 25um has two host names unlike 50um which has one.
+
+        Returns:
+            str: return if its a 25um or 50um detector.
+        """
+
+        hostname = self.run_dc.get_run_value(self.ctrl_src, "rxHostname")
+        return "25um" if hostname[1].decode("utf-8") else "50um"
+
+    def second_module_reversed(self):
+        """Check if reverseSlaveReadOutMode is True."""
+        return bool(
+                self.run_dc[
+                    self.ctrl_src, "reverseSlaveReadOutMode"].as_single_value())
+
+
+def sort_dark_runs_by_gain(
+    raw_folder: Union[str, Path],
+    runs: List[int],
+    ctrl_src: str,
+):
+    """Sort the three dark runs based /RUN/.../settings/.
+    The expected value options are [dynamicgain, fixgain1, fixgain2]
+    Args:
+        raw_folder (Union[str, Path]): The raw folder for the three runs to be sorted.
+        runs (list): A list of three dark runs.
+        ctrl_src: The CTRL source for checking `settings` dataset value.
+    Return:
+        List: Ordered list of runs.
+    """
+
+    run_to_setting = dict()
+    expected_settings = ["dynamicgain", "fixgain1", "fixgain2"]
+    assert len(set(runs)) == 3, f"A list of {len(runs)} runs is provided, three different dark runs are expected."  # noqa
+
+    for r in runs:
+        run_to_setting[r] = extra_data.RunDirectory(
+            Path(raw_folder) / f"r{r:04d}").get_run_value(
+                ctrl_src, "settings")
+
+    if len(set(run_to_setting.values())) < 3:
+        raise ValueError(
+            f"Incorrect gain settings for the provided dark runs: {run_to_setting}."
+            f" The expected settings for these runs are {expected_settings}.")
+
+    sorted_run_to_setting = dict(sorted(
+        run_to_setting.items(),
+        key=lambda x: expected_settings.index(x[1])))
+
+    sorted_runs = list(sorted_run_to_setting.keys())
+    if list(run_to_setting.values()) != expected_settings:
+        warning(
+            "Dark runs were incorrectly sorted and "
+            f"have now been corrected to: {sorted_runs}.")
+
+    return sorted_runs
--- a/src/cal_tools/jungfrau/strixel_rows_A1256-lut_mask.npz
+++ b/src/cal_tools/jungfrau/strixel_rows_A1256-lut_mask.npz
--- a/src/cal_tools/lpdlib.py
+++ b/src/cal_tools/lpdlib.py
 import copy
+from logging import warning
+from pathlib import Path
 from typing import List, Optional, Tuple
 from warnings import warn

 import h5py
 import numpy as np
+from extra_data import RunDirectory
 from iCalibrationDB import Conditions, Constants, Detectors

 from cal_tools.enums import BadPixels
@@ -807,3 +810,41 @@ def make_cell_order_condition(use_param, cellid_pattern) -> Optional[str]:
        use = (use_param == 'always')

    return (",".join([str(c) for c in cellid_pattern]) + ",") if use else None
+
+
+def sort_dark_runs_by_gain(raw_folder, runs, ctrl_src):
+    """Check gain factors [100, 10, 1] to decide,
+    if provided dark runs are in the correct order.
+    
+    Args:
+        raw_folder (Union[str, Path]): The raw data path.
+        runs (list): A list of 3 runs.
+        ctrl_src (str): The CTRL source to check `RUN/.../femAsicGain`.
+    Return:
+        (list): Sorted dark runs.
+    """
+    run_to_gain = dict()
+    expected_gain_factors = [100, 10, 1]
+    assert len(set(runs)) == 3, f"A list of {len(runs)} runs is provided, three different dark runs are expected." # noqa
+
+    for r in runs:
+        run_to_gain[r] = RunDirectory(
+            Path(raw_folder) / f"r{r:04d}").get_run_value(
+                ctrl_src, "femAsicGain")
+
+    if len(set(run_to_gain.values())) < 3:
+        raise ValueError(
+            f"Incorrect gain factors for the provided dark runs: {run_to_gain}."
+            f" The expected gain factors for these runs are {expected_gain_factors}.")
+
+    sorted_run_to_gain = dict(sorted(
+        run_to_gain.items(),
+        key=lambda x: expected_gain_factors.index(x[1])))
+
+    sorted_runs = list(sorted_run_to_gain.keys())
+    if list(run_to_gain.values()) != expected_gain_factors:
+        warning(
+            "Dark runs were incorrectly sorted and "
+            f"have now been corrected to: {sorted_runs}.")
+
+    return sorted_runs
--- a/src/cal_tools/plotting.py
+++ b/src/cal_tools/plotting.py
@@ -7,9 +7,9 @@ from extra_geom import (
    AGIPD_1MGeometry,
    AGIPD_500K2GGeometry,
    DSSC_1MGeometry,
+    GenericGeometry,
    LPD_1MGeometry,
    JUNGFRAUGeometry,
-
 )
 from extra_geom import tests as eg_tests
 from matplotlib import colors
@@ -231,44 +231,54 @@ def create_constant_overview(constant, name, cells, vmin=None, vmax=None,
        ax.set_ylim(vmin, vmax)


-def show_processed_modules(dinstance: str, constants: Optional[Dict[str, Any]],
-                           mnames: str, mode: str):
+def show_processed_modules(
+    detector: str,
+    constants: Optional[Dict[str, Any]],
+    mnames: str,
+    mode: str
+):
    """
    Show the status of the processed modules.
    Green: Processed. Gray: Not Processed. Red: No data available.
-    :param dinstance: The detector instance (e.g. AGIPD1M1 or LPD1M1)
-    :param constants: A dict of the plotted constants data.
-        {"const_name":constant_data}. Can be None in case of position mode.
-    :param mnames: A list of available module names.
-    :param mode: String selecting on of the two modes of operation.
-         "position": To just show the position of the processed modules.
-         "processed": To show the modules successfully processed.
-    :return
+    Args:
+        detector: The detector name (karabo_id) (e.g. MID_DET_AGIPD1M-1 or FXE_DET_LPD1M-1)
+        constants: A dict of the plotted constants data.
+            {"const_name":constant_data}. Can be None in case of position mode.
+        mnames: A list of available module names.
+        mode: String selecting on of the two modes of operation.
+            "position": To just show the position of the processed modules.
+            "processed": To show the modules successfully processed.
    """

    # Create the geometry figure for each detector

-    if dinstance in ('AGIPD1M1', 'AGIPD1M2'):
+    if "AGIPD1M" in detector:
        quadrants = 4
        modules = 4
        tiles = 8
        quad_pos = [(-525, 625), (-550, -10), (520, -160), (542.5, 475)]
        geom = AGIPD_1MGeometry.from_quad_positions(quad_pos)

-    elif dinstance == 'AGIPD500K':
+    elif "AGIPD500K" in detector:
        quadrants = 2
        modules = 4
        tiles = 8
        geom = AGIPD_500K2GGeometry.from_origin()

-    elif 'LPD' in dinstance:
+    elif "AGIPD65K" in detector:
+        quadrants = 1
+        modules = 1
+        tiles = 8
+        geom = agipd_single_module_geometry()
+
+    elif 'LPD' in detector:
        quadrants = 4
        modules = 4
        tiles = 16
        quad_pos = [(11.4, 299), (-11.5, 8), (254.5, -16), (278.5, 275)]
        geom = LPD_1MGeometry.from_quad_positions(quad_pos)

-    elif 'DSSC' in dinstance:
+    elif 'DSSC' in detector:
        quadrants = 4
        modules = 4
        tiles = 2
@@ -279,7 +289,7 @@ def show_processed_modules(dinstance: str, constants: Optional[Dict[str, Any]],
            quad_pos)

    else:
-        raise ValueError(f'{dinstance} is not a real detector')
+        raise ValueError(f'{detector} detector is not available for plotting.')

    # Create a dict that contains the range of tiles, in the figure,
    # that belong to a module.
@@ -462,3 +472,18 @@ def init_jungfrau_geom(

    return expected_modules, JUNGFRAUGeometry.from_module_positions(
        module_pos, orientations=orientations, asic_gap=6)
+
+
+def agipd_single_module_geometry():
+    """Initialize AGIPD single module detector geometry."""
+    # TODO: use extra_geom implementation when ready. This has no module name
+    # tiles text is different from other AGIPD detectors.
+    pixel_size = 2e-4
+    simple_config = {
+        'pixel_size': pixel_size,
+        'slow_pixels': 64,
+        'fast_pixels': 128,
+        'n_tiles_per_module': 8,
+        'corner_coordinates': [np.array([-256, -64, 0]) * pixel_size],
+    }
+    return GenericGeometry.from_simple_description(**simple_config)
--- a/src/cal_tools/tools.py
+++ b/src/cal_tools/tools.py
@@ -1044,3 +1044,24 @@ def reorder_axes(a, from_order, to_order):
    from_order = list(from_order)
    order = tuple([from_order.index(lbl) for lbl in to_order])
    return a.transpose(order)
+
+
+def raw_data_location_string(proposal: str, runs: List[int]):
+    """Create RAW data location string to inject as a
+    metadata along with the calibration parameters.
+
+    Args:
+        proposal (string): The proposal number including the preceding `p`.
+            e.g. p900203
+        runs (list): A list of the run numbers
+
+    Returns:
+        str: The string for raw data_location.
+            e.g. `proposal p900203 runs: 9008, 9009, 90010`
+    """
+    if not isinstance(proposal, str) or proposal[0] != "p":
+        raise ValueError(
+            "Invalid proposal format. The proposal should be a string with"
+            " a preceding 'p'. Example: 'p900203'")
+
+    return f"proposal:{proposal} runs:{' '.join(map(str, runs))}"
--- a/src/xfel_calibrate/calibrate.py
+++ b/src/xfel_calibrate/calibrate.py
@@ -11,7 +11,7 @@ import stat
 import sys
 import textwrap
 import warnings
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from subprocess import DEVNULL, call, check_call, check_output
 from typing import List, Union
@@ -112,6 +112,11 @@ def balance_sequences(in_folder: str, run: int, sequences: List[int],
    elif not isinstance(karabo_da, list):
        raise TypeError("Balance sequences expects `karabo_da` as a string or list.")

+    # data-mapping for LPD mini and GH2 25um uses karabo-da names like
+    # LPDMINI00/2 or DA01/2 to identify individual modules. The /2 is not
+    # part of the file name
+    karabo_da = list({kda.split('/')[0] for kda in karabo_da})
+
    in_path = Path(in_folder, f"r{run:04d}")

    # TODO: remove ["-1"] after karabo_da refactor
@@ -605,7 +610,7 @@ def run(argv=None):

    # request_time is in local timezone
    if args["request_time"] == "Now":
-        request_time = datetime.now()
+        request_time = datetime.now(tz=timezone.utc)
    else:
        request_time = datetime.fromisoformat(args["request_time"])

@@ -814,7 +819,7 @@ def run(argv=None):
    print("Calibration work directory (including Slurm .out files):")
    print(" ", cal_work_dir)

-    submission_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+    submission_time = datetime.now(tz=timezone.utc)

    # Launch the calibration work
    if args["no_cluster_job"]:
@@ -840,8 +845,8 @@ def run(argv=None):
                'author': author,
                'report_to': report_to,
                'in_folder': folder,
-                'request_time': request_time.strftime("%Y-%m-%dT%H:%M:%S"),
-                'submission_time': submission_time,
+                'request_time': request_time.isoformat(),
+                'submission_time': submission_time.isoformat(),
                }

    joblist.append(run_finalize(

--- a/src/xfel_calibrate/finalize.py
+++ b/src/xfel_calibrate/finalize.py
 import re
 import sys
-from datetime import datetime
+from datetime import datetime, timezone
 from importlib.machinery import SourceFileLoader
 from os import chdir, listdir, path
 from pathlib import Path
@@ -145,7 +145,7 @@ def get_job_info(jobs: List[str], fmt: List[str]) -> List[List[str]]:


 def make_timing_summary(cal_work_dir: Path, job_times: List[List[str]],
-                        job_time_fmt: List[str], pipeline_times: Dict[str, str]):
+                        job_time_fmt: List[str], pipeline_times: Dict[str, datetime]):
    """
    Create an rst file with timing summary of executed notebooks

@@ -160,6 +160,8 @@ def make_timing_summary(cal_work_dir: Path, job_times: List[List[str]],
                    Runtime summary
                    ===============

+                    All timestamps are shown in local (Hamburg) time.
+
                    .. math::
                        {% for line in time_table %}
                        {{ line }}
@@ -176,10 +178,12 @@ def make_timing_summary(cal_work_dir: Path, job_times: List[List[str]],
                    ''')

    time_vals = [
-        ["Time of Request", pipeline_times["request-time"]],
-        ["Job submission", pipeline_times["submission-time"]],
-        ["Report compilation", pipeline_times["report-compilation-time"]],
-    ]
+        [title, pipeline_times[k].astimezone(None).strftime("%Y-%m-%d %H:%M:%S")]
+        for (title, k) in [
+            ["Time of Request", "request-time"],
+            ["Job submission", "submission-time"],
+            ["Report compilation", "report-compilation-time"],
+    ]]

    with (cal_work_dir / "timing_summary.rst").open("w+") as fd:
        time_table = tabulate.tabulate(time_vals, tablefmt="latex",
@@ -384,9 +388,9 @@ def finalize(joblist, finaljob, cal_work_dir, out_path, version, title, author,
    job_time_fmt = 'JobID,Start,End,Elapsed,Suspended,State'.split(',')
    job_time_summary = get_job_info(joblist, job_time_fmt)
    pipeline_time_summary = {
-        "request-time": request_time,
-        "submission-time": submission_time,
-        "report-compilation-time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
+        "request-time": datetime.fromisoformat(request_time),
+        "submission-time": datetime.fromisoformat(submission_time),
+        "report-compilation-time": datetime.now(timezone.utc),
    }
    make_timing_summary(cal_work_dir, job_time_summary, job_time_fmt, pipeline_time_summary)
    metadata.update(

--- a/src/xfel_calibrate/notebooks.py
+++ b/src/xfel_calibrate/notebooks.py
@@ -14,6 +14,8 @@ notebooks = {
        },
        "PC": {
            "notebook": "notebooks/AGIPD/Chracterize_AGIPD_Gain_PC_NBC.ipynb",
+            "dep_notebooks": [
+                "notebooks/AGIPD/Chracterize_AGIPD_Gain_PC_Summary.ipynb"],
            "concurrency": {"parameter": "modules",
                            "default concurrency": 16,
                            "cluster cores": 32},
@@ -214,6 +216,8 @@ notebooks = {
        "DARK": {
            "notebook":
                "notebooks/Gotthard2/Characterize_Darks_Gotthard2_NBC.ipynb",
+            "dep_notebooks": [
+                "notebooks/Gotthard2/Summary_Darks_Gotthard2_NBC.ipynb"],
            "concurrency": {"parameter": "karabo_da",
                            "default concurrency": list(range(2)),
                            "cluster cores": 4},

--- a/tests/test_cal_tools.py
+++ b/tests/test_cal_tools.py
@@ -18,6 +18,7 @@ from cal_tools.tools import (
    get_pdu_from_db,
    map_seq_files,
    module_index_to_qm,
+    raw_data_location_string,
    recursive_update,
    send_to_db,
    write_constants_fragment,
@@ -567,3 +568,16 @@ def test_reorder_axes():

    to_order = ('gain', 'fast_scan', 'slow_scan', 'cells')
    assert reorder_axes(a, from_order, to_order).shape == (3, 256, 32, 10)
+
+
+def test_raw_data_location_string():
+    raw_data_loc = raw_data_location_string("p900203", [9008, 9009, 9010])
+    assert raw_data_loc == "proposal:p900203 runs:9008 9009 9010"
+
+
+def test_raise_raw_data_location_string():
+    with pytest.raises(ValueError):
+        raw_data_location_string(900203, [9008, 9009, 9010])
+
+    with pytest.raises(ValueError):
+        raw_data_location_string("900203", [9008, 9009, 9010])
No results found