Include PPT entry for each pulse in triggers table

bc4c791e · Philipp Schmidt · 9d059b35 · bc4c791e
Commit bc4c791e authored 2 years ago by Philipp Schmidt
--- a/notebooks/REMI/REMI_Digitize_and_Transform.ipynb
+++ b/notebooks/REMI/REMI_Digitize_and_Transform.ipynb
@@ -248,7 +248,7 @@
    "\n",
    "psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))\n",
    "\n",
-    "triggers = psh.alloc(shape=(num_pulses,), dtype=trigger_dt, fill=(-1, -1, np.nan, 0, 0))\n",
+    "triggers = psh.alloc(shape=(num_pulses,), dtype=trigger_dt, fill=(-1, -1, np.nan, -1, 0, 0))\n",
    "\n",
    "clock_factor = remi['digitizer']['clock_factor']\n",
    "\n",
@@ -282,6 +282,7 @@
    "    train_triggers['start'] = start_int\n",
    "    train_triggers['stop'] = start_int + int(pulse_len * 2 * clock_factor) - 1\n",
    "    train_triggers['offset'] = start_frac - start_int\n",
+    "    train_triggers['pulse'] = all_pos.astype(np.int16)\n",
    "    train_triggers['fel'] = [pos in fel_pos for pos in all_pos]\n",
    "    train_triggers['ppl'] = [pos in ppl_pos for pos in all_pos]\n",
    "\n",
@@ -316,13 +317,12 @@
    "\n",
    "lx.vlines(pulse_offsets[:n_trains], 0, max_start, color='grey', linewidth=1, alpha=0.2)\n",
    "lx.tick_params(right=True)\n",
-    "lx.set_ylim(-1, max_start+1)\n",
    "\n",
    "lx.set_xlabel('Pulse index')\n",
    "lx.set_xlim(-15, pulse_offsets[n_trains]+15)\n",
    "\n",
    "lx.set_ylabel('Trigger position')\n",
-    "lx.set_ylim(0, max_start)\n",
+    "lx.set_ylim(-max_start // 20, max_start + max_start // 20)\n",
    "\n",
    "lx.legend(fontsize='small', loc='lower right')\n",
    "\n",

 %% Cell type:code id: tags:
 ``` python
 # Data selection parameters.
 run = 104  # Run ID.
 in_folder = '/gpfs/exfel/exp/SQS/202101/p002535/raw'  # Partial input path appended with run ID.
 out_folder = '/gpfs/exfel/exp/SQS/202101/p002535/scratch/cal_test'  # Full path to output folder.
 calib_config_path = '/gpfs/exfel/exp/SQS/202101/p002535/usr/config_board2+4.yaml'  # Path to correction and transform configuration
 # These parameters are required by xfel-calibrate but ignored in this notebook.
 cycle = ''  # Proposal cycle, currently not used.
 cal_db_timeout = 0  # Calibration DB timeout, currently not used.
 cal_db_interface = 'foo'  # Calibration DB interface, currently not used.
 karabo_da = 'bar'  # Karabo data aggregator name, currently not used
 # Output parameters.
 karabo_id = 'SQS_REMI_DLD6'  # Karabo device ID root for virtual output device.
 proposal = ''  # Proposal, leave empty for auto detection based on in_folder
 out_aggregator = 'REMI01'  # Aggregator name for output files.
 out_seq_len = 5000  # Number of trains per sequence file in output.
 det_device_id = '{karabo_id}/DET/{det_name}'  # Karabo device ID for virtual output device.
 det_output_key = 'output'  # Pipeline name for fast data output.
 save_raw_triggers = True  # Whether to save trigger position in files.
 save_raw_edges = True  # Whether to save digitized edge positions in files.
 save_rec_signals = True  # Whether to save reconstructed signals (u1-w2, mcp) in files.
 save_rec_hits = True  # Whether to save reoncstructed hits (x,y,t,m) in files.
 chunks_triggers = [500]  # HDF chunk size for triggers.
 chunks_edges = [500, 7, 50]  # HDF chunk size for edges.
 chunks_hits = [50, 50]  # HDF chunk size for hits.
 chunks_signals = [50, 50]  # HDF chunk size for signals.
 dataset_compression = 'gzip'  # HDF compression method.
 dataset_compression_opts = 3  # HDF GZIP compression level.
 # Trigger parameters.
 ppt_source = 'SQS_RR_UTC/TSYS/TIMESERVER:outputBunchPattern'
 ppl_offset = 0  # In units of the PPT.
 laser_ppt_mask = -1  # Bit mask for used laser, negative to auto-detect from instrument.
 instrument_sase = 3
 first_pulse_offset = 1000
 single_pulse_length = 25000
 # Parallelization parameters.
 mp_find_triggers = 0.5  # Parallelization for finding triggers.
 mp_find_edges = 0.5  # Parallelization for digitizing analog signal.
 mt_avg_trace = 2  # Parallelization for trace averaging.
 mp_rec_hits = 1.0  # Parallelization for hit reconstruction.
 ```
 %% Cell type:code id: tags:
 ``` python
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.colors import LogNorm
 from threadpoolctl import threadpool_limits
 import re
 import h5py
 from pathlib import Path
 from datetime import datetime
 import pasha as psh
 from euxfel_bunch_pattern import indices_at_sase, indices_at_laser
 from extra_data import RunDirectory
 from extra_remi import Analysis, trigger_dt
 from extra_remi.util import timing
 from extra_remi.plots import plot_detector_diagnostics
 from extra_remi.rd_resort import signal_dt, hit_dt
 from extra_remi.files import DataFile, sequence_pulses
 %matplotlib inline
 ```
 %% Cell type:code id: tags:
 ``` python
 calib_config_path = Path(calib_config_path)
 if not calib_config_path.is_file():
    # If the path cannot be resolved right now, try the same path relative to in_folder.
    calib_config_path = Path(in_folder) / calib_config_path
    if not calib_config_path.is_file():
        # Disallow implicit config file creation.
        raise ValueError('calib_config_path not found - neither absolute nor relative to in_folder')
 remi = Analysis(calib_config_path)
 with timing('open_run'):
    dc = remi.prepare_dc(RunDirectory(Path(in_folder) / f'r{run:04d}', inc_suspect_trains=True))
 ```
 %% Cell type:markdown id: tags:
 # Transformation parameters
 %% Cell type:code id: tags:
 ``` python
 def print_leaf(leaf, indent=0):
    for key, value in leaf.items():
        if isinstance(value, dict):
            print(indent * 4 * ' ' + key)
            print_leaf(value, indent=indent+1)
        else:
            print(indent * 4 * ' ' + f'{key}: {value}')
 print_leaf(remi.tree)
 ```
 %% Cell type:markdown id: tags:
 # Pulse and trigger information
 %% Cell type:markdown id: tags:
 ### Count pulses per train and compute offsets
 %% Cell type:code id: tags:
 ``` python
 # Based on the pulse pattern tables, three global variables are obtained:
 #
 # * `pulse_counts [int32: len(dc.train_ids)]` containing the number of pulses per train.
 # * `pulse_offsets [int32: len(dc.train_ids)]` containing the global offset for the first pulse of each train.
 # * `num_pulses = pulse_counts.sum(axis=0)`
 ppt_data = dc[ppt_source, 'data.bunchPatternTable']
 def get_pulse_positions(ppt, sase, laser, ppl_offset):
    # Combine FEL and PPL positions.
    fel_pos = indices_at_sase(ppt, sase)
    if len(fel_pos) > 0:
        ppl_pos = indices_at_laser(ppt, laser) + fel_pos[0] + ppl_offset
    else:
        # Just take them as they are
        ppl_pos = indices_at_laser(ppt, laser)
    return np.union1d(fel_pos, ppl_pos), fel_pos, ppl_pos
 if laser_ppt_mask < 0:
    # If laser PPT mask is not specified, try to figure it out from device IDs.
    from euxfel_bunch_pattern import PPL_BITS
    instrument = karabo_id[:karabo_id.index('_')]
    try:
        laser_ppt_mask = PPL_BITS[f'LP_{instrument}']
    except KeyError:
        raise ValueError(f'Laser PPT mask unknown for instrument `{instrument}`')
 with timing('pulse_info'):
    psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))
    # Build the pulse index
    pulse_counts = psh.alloc(shape=len(dc.train_ids), dtype=np.uint64)
    has_ppt = psh.alloc(shape=len(dc.train_ids), dtype=bool, fill=False)
    def count_pulses(wid, index, tid, ppt):
        pulse_counts[index] = len(get_pulse_positions(ppt, instrument_sase, laser_ppt_mask, ppl_offset)[0])
        has_ppt[index] = True
    psh.map(count_pulses, ppt_data)
    # Fill any missing values with the highest.
    pulse_counts[has_ppt == False] = pulse_counts.max()
    # Compute offsets based on pulse counts.
    pulse_offsets = np.zeros_like(pulse_counts)
    np.cumsum(pulse_counts[:-1], out=pulse_offsets[1:])
    # Total number of pulses.
    num_pulses = int(pulse_counts.sum())
 ```
 %% Cell type:code id: tags:
 ``` python
 fig, ax = plt.subplots(num=1, ncols=1, nrows=1, figsize=(9, 4), clear=True)
 ax.set_title('Pulse count')
 ax.plot(dc.train_ids, pulse_counts, lw=1)
 ax.set_xlabel('Train ID')
 ax.set_ylabel('Number of pulses')
 ax.set_ylim(0, max(300, pulse_counts.max() + 10))
 ax.ticklabel_format(style='plain')
 pass
 ```
 %% Cell type:markdown id: tags:
 ### Find triggers
 The trigger defines the boundary of a pulse on the digitizer trace, which is stored per train.
 %% Cell type:code id: tags:
 ``` python
 # A trigger defines the boundary of a pulse on the digitizer trace stored per train. This cell creates a
 # global variable:
 # * `triggers [(start: int32, stop: int32, offset: float64, fel: bool, ppl: bool): num_pulses]`
 #   containing the triggers for each pulse.
 #
 # This uses the pulse puttern table to locate the pulse positions on the trace. Only number of pulses and
 # their distance can be drawn this way, leaving the absolute offset for the very first pulse to be
 # configured via `trigger/ppt/first_pulse_offset`. If a PPL is used, it will be included in the trigger
 # pattern. The ppt_offset parameter allows taking into account an offset betwen PPL and FEL.
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_triggers))
-triggers = psh.alloc(shape=(num_pulses,), dtype=trigger_dt, fill=(-1, -1, np.nan, 0, 0))
+triggers = psh.alloc(shape=(num_pulses,), dtype=trigger_dt, fill=(-1, -1, np.nan, -1, 0, 0))
 clock_factor = remi['digitizer']['clock_factor']
 def trigger_by_ppt(worker_id, index, train_id, ppt):
    all_pos, fel_pos, ppl_pos = get_pulse_positions(ppt, instrument_sase, laser_ppt_mask, ppl_offset)
    num_pulses = len(all_pos)
    if num_pulses == 0:
        return
    rel_pos = all_pos - all_pos[0]
    if num_pulses > 1:
        pulse_lengths = np.unique(rel_pos[1:] - rel_pos[:-1])
        if len(pulse_lengths) > 1:
            print('WARNING: Differing pulse lengths encountered, minimum is used!')
        pulse_len = pulse_lengths.min()
    elif num_pulses == 1:
        pulse_len = single_pulse_length
    start_frac = first_pulse_offset + rel_pos * 2 * clock_factor
    start_int = start_frac.astype(int)
    pulse_offset = pulse_offsets[index]
    pulse_count = pulse_counts[index]
    train_triggers = triggers[pulse_offset:pulse_offset+pulse_count]
    train_triggers['start'] = start_int
    train_triggers['stop'] = start_int + int(pulse_len * 2 * clock_factor) - 1
    train_triggers['offset'] = start_frac - start_int
+    train_triggers['pulse'] = all_pos.astype(np.int16)
    train_triggers['fel'] = [pos in fel_pos for pos in all_pos]
    train_triggers['ppl'] = [pos in ppl_pos for pos in all_pos]
 with timing('find_triggers'):
    psh.map(trigger_by_ppt, ppt_data)
 ```
 %% Cell type:code id: tags:
 ``` python
 fig, (lx, rx) = plt.subplots(num=2, ncols=2, nrows=1, figsize=(9, 4), clear=True,
                             gridspec_kw=dict(top=0.75))
 # Display ~400 pulses or 10 trains, whatever is lower
 n_trains = max(abs(pulse_offsets - 200).argmin(), 5)
 visible_triggers = triggers[:pulse_offsets[n_trains]]
 pulse_index = np.arange(len(visible_triggers))
 pumped = visible_triggers['fel'] & visible_triggers['ppl']
 fel_only = visible_triggers['fel'] & ~pumped
 ppl_only = visible_triggers['ppl'] & ~pumped
 lx.plot(pulse_index[pumped], visible_triggers[pumped]['start'], ' .', ms=3, c='C0', label='FEL+PPL')
 lx.plot(pulse_index[fel_only], visible_triggers[fel_only]['start'], '.', ms=3, c='C1', label='FEL-only')
 lx.plot(pulse_index[ppl_only], visible_triggers[ppl_only]['start'], '.', ms=2, c='C2', label='PPL-only')
 max_start = visible_triggers['start'].max()
 lx.vlines(pulse_offsets[:n_trains], 0, max_start, color='grey', linewidth=1, alpha=0.2)
 lx.tick_params(right=True)
-lx.set_ylim(-1, max_start+1)
 lx.set_xlabel('Pulse index')
 lx.set_xlim(-15, pulse_offsets[n_trains]+15)
 lx.set_ylabel('Trigger position')
-lx.set_ylim(0, max_start)
+lx.set_ylim(-max_start // 20, max_start + max_start // 20)
 lx.legend(fontsize='small', loc='lower right')
 train_lx = lx.twiny()
 train_lx.set_xlabel('Train ID', labelpad=8)
 train_lx.set_xlim(lx.get_xlim())
 train_lx.set_xticks(pulse_offsets[:n_trains])
 train_lx.set_xticklabels([str(int(x)) for x in dc.train_ids[:n_trains]],
                         rotation=-45, fontsize='x-small')
 rx.plot(triggers['start'], lw=0.2)
 rx.set_xlabel('Pulse index')
 rx.tick_params(left=False, labelleft=False, right=True, labelright=True)
 pass
 ```
 %% Cell type:markdown id: tags:
 # Analog signal to digital edges
 %% Cell type:markdown id: tags:
 ### Find edges in analog signal
 %% Cell type:code id: tags:
 ``` python
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_find_edges))
 threadpool_limits(limits=remi.get_num_workers(mt_avg_trace))
 edges_by_det = {}
 avg_traces_by_det = {}
 for det_name, det in remi['detector'].items():
    det_sourcekeys = remi.get_detector_sourcekeys(det_name)
    det_get_traces = remi.get_traces_getter(det_name)
    trace_len = dc[next(iter(det_sourcekeys))].entry_shape[0]
    edges = psh.alloc(shape=(num_pulses, 7, det['max_hits']),
                      dtype=np.float64, fill=np.nan)
    avg_traces = psh.alloc_per_worker(shape=(7, trace_len), dtype=np.float64)
    def prepare_edge_worker(worker_id):
        correct_func = remi.get_baseline_corrector()
        discr_func, discr_params = remi.get_discriminator(det['channels'])
        source_name = remi['digitizer']['source']
        bl_start, bl_stop, _ = remi.get_baseline_limits(trace_len)
        bl_sym = remi['digitizer']['baseline_symmetry']
        time_cal = remi.get_time_calibration()
        traces_corr = np.empty((7, trace_len), dtype=np.float64)
        baselines = np.empty(bl_sym, dtype=np.float64)
        yield
    @psh.with_init(prepare_edge_worker)
    def find_edges(worker_id, index, train_id, data):
        try:
            data = det_get_traces(data[source_name])
        except KeyError:
            return
        for channel_idx in range(7):
            correct_func(data[channel_idx], traces_corr[channel_idx],
                         baselines, bl_start, bl_stop)
        avg_traces[worker_id] += traces_corr
        pulses_slice = np.s_[pulse_offsets[index]:pulse_offsets[index]+pulse_counts[index]]
        for trigger, pulse_edges in zip(triggers[pulses_slice], edges[pulses_slice]):
            trigger_slice = np.s_[trigger['start']:trigger['stop']]
            for trace, channel_params, channel_edges in zip(traces_corr, discr_params, pulse_edges):
                discr_func(trace[trigger_slice], channel_edges, **channel_params)
            pulse_edges += trigger['offset']
            pulse_edges *= time_cal
    with timing(f'find_edges, {det_name}'):
        psh.map(find_edges, dc.select(det_sourcekeys))
    edges_by_det[det_name] = edges
    avg_traces_by_det[det_name] = avg_traces.sum(axis=0) / len(dc.train_ids)
    with np.printoptions(precision=2, suppress=True):
        print(edges[:5, :, :8])
 ```
 %% Cell type:markdown id: tags:
 ### Global average of analog signals
 %% Cell type:code id: tags:
 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    fig, axs = plt.subplots(num=10+i, nrows=7, figsize=(9.5, 8), clear=True,
                            gridspec_kw=dict(left=0.1, right=0.98, top=0.98, bottom=0.1))
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        axs[edge_idx].plot(avg_traces_by_det[det_name][edge_idx], lw=1)
        axs[edge_idx].tick_params(labelbottom=False)
        axs[edge_idx].set_ylabel(edge_name)
    axs[-1].tick_params(labelbottom=True)
 pass
 ```
 %% Cell type:markdown id: tags:
 ### Sample for digitized traces
 %% Cell type:code id: tags:
 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    edges = edges_by_det[det_name]
    fig = plt.figure(num=100+i, figsize=(9.5, 8))
    grid = fig.add_gridspec(ncols=2, nrows=4, left=0.1, right=0.98, top=0.98, bottom=0.1)
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    for signal_idx, signal_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        row = (1 + signal_idx // 2) if signal_idx < 6 else 0
        col = (signal_idx % 2) if signal_idx < 6 else np.s_[:]
        ax = fig.add_subplot(grid[row, col])
        finite_edges = np.isfinite(edges[:, signal_idx, 0])
        if not finite_edges.any():
            continue
        pulse_idx = finite_edges.nonzero()[0][0]
        train_idx = (pulse_idx >= pulse_offsets).nonzero()[0][-1]
        trigger = triggers[pulse_idx]
        sourcekey = remi.get_channel_sourcekey(
            remi['detector'][det_name]['channels'][signal_idx])
        train_trace = dc[sourcekey].select_trains(np.s_[train_idx:train_idx+1]).ndarray()[0]
        corr_trace = np.zeros_like(train_trace, dtype=np.float64)
        remi.get_baseline_corrector()(
            train_trace, corr_trace,
            np.empty(remi['digitizer']['baseline_symmetry'], dtype=np.float64),
            *remi.get_baseline_limits(len(train_trace))[:2])
        pulse_trace = corr_trace[np.s_[trigger['start']:trigger['stop']]]
        x_time = remi.get_time_calibration() * (np.arange(len(pulse_trace)) + trigger['offset'])
        ax.plot(x_time, pulse_trace, lw=1)
        ax.set_xlim(x_time[0], x_time[-1])
        ax.set_ylim(-200, pulse_trace.max()*1.1)
        ax.text(x_time[-1], pulse_trace.max(),
                f'T{train_idx} P{pulse_idx - pulse_offsets[train_idx]} ',
                va='top', ha='right')
        ax.tick_params(labelbottom=False)
        ax.set_ylabel(signal_name)
        ax.vlines(edges[pulse_idx, signal_idx, :], *ax.get_ylim(), color='red', linewidth=1)
 pass
 ```
 %% Cell type:markdown id: tags:
 ### Digitized channel spectra
 %% Cell type:code id: tags:
 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    fig = plt.figure(num=20+i, figsize=(9.5, 6))
    edges = edges_by_det[det_name]
    min_edge = edges[np.isfinite(edges)].min()
    max_edge = edges[np.isfinite(edges)].max()
    grid = fig.add_gridspec(ncols=3, nrows=3, left=0.08, right=0.98, top=0.95, hspace=0.4)
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    numx = fig.add_subplot(grid[0, 0])
    numx.set_title('Edges per pulse')
    agg_window = num_pulses // 60
    max_num_edges = 0.0
    max_spectral_intensity = 0
    hist_axs = []
    for edge_idx, edge_name in enumerate(['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp']):
        num_edges = np.isfinite(edges[:, edge_idx, :]).sum(axis=1)
        num_edges = num_edges[:((len(num_edges) // agg_window) * agg_window)]
        num_edges = num_edges.reshape(-1, agg_window).mean(axis=1)
        if edge_idx < 6:
            plot_kwargs = dict(c=f'C{edge_idx}', ls='solid', lw=1.0)
        else:
            plot_kwargs = dict(c='k', ls='dashed', lw=1.0)
        numx.plot(np.arange(len(num_edges)) * agg_window, num_edges, label=edge_name, **plot_kwargs)
        max_num_edges = max(max_num_edges, num_edges.max())
        cur_edges = edges[:, edge_idx, :].flatten()
        if edge_idx < 6:
            row = 1 + edge_idx % 2
            col = edge_idx // 2
        else:
            row = 0
            col = np.s_[1:3]
        ax = fig.add_subplot(grid[row, col])
        ax.set_title(f'TOF spectrum: {edge_name}')
        y, _, _ = ax.hist(cur_edges[np.isfinite(cur_edges)], bins=int((max_edge - min_edge) // 5),
                          range=(min_edge, max_edge), color=plot_kwargs['c'], histtype='step', linewidth=1)
        hist_axs.append(ax)
        max_spectral_intensity = max(max_spectral_intensity, y.max())
    numx.tick_params(labelbottom=False)
    numx.set_ylim(0, 1.2*max_num_edges)
    for ax in hist_axs:
        ax.set_ylim(0, max_spectral_intensity*1.1)
        ax.ticklabel_format(axis='y', style='sci', scilimits=(0, 3))
 pass
 ```
 %% Cell type:markdown id: tags:
 # Detector diagnostics
 %% Cell type:code id: tags:
 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    edges = edges_by_det[det_name]
    sort = remi.get_dld_sorter(det_name)
    sum_shifts = sort.sum_shifts if sort.sum_shifts != (0.0, 0.0, 0.0) else None
    is_valid = remi.get_presort_mask(edges, edge_idx=0,
                                     sum_limit=max(sort.uncorrected_time_sum_half_widths),
                                     sum_shifts=sum_shifts)
    signals, sums = remi.get_signals_and_sums(edges, indices=sort.channel_indices, sum_shifts=sum_shifts,
                                              mask=is_valid)
    fig = plot_detector_diagnostics(signals=signals, sums=sums, fig_num=30+i, im_scale=1.5,
                                    sum_range=max(sort.uncorrected_time_sum_half_widths),
                                    sorter=sort)
    fig.text(0.02, 0.98, det_name.upper() + ' before corrections', rotation=90, ha='left', va='top', size='x-large')
    if remi['detector'][det_name]['use_sum_correction'] or remi['detector'][det_name]['use_pos_correction']:
        n_masked = is_valid.sum()
        signals = np.full((n_masked, 3), np.nan, dtype=np.float64)
        sums = np.full((n_masked, 3), np.nan, dtype=np.float64)
        sort.correct(edges[is_valid], signals, sums)
        fig = plot_detector_diagnostics(signals=signals, sums=sums, fig_num=40+i, im_scale=1.5,
                                        sum_range=max(sort.uncorrected_time_sum_half_widths),
                                        sorter=sort)
        fig.text(0.02, 0.98, det_name.upper() + ' after corrections', rotation=90, ha='left', va='top', size='x-large')
 pass
 ```
 %% Cell type:markdown id: tags:
 # Hit reconstruction
 %% Cell type:code id: tags:
 ``` python
 psh.set_default_context('processes', num_workers=remi.get_num_workers(mp_rec_hits))
 signals_by_det = {}
 hits_by_det = {}
 hit_counts_by_det = {}
 for det_name, det in remi['detector'].items():
    edges = edges_by_det[det_name]
    signals = psh.alloc(shape=(num_pulses, 50), dtype=signal_dt, fill=np.nan)
    hits = psh.alloc(shape=(num_pulses, 50), dtype=hit_dt, fill=(np.nan, np.nan, np.nan, -1))
    hit_counts = psh.alloc(shape=len(dc.train_ids), dtype=np.uint32)
    def prepare_hit_worker(worker_id):
        sort = remi.get_dld_sorter(det_name)
        yield
    @psh.with_init(prepare_hit_worker)
    def reconstruct_hits(worker_id, index, train_id):
        hit_counts[index] += sort.run_on_train(
            edges, signals, hits, pulse_offsets[index], pulse_offsets[index] + pulse_counts[index])
    with timing(f'rec_hits, {det_name}'):
        psh.map(reconstruct_hits, dc.train_ids)
    signals_by_det[det_name] = signals
    hits_by_det[det_name] = hits
    hit_counts_by_det[det_name] = hit_counts
 ```
 %% Cell type:code id: tags:
 ``` python
 fig, ax = plt.subplots(num=50+i, figsize=(9.5, 4), ncols=1, clear=True,
                       gridspec_kw=dict(top=0.92, right=0.98, left=0.05, bottom=0.12))
 max_num_hits = 0.0
 for det_name in remi['detector'].keys():
    agg_window = num_pulses // 1000
    num_hits = np.isfinite(hits_by_det[det_name]['x']).sum(axis=1)
    num_hits = num_hits[:(len(num_hits) // agg_window) * agg_window]
    num_hits = num_hits.reshape(-1, agg_window).mean(axis=1)
    max_num_hits = max(max_num_hits, num_hits.max())
    ax.plot(np.arange(0, (num_pulses // agg_window) * agg_window, agg_window), num_hits,
            lw=1, label=det_name.upper())
 ax.set_title('Hits per pulse')
 ax.set_xlabel('Pulse index')
 ax.set_ylim(0, max_num_hits*1.1)
 ax.legend()
 pass
 ```
 %% Cell type:markdown id: tags:
 ### Reconstruction methods
 Each hit may be reconstructed by one of 19 different methods. These differ by the number of real signals across the channels, which could be combined to form the hit. Each of these methods is designed by a number between `0` and `19` (with empty hits using `-1`), which can be found in the `m` key of a hit, e.g.:
 * `0`: All six anode signals and the corresponding MCP signal were found.
 * `4`: One signal on layer `u` is missing, all other signals for this event were found.
 * `18`: Only one anode signal on each layer was found and the MCP signal is missing. There is no way to check whether this combination of signals is actually valid.
 | Method | `u+v+w +mcp` |
 | - | - |
 | 0 | `2+2+2 +1` |
 | 1 | `0+2+2 +1` |
 | 2 | `2+0+2 +1` |
 | 3 | `2+2+0 +1` |
 | 4 | `1+2+2 +1` (2 permutations) |
 | 5 | `2+1+2 +1` (2 permutations) |
 | 6 | `2+2+1 +1` (2 permutations) |
 | 7 | `2+2+2 +0` |
 | 8 | `0+2+2 +0` |
 | 9 | `2+0+2 +0` |
 | 10 | `2+2+0 +0` |
 | 11 | `1+2+2 +0` (2 permutations) |
 | 12 | `2+1+2 +0` (2 permutations) |
 | 13 | `2+2+1 +0` (2 permutations) |
 | 14 | `2+1+1 +1` `1+2+1 +1` `1+1+2 +1` (12 permutations) |
 | 15 | `2+1+0 +1` `2+0+1 +1` `1+2+0 +1` `1+0+2 +1` `0+2+1 +1` `0+1+2 +1` (12 permutations) |
 | 16 | `1+1+1 +1` (8 permutations) |
 | 17 | `2+1+1 +0` `1+2+1 +0` `1+1+2 +0` (12 permutations) |
 | 18 | `1+1+1 +0` (8 permutations) |
 | 19 | `2+1+0 +0` `2+0+1 +0` `1+2+0 +0` `1+0+2 +0` `0+1+2 +0` `0+2+1 +0` (12 permutations) |
 * For hits reconstructed with method `> 10`, extra attention should be given to ensure they add meaningful signal.
 * Any method `> 14` has to considered risky, because neither a time sum nor the position can be checked. If the scale factors and/or `w` shift are not correct, then the number of events reconstructed with the risky methods will increase. They will most likely be *ghost hits*, which do not correspond to actual impacts on the detector.
 %% Cell type:code id: tags:
 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    hits = hits_by_det[det_name]
    fig, ax = plt.subplots(num=60+i, figsize=(9.5, 5), ncols=1, clear=True,
                           gridspec_kw=dict(left=0.08, right=0.91, top=0.8))
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    method_bins = np.bincount(hits['m'][hits['m'] >= 0], minlength=20)
    ax.bar(np.arange(20), method_bins, width=0.5)
    ax.set_xlabel('Reconstruction method')
    ax.set_xlim(-0.5, 19.5)
    ax.set_xticks(np.arange(20))
    ax.set_ylabel('Number of hits')
    ax.set_ylim(0, method_bins.max()*1.05)
    ylims = ax.get_ylim()
    ax.tick_params(which='both', right=True, labelright=True)
    num_risky = method_bins[15:].sum()
    num_total = method_bins.sum()
    ax.text(14.2, method_bins.max(), f'{(100*(num_total-num_risky)/num_total):.2g}%',
            va='top', ha='right', color='black')
    ax.text(14.8, method_bins.max(), f'{(100*num_risky/num_total):.2g}%',
            va='top', ha='left', color='red')
    ax.fill([14.5, 19.5, 19.5, 14.5], [ylims[0], ylims[0], ylims[1], ylims[1]], c='r', alpha=0.2)
    labelx = ax.twiny()
    labelx.set_xlim(*ax.get_xlim())
    labelx.set_xticks(ax.get_xticks())
    labelx.set_xticklabels([
        '2+2+2 +1',
        '0+2+2 +1', '2+0+2 +1', '2+2+0 +1',
        '1+2+2 +1', '2+1+2 +1', '2+2+1 +1',
        '2+2+2 +0',
        '0+2+2 +0', '2+0+2 +0', '2+2+0 +0', '1+2+2 +0', '2+1+2 +0', '2+2+1 +0',
        '2+1+1 +1',
        '2+1+0 +1',
        '1+1+1 +1',
        '2+1+1 +0',
        '1+1+1 +0',
        '2+1+0 +0',
    ], rotation=90)
    min_rel_tick = np.ceil((ax.get_ylim()[0] / num_total) / 0.1) * 0.1
    max_rel_tick = np.floor((method_bins.max() / num_total) / 0.1) * 0.1
    rely = ax.twinx()
    rely.set_ylim(*ax.get_ylim())
    rely.set_yticks(np.arange(0.0, max_rel_tick+0.01, 0.1)*num_total)
    rely.set_yticks(np.arange(0.0, ylims[1]/num_total, 0.02)*num_total, minor=True)
    rely.set_yticklabels([f'{(y/num_total)*100:.0f}%' for y in rely.get_yticks()])
    rely.set_ylabel('Percentage of total hits')
 pass
 ```
 %% Cell type:markdown id: tags:
 ### Detector image and fishes
 %% Cell type:code id: tags:
 ``` python
 for i, det_name in enumerate(remi['detector'].keys()):
    flat_hits = hits_by_det[det_name].reshape(-1)
    flat_hits = flat_hits[np.isfinite(flat_hits[:]['x'])]
    flat_hits = flat_hits[flat_hits['m'] < 10]
    fig = plt.figure(num=70+i, figsize=(9, 13.5))
    fig.text(0.02, 0.98, det_name.upper(), rotation=90, ha='left', va='top', size='x-large')
    fig.text(0.02, 0.02, det_name.upper(), rotation=90, ha='left', va='bottom', size='x-large')
    imp = fig.add_axes([0.1 + 0.25/2, 0.56, 0.6, 0.4])
    txp = fig.add_axes([0.1, 0.28, 0.85, 0.22])
    typ = fig.add_axes([0.1, 0.04, 0.85, 0.22])
    im_radius = remi['detector'][det_name]['mcp_radius']*1.1
    min_tof = flat_hits['t'].min()
    max_tof = flat_hits['t'].max()
    imp.hist2d(flat_hits['x'], flat_hits['y'], bins=(256, 256),
               range=[[-im_radius, im_radius], [-im_radius, im_radius]], norm=LogNorm())
    imp.xaxis.set_label_position('top')
    imp.set_xlabel('X / mm')
    imp.set_ylabel('Y / mm')
    imp.tick_params(right=True, labelright=True, top=True, labeltop=True)
    imp.grid()
    for ax, dim_label in zip([txp, typ], ['x', 'y']):
        ax.hist2d(flat_hits['t'], flat_hits[dim_label], bins=(int((max_tof - min_tof) // 5), 256),
                   range=[[min_tof, max_tof], [-im_radius, im_radius]], norm=LogNorm())
        ax.set_ylabel(f'{dim_label.upper()} / mm')
    typ.set_xlabel('Time-of-flight / ns')
    txp.tick_params(bottom=True, labelbottom=False, top=True, labeltop=True, right=True, labelright=True)
    typ.tick_params(right=True, labelright=True, top=True)
 pass
 ```
 %% Cell type:markdown id: tags:
 # Transformed data files
 %% Cell type:code id: tags:
 ``` python
 seq_len = out_seq_len if out_seq_len > 0 else len(dc.files[0].train_ids)
 dataset_kwargs = {k[8:]: v for k, v in locals().items() if k.startswith('dataset_compression')}
 Path(out_folder).mkdir(parents=True, exist_ok=True)
 print('Writing sequence files', flush=True, end='')
 t_write = timing('write_files')
 t_write.__enter__()
 for seq_id, train_mask, pulse_mask in sequence_pulses(dc.train_ids, pulse_counts, pulse_offsets, seq_len):
    seq_train_ids = dc.train_ids[train_mask]
    with DataFile.from_details(out_folder, out_aggregator, run, seq_id) as outp:
        outp.create_index(seq_train_ids)
        for det_name in remi['detector']:
            cur_device_id = det_device_id.format(karabo_id=karabo_id, det_name=det_name.upper())
            cur_control_data = outp.create_control_source(cur_device_id)
            # Manually manipulate the file here, still creates the index properly.
            remi.attach_detector_config(det_name, cur_control_data.get_run_group())
            cur_control_data.create_index(len(seq_train_ids))
            cur_fast_data = outp.create_instrument_source(f'{cur_device_id}:{det_output_key}')
            if save_raw_triggers:
                cur_fast_data.create_key('raw.triggers', triggers[pulse_mask],
                                         chunks=tuple(chunks_triggers), **dataset_kwargs)
            if save_raw_edges:
                cur_fast_data.create_key('raw.edges', edges_by_det[det_name][pulse_mask],
                                         chunks=tuple(chunks_edges), **dataset_kwargs)
            if save_rec_signals:
                cur_fast_data.create_key('rec.signals', signals_by_det[det_name][pulse_mask],
                                         chunks=tuple(chunks_signals), **dataset_kwargs)
            if save_rec_hits:
                cur_fast_data.create_key('rec.hits', hits_by_det[det_name][pulse_mask],
                                         chunks=tuple(chunks_hits), **dataset_kwargs)
            cur_fast_data.create_index(raw=pulse_counts[train_mask], rec=pulse_counts[train_mask])
        outp.create_metadata(like=dc)
    print('.', flush=True, end='')
 print('')
 t_write.__exit__()
 ```