diff --git a/DEPENDS b/DEPENDS
new file mode 100644
index 0000000000000000000000000000000000000000..adead8b96d81197f033bdccf55a7921887c14b23
--- /dev/null
+++ b/DEPENDS
@@ -0,0 +1,4 @@
+TrainMatcher, 1.2.0-2.10.2
+PipeToZeroMQ, 3.2.6-2.11.0
+calngDeps, 0.0.3-2.10.0
+calibrationClient, 9.0.6
diff --git a/README.md b/README.md
index 4a09a144da1115a598849e46e4c73125e8112c2d..5933bd80822722eb9565d323d09ca6bda7bf3ab0 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,20 @@
 # calng
 
 calng is a collection of Karabo devices to perform online processing of 2D X-ray detector data at runtime. It is the successor of the calPy package.
+
+# CalCat secrets and deployment
+Correction devices each run their own `calibration_client.CalibrationClient`, so they need to have credentials for CalCat.
+They expect to be able to load these from a JSON file; by default, this will be in `$KARABO/var/data/calibration-client-secrets.json` (`var/data` is CWD of Karabo devices).
+The file should look something like:
+
+```json
+{
+	"base_url": "https://in.xfel.eu/test_calibration",
+	"client_id": "[sort of secret]",
+	"client_secret": "[actual secret]",
+	"user_email": "[eh, not that secret]",
+	"caldb_store_path": "/gpfs/exfel/d/cal/caldb_store"
+}
+```
+
+For deployment, you'll want `/calibration` instead of `/test_calibration` and the caldb store as seen from ONC will be `/common/cal/caldb_store`.
diff --git a/setup.py b/setup.py
index 85f6b2ada7ee2cb26c065409707b0bc22cf94e4e..31a693e453590a0797e9e86020bd14e063034c53 100644
--- a/setup.py
+++ b/setup.py
@@ -24,15 +24,21 @@ setup(name='calng',
       packages=find_packages('src'),
       entry_points={
           'karabo.bound_device': [
+              'AgipdCorrection = calng.AgipdCorrection:AgipdCorrection',
               'DsscCorrection = calng.DsscCorrection:DsscCorrection',
+              'JungfrauCorrection = calng.JungfrauCorrection:JungfrauCorrection',
               'ModuleStacker = calng.ModuleStacker:ModuleStacker',
+              'ManualAgipdGeometry = calng.ManualAgipdGeometry:ManualAgipdGeometry',
+              'ManualDsscGeometry = calng.ManualDsscGeometry:ManualDsscGeometry',
+              'ManualJungfrauGeometry = calng.ManualJungfrauGeometry:ManualJungfrauGeometry',
               'ShmemToZMQ = calng.ShmemToZMQ:ShmemToZMQ',
+              'SimpleAssembler = calng.SimpleAssembler:SimpleAssembler',
           ],
 
           'karabo.middlelayer_device': [
               'CalibrationManager = calng.CalibrationManager:CalibrationManager'
           ],
       },
-      package_data={'': ['*.cpp']},
+      package_data={'': ['kernels/*']},
       requires=[],
 )
diff --git a/src/calng/AgipdCorrection.py b/src/calng/AgipdCorrection.py
new file mode 100644
index 0000000000000000000000000000000000000000..36edc80d124a71e09aeaab51454b656bc42888bc
--- /dev/null
+++ b/src/calng/AgipdCorrection.py
@@ -0,0 +1,831 @@
+import enum
+
+import cupy
+import numpy as np
+from karabo.bound import (
+    BOOL_ELEMENT,
+    DOUBLE_ELEMENT,
+    FLOAT_ELEMENT,
+    KARABO_CLASSINFO,
+    NODE_ELEMENT,
+    OUTPUT_CHANNEL,
+    OVERWRITE_ELEMENT,
+    STRING_ELEMENT,
+    VECTOR_STRING_ELEMENT,
+    State,
+)
+
+from . import base_gpu, calcat_utils, utils
+from ._version import version as deviceVersion
+from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
+
+
+class AgipdConstants(enum.Enum):
+    SlopesFF = enum.auto()
+    ThresholdsDark = enum.auto()
+    Offset = enum.auto()
+    SlopesPC = enum.auto()
+    BadPixelsDark = enum.auto()
+    BadPixelsPC = enum.auto()
+    BadPixelsFF = enum.auto()
+
+
+# from pycalibration's enum.py
+class AgipdGainMode(enum.IntEnum):
+    ADAPTIVE_GAIN = 0
+    FIXED_HIGH_GAIN = 1
+    FIXED_MEDIUM_GAIN = 2
+    FIXED_LOW_GAIN = 3
+
+
+class BadPixelValues(enum.IntFlag):
+    """The European XFEL Bad Pixel Encoding
+
+    Straight from pycalibration's enum.py"""
+
+    OFFSET_OUT_OF_THRESHOLD = 2 ** 0
+    NOISE_OUT_OF_THRESHOLD = 2 ** 1
+    OFFSET_NOISE_EVAL_ERROR = 2 ** 2
+    NO_DARK_DATA = 2 ** 3
+    CI_GAIN_OUT_OF_THRESHOLD = 2 ** 4
+    CI_LINEAR_DEVIATION = 2 ** 5
+    CI_EVAL_ERROR = 2 ** 6
+    FF_GAIN_EVAL_ERROR = 2 ** 7
+    FF_GAIN_DEVIATION = 2 ** 8
+    FF_NO_ENTRIES = 2 ** 9
+    CI2_EVAL_ERROR = 2 ** 10
+    VALUE_IS_NAN = 2 ** 11
+    VALUE_OUT_OF_RANGE = 2 ** 12
+    GAIN_THRESHOLDING_ERROR = 2 ** 13
+    DATA_STD_IS_ZERO = 2 ** 14
+    ASIC_STD_BELOW_NOISE = 2 ** 15
+    INTERPOLATED = 2 ** 16
+    NOISY_ADC = 2 ** 17
+    OVERSCAN = 2 ** 18
+    NON_SENSITIVE = 2 ** 19
+    NON_LIN_RESPONSE_REGION = 2 ** 20
+
+
+class CorrectionFlags(enum.IntFlag):
+    NONE = 0
+    THRESHOLD = 1
+    OFFSET = 2
+    BLSHIFT = 4
+    REL_GAIN_PC = 8
+    GAIN_XRAY = 16
+    BPMASK = 32
+
+
+class AgipdGpuRunner(base_gpu.BaseGpuRunner):
+    _kernel_source_filename = "agipd_gpu.cu"
+    _corrected_axis_order = "cxy"
+
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=cupy.uint16,
+        output_data_dtype=cupy.float32,
+        bad_pixel_mask_value=cupy.nan,
+        gain_mode=AgipdGainMode.ADAPTIVE_GAIN,
+        g_gain_value=1,
+    ):
+        self.gain_mode = gain_mode
+        # default gain only matters when not thresholding (missing constant or fixed)
+        # note: gain stage (result of thresholding) is 0, 1, or 2
+        if self.gain_mode is AgipdGainMode.ADAPTIVE_GAIN:
+            self.default_gain = cupy.uint8(gain_mode)
+        else:
+            self.default_gain = cupy.uint8(gain_mode - 1)
+        self.input_shape = (memory_cells, 2, pixels_x, pixels_y)
+        self.processed_shape = (memory_cells, pixels_x, pixels_y)
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+        self.gain_map_gpu = cupy.empty(self.processed_shape, dtype=cupy.float32)
+        self.preview_buffer_getters.extend(
+            [self._get_raw_gain_for_preview, self._get_gain_map_for_preview]
+        )
+
+        self.map_shape = (self.constant_memory_cells, self.pixels_x, self.pixels_y)
+        self.gm_map_shape = self.map_shape + (3,)  # for gain-mapped constants
+        self.threshold_map_shape = self.map_shape + (2,)
+        # constants
+        self.gain_thresholds_gpu = cupy.empty(
+            self.threshold_map_shape, dtype=cupy.float32
+        )
+        self.offset_map_gpu = cupy.zeros(self.gm_map_shape, dtype=cupy.float32)
+        self.rel_gain_pc_map_gpu = cupy.ones(self.gm_map_shape, dtype=cupy.float32)
+        # not gm_map_shape because it only applies to medium gain pixels
+        self.md_additional_offset_gpu = cupy.zeros(self.map_shape, dtype=cupy.float32)
+        self.rel_gain_xray_map_gpu = cupy.ones(self.map_shape, dtype=cupy.float32)
+        self.bad_pixel_map_gpu = cupy.zeros(self.gm_map_shape, dtype=cupy.uint32)
+        self.set_bad_pixel_mask_value(bad_pixel_mask_value)
+        self.set_g_gain_value(g_gain_value)
+
+        self.update_block_size((1, 1, 64))
+
+    def _get_raw_for_preview(self):
+        return self.input_data_gpu[:, 0]
+
+    def _get_corrected_for_preview(self):
+        return self.processed_data_gpu
+
+    # special to AGIPD
+    def _get_raw_gain_for_preview(self):
+        return self.input_data_gpu[:, 1]
+
+    def _get_gain_map_for_preview(self):
+        return self.gain_map_gpu
+
+    def load_thresholds(self, threshold_map):
+        # shape: y, x, memory cell, thresholds and gain values
+        # note: the gain values are something like means used to derive thresholds
+        self.gain_thresholds_gpu.set(
+            np.transpose(threshold_map[..., :2], (2, 1, 0, 3)).astype(np.float32)
+        )
+
+    def load_offset_map(self, offset_map):
+        # shape: y, x, memory cell, gain stage
+        self.offset_map_gpu.set(
+            np.transpose(offset_map, (2, 1, 0, 3)).astype(np.float32)
+        )
+
+    def load_rel_gain_pc_map(self, slopes_pc_map, override_md_additional_offset=None):
+        # pc has funny shape (11, 352, 128, 512) from file
+        # this is (fi, memory cell, y, x)
+        slopes_pc_map = slopes_pc_map.astype(np.float32)
+        # the following may contain NaNs, though...
+        hg_slope = slopes_pc_map[0]
+        hg_intercept = slopes_pc_map[1]
+        mg_slope = slopes_pc_map[3]
+        mg_intercept = slopes_pc_map[4]
+        # TODO: remove sanitization (should happen in constant preparation notebook)
+        # from agipdlib.py: replace NaN with median (per memory cell)
+        # note: suffixes in agipdlib are "_m" and "_l", should probably be "_I"
+        for naughty_array in (hg_slope, hg_intercept, mg_slope, mg_intercept):
+            medians = np.nanmedian(naughty_array, axis=(1, 2))
+            nan_bool = np.isnan(naughty_array)
+            nan_cell, _, _ = np.where(nan_bool)
+            naughty_array[nan_bool] = medians[nan_cell]
+
+            too_low_bool = naughty_array < 0.8 * medians[:, np.newaxis, np.newaxis]
+            too_low_cell, _, _ = np.where(too_low_bool)
+            naughty_array[too_low_bool] = medians[too_low_cell]
+
+            too_high_bool = naughty_array > 1.2 * medians[:, np.newaxis, np.newaxis]
+            too_high_cell, _, _ = np.where(too_high_bool)
+            naughty_array[too_high_bool] = medians[too_high_cell]
+
+        frac_hg_mg = hg_slope / mg_slope
+        rel_gain_map = np.ones(
+            (3, self.constant_memory_cells, self.pixels_y, self.pixels_x),
+            dtype=np.float32,
+        )
+        rel_gain_map[1] = rel_gain_map[0] * frac_hg_mg
+        rel_gain_map[2] = rel_gain_map[1] * 4.48
+        self.rel_gain_pc_map_gpu.set(np.transpose(rel_gain_map, (1, 3, 2, 0)))
+        if override_md_additional_offset is None:
+            md_additional_offset = (hg_intercept - mg_intercept * frac_hg_mg).astype(
+                np.float32
+            )
+            self.md_additional_offset_gpu.set(
+                np.transpose(md_additional_offset, (0, 2, 1))
+            )
+        else:
+            self.override_md_additional_offset(override_md_additional_offset)
+
+    def override_md_additional_offset(self, override_value):
+        self.md_additional_offset_gpu.fill(override_value)
+
+    def load_rel_gain_ff_map(self, slopes_ff_map):
+        # constant shape: y, x, memory cell
+        if slopes_ff_map.shape[2] == 2:
+            # TODO: remove support for old format
+            # old format, is per pixel only (shape is y, x, 2)
+            # note: we should not support this in online
+            slopes_ff_map = np.broadcast_to(
+                slopes_ff_map[..., 0][..., np.newaxis],
+                (self.pixels_y, self.pixels_x, self.constant_memory_cells),
+            )
+        self.rel_gain_xray_map_gpu.set(np.transpose(slopes_ff_map).astype(np.float32))
+
+    def set_g_gain_value(self, override_value):
+        self.g_gain_value = cupy.float32(override_value)
+
+    def load_bad_pixels_map(self, bad_pixels_map, override_flags_to_use=None):
+        # will simply OR with already loaded, does not take into account which ones
+        # TODO: inquire what "mask for double size pixels" means
+        if len(bad_pixels_map.shape) == 3:
+            if bad_pixels_map.shape == (
+                self.pixels_y,
+                self.pixels_x,
+                self.constant_memory_cells,
+            ):
+                # BadPixelsFF is not per gain stage - broadcasting along gain dimension
+                self.bad_pixel_map_gpu |= cupy.asarray(
+                    np.broadcast_to(
+                        np.transpose(bad_pixels_map)[..., np.newaxis],
+                        self.gm_map_shape,
+                    ),
+                    dtype=np.uint32,
+                )
+            elif bad_pixels_map.shape == (
+                self.constant_memory_cells,
+                self.pixels_y,
+                self.pixels_x,
+            ):
+                # oh, can also be old bad pixels pc?
+                self.bad_pixel_map_gpu |= cupy.asarray(
+                    np.broadcast_to(
+                        np.transpose(bad_pixels_map, (0, 2, 1))[..., np.newaxis],
+                        self.gm_map_shape,
+                    ),
+                    dtype=np.uint32,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported bad pixel map shape: {bad_pixels_map.shape}"
+                )
+        else:
+            self.bad_pixel_map_gpu |= cupy.asarray(
+                np.transpose(bad_pixels_map, (2, 1, 0, 3)), dtype=np.uint32
+            )
+
+        if override_flags_to_use is not None:
+            self.override_bad_pixel_flags_to_use(override_flags_to_use)
+
+    def override_bad_pixel_flags_to_use(self, override_value):
+        self.bad_pixel_map_gpu &= cupy.uint32(override_value)
+
+    def set_bad_pixel_mask_value(self, mask_value):
+        self.bad_pixel_mask_value = cupy.float32(mask_value)
+
+    def flush_buffers(self):
+        self.offset_map_gpu.fill(0)
+        self.rel_gain_pc_map_gpu.fill(1)
+        self.md_additional_offset_gpu.fill(0)
+        self.rel_gain_xray_map_gpu.fill(1)
+        self.bad_pixel_map_gpu.fill(0)
+
+    # TODO: baseline shift
+
+    def correct(self, flags):
+        if flags & CorrectionFlags.BLSHIFT:
+            raise NotImplementedError("Baseline shift not implemented yet")
+
+        self.correction_kernel(
+            self.full_grid,
+            self.full_block,
+            (
+                self.input_data_gpu,
+                self.cell_table_gpu,
+                cupy.uint8(flags),
+                self.default_gain,
+                self.gain_thresholds_gpu,
+                self.offset_map_gpu,
+                self.rel_gain_pc_map_gpu,
+                self.md_additional_offset_gpu,
+                self.rel_gain_xray_map_gpu,
+                self.g_gain_value,
+                self.bad_pixel_map_gpu,
+                self.bad_pixel_mask_value,
+                self.gain_map_gpu,
+                self.processed_data_gpu,
+            ),
+        )
+
+    def _init_kernels(self):
+        kernel_source = self._kernel_template.render(
+            {
+                "pixels_x": self.pixels_x,
+                "pixels_y": self.pixels_y,
+                "data_memory_cells": self.memory_cells,
+                "constant_memory_cells": self.constant_memory_cells,
+                "input_data_dtype": utils.np_dtype_to_c_type(self.input_data_dtype),
+                "output_data_dtype": utils.np_dtype_to_c_type(self.output_data_dtype),
+                "corr_enum": utils.enum_to_c_template(CorrectionFlags),
+            }
+        )
+        self.source_module = cupy.RawModule(code=kernel_source)
+        self.correction_kernel = self.source_module.get_function("correct")
+
+
+class AgipdCalcatFriend(calcat_utils.BaseCalcatFriend):
+    _constant_enum_class = AgipdConstants
+
+    def __init__(self, device, *args, **kwargs):
+        super().__init__(device, *args, **kwargs)
+        self._constants_need_conditions = {
+            AgipdConstants.ThresholdsDark: self.dark_condition,
+            AgipdConstants.Offset: self.dark_condition,
+            AgipdConstants.SlopesPC: self.dark_condition,
+            AgipdConstants.SlopesFF: self.illuminated_condition,
+            AgipdConstants.BadPixelsDark: self.dark_condition,
+            AgipdConstants.BadPixelsPC: self.dark_condition,
+            AgipdConstants.BadPixelsFF: self.illuminated_condition,
+        }
+
+    @staticmethod
+    def add_schema(
+        schema,
+        managed_keys,
+        param_prefix="constantParameters",
+        status_prefix="foundConstants",
+    ):
+        super(AgipdCalcatFriend, AgipdCalcatFriend).add_schema(
+            schema, managed_keys, "AGIPD-Type", param_prefix, status_prefix
+        )
+
+        (
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.memoryCells")
+            .setNewDefaultValue(352)
+            .commit(),
+
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.biasVoltage")
+            .setNewDefaultValue(300)
+            .commit()
+        )
+
+        (
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.acquisitionRate")
+            .assignmentOptional()
+            .defaultValue(1.1)
+            .reconfigurable()
+            .commit(),
+
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.gainSetting")
+            .assignmentOptional()
+            .defaultValue(0)
+            .reconfigurable()
+            .commit(),
+
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.photonEnergy")
+            .assignmentOptional()
+            .defaultValue(9.2)
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.gainMode")
+            .assignmentOptional()
+            .defaultValue("ADAPTIVE_GAIN")
+            .options(",".join(gain_mode.name for gain_mode in AgipdGainMode))
+            .reconfigurable()
+            .commit(),
+
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.integrationTime")
+            .assignmentOptional()
+            .defaultValue(12)
+            .reconfigurable()
+            .commit(),
+        )
+        managed_keys.add(f"{param_prefix}.acquisitionRate")
+        managed_keys.add(f"{param_prefix}.gainSetting")
+        managed_keys.add(f"{param_prefix}.photonEnergy")
+        managed_keys.add(f"{param_prefix}.gainMode")
+        managed_keys.add(f"{param_prefix}.integrationTime")
+
+        calcat_utils.add_status_schema_from_enum(schema, status_prefix, AgipdConstants)
+
+    def dark_condition(self):
+        res = calcat_utils.OperatingConditions()
+        res["Memory cells"] = self._get_param("memoryCells")
+        res["Sensor Bias Voltage"] = self._get_param("biasVoltage")
+        res["Pixels X"] = self._get_param("pixelsX")
+        res["Pixels Y"] = self._get_param("pixelsY")
+        res["Acquisition rate"] = self._get_param("acquisitionRate")
+
+        # TODO: remove this workaround after CalCat update
+        integration_time = self._get_param("integrationTime")
+        if integration_time != 12:
+            res["Integration Time"] = integration_time
+
+        gain_mode = AgipdGainMode[self._get_param("gainMode")]
+        if gain_mode is not AgipdGainMode.ADAPTIVE_GAIN:
+            res["Gain Mode"] = 1
+
+        # TODO: make configurable whether or not to include gain setting?
+        res["Gain Setting"] = self._get_param("gainSetting")
+
+        return res
+
+    def illuminated_condition(self):
+        res = self.dark_condition()
+
+        # note: can consider always setting memory cells to 352 for FF
+        # (deviation on constants in database should remove need for this, though)
+
+        # for now, FF constants don't care about gain mode
+        if "Gain Mode" in res:
+            del res["Gain Mode"]
+
+        res["Source Energy"] = self._get_param("photonEnergy")
+
+        return res
+
+
+@KARABO_CLASSINFO("AgipdCorrection", deviceVersion)
+class AgipdCorrection(BaseCorrection):
+    # subclass *must* set these attributes
+    _correction_flag_class = CorrectionFlags
+    _correction_field_names = (
+        ("thresholding", CorrectionFlags.THRESHOLD),
+        ("offset", CorrectionFlags.OFFSET),
+        ("relGainPc", CorrectionFlags.REL_GAIN_PC),
+        ("gainXray", CorrectionFlags.GAIN_XRAY),
+        ("badPixels", CorrectionFlags.BPMASK),
+    )
+    _kernel_runner_class = AgipdGpuRunner
+    _calcat_friend_class = AgipdCalcatFriend
+    _constant_enum_class = AgipdConstants
+    _managed_keys = BaseCorrection._managed_keys.copy()
+
+    @staticmethod
+    def expectedParameters(expected):
+        (
+            OVERWRITE_ELEMENT(expected)
+            .key("dataFormat.memoryCells")
+            .setNewDefaultValue(352)
+            .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("preview.selectionMode")
+            .setNewDefaultValue("cell")
+            .commit(),
+        )
+
+        (
+            OUTPUT_CHANNEL(expected)
+            .key("preview.outputRawGain")
+            .dataSchema(preview_schema)
+            .commit(),
+
+            OUTPUT_CHANNEL(expected)
+            .key("preview.outputGainMap")
+            .dataSchema(preview_schema)
+            .commit(),
+        )
+
+        AgipdCalcatFriend.add_schema(expected, AgipdCorrection._managed_keys)
+        # this is not automatically done by superclass for complicated class reasons
+        add_correction_step_schema(
+            expected,
+            AgipdCorrection._managed_keys,
+            AgipdCorrection._correction_field_names,
+        )
+
+        # additional settings specific to AGIPD correction steps
+        (
+            BOOL_ELEMENT(expected)
+            .key("corrections.relGainPc.overrideMdAdditionalOffset")
+            .displayedName("Override md_additional_offset")
+            .description(
+                "Toggling this on will use the value in the next field globally for "
+                "md_additional_offset. Note that the correction map on GPU gets "
+                "overwritten as long as this boolean is True, so reload constants "
+                "after turning off."
+            )
+            .assignmentOptional()
+            .defaultValue(False)
+            .reconfigurable()
+            .commit(),
+
+            FLOAT_ELEMENT(expected)
+            .key("corrections.relGainPc.mdAdditionalOffset")
+            .displayedName("Value for md_additional_offset (if overriding)")
+            .description(
+                "Normally, md_additional_offset (part of relative gain correction) is "
+                "computed when loading SlopesPC. In case you want to use a different "
+                "value (global for all medium gain pixels), you can specify it here "
+                "and set corrections.overrideMdAdditionalOffset to True."
+            )
+            .assignmentOptional()
+            .defaultValue(0)
+            .reconfigurable()
+            .commit(),
+
+            FLOAT_ELEMENT(expected)
+            .key("corrections.gainXray.gGainValue")
+            .displayedName("G_gain_value")
+            .description(
+                "Newer X-ray gain correction constants are absolute. The default "
+                "G_gain_value of 1 means that output is expected to be in keV. If "
+                "this is not desired, one can here specify the mean X-ray gain value "
+                "over all modules to get ADU values out - operator must manually "
+                "find this mean value."
+            )
+            .assignmentOptional()
+            .defaultValue(1)
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("corrections.badPixels.maskingValue")
+            .displayedName("Bad pixel masking value")
+            .description(
+                "Any pixels masked by the bad pixel mask will have their value "
+                "replaced with this. Note that this parameter is to be interpreted as "
+                "a numpy.float32; use 'nan' to get NaN value."
+            )
+            .assignmentOptional()
+            .defaultValue("nan")
+            .reconfigurable()
+            .commit(),
+
+            NODE_ELEMENT(expected)
+            .key("corrections.badPixels.subsetToUse")
+            .displayedName("Bad pixel flags to use")
+            .description(
+                "The booleans under this node allow for selecting a subset of bad "
+                "pixel types to take into account when doing bad pixel masking. "
+                "Upon updating these flags, the map used for bad pixel masking will "
+                "be ANDed with this selection. Turning disabled flags back on causes "
+                "reloading of cached constants."
+            )
+            .commit(),
+        )
+        AgipdCorrection._managed_keys.add(
+            "corrections.relGainPc.overrideMdAdditionalOffset"
+        )
+        AgipdCorrection._managed_keys.add("corrections.relGainPc.mdAdditionalOffset")
+        AgipdCorrection._managed_keys.add("corrections.gainXray.gGainValue")
+        AgipdCorrection._managed_keys.add("corrections.badPixels.maskingValue")
+        # TODO: DRY / encapsulate
+        for field in BadPixelValues:
+            (
+                BOOL_ELEMENT(expected)
+                .key(f"corrections.badPixels.subsetToUse.{field.name}")
+                .assignmentOptional()
+                .defaultValue(True)
+                .reconfigurable()
+                .commit()
+            )
+            AgipdCorrection._managed_keys.add(
+                f"corrections.badPixels.subsetToUse.{field.name}"
+            )
+
+        # mandatory: manager needs this in schema
+        (
+            VECTOR_STRING_ELEMENT(expected)
+            .key("managedKeys")
+            .assignmentOptional()
+            .defaultValue(list(AgipdCorrection._managed_keys))
+            .commit()
+        )
+
+    @property
+    def input_data_shape(self):
+        return (
+            self.unsafe_get("dataFormat.memoryCells"),
+            2,
+            self.unsafe_get("dataFormat.pixelsX"),
+            self.unsafe_get("dataFormat.pixelsY"),
+        )
+
+    def __init__(self, config):
+        super().__init__(config)
+        # note: gain mode single sourced from constant retrieval node
+        self.gain_mode = AgipdGainMode[config.get("constantParameters.gainMode")]
+
+        try:
+            self.bad_pixel_mask_value = np.float32(
+                config.get("corrections.badPixels.maskingValue")
+            )
+        except ValueError:
+            self.bad_pixel_mask_value = np.float32("nan")
+
+        self._kernel_runner_init_args = {
+            "gain_mode": self.gain_mode,
+            "bad_pixel_mask_value": self.bad_pixel_mask_value,
+            "g_gain_value": config.get("corrections.gainXray.gGainValue"),
+        }
+
+        # configurability: overriding md_additional_offset
+        if config.get("corrections.relGainPc.overrideMdAdditionalOffset"):
+            self._override_md_additional_offset = config.get(
+                "corrections.relGainPc.mdAdditionalOffset"
+            )
+        else:
+            self._override_md_additional_offset = None
+
+        self._has_updated_bad_pixel_selection = False
+
+
+    def _initialization(self):
+        self._update_bad_pixel_selection()
+        super()._initialization()
+
+    def process_data(
+        self,
+        data_hash,
+        metadata,
+        source,
+        train_id,
+        image_data,
+        cell_table,
+        do_generate_preview,
+    ):
+        """Called by input_handler for each data hash. Should correct data, optionally
+        compute preview, write data output, and optionally write preview outputs."""
+        # original shape: memory_cell, data/raw_gain, x, y
+
+        pulse_table = np.squeeze(data_hash.get("image.pulseId"))
+        if self._frame_filter is not None:
+            try:
+                cell_table = cell_table[self._frame_filter]
+                pulse_table = pulse_table[self._frame_filter]
+                image_data = image_data[self._frame_filter]
+            except IndexError:
+                self.log_status_warn(
+                    "Failed to apply frame filter, please check that it is valid!"
+                )
+                return
+
+        try:
+            self.kernel_runner.load_data(image_data)
+        except ValueError as e:
+            self.log_status_warn(f"Failed to load data: {e}")
+            return
+        except Exception as e:
+            self.log_status_warn(f"Unknown exception when loading data to GPU: {e}")
+
+        buffer_handle, buffer_array = self._shmem_buffer.next_slot()
+        self.kernel_runner.load_cell_table(cell_table)
+        self.kernel_runner.correct(self._correction_flag_enabled)
+        self.kernel_runner.reshape(
+            output_order=self.unsafe_get("dataFormat.outputAxisOrder"),
+            out=buffer_array,
+        )
+        # after reshape, data for dataOutput is now safe in its own buffer
+        if do_generate_preview:
+            if self._correction_flag_enabled != self._correction_flag_preview:
+                self.kernel_runner.correct(self._correction_flag_preview)
+            (
+                preview_slice_index,
+                preview_cell,
+                preview_pulse,
+            ) = utils.pick_frame_index(
+                self.unsafe_get("preview.selectionMode"),
+                self.unsafe_get("preview.index"),
+                cell_table,
+                pulse_table,
+                warn_func=self.log_status_warn,
+            )
+            (
+                preview_raw,
+                preview_corrected,
+                preview_raw_gain,
+                preview_gain_map,
+            ) = self.kernel_runner.compute_previews(preview_slice_index)
+
+        # reusing input data hash for sending
+        data_hash.set("image.data", buffer_handle)
+        data_hash.set("calngShmemPaths", ["image.data"])
+
+        data_hash.set("image.cellId", cell_table[:, np.newaxis])
+        data_hash.set("image.pulseId", pulse_table[:, np.newaxis])
+
+        self._write_output(data_hash, metadata)
+        if do_generate_preview:
+            self._write_combiner_previews(
+                (
+                    ("preview.outputRaw", preview_raw),
+                    ("preview.outputCorrected", preview_corrected),
+                    ("preview.outputRawGain", preview_raw_gain),
+                    ("preview.outputGainMap", preview_gain_map),
+                ),
+                train_id,
+                source,
+            )
+
+    def _load_constant_to_runner(self, constant, constant_data):
+        # TODO: encode correction / constant dependencies in a clever way
+        if constant is AgipdConstants.ThresholdsDark:
+            field_name = "thresholding"  # TODO: (reverse) mapping, DRY
+            if self.gain_mode is not AgipdGainMode.ADAPTIVE_GAIN:
+                self.log.INFO("Loaded ThresholdsDark ignored due to fixed gain mode")
+                return
+            self.kernel_runner.load_thresholds(constant_data)
+        elif constant is AgipdConstants.Offset:
+            field_name = "offset"
+            self.kernel_runner.load_offset_map(constant_data)
+        elif constant is AgipdConstants.SlopesPC:
+            field_name = "relGainPc"
+            self.kernel_runner.load_rel_gain_pc_map(constant_data)
+            if self._override_md_additional_offset is not None:
+                self.kernel_runner.md_additional_offset_gpu.fill(
+                    self._override_md_additional_offset
+                )
+        elif constant is AgipdConstants.SlopesFF:
+            field_name = "gainXray"
+            self.kernel_runner.load_rel_gain_ff_map(constant_data)
+        elif "BadPixels" in constant.name:
+            field_name = "badPixels"
+            self.kernel_runner.load_bad_pixels_map(
+                constant_data, override_flags_to_use=self._override_bad_pixel_flags
+            )
+
+        # switch relevant correction on if it just now became available
+        if not self.get(f"corrections.{field_name}.available"):
+            # TODO: turn off again when flushing
+            self.set(f"corrections.{field_name}.available", True)
+
+        self._update_correction_flags()
+        self.log_status_info(f"Done loading {constant.name} to GPU")
+
+    def _update_bad_pixel_selection(self):
+        selection = 0
+        for field in BadPixelValues:
+            if self.get(f"corrections.badPixels.subsetToUse.{field.name}"):
+                selection |= field
+        self._override_bad_pixel_flags = selection
+
+    def preReconfigure(self, config):
+        super().preReconfigure(config)
+        if config.has("corrections.badPixels.maskingValue"):
+            # only check if it is valid; postReconfigure will use it
+            try:
+                np.float32(config.get("corrections.badPixels.maskingValue"))
+            except ValueError:
+                self.log_status_warn("Invalid masking value, ignoring.")
+                config.erase("corrections.badPixels.maskingValue")
+
+    def postReconfigure(self):
+        super().postReconfigure()
+
+        # TODO: move after getting cached update, check if necessary
+        if self.get("corrections.relGainPc.overrideMdAdditionalOffset"):
+            self._override_md_additional_offset = self.get(
+                "corrections.relGainPc.mdAdditionalOffset"
+            )
+            self.kernel_runner.override_md_additional_offset(
+                self._override_md_additional_offset
+            )
+        else:
+            self._override_md_additional_offset = None
+
+        if not hasattr(self, "_prereconfigure_update_hash"):
+            return
+
+        update = self._prereconfigure_update_hash
+
+        if update.has("constantParameters.gainMode"):
+            self.gain_mode = AgipdGainMode[update["constantParameters.gainMode"]]
+            self._update_buffers()
+
+        if update.has("corrections.gainXray.gGainValue"):
+            self.kernel_runner.set_g_gain_value(
+                self.get("corrections.gainXray.gGainValue")
+            )
+            self._kernel_runner_init_args["g_gain_value"] = self.get(
+                "corrections.gainXray.gGainValue"
+            )
+
+        if update.has("corrections.badPixels.maskingValue"):
+            self.bad_pixel_mask_value = np.float32(
+                self.get("corrections.badPixels.maskingValue")
+            )
+            self.kernel_runner.set_bad_pixel_mask_value(self.bad_pixel_mask_value)
+            self._kernel_runner_init_args[
+                "bad_pixel_mask_value"
+            ] = self.bad_pixel_mask_value
+
+        if any(
+            path.startswith("corrections.badPixels.subsetToUse")
+            for path in update.getPaths()
+        ):
+            self.log_status_info("Updating bad pixel maps based on subset specified")
+            if any(
+                update.get(
+                    f"corrections.badPixels.subsetToUse.{field.name}", default=False
+                )
+                for field in BadPixelValues
+            ):
+                self.log_status_info(
+                    "Some fields reenabled, reloading cached bad pixel constants"
+                )
+                with self.calcat_friend.cached_constants_lock:
+                    for (
+                        constant,
+                        data,
+                    ) in self.calcat_friend.cached_constants.items():
+                        if "BadPixels" in constant.name:
+                            self._load_constant_to_runner(constant, data)
+            self._update_bad_pixel_selection()
+            self.kernel_runner.override_bad_pixel_flags_to_use(
+                self._override_bad_pixel_flags
+            )
diff --git a/src/calng/CalibrationManager.py b/src/calng/CalibrationManager.py
index 7fc7cd61dceb10ad09ca8d952053a609f9818d4f..07f1277266c948308b66d1fd86856e5421cb5cf1 100644
--- a/src/calng/CalibrationManager.py
+++ b/src/calng/CalibrationManager.py
@@ -23,7 +23,7 @@ from karabo.middlelayer import (
     KaraboError, Device, DeviceClientBase, Descriptor, Hash, Configurable,
     Slot, Node, Type,
     AccessMode, AccessLevel, Assignment, DaqPolicy, State, Unit,
-    UInt16, UInt32, Bool, Double, String, VectorString, VectorHash,
+    UInt16, UInt32, Bool, Double, Schema, String, VectorString, VectorHash,
     background, call, callNoWait, setNoWait, sleep, instantiate, slot, coslot,
     getDevice, getTopology, getConfiguration, getConfigurationFromPast,
     get_property)
@@ -31,6 +31,7 @@ from karabo.middlelayer_api.proxy import ProxyFactory
 
 from karabo import version as karaboVersion
 from ._version import version as deviceVersion
+from . import scenes
 
 
 '''
@@ -81,14 +82,6 @@ class ClassIdsNode(Configurable):
         accessMode=AccessMode.INITONLY,
         assignment=Assignment.MANDATORY)
 
-    previewMatcherClass = String(
-        displayedName='Preview matcher class',
-        description='Device class to use for matching the output of a preview '
-                    'layer.',
-        defaultValue='ModuleStacker',
-        accessMode=AccessMode.INITONLY,
-        assignment=Assignment.MANDATORY)
-
     assemblerClass = String(
         displayedName='Assembler class',
         description='Device class to use for assembling the matched output of '
@@ -124,14 +117,6 @@ class DeviceIdsNode(Configurable):
         accessMode=AccessMode.INITONLY,
         assignment=Assignment.MANDATORY)
 
-    previewMatcherSuffix = String(
-        displayedName='Preview matcher suffix',
-        description='Suffix for preview layer matching device IDs. The '
-                    'formatting placeholder \'layer\' may be used.',
-        defaultValue='MATCH_{layer}',
-        accessMode=AccessMode.INITONLY,
-        assignment=Assignment.MANDATORY)
-
     assemblerSuffix = String(
         displayedName='Assembler suffix',
         description='Suffix for assembler device IDs. The formatting '
@@ -302,6 +287,45 @@ class CalibrationManager(DeviceClientBase, Device):
             else []),
         accessMode=AccessMode.READONLY)
 
+    availableScenes = VectorString(
+        displayedName='Available scenes',
+        displayType='Scenes',
+        requiredAccessLevel=AccessLevel.OBSERVER,
+        accessMode=AccessMode.READONLY,
+        defaultValue=['overview', 'managed_keys'],
+        daqPolicy=DaqPolicy.OMIT)
+
+    @slot
+    def requestScene(self, params):
+        name = params.get('name', default='overview')
+        if name == 'overview':
+            # Assumes there are correction devices known to manager
+            scene_data = scenes.manager_device_overview_scene(
+                self.deviceId,
+                self.getDeviceSchema(),
+                self._correction_device_schema,
+                self._correction_device_ids,
+                self._domain_device_ids,
+            )
+            payload = Hash('success', True, 'name', name, 'data', scene_data)
+        elif name.startswith('browse_schema'):
+            if ':' in name:
+                prefix = name[len('browse_schema:'):]
+            else:
+                prefix = 'managed'
+            scene_data = scenes.recursive_subschema_scene(
+                self.deviceId,
+                self.getDeviceSchema(),
+                prefix,
+            )
+            payload = Hash('success', True, 'name', name, 'data', scene_data)
+        else:
+            payload = Hash('success', False, 'name', name)
+
+        return Hash('type', 'deviceScene',
+                    'origin', self.deviceId,
+                    'payload', payload)
+
     detectorType = String(
         displayedName='Detector type',
         description='Type of the detector to manage.',
@@ -368,6 +392,13 @@ class CalibrationManager(DeviceClientBase, Device):
         self.deviceServers = value
         self._servers_changed = True
 
+    imageDataPath = String(
+        displayedName='Image data path',
+        description='Path in DAQ hash to actual image data, used for preview',
+        accessMode=AccessMode.RECONFIGURABLE,
+        assignment=Assignment.OPTIONAL,
+        defaultValue='image.data')
+
     geometryDevice = String(
         displayedName='Geometry device',
         description='[NYI] Device ID for a geometry device defining the '
@@ -591,6 +622,9 @@ class CalibrationManager(DeviceClientBase, Device):
         # Obtain the device schema from a correction device server.
         managed_schema, _, _ = await call(corr_server, 'slotGetClassSchema',
                                           self._correction_class_id)
+        # saving this for later
+        self._correction_device_schema = Schema()
+        self._correction_device_schema.copy(managed_schema)
 
         if managed_schema.name != self._correction_class_id:
             self._set_fatal(
@@ -1012,7 +1046,7 @@ class CalibrationManager(DeviceClientBase, Device):
         device_id_templates = {}
 
         class_args = (self.detectorType.value.lower().capitalize(),)
-        for role in ['correction', 'groupMatcher', 'bridge', 'previewMatcher',
+        for role in ['correction', 'groupMatcher', 'bridge',
                      'assembler']:
             class_ids[role] = getattr(
                 self.classIds, f'{role}Class').value.format(*class_args)
@@ -1061,10 +1095,8 @@ class CalibrationManager(DeviceClientBase, Device):
 
             config = Hash()
 
-            # Legacy keys for calibrationBase.
-            config['det_type'] = self.detectorType
-            config['det_identifier'] = self.detectorIdentifier
-            config['da_name'] = aggregator
+            config['constantParameters.detectorName'] = self.detectorIdentifier.value
+            config['constantParameters.karaboDa'] = aggregator
             config['dataInput.connectedOutputChannels'] = [input_channel]
             config['fastSources'] = [input_source]
 
@@ -1153,78 +1185,26 @@ class CalibrationManager(DeviceClientBase, Device):
 
                     background(_activate_bridge(bridge_device_id))
 
-        # Instantiate preview layer matchers and assemblers.
+        # Instantiate preview layer assemblers.
+        geometry_device_id = self.geometryDevice.value
         for layer, output_pipeline, server in self.previewLayers.value:
-            # Preview matcher.
-            matcher_device_id = device_id_templates['previewMatcher'].format(
+            assembler_device_id = device_id_templates['assembler'].format(
                 layer=layer)
 
             config = Hash()
-            config['channels'] = [
-                f'{device_id}:{output_pipeline}'
-                for device_id in correct_device_id_by_module.values()]
+            # TODO: put _image_data_path in corr dev schema, get from there
+            config['pathToStack'] = self.imageDataPath.value
             config['fastSources'] = [
                 Hash('fsSelect', True,
                      'fsSource',
                      f'{input_source_by_module[virtual_id]}')
                 for (virtual_id, device_id)
                 in correct_device_id_by_module.items()]
-            config['pathToStack'] = 'data.adc'
-
-            if not await self._instantiate_device(
-                server, class_ids['previewMatcher'], matcher_device_id, config
-            ):
-                return
-
-            # Preview assembler.
-            assembler_device_id = device_id_templates['assembler'].format(
-                layer=layer)
-
-            config = Hash()
-            config['input.connectedOutputChannels'] = [
-                f'{matcher_device_id}:output']
-            config['modules'] = [
-                Hash('source', input_source_by_module.get('Q1M1', ''),
-                     'offX', 474, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q1M2', ''),
-                     'offX', 316, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q1M3', ''),
-                     'offX', 158, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q1M4', ''),
-                     'offX', 0, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q2M1', ''),
-                     'offX', 1136, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q2M2', ''),
-                     'offX', 978, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q2M3', ''),
-                     'offX', 820, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q2M4', ''),
-                     'offX', 662, 'offY', 612, 'rot', 90),
-                Hash('source', input_source_by_module.get('Q3M1', ''),
-                     'offX', 712, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q3M2', ''),
-                     'offX', 870, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q3M3', ''),
-                     'offX', 1028, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q3M4', ''),
-                     'offX', 1186, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q4M1', ''),
-                     'offX', 50, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q4M2', ''),
-                     'offX', 208, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q4M3', ''),
-                     'offX', 366, 'offY', 0, 'rot', 270),
-                Hash('source', input_source_by_module.get('Q4M4', ''),
-                     'offX', 524, 'offY', 0, 'rot', 270),
-            ]
-            config['pathsToCombine'] = ['data.adc']
-            config['trainIdPath'] = 'image.trainId'
-            config['pulseIdPath'] = 'image.pulseId'
-            config['preview.enablePreview'] = True
-            config['preview.pathToPreview'] = 'data.adc'
-            config['preview.downSample'] = 2
-            config['badpixelPath'] = 'image.bad_pixels'
-            config['rotated90Grad'] = True
+            config['channels'] = [
+                f'{device_id}:{output_pipeline}'
+                for device_id in correct_device_id_by_module.values()]
+            config['geometryInput.connectedOutputChannels'] = [
+                f'{geometry_device_id}:geometryOutput']
 
             if not await self._instantiate_device(
                 server, class_ids['assembler'], assembler_device_id, config
diff --git a/src/calng/DsscCorrection.py b/src/calng/DsscCorrection.py
index f86153cbf96fcf7ee8691853e14893cc78e8c0de..c228648a01057c0d44719a2a316792497f705adb 100644
--- a/src/calng/DsscCorrection.py
+++ b/src/calng/DsscCorrection.py
@@ -1,838 +1,290 @@
-import threading
-import timeit
+import enum
 
-import calibrationBase
-import hashToSchema
+import cupy
 import numpy as np
 from karabo.bound import (
-    BOOL_ELEMENT,
-    FLOAT_ELEMENT,
-    INPUT_CHANNEL,
-    INT32_ELEMENT,
+    DOUBLE_ELEMENT,
     KARABO_CLASSINFO,
-    NDARRAY_ELEMENT,
-    NODE_ELEMENT,
-    OUTPUT_CHANNEL,
-    STRING_ELEMENT,
-    UINT32_ELEMENT,
-    UINT64_ELEMENT,
+    OVERWRITE_ELEMENT,
     VECTOR_STRING_ELEMENT,
-    VECTOR_UINT32_ELEMENT,
-    ChannelMetaData,
-    Epochstamp,
-    Hash,
-    MetricPrefix,
-    Schema,
-    Timestamp,
-    Trainstamp,
-    Unit,
+    State,
 )
-from karabo.common.states import State
-
-from . import shmem_utils
-from . import utils
 
+from . import base_gpu, calcat_utils, utils
 from ._version import version as deviceVersion
-from .dssc_gpu import DsscGpuRunner
+from .base_correction import BaseCorrection, add_correction_step_schema
 
 
-@KARABO_CLASSINFO("DsscCorrection", deviceVersion)
-class DsscCorrection(calibrationBase.CalibrationReceiverBaseDevice):
-    _dict_cache_slots = {
-        "applyCorrection",
-        "doAnything",
-        "dataFormat.memoryCells",
-        "dataFormat.memoryCellsCorrection",
-        "dataFormat.pixelsX",
-        "dataFormat.pixelsY",
-        "preview.enable",
-        "preview.pulse",
-        "preview.trainIdModulo",
-        "processingStateTimeout",
-        "performance.rateUpdateOnEachInput",
-        "state",
-    }
+class CorrectionFlags(enum.IntFlag):
+    NONE = 0
+    OFFSET = 1
 
-    @staticmethod
-    def expectedParameters(expected):
-        DsscCorrection.addConstant(
-            "Offset", "Dark", expected, optional=True, mandatoryForIteration=True
-        )
 
-        (
-            BOOL_ELEMENT(expected)
-            .key("doAnything")
-            .displayedName("Enable input processing")
-            .description(
-                "Toggle handling of input (at all). If False, the input handler of "
-                "this device will be skipped. Useful to decrease logspam if device is "
-                "misconfigured."
-            )
-            .assignmentOptional()
-            .defaultValue(True)
-            .reconfigurable()
-            .commit(),
-            BOOL_ELEMENT(expected)
-            .key("applyCorrection")
-            .displayedName("Enable correction(s)")
-            .description(
-                "Toggle whether not correction(s) are applied to image data. If "
-                "false, this device still reshapes data to output shape, applies the "
-                "pulse filter, and casts to output dtype. Useful if constants are "
-                "missing / bad, or if data is sent to application doing its own "
-                "correction."
-            )
-            .assignmentOptional()
-            .defaultValue(True)
-            .reconfigurable()
-            .commit(),
-            INPUT_CHANNEL(expected).key("dataInput").commit(),
-            # note: output schema not set, will be updated to match data later
-            OUTPUT_CHANNEL(expected).key("dataOutput").commit(),
-            VECTOR_STRING_ELEMENT(expected)
-            .key("fastSources")
-            .displayedName("Fast data sources")
-            .description(
-                "Sources to get data from. Only incoming hashes from these sources "
-                "will be processed."
-            )
-            .assignmentMandatory()
-            .commit(),
-            STRING_ELEMENT(expected)
-            .key("pulseFilter")
-            .displayedName("[disabled] Pulse filter")
-            .description(
-                "Filter pulses: will be evaluated as array of indices to keep from "
-                "data. Can be anything which can be turned into numpy uint16 array. "
-                "Numpy is available as np. Take care not to include duplicates. If "
-                "empty, will not filter at all."
-            )
-            .readOnly()
-            .initialValue("")
-            .commit(),
-            UINT32_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .displayedName("Output buffer size limit (GB)")
-            .description(
-                "Corrected trains are written to shared memory locations. These are "
-                "pre-allocated and re-used. This parameter determines how big (number "
-                "of GB) the circular buffer will be."
-            )
-            .assignmentOptional()
-            .defaultValue(10)
-            .commit(),
-        )
+class DsscConstants(enum.Enum):
+    Offset = enum.auto()
 
-        (
-            NODE_ELEMENT(expected)
-            .key("dataFormat")
-            .displayedName("Data format (in/out)")
-            .commit(),
-            STRING_ELEMENT(expected)
-            .key("dataFormat.inputImageDtype")
-            .displayedName("Input image data dtype")
-            .description("The (numpy) dtype to expect for incoming image data.")
-            .options("uint16,float32")
-            .assignmentOptional()
-            .defaultValue("uint16")
-            .commit(),
-            STRING_ELEMENT(expected)
-            .key("dataFormat.outputImageDtype")
-            .displayedName("Output image data dtype")
-            .description(
-                "The (numpy) dtype to use for outgoing image data. Input is "
-                "cast to float32, corrections are applied, and only then will "
-                "the result be cast back to outputImageDtype (all on GPU)."
-            )
-            .options("float16,float32,uint16")
-            .assignmentOptional()
-            .defaultValue("float32")
-            .commit(),
-            # important: shape of data as going into correction
-            UINT32_ELEMENT(expected)
-            .key("dataFormat.pixelsX")
-            .displayedName("Pixels x")
-            .description("Number of pixels of image data along X axis")
-            .assignmentMandatory()
-            .commit(),
-            UINT32_ELEMENT(expected)
-            .key("dataFormat.pixelsY")
-            .displayedName("Pixels y")
-            .description("Number of pixels of image data along Y axis")
-            .assignmentMandatory()
-            .commit(),
-            UINT32_ELEMENT(expected)
-            .key("dataFormat.memoryCells")
-            .displayedName("Memory cells")
-            .description("Full number of memory cells in incoming data")
-            .assignmentMandatory()
-            .commit(),
-            STRING_ELEMENT(expected)
-            .key("dataFormat.outputAxisOrder")
-            .displayedName("Output axis order")
-            .description(
-                "Axes of main data output can be reordered after correction. Choose "
-                "between 'pixels-fast' (memory_cell, x, y), 'memorycells-fast' "
-                "(x, y, memory_cell), and 'no-reshape' (memory_cell, y, x)"
-            )
-            .options("pixels-fast,memorycells-fast,no-reshape")
-            .assignmentOptional()
-            .defaultValue("pixels-fast")
-            .commit(),
-            UINT32_ELEMENT(expected)
-            .key("dataFormat.memoryCellsCorrection")
-            .displayedName("(Debug) Memory cells in correction map")
-            .description(
-                "Full number of memory cells in currently loaded correction map. "
-                "May exceed memory cell number in input if veto is on. "
-                "This value just displayed for debugging."
-            )
-            .readOnly()
-            .initialValue(0)
-            .commit(),
-            VECTOR_UINT32_ELEMENT(expected)
-            .key("dataFormat.inputDataShape")
-            .displayedName("Input data shape")
-            .description(
-                "Image data shape in incoming data (from reader / DAQ). This value is "
-                "computed from pixelsX, pixelsY, and memoryCells - this field just "
-                "shows you what is currently expected."
-            )
-            .readOnly()
-            .initialValue([])
-            .commit(),
-            VECTOR_UINT32_ELEMENT(expected)
-            .key("dataFormat.outputDataShape")
-            .displayedName("Output data shape")
-            .description(
-                "Image data shape for data output from this device. This value is "
-                "computed from pixelsX, pixelsY, and the size of the pulse filter - "
-                "this field just shows what is currently expected."
-            )
-            .readOnly()
-            .initialValue([])
-            .commit(),
-        )
 
-        preview_schema = Schema()
-        (
-            NODE_ELEMENT(expected).key("preview").displayedName("Preview").commit(),
-            NODE_ELEMENT(preview_schema).key("data").commit(),
-            NDARRAY_ELEMENT(preview_schema).key("data.adc").dtype("FLOAT").commit(),
-            OUTPUT_CHANNEL(expected)
-            .key("preview.outputRaw")
-            .dataSchema(preview_schema)
-            .commit(),
-            OUTPUT_CHANNEL(expected)
-            .key("preview.outputCorrected")
-            .dataSchema(preview_schema)
-            .commit(),
-            BOOL_ELEMENT(expected)
-            .key("preview.enable")
-            .displayedName("Enable preview data generation")
-            .assignmentOptional()
-            .defaultValue(True)
-            .reconfigurable()
-            .commit(),
-            INT32_ELEMENT(expected)
-            .key("preview.pulse")
-            .displayedName("Pulse (or stat) for preview")
-            .description(
-                "If this value is ≥ 0, the corresponding index from data will be "
-                "sliced for the preview. If this value is ≤ 0, preview will be one of "
-                "the following stats:"
-                "-1: max, "
-                "-2: mean, "
-                "-3: sum, "
-                "-4: stdev. "
-                "Max means selecting the pulse with the maximum integrated value. The "
-                "others are computed across all filtered pulses in the train."
-            )
-            .assignmentOptional()
-            .defaultValue(0)
-            .reconfigurable()
-            .commit(),
-            UINT32_ELEMENT(expected)
-            .key("preview.trainIdModulo")
-            .displayedName("Train modulo for throttling")
-            .description(
-                "Preview will only be generated for trains whose ID modulo this "
-                "number is zero. Higher values means fewer preview updates. Should be "
-                "adjusted based on input rate. Keep in mind that the GUI has limited "
-                "refresh rate anyway and that network is precious."
-            )
-            .assignmentOptional()
-            .defaultValue(6)
-            .reconfigurable()
-            .commit(),
-        )
+class DsscGpuRunner(base_gpu.BaseGpuRunner):
+    _kernel_source_filename = "dssc_gpu.cu"
+    _corrected_axis_order = "cyx"
 
-        (
-            NODE_ELEMENT(expected)
-            .key("performance")
-            .displayedName("Performance measures")
-            .commit(),
-            FLOAT_ELEMENT(expected)
-            .key("performance.rateUpdateInterval")
-            .displayedName("Rate update interval")
-            .description(
-                "Maximum interval (seconds) between updates of the rate. Mostly "
-                "relevant if not rateUpdateOnEachInput or if input is slow."
-            )
-            .assignmentOptional()
-            .defaultValue(1)
-            .reconfigurable()
-            .commit(),
-            FLOAT_ELEMENT(expected)
-            .key("performance.rateBufferSpan")
-            .displayedName("Rate measurement buffer span")
-            .description("Event buffer timespan (in seconds) for measuring rate")
-            .assignmentOptional()
-            .defaultValue(20)
-            .reconfigurable()
-            .commit(),
-            BOOL_ELEMENT(expected)
-            .key("performance.rateUpdateOnEachInput")
-            .displayedName("Update rate on each input")
-            .description(
-                "Whether or not to update the device rate for each input (otherwise "
-                "only based on rateUpdateInterval). Note that processed trains are "
-                "always registered - this just impacts when the rate is computed "
-                "based on this."
-            )
-            .assignmentOptional()
-            .defaultValue(False)
-            .reconfigurable()
-            .commit(),
-            FLOAT_ELEMENT(expected)
-            .key("processingStateTimeout")
-            .description(
-                "Timeout after which the device goes from PROCESSING back to ACTIVE "
-                "if no new input is processed"
-            )
-            .assignmentOptional()
-            .defaultValue(10)
-            .reconfigurable()
-            .commit(),
-            # just measurements and counters to display
-            UINT64_ELEMENT(expected)
-            .key("trainId")
-            .displayedName("Train ID")
-            .description("ID of latest train processed by this device.")
-            .readOnly()
-            .initialValue(0)
-            .commit(),
-            FLOAT_ELEMENT(expected)
-            .key("performance.lastProcessingDuration")
-            .displayedName("Processing time")
-            .description(
-                "Amount of time spent in processing latest train. Time includes "
-                "generating preview and sending data."
-            )
-            .unit(Unit.SECOND)
-            .metricPrefix(MetricPrefix.MILLI)
-            .readOnly()
-            .initialValue(0)
-            .commit(),
-            FLOAT_ELEMENT(expected)
-            .key("performance.rate")
-            .displayedName("Rate")
-            .description(
-                "Actual rate with which this device gets / processes / sends trains"
-            )
-            .unit(Unit.HERTZ)
-            .readOnly()
-            .initialValue(0)
-            .commit(),
-            FLOAT_ELEMENT(expected)
-            .key("performance.theoreticalRate")
-            .displayedName("Processing rate (hypothetical)")
-            .description(
-                "Rate with which this device could hypothetically process trains. "
-                "Based on lastProcessingDuration."
-            )
-            .unit(Unit.HERTZ)
-            .readOnly()
-            .initialValue(float("NaN"))
-            .warnLow(10)
-            .info("Processing not fast enough for full speed")
-            .needsAcknowledging(False)
-            .commit(),
-        )
-
-    def __init__(self, config):
-        self._dict_cache = {k: config.get(k) for k in self._dict_cache_slots}
-        super().__init__(config)
-
-        self.KARABO_ON_DATA("dataInput", self.process_input)
-        self.KARABO_ON_EOS("dataInput", self.handle_eos)
-
-        self.sources = set(config.get("fastSources"))
-
-        self.input_data_dtype = getattr(np, config.get("dataFormat.inputImageDtype"))
-        self.output_data_dtype = getattr(np, config.get("dataFormat.outputImageDtype"))
-        output_axis_order = config.get("dataFormat.outputAxisOrder")
-        if output_axis_order == "pixels-fast":
-            self._output_transpose = (0, 2, 1)
-        elif output_axis_order == "memorycells-fast":
-            self._output_transpose = (2, 1, 0)
-        else:
-            self._output_transpose = None
-        self._offset_map = None
-        self._update_pulse_filter(config.get("pulseFilter"))
-        self._shmem_buffer = None
-        self._update_shapes(
-            config.get("dataFormat.pixelsX"),
-            config.get("dataFormat.pixelsY"),
-            config.get("dataFormat.memoryCells"),
-            self.pulse_filter,
-            self._output_transpose,
-        )
-        self._has_set_output_schema = False
-        self._rate_tracker = calibrationBase.utils.UpdateRate(
-            interval=config.get("performance.rateBufferSpan")
-        )
-        self._state_reset_timer = None
-
-        self._buffered_status_update = Hash(
-            "trainId",
-            0,
-            "performance.rate",
-            0,
-            "performance.theoreticalRate",
-            float("NaN"),
-            "performance.lastProcessingDuration",
-            0,
-        )
-        self._rate_update_timer = utils.RepeatingTimer(
-            interval=config.get("performance.rateUpdateInterval"),
-            callback=self._update_actual_rate,
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=np.uint16,
+        output_data_dtype=np.float32,
+    ):
+        self.input_shape = (memory_cells, pixels_y, pixels_x)
+        self.processed_shape = self.input_shape
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
         )
-        self._buffer_lock = threading.Lock()
-
-        self.updateState(State.ON)
-
-    def get(self, key):
-        if key in self._dict_cache_slots:
-            return self._dict_cache.get(key)
-        else:
-            return super().get(key)
-
-    def set(self, *args):
-        if len(args) == 2:
-            key, value = args
-            if key in self._dict_cache_slots:
-                self._dict_cache[key] = value
-        super().set(*args)
-
-    def preReconfigure(self, config):
-        if config.has("pulseFilter"):
-            with self._buffer_lock:
-                # apply new pulse filter
-                self._update_pulse_filter(config.get("pulseFilter"))
-                # but existing shapes (not reconfigurable)
-                # TODO: avoid double compilation here if constants are loaded
-                self._update_shapes(
-                    self.get("dataFormat.pixelsX"),
-                    self.get("dataFormat.pixelsY"),
-                    self.get("dataFormat.memoryCells"),
-                    self.pulse_filter,
-                )
-
-        if config.has("performance.rateUpdateInterval"):
-            self._rate_update_timer.stop()
-            self._rate_update_timer = utils.RepeatingTimer(
-                interval=config.get("performance.rateUpdateInterval"),
-                callback=self._update_actual_rate,
-            )
-
-        if config.has("performance.rateBufferSpan"):
-            self._rate_tracker = calibrationBase.utils.UpdateRate(
-                interval=config.get("performance.rateBufferSpan")
-            )
-
-        for path in config.getPaths():
-            if path in self._dict_cache_slots:
-                self._dict_cache[path] = config.get(path)
 
-    def process_input(self, data, metadata):
-        """Registered for dataInput, handles all processing and sending
+        self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x)
+        self.offset_map_gpu = cupy.empty(self.map_shape, dtype=np.float32)
 
-        Comparable to StreamBase.onInput but hopefully faster
+        self._init_kernels()
 
-        """
+        self.offset_map_gpu = cupy.empty(self.map_shape, dtype=np.float32)
 
-        if not self.get("doAnything"):
-            if self.get("state") is State.PROCESSING:
-                self.updateState(State.ACTIVE)
-            return
-
-        # TODO: compare KARABO_ON_INPUT (old) against KARABO_ON_DATA (current)
-        source = metadata.get("source")
+        self.update_block_size((1, 1, 64))
 
-        if source not in self.sources:
-            self.log.INFO(f"Ignoring unknown source {source}")
-            return
-
-        # TODO: what are these empty things for?
-        if not data.has("image"):
-            self.log.INFO("Ignoring hash without image node")
-            return
+    def _get_raw_for_preview(self):
+        return self.input_data_gpu
 
-        time_start = timeit.default_timer()
+    def _get_corrected_for_preview(self):
+        return self.processed_data_gpu
 
-        train_id = metadata.getAttribute("timestamp", "tid")
-        cell_table = np.squeeze(data.get("image.cellId"))
-        assert isinstance(cell_table, np.ndarray), "image.cellId should be ndarray"
-        if len(cell_table.shape) == 0:
-            msg = "cellId had 0 dimensions. DAQ may not be sending data."
-            self.set("status", msg)
-            self.log.WARN(msg)
-            return
-        # original shape: 400, 1, 128, 512 (memory cells, something, y, x)
-        # TODO: consider making paths configurable
-        image_data = data.get("image.data")
-        if image_data.shape[0] != self.get("dataFormat.memoryCells"):
-            self.set(
-                "status", f"Updating input shapes based on received {image_data.shape}"
-            )
-            # TODO: truncate if > 800
-            self.set("dataFormat.memoryCells", image_data.shape[0])
-            with self._buffer_lock:
-                self._update_pulse_filter(self.get("pulseFilter"))
-                self._update_shapes(
-                    self.get("dataFormat.pixelsX"),
-                    self.get("dataFormat.pixelsY"),
-                    self.get("dataFormat.memoryCells"),
-                    self.pulse_filter,
-                    self._output_transpose,
-                )
-        # TODO: check shape (DAQ fake data and RunToPipe don't agree)
-        # TODO: consider just updating shapes based on whatever comes in
-
-        correction_cell_num = self.get("dataFormat.memoryCellsCorrection")
-        do_generate_preview = train_id % self.get(
-            "preview.trainIdModulo"
-        ) == 0 and self.get("preview.enable")
-        can_apply_correction = correction_cell_num > 0
-        do_apply_correction = self.get("applyCorrection")
-
-        if not self.get("state") is State.PROCESSING:
-            self.updateState(State.PROCESSING)
-            self.set("status", "Processing data")
-        if self._state_reset_timer is None:
-            self._state_reset_timer = utils.DelayableTimer(
-                timeout=self.get("processingStateTimeout"),
-                callback=self._reset_state_from_processing,
-            )
-        else:
-            self._state_reset_timer.set_timeout(self.get("processingStateTimeout"))
-
-        with self._buffer_lock:
-            cell_table = cell_table[self.pulse_filter]
-            pulse_table = np.squeeze(data.get("image.pulseId"))[self.pulse_filter]
-
-            cell_table_max = np.max(cell_table)
-            if do_apply_correction:
-                if not can_apply_correction:
-                    msg = "No constant loaded, correction will not be applied."
-                    self.log.WARN(msg)
-                    self.set("status", msg)
-                    do_apply_correction = False
-                elif cell_table_max >= correction_cell_num:
-                    msg = (
-                        f"Max cell ID ({cell_table_max}) exceeds range for loaded "
-                        f"constant (has {correction_cell_num} cells). Some frames "
-                        "will not be corrected."
-                    )
-                    self.log.WARN(msg)
-                    self.set("status", msg)
-
-            self.gpu_runner.load_data(image_data)
-            buffer_handle, buffer_array = self._shmem_buffer.next_slot()
-            if do_apply_correction:
-                self.gpu_runner.load_cell_table(cell_table)
-                self.gpu_runner.correct()
-            else:
-                self.gpu_runner.only_cast()
-            self.gpu_runner.reshape(out=buffer_array)
-            if do_generate_preview:
-                preview_slice_index = self.get("preview.pulse")
-                if preview_slice_index >= 0:
-                    # look at pulse_table to find which index this pulse ID is in
-                    pulse_id_found = np.where(pulse_table == preview_slice_index)[0]
-                    if len(pulse_id_found) == 0:
-                        pulse_found_instead = pulse_table[0]
-                        msg = (
-                            f"Pulse {preview_slice_index} not found in "
-                            f"image.pulseId, arbitrary pulse "
-                            f"{pulse_found_instead} will be shown."
-                        )
-                        preview_slice_index = 0
-                        self.log.WARN(msg)
-                        self.set("status", msg)
-                    else:
-                        preview_slice_index = pulse_id_found[0]
-                if not do_apply_correction:
-                    if can_apply_correction:
-                        # in this case, cell table has not been loaded, but needs to be now
-                        self.gpu_runner.load_cell_table(cell_table)
-                    else:
-                        # in this case, there will be no corrected preview
-                        self.log.WARN(
-                            "Corrected preview will not actually be corrected."
-                        )
-                preview_raw, preview_corrected = self.gpu_runner.compute_preview(
-                    preview_slice_index,
-                    have_corrected=do_apply_correction,
-                    can_correct=can_apply_correction,
-                )
+    def load_offset_map(self, offset_map):
+        # can have an extra dimension for some reason
+        if len(offset_map.shape) == 4:  # old format (see offsetcorrection_dssc.py)?
+            offset_map = offset_map[..., 0]
+        # shape (now): x, y, memory cell
+        offset_map = np.transpose(offset_map).astype(np.float32)
+        self.offset_map_gpu.set(offset_map)
 
-        data.set("image.data", buffer_handle)
-        data.set("image.cellId", cell_table[:, np.newaxis])
-        data.set("image.pulseId", pulse_table[:, np.newaxis])
-        data.set("calngShmemPaths", ["image.data"])
-        self.write_output(data, metadata)
-        if do_generate_preview:
-            self.write_combiner_preview(
-                preview_raw, preview_corrected, train_id, source
-            )
-
-        # update rate etc.
-        self._buffered_status_update.set("trainId", train_id)
-        self._rate_tracker.update()
-        time_spent = timeit.default_timer() - time_start
-        self._buffered_status_update.set(
-            "performance.lastProcessingDuration", time_spent * 1000
+    def correct(self, flags):
+        self.correction_kernel(
+            self.full_grid,
+            self.full_block,
+            (
+                self.input_data_gpu,
+                self.cell_table_gpu,
+                np.uint8(flags),
+                self.offset_map_gpu,
+                self.processed_data_gpu,
+            ),
         )
-        if self.get("performance.rateUpdateOnEachInput"):
-            self._update_actual_rate()
-
-    def handle_eos(self, channel):
-        self._has_set_output_schema = False
-        self.updateState(State.ON)
-        self.signalEndOfStream("dataOutput")
-
-    def write_output(self, data, old_metadata):
-        metadata = ChannelMetaData(
-            old_metadata.get("source"),
-            Timestamp.fromHashAttributes(old_metadata.getAttributes("timestamp")),
-        )
-
-        if "image.passport" not in data:
-            data["image.passport"] = []
-        data["image.passport"].append(self.getInstanceId())
-
-        if not self._has_set_output_schema:
-            self.updateState(State.CHANGING)
-            self._update_output_schema(data)
-            self.updateState(State.PROCESSING)
-
-        channel = self.signalSlotable.getOutputChannel("dataOutput")
-        channel.write(data, metadata, False)
-        channel.update()
-
-    def write_combiner_preview(self, data_raw, data_corrected, train_id, source):
-        # TODO: take into account updated pulse table after pulse filter
-        preview_hash = Hash()
-        preview_hash.set("image.passport", [self.getInstanceId()])
-        preview_hash.set("image.trainId", train_id)
-        preview_hash.set("image.pulseId", self.get("preview.pulse"))
-
-        # note: have to construct because setting .tid after init is broken
-        timestamp = Timestamp(Epochstamp(), Trainstamp(train_id))
-        metadata = ChannelMetaData(source, timestamp)
-        for channel_name, data in (
-            ("preview.outputRaw", data_raw),
-            ("preview.outputCorrected", data_corrected),
-        ):
-            preview_hash.set("data.adc", data[..., np.newaxis])
-            channel = self.signalSlotable.getOutputChannel(channel_name)
-            channel.write(preview_hash, metadata, False)
-            channel.update()
-
-    def getConstant(self, name):
-        """Hacky override of getConstant to actually return None on failure
-
-        Full function is from CalibrationReceiverBaseDevice
-
-        """
-
-        const = super().getConstant(name)
-        if const is not None and len(const.shape) == 1:
-            self.log.WARN(
-                f"Constant {name} should probably be None, but is array"
-                f" of size {const.size}, shape {const.shape}"
-            )
-            const = None
-        return const
 
-    def constantLoaded(self):
-        """Hook from CalibrationReceiverBaseDevice called after each getConstant
+    def _init_kernels(self):
+        kernel_source = self._kernel_template.render(
+            {
+                "pixels_x": self.pixels_x,
+                "pixels_y": self.pixels_y,
+                "data_memory_cells": self.memory_cells,
+                "constant_memory_cells": self.constant_memory_cells,
+                "input_data_dtype": utils.np_dtype_to_c_type(self.input_data_dtype),
+                "output_data_dtype": utils.np_dtype_to_c_type(self.output_data_dtype),
+                "corr_enum": utils.enum_to_c_template(CorrectionFlags),
+            }
+        )
+        self.source_module = cupy.RawModule(code=kernel_source)
+        self.correction_kernel = self.source_module.get_function("correct")
 
-        Here, used to load the received constants (or correction maps derived
-        fromt them) onto GPU.
 
-        TODO: call after receiving *all* constants instead of calling once per
-        new constant (will cause some overhead for bigger devices)
+class DsscCalcatFriend(calcat_utils.BaseCalcatFriend):
+    _constant_enum_class = DsscConstants
 
-        """
+    def __init__(self, device, *args, **kwargs):
+        super().__init__(device, *args, **kwargs)
+        self._constants_need_conditions = {
+            DsscConstants.Offset: self.dark_condition,
+        }
 
-        offset_map = self.getConstant("Offset")
-        input_memory_cells = self.get("dataFormat.memoryCells")
-        if offset_map is None:
-            msg = (
-                "Warning: Did not find offset constant, offset correction "
-                "will not be applied"
-            )
-            self.set("status", msg)
-            self.log.WARN(msg)
-            self._offset_map = None
-        elif len(offset_map.shape) not in (3, 4):
-            msg = (
-                f"Offset map had unexpected shape {offset_map.shape}, "
-                "offset correction will not be applied"
-            )
-            self.set("status", msg)
-            self.log.WARN(msg)
-        else:
-            self.log.INFO(f"Offset map loaded has shape {offset_map.shape}")
-            if len(offset_map.shape) == 4:  # old format (see offsetcorrection_dssc.py)?
-                offset_map = offset_map[..., 0]
-            constant_memory_cells = offset_map.shape[-1]
-            if input_memory_cells > constant_memory_cells:
-                msg = (
-                    f"Warning: Memory cells in input {input_memory_cells} > "
-                    f"memory cells in constant {constant_memory_cells}, some "
-                    "frames may not get correction applied."
-                )
-                self.set("status", msg)
-                self.log.WARN(msg)
-            self._offset_map = offset_map.astype(np.float32)
-            msg = f"Offset map with shape {self._offset_map.shape} ready to load to GPU"
-            self.set("status", msg)
-            self.log.INFO(msg)
-            if constant_memory_cells != self.get("dataFormat.memoryCellsCorrection"):
-                self.log.INFO("Will first have to update buffers on GPU")
-                self.set("dataFormat.memoryCellsCorrection", constant_memory_cells)
+    @staticmethod
+    def add_schema(
+        schema,
+        managed_keys,
+        param_prefix="constantParameters",
+        status_prefix="foundConstants",
+    ):
+        super(DsscCalcatFriend, DsscCalcatFriend).add_schema(
+            schema, managed_keys, "DSSC-Type", param_prefix, status_prefix
+        )
+        (
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.memoryCells")
+            .setNewDefaultValue(400)
+            .commit(),
 
-        self._update_maps_on_gpu()
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.biasVoltage")
+            .setNewDefaultValue(100)  # TODO: proper
+            .commit()
+        )
+        (
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.pulseIdChecksum")
+            .assignmentOptional()
+            .defaultValue(2.8866323107820637e-36)
+            .commit(),
 
-    def registerManager(self, instance_id):
-        """A hook from stream.py for Manager devices to register themselves
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.acquisitionRate")
+            .assignmentOptional()
+            .defaultValue(4.5)
+            .commit(),
 
-        instance_id should be the instance id of the manager device.  The
-        registration is currently not really used I think.
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.encodedGain")
+            .assignmentOptional()
+            .defaultValue(67328)
+            .commit(),
+        )
 
-        """
+        calcat_utils.add_status_schema_from_enum(schema, status_prefix, DsscConstants)
 
-        self.managerInstance = instance_id
-        self.log.INFO(f"Registered calibration manager {instance_id}")
+    def dark_condition(self):
+        res = calcat_utils.OperatingConditions()
+        res["Memory cells"] = self._get_param("memoryCells")
+        res["Sensor Bias Voltage"] = self._get_param("biasVoltage")
+        res["Pixels X"] = self._get_param("pixelsX")
+        res["Pixels Y"] = self._get_param("pixelsY")
+        # res["Pulse id checksum"] = self._get_param("pulseIdChecksum")
+        # res["Acquisition rate"] = self._get_param("acquisitionRate")
+        # res["Encoded gain"] = self._get_param("encodedGain")
+        return res
 
-    def _update_output_schema(self, data):
-        """Updates the schema of dataOutput based on parameter data (a Hash)
 
-        This should only be called once: when handling output for the first
-        time, we update the schema to match the modified data we'd send.
+@KARABO_CLASSINFO("DsscCorrection", deviceVersion)
+class DsscCorrection(BaseCorrection):
+    # subclass *must* set these attributes
+    _correction_flag_class = CorrectionFlags
+    _correction_field_names = (("offset", CorrectionFlags.OFFSET),)
+    _kernel_runner_class = DsscGpuRunner
+    _calcat_friend_class = DsscCalcatFriend
+    _constant_enum_class = DsscConstants
+    _managed_keys = BaseCorrection._managed_keys.copy()
 
-        """
+    @staticmethod
+    def expectedParameters(expected):
+        (
+            OVERWRITE_ELEMENT(expected)
+            .key("dataFormat.memoryCells")
+            .setNewDefaultValue(400)
+            .commit(),
 
-        self.log.INFO("Updating output schema")
-        my_schema_update = Schema()
-        data_schema = hashToSchema.HashToSchema(data).schema
+            OVERWRITE_ELEMENT(expected)
+            .key("preview.selectionMode")
+            .setNewDefaultValue("pulse")
+            .commit(),
+        )
+        DsscCalcatFriend.add_schema(expected, DsscCorrection._managed_keys)
+        add_correction_step_schema(
+            expected,
+            DsscCorrection._managed_keys,
+            DsscCorrection._correction_field_names,
+        )
         (
-            OUTPUT_CHANNEL(my_schema_update)
-            .key("dataOutput")
-            .dataSchema(data_schema)
+            VECTOR_STRING_ELEMENT(expected)
+            .key("managedKeys")
+            .assignmentOptional()
+            .defaultValue(list(DsscCorrection._managed_keys))
             .commit()
         )
-        self.updateSchema(my_schema_update)
-        self._has_set_output_schema = True
-
-    def _update_pulse_filter(self, filter_string):
-        """Called whenever the pulse filter changes, typically followed by
-        _update_shapes"""
-
-        if filter_string.strip() == "":
-            new_filter = np.arange(self.get("dataFormat.memoryCells"), dtype=np.uint16)
-        else:
-            new_filter = np.array(eval(filter_string), dtype=np.uint16)
-        assert np.max(new_filter) < self.get("dataFormat.memoryCells")
-        self.pulse_filter = new_filter
-
-    def _update_shapes(
-        self, pixels_x, pixels_y, memory_cells, pulse_filter, output_transpose
-    ):
-        """(Re)initialize (GPU) buffers according to expected data shapes"""
 
-        input_data_shape = (memory_cells, 1, pixels_y, pixels_x)
-        # reflect the axis reordering in the expected output shape
-        output_data_shape = utils.shape_after_transpose(
-            input_data_shape, output_transpose
+    @property
+    def input_data_shape(self):
+        return (
+            self.get("dataFormat.memoryCells"),
+            1,
+            self.get("dataFormat.pixelsY"),
+            self.get("dataFormat.pixelsX"),
         )
-        self.set("dataFormat.inputDataShape", list(input_data_shape))
-        self.set("dataFormat.outputDataShape", list(output_data_shape))
-
-        if self._shmem_buffer is None:
-            shmem_buffer_name = self.getInstanceId() + ":dataOutput"
-            memory_budget = self.get("outputShmemBufferSize") * 2 ** 30
-            self.log.INFO(f"Opening new shmem buffer: {shmem_buffer_name}")
-            self._shmem_buffer = shmem_utils.ShmemCircularBuffer(
-                memory_budget,
-                output_data_shape,
-                self.output_data_dtype,
-                shmem_buffer_name,
-            )
-        else:
-            self._shmem_buffer.change_shape(output_data_shape)
 
-        self.gpu_runner = DsscGpuRunner(
-            pixels_x,
-            pixels_y,
-            memory_cells,
-            output_transpose=output_transpose,
-            input_data_dtype=self.input_data_dtype,
-            output_data_dtype=self.output_data_dtype,
-        )
-
-        self._update_maps_on_gpu()
-
-    def _update_maps_on_gpu(self):
-        """Updates the correction maps stored on GPU based on constants known
-
-        This only does something useful if constants have been retrieved from
-        CalCat.  Should be called automatically upon retrieval and after
-        changing the data shape.
-
-        """
-
-        self.set("status", "Updating constants on GPU using known constants")
-        self.updateState(State.CHANGING)
-        if self._offset_map is not None:
-            self.gpu_runner.load_constants(self._offset_map)
-            msg = "Done transferring known constant(s) to GPU"
-            self.log.INFO(msg)
-            self.set("status", msg)
-
-        self.updateState(State.ON)
-
-    def _reset_state_from_processing(self):
-        if self.get("state") is State.PROCESSING:
-            self.updateState(State.ON)
-            self._state_reset_timer = None
+    def process_data(
+        self,
+        data_hash,
+        metadata,
+        source,
+        train_id,
+        image_data,
+        cell_table,
+        do_generate_preview,
+    ):
+        pulse_table = np.squeeze(data_hash.get("image.pulseId"))
+        if self._frame_filter is not None:
+            try:
+                cell_table = cell_table[self._frame_filter]
+                pulse_table = pulse_table[self._frame_filter]
+                image_data = image_data[self._frame_filter]
+            except IndexError:
+                self.log_status_warn(
+                    "Failed to apply frame filter, please check that it is valid!"
+                )
+                return
 
-    def _update_actual_rate(self):
-        if not self.get("state") is State.PROCESSING:
-            self._rate_update_timer.delay()
+        try:
+            self.kernel_runner.load_data(image_data)
+        except ValueError as e:
+            self.log_status_warn(f"Failed to load data: {e}")
             return
-        self._buffered_status_update.set("performance.rate", self._rate_tracker.rate())
-        last_processing = self._buffered_status_update.get(
-            "performance.lastProcessingDuration"
+        except Exception as e:
+            self.log_status_warn(f"Unknown exception when loading data to GPU: {e}")
+
+        buffer_handle, buffer_array = self._shmem_buffer.next_slot()
+        self.kernel_runner.load_cell_table(cell_table)
+        self.kernel_runner.correct(self._correction_flag_enabled)
+        self.kernel_runner.reshape(
+            output_order=self.unsafe_get("dataFormat.outputAxisOrder"),
+            out=buffer_array,
         )
-        if last_processing > 0:
-            theoretical_rate = 1000 / last_processing
-            self._buffered_status_update.set(
-                "performance.theoreticalRate", theoretical_rate
+        if do_generate_preview:
+            if self._correction_flag_enabled != self._correction_flag_preview:
+                self.kernel_runner.correct(self._correction_flag_preview)
+            (
+                preview_slice_index,
+                preview_cell,
+                preview_pulse,
+            ) = utils.pick_frame_index(
+                self.unsafe_get("preview.selectionMode"),
+                self.unsafe_get("preview.index"),
+                cell_table,
+                pulse_table,
+                warn_func=self.log_status_warn,
             )
-        self.set(self._buffered_status_update)
-        self._rate_update_timer.delay()
+            preview_raw, preview_corrected = self.kernel_runner.compute_previews(
+                preview_slice_index,
+            )
+
+        data_hash.set(self._image_data_path, buffer_handle)
+        data_hash.set(self._cell_table_path, cell_table[:, np.newaxis])
+        data_hash.set("image.pulseId", pulse_table[:, np.newaxis])
+        data_hash.set("calngShmemPaths", [self._image_data_path])
+        self._write_output(data_hash, metadata)
+        if do_generate_preview:
+            self._write_combiner_previews(
+                (
+                    ("preview.outputRaw", preview_raw),
+                    ("preview.outputCorrected", preview_corrected),
+                ),
+                train_id,
+                source,
+            )
+
+    def _load_constant_to_runner(self, constant, constant_data):
+        assert constant is DsscConstants.Offset
+        self.kernel_runner.load_offset_map(constant_data)
+        if not self.get("corrections.offset.available"):
+            self.set("corrections.offset.available", True)
+
+        self._update_correction_flags()
+        self.log_status_info(f"Done loading {constant.name} to GPU")
diff --git a/src/calng/JungfrauCorrection.py b/src/calng/JungfrauCorrection.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78d671ba8c5919523b0286fc86d9386132c652e
--- /dev/null
+++ b/src/calng/JungfrauCorrection.py
@@ -0,0 +1,417 @@
+import enum
+
+import cupy
+import numpy as np
+from karabo.bound import (
+    DOUBLE_ELEMENT,
+    KARABO_CLASSINFO,
+    OUTPUT_CHANNEL,
+    OVERWRITE_ELEMENT,
+    STRING_ELEMENT,
+    VECTOR_STRING_ELEMENT,
+)
+
+from . import base_gpu, calcat_utils, utils
+from ._version import version as deviceVersion
+from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
+
+
+_pretend_pulse_table = np.arange(16, dtype=np.uint8)
+
+
+class JungfrauConstants(enum.Enum):
+    Offset10Hz = enum.auto()
+    BadPixelsDark10Hz = enum.auto()
+    BadPixelsFF10Hz = enum.auto()
+    RelativeGain10Hz = enum.auto()
+
+
+class CorrectionFlags(enum.IntFlag):
+    NONE = 0
+    OFFSET = 1
+    REL_GAIN = 2
+    BPMASK = 4
+
+
+class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
+    _kernel_source_filename = "jungfrau_gpu.cu"
+    _corrected_axis_order = "cyx"
+
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=cupy.uint16,
+        output_data_dtype=cupy.float32,
+        bad_pixel_mask_value=cupy.nan,
+    ):
+        self.input_shape = (memory_cells, pixels_y, pixels_x)
+        self.processed_shape = self.input_shape
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+        # TODO: avoid superclass creating cell table with wrong dtype first
+        self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=cupy.uint8)
+        self.input_gain_map_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
+        self.preview_buffer_getters.append(self._get_gain_map_for_preview)
+        self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
+        self.offset_map_gpu = cupy.zeros(self.map_shape, dtype=cupy.float32)
+        self.rel_gain_map_gpu = cupy.ones(self.map_shape, dtype=cupy.float32)
+        self.bad_pixel_map_gpu = cupy.zeros(self.map_shape, dtype=cupy.uint32)
+        self.bad_pixel_mask_value = bad_pixel_mask_value
+
+        self.update_block_size((1, 1, 64))
+
+    def _init_kernels(self):
+        kernel_source = self._kernel_template.render(
+            {
+                "pixels_x": self.pixels_x,
+                "pixels_y": self.pixels_y,
+                "data_memory_cells": self.memory_cells,
+                "constant_memory_cells": self.constant_memory_cells,
+                "input_data_dtype": utils.np_dtype_to_c_type(self.input_data_dtype),
+                "output_data_dtype": utils.np_dtype_to_c_type(self.output_data_dtype),
+                "corr_enum": utils.enum_to_c_template(CorrectionFlags),
+                "burst_mode": self.burst_mode,
+            }
+        )
+        self.source_module = cupy.RawModule(code=kernel_source)
+        self.correction_kernel = self.source_module.get_function("correct")
+
+    @property
+    def burst_mode(self):
+        return self.memory_cells > 1
+
+    def _get_raw_for_preview(self):
+        return self.input_data_gpu
+
+    def _get_corrected_for_preview(self):
+        return self.processed_data_gpu
+
+    def _get_gain_map_for_preview(self):
+        return self.input_gain_map_gpu
+
+    def load_data(self, image_data, input_gain_map, cell_table):
+        """Experiment: loading all three in one function as they are tied"""
+        self.input_data_gpu.set(image_data)
+        self.input_gain_map_gpu.set(input_gain_map)
+        if self.burst_mode:
+            self.cell_table_gpu.set(cell_table)
+
+    def flush_buffers(self):
+        self.offset_map_gpu.fill(0)
+        self.rel_gain_map_gpu.fill(1)
+        self.bad_pixel_map_gpu.fill(0)
+
+    def correct(self, flags):
+        self.correction_kernel(
+            self.full_grid,
+            self.full_block,
+            (
+                self.input_data_gpu,
+                self.input_gain_map_gpu,
+                self.cell_table_gpu,
+                cupy.uint8(flags),
+                self.offset_map_gpu,
+                self.rel_gain_map_gpu,
+                self.bad_pixel_map_gpu,
+                self.bad_pixel_mask_value,
+                self.processed_data_gpu,
+            )
+        )
+
+
+class JungfrauCalcatFriend(calcat_utils.BaseCalcatFriend):
+    _constant_enum_class = JungfrauConstants
+
+    def __init__(self, device, *args, **kwargs):
+        super().__init__(device, *args, **kwargs)
+        self._constants_need_conditions = {
+            JungfrauConstants.Offset10Hz: self.dark_condition,
+            JungfrauConstants.BadPixelsDark10Hz: self.dark_condition,
+            JungfrauConstants.BadPixelsFF10Hz: self.dark_condition,
+            JungfrauConstants.RelativeGain10Hz: self.dark_condition,
+        }
+
+    @staticmethod
+    def add_schema(
+        schema,
+        managed_keys,
+        param_prefix="constantParameters",
+        status_prefix="foundConstants",
+    ):
+        super(JungfrauCalcatFriend, JungfrauCalcatFriend).add_schema(
+            schema, managed_keys, "jungfrau-Type", param_prefix, status_prefix
+        )
+
+        # set some defaults for common parameters
+        (
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.pixelsX")
+            .setNewDefaultValue(1024)
+            .commit(),
+
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.pixelsY")
+            .setNewDefaultValue(512)
+            .commit(),
+
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.memoryCells")
+            .setNewDefaultValue(1)
+            .commit(),
+
+            OVERWRITE_ELEMENT(schema)
+            .key(f"{param_prefix}.biasVoltage")
+            .setNewDefaultValue(90)
+            .commit(),
+        )
+
+        # add extra parameters
+        (
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.integrationTime")
+            .displayedName("Integration time")
+            .description("Integration time in ms")
+            .assignmentOptional()
+            .defaultValue(350)
+            .reconfigurable()
+            .commit(),
+
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.sensorTemperature")
+            .displayedName("Sensor temperature")
+            .description("Sensor temperature in K")
+            .assignmentOptional()
+            .defaultValue(291)
+            .reconfigurable()
+            .commit(),
+
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.gainSetting")
+            .displayedName("Gain setting")
+            .description("Feedback capacitor setting; 0 is default, 1 is HG0")
+            .assignmentOptional()
+            .defaultValue(0)
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.gainMode")
+            .displayedName("Gain mode")
+            .description(
+                "Detector may be operating in one of several gain modes. For this "
+                "device to query appropriate constants, it is sufficient to know "
+                "whether gain mode is dynamic or fixed."
+            )
+            .assignmentOptional()
+            .defaultValue("dynamicgain")
+            .options("dynamicgain,fixedgain")
+            .commit(),
+        )
+        managed_keys.add(f"{param_prefix}.integrationTime")
+        managed_keys.add(f"{param_prefix}.sensorTemperature")
+        managed_keys.add(f"{param_prefix}.gainSetting")
+        managed_keys.add(f"{param_prefix}.gainMode")
+
+        calcat_utils.add_status_schema_from_enum(
+            schema, status_prefix, JungfrauConstants
+        )
+
+    def dark_condition(self):
+        res = calcat_utils.OperatingConditions()
+        res["Memory cells"] = self._get_param("memoryCells")
+        res["Sensor Bias Voltage"] = self._get_param("biasVoltage")
+        res["Pixels X"] = self._get_param("pixelsX")
+        res["Pixels Y"] = self._get_param("pixelsY")
+        res["Integration Time"] = self._get_param("integrationTime")
+        res["Sensor Temperature"] = self._get_param("sensorTemperature")
+        res["Gain Setting"] = self._get_param("gainSetting")
+        gain_mode = self._get_param("gainMode")
+        if gain_mode != "dynamicgain":
+            # NOTE: always include if CalCat is updated for this
+            res["Gain mode"] = 1
+        return res
+
+
+@KARABO_CLASSINFO("JungfrauCorrection", deviceVersion)
+class JungfrauCorrection(BaseCorrection):
+    _correction_flag_class = CorrectionFlags
+    _correction_field_names = (
+        ("offset", CorrectionFlags.OFFSET),
+        ("relGain", CorrectionFlags.REL_GAIN),
+        ("badPixels", CorrectionFlags.BPMASK),
+    )
+    _kernel_runner_class = JungfrauGpuRunner
+    _calcat_friend_class = JungfrauCalcatFriend
+    _constant_enum_class = JungfrauConstants
+    _managed_keys = BaseCorrection._managed_keys.copy()
+    _image_data_path = "data.adc"
+    _cell_table_path = "data.memoryCell"
+
+    @staticmethod
+    def expectedParameters(expected):
+        super(JungfrauCorrection, JungfrauCorrection).expectedParameters(expected)
+        (
+            OVERWRITE_ELEMENT(expected)
+            .key("dataFormat.pixelsX")
+            .setNewDefaultValue(1024)
+            .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("dataFormat.pixelsY")
+            .setNewDefaultValue(512)
+            .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("dataFormat.memoryCells")
+            .setNewDefaultValue(1)
+            .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("preview.selectionMode")
+            .setNewDefaultValue("frame")
+            .commit(),
+        )
+
+        (
+            OUTPUT_CHANNEL(expected)
+            .key("preview.outputGainMap")
+            .dataSchema(preview_schema)
+            .commit(),
+        )
+
+        JungfrauCalcatFriend.add_schema(expected, JungfrauCorrection._managed_keys)
+        add_correction_step_schema(
+            expected,
+            JungfrauCorrection._managed_keys,
+            JungfrauCorrection._correction_field_names,
+        )
+
+        # mandatory: manager needs this in schema
+        (
+            VECTOR_STRING_ELEMENT(expected)
+            .key("managedKeys")
+            .assignmentOptional()
+            .defaultValue(list(JungfrauCorrection._managed_keys))
+            .commit()
+        )
+
+    @property
+    def input_data_shape(self):
+        return (
+            self.unsafe_get("dataFormat.memoryCells"),
+            self.unsafe_get("dataFormat.pixelsY"),
+            self.unsafe_get("dataFormat.pixelsX"),
+        )
+
+    def __init__(self, config):
+        super().__init__(config)
+        # TODO: gain mode as constant parameter and / or device configuration
+
+        try:
+            self.bad_pixel_mask_value = np.float32(
+                config.get("corrections.badPixels.maskingValue")
+            )
+        except ValueError:
+            self.bad_pixel_mask_value = np.float32("nan")
+
+        self._kernel_runner_init_args = {
+            "bad_pixel_mask_value": self.bad_pixel_mask_value,
+        }
+
+    def process_data(
+        self,
+        data_hash,
+        metadata,
+        source,
+        train_id,
+        image_data,
+        cell_table,
+        do_generate_preview,
+    ):
+        if len(cell_table.shape) == 0:
+            cell_table = cell_table[np.newaxis]
+        try:
+            self.kernel_runner.load_data(
+                image_data, data_hash.get("data.gain"), cell_table
+            )
+        except ValueError as e:
+            self.log_status_warn(f"Failed to load data: {e}")
+            return
+        except Exception as e:
+            self.log_status_warn(f"Unknown exception when loading data to GPU: {e}")
+
+        buffer_handle, buffer_array = self._shmem_buffer.next_slot()
+        self.kernel_runner.correct(self._correction_flag_enabled)
+        self.kernel_runner.reshape(
+            output_order=self.unsafe_get("dataFormat.outputAxisOrder"),
+            out=buffer_array,
+        )
+
+        if do_generate_preview:
+            if self._correction_flag_enabled != self._correction_flag_preview:
+                self.kernel_runner.correct(self._correction_flag_preview)
+            (
+                preview_slice_index,
+                preview_cell,
+                preview_pulse,
+            ) = utils.pick_frame_index(
+                self.unsafe_get("preview.selectionMode"),
+                self.unsafe_get("preview.index"),
+                cell_table,
+                _pretend_pulse_table,
+                warn_func=self.log_status_warn,
+            )
+            (
+                preview_raw,
+                preview_corrected,
+                preview_gain_map
+            ) = self.kernel_runner.compute_previews(preview_slice_index)
+
+        # reusing input data hash for sending
+        data_hash.set(self._image_data_path, buffer_handle)
+        data_hash.set("calngShmemPaths", [self._image_data_path])
+
+        self._write_output(data_hash, metadata)
+
+        if do_generate_preview:
+            self._write_combiner_previews(
+                (
+                    ("preview.outputRaw", preview_raw),
+                    ("preview.outputCorrected", preview_corrected),
+                    ("preview.outputGainMap", preview_gain_map),
+                ),
+                train_id,
+                source,
+            )
+
+    def _load_constant_to_runner(self, constant, constant_data):
+        if constant_data.shape[0] == self.get("dataFormat.pixelsX"):
+            constant_data = np.transpose(constant_data, (2, 1, 0, 3))
+        else:
+            constant_data = np.transpose(constant_data, (2, 0, 1, 3))
+        if constant is JungfrauConstants.Offset10Hz:
+            self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
+            if not self.get("corrections.offset.available"):
+                self.set("corrections.offset.available", True)
+        elif constant is JungfrauConstants.RelativeGain10Hz:
+            self.kernel_runner.rel_gain_map_gpu.set(constant_data.astype(np.float32))
+            if not self.get("corrections.relGain.available"):
+                self.set("corrections.relGain.available", True)
+        elif constant in (
+                JungfrauConstants.BadPixelsDark10Hz, JungfrauConstants.BadPixelsFF10Hz
+        ):
+            self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
+            if not self.get("corrections.badPixels.available"):
+                self.set("corrections.badPixels.available", True)
+
+        self._update_correction_flags()
+        self.log_status_info(f"Done loading {constant.name} to GPU")
diff --git a/src/calng/ManualAgipdGeometry.py b/src/calng/ManualAgipdGeometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..20428d6bdbfe38709305c6fc90820445e25e14f2
--- /dev/null
+++ b/src/calng/ManualAgipdGeometry.py
@@ -0,0 +1,26 @@
+import extra_geom
+from karabo.bound import KARABO_CLASSINFO
+
+from ._version import version as deviceVersion
+from .manual_geometry_base import ManualQuadrantsGeometryBase
+
+
+@KARABO_CLASSINFO("ManualAgipdGeometry", deviceVersion)
+class ManualAgipdGeometry(ManualQuadrantsGeometryBase):
+    geometry_class = extra_geom.AGIPD_1MGeometry
+
+    @staticmethod
+    def expectedParameters(expected):
+        super(ManualAgipdGeometry, ManualAgipdGeometry).expectedParameters(expected)
+
+        expected.setDefaultValue("quadrantCorners.Q1.x", -525)
+        expected.setDefaultValue("quadrantCorners.Q1.y", 625)
+
+        expected.setDefaultValue("quadrantCorners.Q2.x", -550)
+        expected.setDefaultValue("quadrantCorners.Q2.y", -10)
+
+        expected.setDefaultValue("quadrantCorners.Q3.x", 520)
+        expected.setDefaultValue("quadrantCorners.Q3.y", -160)
+
+        expected.setDefaultValue("quadrantCorners.Q4.x", 542.5)
+        expected.setDefaultValue("quadrantCorners.Q4.y", 475)
diff --git a/src/calng/ManualDsscGeometry.py b/src/calng/ManualDsscGeometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b3d9b2e4d2394bf67b78c7fed2cc084d384612f
--- /dev/null
+++ b/src/calng/ManualDsscGeometry.py
@@ -0,0 +1,26 @@
+import extra_geom
+from karabo.bound import KARABO_CLASSINFO
+
+from ._version import version as deviceVersion
+from .manual_geometry_base import ManualQuadrantsGeometryBase
+
+
+@KARABO_CLASSINFO("ManualDsscGeometry", deviceVersion)
+class ManualDsscGeometry(ManualQuadrantsGeometryBase):
+    geometry_class = extra_geom.DSSC_1MGeometry
+
+    @staticmethod
+    def expectedParameters(expected):
+        super(ManualDsscGeometry, ManualDsscGeometry).expectedParameters(expected)
+
+        expected.setDefaultValue("quadrantCorners.Q1.x", -130)
+        expected.setDefaultValue("quadrantCorners.Q1.y", 5)
+
+        expected.setDefaultValue("quadrantCorners.Q2.x", -130)
+        expected.setDefaultValue("quadrantCorners.Q2.y", -125)
+
+        expected.setDefaultValue("quadrantCorners.Q3.x", 5)
+        expected.setDefaultValue("quadrantCorners.Q3.y", -125)
+
+        expected.setDefaultValue("quadrantCorners.Q4.x", 5)
+        expected.setDefaultValue("quadrantCorners.Q4.y", 5)
diff --git a/src/calng/ManualJungfrauGeometry.py b/src/calng/ManualJungfrauGeometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f7da914e77cb7aee28ffd078fe64c13231c259
--- /dev/null
+++ b/src/calng/ManualJungfrauGeometry.py
@@ -0,0 +1,28 @@
+import extra_geom
+from karabo.bound import KARABO_CLASSINFO, OVERWRITE_ELEMENT, Hash
+
+from ._version import version as deviceVersion
+from .manual_geometry_base import ManualModulesGeometryBase
+
+
+@KARABO_CLASSINFO("ManualJungfrauGeometry", deviceVersion)
+class ManualJungfrauGeometry(ManualModulesGeometryBase):
+    geometry_class = extra_geom.JUNGFRAUGeometry
+
+    @staticmethod
+    def expectedParameters(expected):
+        # TODO: come up with some sweet defaults (this is two modules from docs 4M)
+        (
+            OVERWRITE_ELEMENT(expected)
+            .key("modules")
+            .setNewDefaultValue(
+                [
+                    Hash(
+                        "posX", 95, "posY", 564, "orientationX", -1, "orientationY", -1
+                    ),
+                    Hash(
+                        "posX", 95, "posY", 17, "orientationX", -1, "orientationY", -1
+                    ),
+                ]
+            )
+        )
diff --git a/src/calng/ModuleStacker.py b/src/calng/ModuleStacker.py
index 63842cb5f65ead76bb48f0f35048da615637eb6d..95abe95ee79f05007fffe98bb4c73b1fad6c6f08 100644
--- a/src/calng/ModuleStacker.py
+++ b/src/calng/ModuleStacker.py
@@ -1,10 +1,10 @@
 import numpy as np
 from karabo.bound import (
-    ChannelMetaData,
     FLOAT_ELEMENT,
     KARABO_CLASSINFO,
     NODE_ELEMENT,
     STRING_ELEMENT,
+    ChannelMetaData,
     Epochstamp,
     Hash,
     MetricPrefix,
@@ -31,7 +31,6 @@ class ModuleStacker(TrainMatcher.TrainMatcher):
 
     @staticmethod
     def expectedParameters(expected):
-        super(ModuleStacker, ModuleStacker).expectedParameters(expected)
         (
             FLOAT_ELEMENT(expected)
             .key("timeOfFlight")
@@ -45,6 +44,7 @@ class ModuleStacker(TrainMatcher.TrainMatcher):
             .metricPrefix(MetricPrefix.MILLI)
             .readOnly()
             .commit(),
+
             STRING_ELEMENT(expected)
             .key("pathToStack")
             .displayedName("Data path to stack")
@@ -79,8 +79,11 @@ class ModuleStacker(TrainMatcher.TrainMatcher):
             "These nodes are not used."
         )
         schema = Schema()
-        NODE_ELEMENT(schema).key("start").description(desc).commit()
-        NODE_ELEMENT(schema).key("stop").description(desc).commit()
+        (
+            NODE_ELEMENT(schema).key("start").description(desc).commit(),
+
+            NODE_ELEMENT(schema).key("stop").description(desc).commit(),
+        )
         self.path_to_stack = self.get("pathToStack")
         self.updateSchema(schema)
 
@@ -133,9 +136,6 @@ class ModuleStacker(TrainMatcher.TrainMatcher):
         out_hash[self.path_to_stack] = stacked_data
         out_hash["sources"] = stacked_sources
         out_hash["modulesPresent"] = stacked_present
-        if not out_hash.has("image.passport"):
-            out_hash.set("image.passport", [])
-        out_hash["image.passport"].append(self.getInstanceId())
         channel = self.signalSlotable.getOutputChannel("output")
         channel.write(out_hash, ChannelMetaData(self.getInstanceId(), timestamp))
         channel.update()
diff --git a/src/calng/ShmemToZMQ.py b/src/calng/ShmemToZMQ.py
index 1fcdaa4f09ea414c77ba37a0c7553db768dfbed6..1e0af8e14f5b3b71f1fa29db231e95370ac714a0 100644
--- a/src/calng/ShmemToZMQ.py
+++ b/src/calng/ShmemToZMQ.py
@@ -1,11 +1,9 @@
-import threading
 from time import time
 
 from karabo.bound import KARABO_CLASSINFO
 from PipeToZeroMQ import PipeToZeroMQ, conversion, device_schema
 
 from . import shmem_utils
-
 from ._version import version as deviceVersion
 
 
@@ -13,32 +11,7 @@ from ._version import version as deviceVersion
 class ShmemToZMQ(PipeToZeroMQ.PipeToZeroMQ):
     def initialization(self):
         super().initialization()
-        self._source_to_shmem_ary = {}
-        self._source_to_shmem_mem = {}
-        self._buffer_lock = threading.Lock()
-
-    def _get_shmem_buffer_data(self, source, shmem_handle):
-        # TODO: handle failure if this was not a shmem handle
-        name, dtype, shape, index = shmem_utils.parse_shmem_handle(shmem_handle)
-        with self._buffer_lock:
-            # may have to open shared memory buffer
-            if source not in self._source_to_shmem_ary:
-                self.log.INFO(f"Opening buffer {name} for source {source}")
-                try:
-                    mem, ary = shmem_utils.open_shmem_from_handle(shmem_handle)
-                except OSError:
-                    self.log.WARN(f"Failed to open buffer {name}")
-                    return None
-                self._source_to_shmem_mem[source] = mem
-                self._source_to_shmem_ary[source] = ary
-            elif self._source_to_shmem_ary[source].shape != shape:
-                self.log.INFO(f"Updating buffer shape for {source} to {shape}")
-                self._source_to_shmem_ary[source] = self._source_to_shmem_mem[
-                    source
-                ].ndarray(shape=shape, dtype=dtype)
-
-            # grab data from shared memory buffer
-            return self._source_to_shmem_ary[source][index]
+        self._shmem_handler = shmem_utils.ShmemCircularBufferReceiver()
 
     def onInput(self, input_channel):
         actual = self.getActualTimestamp()
@@ -80,7 +53,12 @@ class ShmemToZMQ(PipeToZeroMQ.PipeToZeroMQ):
                         f"Hash from {source} did not have {shmem_handle_path}"
                     )
                     continue
-                actual_data = self._get_shmem_buffer_data(source, shmem_handle)
+                elif shmem_handle_path == "":
+                    self.log.INFO(
+                        f"Hash from {source} had empty {shmem_handle_path}"
+                    )
+                    continue
+                actual_data = self._shmem_handler.get(shmem_handle)
                 arr[shmem_handle_path] = actual_data
 
             data[source] = (dic, arr)
@@ -95,8 +73,3 @@ class ShmemToZMQ(PipeToZeroMQ.PipeToZeroMQ):
         self._updateProperties(output_tic)
         # block if device is in passive state
         self.monitoring.wait()
-
-    def preDestruction(self):
-        for ary in self._source_to_shmem_ary.values():
-            del ary
-        super().preDestruction()
diff --git a/src/calng/SimpleAssembler.py b/src/calng/SimpleAssembler.py
new file mode 100644
index 0000000000000000000000000000000000000000..847265f6a2b40b718afc3132f224824549ffeec5
--- /dev/null
+++ b/src/calng/SimpleAssembler.py
@@ -0,0 +1,279 @@
+import functools
+import pickle
+import re
+import threading
+import time
+
+import numpy as np
+from karabo.bound import (
+    FLOAT_ELEMENT,
+    IMAGEDATA_ELEMENT,
+    INPUT_CHANNEL,
+    KARABO_CLASSINFO,
+    OUTPUT_CHANNEL,
+    OVERWRITE_ELEMENT,
+    STRING_ELEMENT,
+    UINT32_ELEMENT,
+    UINT64_ELEMENT,
+    ChannelMetaData,
+    Dims,
+    Encoding,
+    Epochstamp,
+    Hash,
+    ImageData,
+    MetricPrefix,
+    Schema,
+    Timestamp,
+    Trainstamp,
+    Unit,
+)
+from karabo.common.api import KARABO_SCHEMA_DISPLAY_TYPE_SCENES as DT_SCENES
+from TrainMatcher import TrainMatcher
+from TrainMatcher import scenes as trainmatcher_scenes
+
+from . import scenes
+from ._version import version as deviceVersion
+
+preview_schema = Schema()
+(
+    IMAGEDATA_ELEMENT(preview_schema).key("image").commit(),
+
+    UINT64_ELEMENT(preview_schema).key("trainId").readOnly().commit(),
+)
+
+xtdf_source_re = re.compile(r".*\/DET\/(\d+)CH0:xtdf")
+daq_source_re = re.compile(r".*\/DET\/.*?(\d+):daqOutput")
+
+
+# TODO: merge scene with TrainMatcher's nice overview
+@KARABO_CLASSINFO("SimpleAssembler", deviceVersion)
+class SimpleAssembler(TrainMatcher.TrainMatcher):
+    @staticmethod
+    def expectedParameters(expected):
+        (
+            OVERWRITE_ELEMENT(expected)
+            .key("availableScenes")
+            .setNewDefaultValue(["overview", "trainMatcherScene"])
+            .commit(),
+
+            FLOAT_ELEMENT(expected)
+            .key("processingTime")
+            .unit(Unit.SECOND)
+            .metricPrefix(MetricPrefix.MILLI)
+            .readOnly()
+            .initialValue(0)
+            .warnHigh(500)
+            .info("Cannot keep up with GUI limit")
+            .needsAcknowledging(False)
+            .commit(),
+
+            FLOAT_ELEMENT(expected)
+            .key("timeOfFlight")
+            .unit(Unit.SECOND)
+            .metricPrefix(MetricPrefix.MILLI)
+            .readOnly()
+            .initialValue(0)
+            .warnHigh(1000)
+            .info("Time of flight exceeding 1 s")
+            .needsAcknowledging(False)
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("pathToStack")
+            .assignmentOptional()
+            .defaultValue("image.data")
+            .commit(),
+
+            UINT32_ELEMENT(expected)
+            .key("downsamplingFactor")
+            .description(
+                "If greater than 1, the assembled image will be downsampled by this "
+                "factor in x and y dimensions before sending. This is only to save "
+                "bandwidth in case GUI updates start lagging."
+            )
+            .assignmentOptional()
+            .defaultValue(1)
+            .options("1,2,4,8")
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("downsamplingFunction")
+            .description("Reduction function used during downsampling.")
+            .assignmentOptional()
+            .defaultValue("nanmax")
+            .options("nanmax,nanmean,nanmin,nanmedian")
+            .reconfigurable()
+            .commit(),
+
+            INPUT_CHANNEL(expected)
+            .key("geometryInput")
+            .displayedName("Geometry input")
+            .commit(),
+
+            OUTPUT_CHANNEL(expected)  # can OVERWRITE_ELEMENT even do this?
+            .key("output")
+            .dataSchema(preview_schema)
+            .commit(),
+        )
+
+    def initialization(self):
+        super().initialization()
+
+        # TODO: match inside device, fill multiple independent buffers
+
+        self._path_to_stack = self.get("pathToStack")
+        self.geometry = None
+        self.input_buffer = None
+
+        self.KARABO_ON_DATA("geometryInput", self.receive_geometry)
+        self.KARABO_SLOT(self.requestScene)
+
+        self.ask_for_geometry()
+        self.start()
+
+    def requestScene(self, params):
+        # TODO: unify with TrainMatcher overview
+        scene_name = params.get("name", default="")
+        if scene_name == "overview":
+            payload = Hash("name", scene_name, "success", True)
+            payload["data"] = scenes.simple_assembler_overview(
+                device_id=self.getInstanceId(),
+                geometry_device_id=self.get("geometryInput.connectedOutputChannels")[
+                    0
+                ].split(":")[0],
+            )
+            self.reply(
+                Hash(
+                    "type",
+                    "deviceScene",
+                    "origin",
+                    self.getInstanceId(),
+                    "payload",
+                    payload,
+                )
+            )
+        elif scene_name == "trainMatcherScene":
+            params["name"] = "scene"
+            return super().requestScene(params)
+
+    def receive_geometry(self, data, metadata):
+        self.log.INFO("Received a new geometry")
+        self.geometry = pickle.loads(data.get("pickledGeometry"))
+        # TODO: allow multiple memory cells (extra geom notion of extra dimensions)
+        self.input_buffer = np.zeros(self.geometry.expected_data_shape)
+
+    def ask_for_geometry(self):
+        def runner():
+            self.log.INFO("Will ask around for a geometry")
+            max_tries = 10
+            for i in range(max_tries):
+                time.sleep(np.random.random() * 10)
+                if self.geometry is None:
+                    missing_connections = set(
+                        self.get("geometryInput.missingConnections")
+                    )
+                    # note: connectedOutputChannels not necessarily connected...
+                    geometry_device_list = [
+                        channel
+                        for channel in self.get("geometryInput.connectedOutputChannels")
+                        if channel not in missing_connections
+                    ]
+                    if not geometry_device_list:
+                        self.log.INFO("No geometry device connected")
+                        continue
+                    geometry_device = geometry_device_list[0].split(":")[0]
+                    self.log.INFO(f"Asking {geometry_device} for a geometry")
+                    self.signalSlotable.call(geometry_device, "pleaseSendYourGeometry")
+                    time.sleep(1)
+
+                if self.geometry is not None:
+                    return
+            self.log.INFO(f"Failed to get geometry in {max_tries} tries, need help")
+        threading.Thread(target=runner, daemon=True).start()
+
+    def _send(self, train_id, sources):
+        # TODO: adapt to appropriate hook for new TrainMatcher (no _send)
+        if self.geometry is None:
+            self.log.WARN("Have not received a geometry yet")
+            return
+
+        timestamp = Timestamp(Epochstamp(), Trainstamp(train_id))
+
+        module_indices_unfilled = set(range(self.input_buffer.shape[0]))
+        for source, (data, metadata) in sources.items():
+            # TODO: handle failure to "parse" source, get data out
+            module_index = self._source_to_index(source)
+            self.input_buffer[module_index] = np.squeeze(data.get(self._path_to_stack))
+            module_indices_unfilled.discard(module_index)
+
+        for unfilled_module in module_indices_unfilled:
+            self.input_buffer[unfilled_module].fill(0)
+            # TODO: configurable treatment of missing modules
+
+        # TODO: reusable output buffer to save on allocation
+        assembled, _ = self.geometry.position_modules_fast(self.input_buffer)
+
+        downsampling_factor = self.get("downsamplingFactor")
+        if downsampling_factor > 1:
+            assembled = downsample_2d(
+                assembled,
+                downsampling_factor,
+                reduction_fun=getattr(np, self.get("downsamplingFunction"))
+            )
+
+        # TODO: optionally include control data
+        out_hash = Hash(
+            "image",
+            ImageData(
+                # TODO: get around this being mirrored...
+                (assembled[::-1, ::-1]).astype(np.int32),
+                Dims(*assembled.shape),
+                Encoding.GRAY,
+            ),
+            "trainId",
+            train_id,
+        )
+        channel = self.signalSlotable.getOutputChannel("output")
+        channel.write(out_hash, ChannelMetaData(self.getInstanceId(), timestamp))
+        channel.update()
+        self.rate_out.update()
+
+    @functools.lru_cache()
+    def _source_to_index(self, source):
+        # note: cache means warning only shows up once (also not performance-critical)
+        # TODO: allow user to inspect, modify the mapping
+
+        match = xtdf_source_re.match(source)
+        if match is not None:
+            return int(match.group(1))
+
+        match = daq_source_re.match(source)
+        if match is not None:
+            return int(match.group(1)) - 1
+
+        self.log.WARN(f"Couldn't figure out index for source {source}")
+        return 0
+
+
+def downsample_2d(arr, factor, reduction_fun=np.nanmax):
+    """Generalization of downsampling from FemDataAssembler
+
+    Expects first two dimensions of arr to be multiple of 2 ** factor
+    Useful if you're sitting at home and ssh connection is slow to get full-resolution
+    previews."""
+
+    for i in range(factor // 2):
+        arr = reduction_fun(
+            (
+                arr[:-1:2],
+                arr[1::2],
+            ), axis=0
+        )
+        arr = reduction_fun(
+            (
+                arr[:, :-1:2],
+                arr[:, 1::2],
+            ), axis=0
+        )
+    return arr
diff --git a/src/calng/base_correction.py b/src/calng/base_correction.py
new file mode 100644
index 0000000000000000000000000000000000000000..4876936288d05ffa8b8bb3b84e74d09ec2360edb
--- /dev/null
+++ b/src/calng/base_correction.py
@@ -0,0 +1,1112 @@
+import collections
+import enum
+import pathlib
+import threading
+from timeit import default_timer
+
+import dateutil.parser
+import numpy as np
+from karabo.bound import (
+    BOOL_ELEMENT,
+    DOUBLE_ELEMENT,
+    INPUT_CHANNEL,
+    INT32_ELEMENT,
+    INT64_ELEMENT,
+    KARABO_CLASSINFO,
+    NDARRAY_ELEMENT,
+    NODE_ELEMENT,
+    OUTPUT_CHANNEL,
+    OVERWRITE_ELEMENT,
+    SLOT_ELEMENT,
+    STRING_ELEMENT,
+    UINT32_ELEMENT,
+    UINT64_ELEMENT,
+    VECTOR_STRING_ELEMENT,
+    VECTOR_UINT32_ELEMENT,
+    ChannelMetaData,
+    Epochstamp,
+    Hash,
+    MetricPrefix,
+    PythonDevice,
+    Schema,
+    State,
+    Timestamp,
+    Trainstamp,
+    Unit,
+)
+from karabo.common.api import KARABO_SCHEMA_DISPLAY_TYPE_SCENES as DT_SCENES
+from karabo import version as karaboVersion
+from pkg_resources import parse_version
+
+from . import scenes, shmem_utils, utils
+from ._version import version as deviceVersion
+
+PROCESSING_STATE_TIMEOUT = 10
+
+
+class FramefilterSpecType(enum.Enum):
+    NONE = "none"
+    RANGE = "range"
+    COMMASEPARATED = "commaseparated"
+
+
+preview_schema = Schema()
+(
+    NODE_ELEMENT(preview_schema).key("image").commit(),
+
+    NDARRAY_ELEMENT(preview_schema).key("image.data").dtype("FLOAT").commit(),
+
+    UINT64_ELEMENT(preview_schema)
+    .key("image.trainId")
+    .displayedName("Train ID")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+)
+
+# TODO: trim output schema / adapt to specific detectors
+# currently: based on snapshot of actual output reusing AGIPD hash
+output_schema = Schema()
+(
+    NODE_ELEMENT(output_schema).key("image").commit(),
+
+    STRING_ELEMENT(output_schema)
+    .key("image.data")
+    .assignmentOptional()
+    .defaultValue("")
+    .commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("image.length").dtype("UINT32").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("image.cellId").dtype("UINT16").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("image.pulseId").dtype("UINT64").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("image.status").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("image.trainId").dtype("UINT64").commit(),
+
+    VECTOR_STRING_ELEMENT(output_schema)
+    .key("calngShmemPaths")
+    .assignmentOptional()
+    .defaultValue(["image.data"])
+    .commit(),
+
+    NODE_ELEMENT(output_schema).key("metadata").commit(),
+
+    STRING_ELEMENT(output_schema)
+    .key("metadata.source")
+    .assignmentOptional()
+    .defaultValue("")
+    .commit(),
+
+    NODE_ELEMENT(output_schema).key("metadata.timestamp").commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("metadata.timestamp.tid")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    NODE_ELEMENT(output_schema).key("header").commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("header.minorTrainFormatVersion")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("header.majorTrainFormatVersion")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("header.trainId")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    INT64_ELEMENT(output_schema)
+    .key("header.linkId")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    INT64_ELEMENT(output_schema)
+    .key("header.dataId")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    INT64_ELEMENT(output_schema)
+    .key("header.pulseCount")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("header.reserved").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("header.magicNumberBegin").commit(),
+
+    NODE_ELEMENT(output_schema).key("detector").commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("detector.trainId")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("detector.data").commit(),
+
+    NODE_ELEMENT(output_schema).key("trailer").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("trailer.checksum").commit(),
+
+    NDARRAY_ELEMENT(output_schema).key("trailer.magicNumberEnd").commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("trailer.status")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+
+    INT32_ELEMENT(output_schema)
+    .key("trailer.trainId")
+    .assignmentOptional()
+    .defaultValue(0)
+    .commit(),
+)
+
+
+@KARABO_CLASSINFO("BaseCorrection", deviceVersion)
+class BaseCorrection(PythonDevice):
+    _correction_flag_class = None  # subclass must set (ex.: dssc_gpu.CorrectionFlags)
+    _kernel_runner_class = None  # subclass must set (ex.: dssc_gpu.DsscGpuRunner)
+    _kernel_runner_init_args = {}  # optional extra args for runner
+    _managed_keys = {
+        "outputShmemBufferSize",
+        "dataFormat.outputAxisOrder",
+        "dataFormat.outputImageDtype",
+        "dataFormat.overrideInputAxisOrder",
+        "frameFilter.type",
+        "frameFilter.spec",
+        "preview.enable",
+        "preview.index",
+        "preview.selectionMode",
+        "preview.trainIdModulo",
+        "loadMostRecentConstants",
+    }  # subclass can extend this, /must/ put it in schema as managedKeys
+    _image_data_path = "image.data"  # customize for *some* subclasses
+    _cell_table_path = "image.cellId"
+
+    def _load_constant_to_runner(self, constant_name, constant_data):
+        """Subclass must define how to process constants into correction maps and store
+        into appropriate buffers in (GPU or main) memory."""
+        raise NotImplementedError()
+
+    @property
+    def input_data_shape(self):
+        """Subclass must define expected input data shape in terms of dataFormat.{
+        memoryCells,pixelsX,pixelsY} and any other axes."""
+        raise NotImplementedError()
+
+    @property
+    def output_data_shape(self):
+        """Shape of corrected image data sent on dataOutput. Depends on data format
+        parameters pixels x / y, and number of cells (optionally after frame filter)."""
+        axis_lengths = {
+            "x": self.unsafe_get("dataFormat.pixelsX"),
+            "y": self.unsafe_get("dataFormat.pixelsY"),
+            "c": self.unsafe_get("dataFormat.filteredFrames"),
+        }
+        return tuple(
+            axis_lengths[axis]
+            for axis in self.unsafe_get("dataFormat.outputAxisOrder")
+        )
+
+    def process_data(
+        self,
+        data_hash,
+        metadata,
+        source,
+        train_id,
+        image_data,
+        cell_table,
+        do_generate_preview,
+    ):
+        """Subclass must define data processing (presumably using the kernel runner).
+        Will be called by input_handler, which will take care of some common checks and
+        extracting the parameters given to process_data."""
+        raise NotImplementedError()
+
+    @staticmethod
+    def expectedParameters(expected):
+        (
+            OVERWRITE_ELEMENT(expected)
+            .key("state")
+            .setNewDefaultValue(State.INIT)
+            .commit(),
+
+            INPUT_CHANNEL(expected).key("dataInput").commit(),
+
+            OUTPUT_CHANNEL(expected)
+            .key("dataOutput")
+            .dataSchema(output_schema)
+            .commit(),
+
+            VECTOR_STRING_ELEMENT(expected)
+            .key("fastSources")
+            .displayedName("Fast data sources")
+            .description(
+                "Sources to get data from. Only incoming hashes from these sources "
+                "will be processed. This will typically be a single entry of the form: "
+                "'[instrument]_DET_[detector]/DET/[channel]:xtdf'."
+            )
+            .assignmentOptional()
+            .defaultValue([])
+            .commit(),
+
+            NODE_ELEMENT(expected)
+            .key("frameFilter")
+            .displayedName("Frame filter")
+            .description(
+                "The frame filter - if set - slices the input data. Frames not in the "
+                "filter will be discarded before any processing happens and will not "
+                "get to dataOutput or preview. Note that this filter goes by frame "
+                "index rather than cell ID or pulse ID; set accordingly. Handle with "
+                "care - an invalid filter can prevent all processing. How the filter "
+                "is specified depends on frameFilter.type. See frameFilter.current to "
+                "inspect the currently set frame filter array (if any)."
+            )
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("frameFilter.type")
+            .displayedName("Filter definition type")
+            .description(
+                "Controls how frameFilter.spec is used. The default value of 'none' "
+                "means that no filter is set (regardless of frameFilter.spec). "
+                "'arange' allows between one and three integers separated by ',' which "
+                "are parsed and passed directly to numpy.arange. 'commaseparated' "
+                "reads a list of integers separated by commas."
+            )
+            .options(",".join(spectype.value for spectype in FramefilterSpecType))
+            .assignmentOptional()
+            .defaultValue("none")
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("frameFilter.spec")
+            .assignmentOptional()
+            .defaultValue("")
+            .reconfigurable()
+            .commit(),
+
+            VECTOR_UINT32_ELEMENT(expected)
+            .key("frameFilter.current")
+            .displayedName("Current filter")
+            .description(
+                "This read-only value is used to display the contents of the current "
+                "frame filter. An empty array means no filtering is done."
+            )
+            .readOnly()
+            .initialValue([])
+            .commit(),
+
+            UINT32_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .displayedName("Output buffer size limit")
+            .unit(Unit.BYTE)
+            .metricPrefix(MetricPrefix.GIGA)
+            .description(
+                "Corrected trains are written to shared memory locations. These are "
+                "pre-allocated and re-used (circular buffer). This parameter "
+                "determines how much memory to set aside for that buffer."
+            )
+            .assignmentOptional()
+            .defaultValue(10)
+            .commit(),
+
+            VECTOR_STRING_ELEMENT(expected)
+            .key("availableScenes")
+            .setSpecialDisplayType(DT_SCENES)
+            .readOnly()
+            .initialValue(["overview"])
+            .commit(),
+        )
+
+        (
+            NODE_ELEMENT(expected)
+            .key("dataFormat")
+            .displayedName("Data format (in/out)")
+            .commit(),
+
+            BOOL_ELEMENT(expected)
+            .key("dataFormat.overrideInputAxisOrder")
+            .displayedName("Override input axis order")
+            .description(
+                "The shape of the image data ndarray as received from the "
+                "DataAggregator is sometimes wrong - the axes are actually in a "
+                "different order than the ndarray shape suggests. If this flag is on, "
+                "the shape of the ndarray will be overridden with the axis order which "
+                "was expected."
+            )
+            .assignmentOptional()
+            .defaultValue(True)
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("dataFormat.inputImageDtype")
+            .displayedName("Input image data dtype")
+            .description("The (numpy) dtype to expect for incoming image data.")
+            .options("uint16,float32")
+            .assignmentOptional()
+            .defaultValue("uint16")
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("dataFormat.outputImageDtype")
+            .displayedName("Output image data dtype")
+            .description(
+                "The (numpy) dtype to use for outgoing image data. Input is cast to "
+                "float32, corrections are applied, and only then will the result be "
+                "cast to outputImageDtype. Be aware that casting to integer type "
+                "causes truncation rather than rounding."
+            )
+            # TODO: consider adding rounding / binning for integer output
+            .options("float16,float32,uint16")
+            .assignmentOptional()
+            .defaultValue("float32")
+            .commit(),
+
+            # important: determines shape of data as going into correction
+            UINT32_ELEMENT(expected)
+            .key("dataFormat.pixelsX")
+            .displayedName("Pixels x")
+            .description("Number of pixels of image data along X axis")
+            .assignmentOptional()
+            .defaultValue(512)
+            .commit(),
+
+            UINT32_ELEMENT(expected)
+            .key("dataFormat.pixelsY")
+            .displayedName("Pixels y")
+            .description("Number of pixels of image data along Y axis")
+            .assignmentOptional()
+            .defaultValue(128)
+            .commit(),
+
+            UINT32_ELEMENT(expected)
+            .key("dataFormat.memoryCells")
+            .displayedName("Memory cells")
+            .description("Full number of memory cells in incoming data")
+            .assignmentOptional()
+            .defaultValue(1)  # subclass will want to set a default value
+            .commit(),
+
+            UINT32_ELEMENT(expected)
+            .key("dataFormat.filteredFrames")
+            .displayedName("Frames after filter")
+            .description("Number of frames left after applying frame filter")
+            .readOnly()
+            .initialValue(0)
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("dataFormat.outputAxisOrder")
+            .displayedName("Output axis order")
+            .description(
+                "Axes of main data output can be reordered after correction. Axis "
+                "order is specified as string consisting of 'x', 'y', and 'c', with "
+                "the latter indicating the memory cell axis. The default value of "
+                "'cxy' puts pixels on the fast axes."
+            )
+            .options("cxy,cyx,xcy,xyc,ycx,yxc")
+            .assignmentOptional()
+            .defaultValue("cxy")
+            .commit(),
+
+            VECTOR_UINT32_ELEMENT(expected)
+            .key("dataFormat.inputDataShape")
+            .displayedName("Input data shape")
+            .description(
+                "Image data shape in incoming data (from reader / DAQ). This value is "
+                "computed from pixelsX, pixelsY, and memoryCells - this field just "
+                "shows what is currently expected."
+            )
+            .readOnly()
+            .initialValue([])
+            .commit(),
+
+            VECTOR_UINT32_ELEMENT(expected)
+            .key("dataFormat.outputDataShape")
+            .displayedName("Output data shape")
+            .description(
+                "Image data shape for data output from this device. This value is "
+                "computed from pixelsX, pixelsY, and the size of the frame filter - "
+                "this field just shows what is currently expected."
+            )
+            .readOnly()
+            .initialValue([])
+            .commit(),
+        )
+
+        (
+            SLOT_ELEMENT(expected)
+            .key("loadMostRecentConstants")
+            .displayedName("Load most recent constants")
+            .description(
+                "Calling this slot will flush all constant buffers and cause the "
+                "device to start querying CalCat for the most recent constants - all "
+                "constants applicable for this device - available with the currently "
+                "set constant parameters. This is typically called after "
+                "instantiating pipeline, after changing parameters, or after "
+                "generating new constants."
+            )
+            .commit()
+        )
+
+        (
+            NODE_ELEMENT(expected).key("preview").displayedName("Preview").commit(),
+
+            OUTPUT_CHANNEL(expected)
+            .key("preview.outputRaw")
+            .dataSchema(preview_schema)
+            .commit(),
+
+            OUTPUT_CHANNEL(expected)
+            .key("preview.outputCorrected")
+            .dataSchema(preview_schema)
+            .commit(),
+
+            BOOL_ELEMENT(expected)
+            .key("preview.enable")
+            .displayedName("Enable preview")
+            .assignmentOptional()
+            .defaultValue(True)
+            .reconfigurable()
+            .commit(),
+
+            INT32_ELEMENT(expected)
+            .key("preview.index")
+            .displayedName("Index (or stat) for preview")
+            .description(
+                "If this value is ≥ 0, the corresponding index (frame, cell, or pulse) "
+                "will be sliced for the preview output. If this value is < 0, preview "
+                "will be one of the following stats: -1: max, -2: mean, -3: sum, -4: "
+                "stdev. These stats are computed across memory cells."
+            )
+            .assignmentOptional()
+            .defaultValue(0)
+            .minInc(-4)
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(expected)
+            .key("preview.selectionMode")
+            .displayedName("Index selection mode")
+            .description(
+                "The value of preview.index can be used in multiple ways, controlled "
+                "by this value. If this is set to 'frame', preview.index is sliced "
+                "directly from data. If 'cell' (or 'pulse') is selected, I will look "
+                "at cell (or pulse) table for the requested cell (or pulse ID). "
+                "Special (stat) index values <0 are not affected by this."
+            )
+            .options("frame,cell,pulse")
+            .assignmentOptional()
+            .defaultValue("frame")
+            .reconfigurable()
+            .commit(),
+
+            UINT32_ELEMENT(expected)
+            .key("preview.trainIdModulo")
+            .displayedName("Preview train stride")
+            .description(
+                "Preview will only be generated for trains whose ID modulo this "
+                "number is zero. Higher values means less frequent preview updates. "
+                "Keep in mind that the GUI has limited refresh rate. Extra care must "
+                "be taken if DAQ train stride is >1."
+            )
+            .assignmentOptional()
+            .defaultValue(6)
+            .reconfigurable()
+            .commit(),
+        )
+
+        # just measurements and counters to display
+        (
+            UINT64_ELEMENT(expected)
+            .key("trainId")
+            .displayedName("Train ID")
+            .description("ID of latest train processed by this device.")
+            .readOnly()
+            .initialValue(0)
+            .commit(),
+
+            NODE_ELEMENT(expected)
+            .key("performance")
+            .displayedName("Performance measures")
+            .commit(),
+
+            DOUBLE_ELEMENT(expected)
+            .key("performance.processingTime")
+            .displayedName("Processing time")
+            .unit(Unit.SECOND)
+            .metricPrefix(MetricPrefix.MILLI)
+            .readOnly()
+            .initialValue(0)
+            .warnHigh(100)
+            .info("Processing too slow to reach 10 Hz")
+            .needsAcknowledging(False)
+            .commit(),
+
+            DOUBLE_ELEMENT(expected)
+            .key("performance.rate")
+            .displayedName("Rate")
+            .description(
+                "Actual rate with which this device gets, processes, and sends trains. "
+                "This is a simple windowed moving average."
+            )
+            .unit(Unit.HERTZ)
+            .readOnly()
+            .initialValue(0)
+            .commit(),
+
+            DOUBLE_ELEMENT(expected)
+            .key("performance.ratioOfRecentTrainsReceived")
+            .description(
+                "Of the latest trains (from last received train, going back "
+                "[some buffer range]), how many did we receive? This estimate is "
+                "updated when new trains come in, so is unreliable if nothing is "
+                "coming at all."
+            )
+            .unit(Unit.PERCENT)
+            .readOnly()
+            .initialValue(0)
+            .commit(),
+        )
+
+        # this node will be filled out by subclass
+        (
+            NODE_ELEMENT(expected)
+            .key("corrections")
+            .displayedName("Correction steps")
+            .commit(),
+        )
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.input_data_dtype = np.dtype(config["dataFormat.inputImageDtype"])
+        self.output_data_dtype = np.dtype(config["dataFormat.outputImageDtype"])
+
+        self.sources = set(config.get("fastSources"))
+
+        self.kernel_runner = None  # must call _update_buffers to initialize
+        self._shmem_buffer = None  # ditto
+
+        self._correction_flag_enabled = self._correction_flag_class.NONE
+        self._correction_flag_preview = self._correction_flag_class.NONE
+        self._buffer_lock = threading.Lock()
+        self._last_processing_started = 0  # used for processing time and timeout
+
+        # register slots
+        if parse_version(karaboVersion) >= parse_version("2.11"):
+            # TODO: the CalCatFriend could add these for us
+            # note: overly complicated for closure to work
+            def make_wrapper_capturing_constant(constant):
+                def aux():
+                    self.calcat_friend.get_specific_constant_version_and_call_me_back(
+                        constant, self._load_constant_to_runner
+                    )
+
+                return aux
+
+            for constant in self._constant_enum_class:
+                slot_name = f"foundConstants.{constant.name}.overrideConstantVersion"
+                meth_name = slot_name.replace(".", "_")
+                self.KARABO_SLOT(
+                    make_wrapper_capturing_constant(constant),
+                    slotName=meth_name,
+                )
+
+        self.KARABO_SLOT(self.loadMostRecentConstants)
+        self.KARABO_SLOT(self.requestScene)
+
+        self.registerInitialFunction(self._initialization)
+
+    def _initialization(self):
+        self.calcat_friend = self._calcat_friend_class(
+            self, pathlib.Path.cwd() / "calibration-client-secrets.json"
+        )
+        try:
+            self._frame_filter = _parse_frame_filter(self._parameters)
+        except (ValueError, TypeError):
+            self.log_status_warn("Failed to parse initial frame filter, will not use")
+            self._frame_filter = None
+        self._update_frame_filter()
+
+        self._buffered_status_update = Hash(
+            "trainId",
+            0,
+            "performance.rate",
+            0,
+            "performance.processingTime",
+            0,
+            "performance.ratioOfRecentTrainsReceived",
+            0,
+        )
+        self._processing_time_ema = utils.ExponentialMovingAverage(alpha=0.3)
+        self._rate_tracker = utils.WindowRateTracker()
+        self._rate_update_timer = utils.RepeatingTimer(
+            interval=1,
+            callback=self._update_rate_and_state,
+        )
+        self._train_ratio_tracker = utils.TrainRatioTracker(
+            warn_callback=self.log_status_warn
+        )
+
+        self.KARABO_ON_INPUT("dataInput", self.input_handler)
+        self.KARABO_ON_EOS("dataInput", self.handle_eos)
+
+        self.updateState(State.ON)
+
+    def __del__(self):
+        del self._shmem_buffer
+        super().__del__()
+
+    def preReconfigure(self, config):
+        # validation
+        for ts_path in (
+            "constantParameters.deviceMappingSnapshotAt",
+            "constantParameters.constantVersionEventAt",
+        ):
+            if config.has(ts_path):
+                ts_string = config.get(ts_path)
+                if ts_string.strip() == "":
+                    config.set(ts_path, "")
+                else:
+                    # allow exception to reach operator
+                    timestamp = dateutil.parser.isoparse(ts_string)
+                    config.set(ts_path, timestamp.isoformat())
+
+        if config.has("constantParameters.deviceMappingSnapshotAt"):
+            self.calcat_friend.flush_pdu_mapping()
+
+        # update device based on changes
+        if config.has("frameFilter"):
+            self._frame_filter = _parse_frame_filter(
+                utils.ChainHash(config, self._parameters)
+            )
+
+        self._prereconfigure_update_hash = config
+
+    def postReconfigure(self):
+        if not hasattr(self, "_prereconfigure_update_hash"):
+            self.log_status_warn("postReconfigure without knowing update hash")
+            return
+
+        update = self._prereconfigure_update_hash
+
+        if update.has("frameFilter"):
+            with self._buffer_lock:
+                self._update_frame_filter()
+        elif any(
+            update.has(shape_param)
+            for shape_param in (
+                "dataFormat.pixelsX",
+                "dataFormat.pixelsY",
+                "dataFormat.memoryCells",
+                "constantParameters.memoryCells",
+                "frameFilter",
+            )
+        ):
+            with self._buffer_lock:
+                self._update_buffers()
+        # TODO: only call this if they are changed (is cheap, though)
+        self._update_correction_flags()
+
+    def loadMostRecentConstants(self):
+        self.flush_constants()
+        self.calcat_friend.flush_constants()
+        for constant in self._constant_enum_class:
+            self.calcat_friend.get_constant_version_and_call_me_back(
+                constant, self._load_constant_to_runner
+            )
+
+    def flush_constants(self):
+        """Reset constant buffers and disable corresponding correction steps"""
+        for correction_step, _ in self._correction_field_names:
+            self.set(f"corrections.{correction_step}.available", False)
+        self.kernel_runner.flush_buffers()
+        self._update_correction_flags()
+
+    def log_status_info(self, msg):
+        self.log.INFO(msg)
+        self.set("status", msg)
+
+    def log_status_warn(self, msg):
+        self.log.WARN(msg)
+        self.set("status", msg)
+
+    def log_status_error(self, msg):
+        self.set("status", msg)
+        self.log.ERROR(msg)
+        self.updateState(State.ERROR)
+
+    def requestScene(self, params):
+        payload = Hash()
+        name = params.get("name", default="")
+        payload["name"] = name
+        payload["success"] = True
+        if name == "overview":
+            payload["data"] = scenes.correction_device_overview_scene(
+                device_id=self.getInstanceId(),
+                schema=self.getFullSchema(),
+            )
+        elif name.startswith("browse_schema"):
+            if ":" in name:
+                prefix = name[len("browse_schema:") :]
+            else:
+                prefix = "managed"
+            payload["data"] = scenes.recursive_subschema_scene(
+                self.getInstanceId(),
+                self.getFullSchema(),
+                prefix,
+            )
+        else:
+            payload["success"] = False
+        response = Hash()
+        response["type"] = "deviceScene"
+        response["origin"] = self.getInstanceId()
+        response["payload"] = payload
+        self.reply(response)
+
+    def _write_output(self, data, old_metadata):
+        """For dataOutput: reusing incoming data hash and setting source and timestamp
+        to be same as input"""
+        metadata = ChannelMetaData(
+            old_metadata.get("source"),
+            Timestamp.fromHashAttributes(old_metadata.getAttributes("timestamp")),
+        )
+
+        channel = self.signalSlotable.getOutputChannel("dataOutput")
+        channel.write(data, metadata, False)
+        channel.update()
+
+    def _write_combiner_previews(self, channel_data_pairs, train_id, source):
+        # TODO: send as ImageData (requires updated assembler)
+        # TODO: allow sending *all* frames for commissioning (request: Jola)
+        preview_hash = Hash()
+        preview_hash.set("image.trainId", train_id)
+
+        # note: have to construct because setting .tid after init is broken
+        timestamp = Timestamp(Epochstamp(), Trainstamp(train_id))
+        metadata = ChannelMetaData(source, timestamp)
+        for channel_name, data in channel_data_pairs:
+            preview_hash.set(self._image_data_path, data)
+            channel = self.signalSlotable.getOutputChannel(channel_name)
+            channel.write(preview_hash, metadata, False)
+            channel.update()
+
+    def _update_correction_flags(self):
+        """Based on constants loaded and settings, update bit mask flags for kernel"""
+        available = self._correction_flag_class.NONE
+        enabled = self._correction_flag_class.NONE
+        preview = self._correction_flag_class.NONE
+        for field_name, flag in self._correction_field_names:
+            if self.get(f"corrections.{field_name}.available"):
+                available |= flag
+            if self.get(f"corrections.{field_name}.enable"):
+                enabled |= flag
+            if self.get(f"corrections.{field_name}.preview"):
+                preview |= flag
+        enabled &= available
+        preview &= available
+        self._correction_flag_enabled = enabled
+        self._correction_flag_preview = preview
+        self.log.DEBUG(f"Corrections for dataOutput: {str(enabled)}")
+        self.log.DEBUG(f"Corrections for preview: {str(preview)}")
+
+    def _update_frame_filter(self, update_buffers=True):
+        """Parse frameFilter string (if set) and update cached filter array. May update
+        dataFormat.filteredFrames - will therefore by default call _update_buffers
+        afterwards."""
+        # TODO: add some validation to preReconfigure
+        self.log.DEBUG("Updating frame filter")
+
+        if self._frame_filter is None:
+            self.set("dataFormat.filteredFrames", self.get("dataFormat.memoryCells"))
+            self.set("frameFilter.current", [])
+        else:
+            self.set("dataFormat.filteredFrames", self._frame_filter.size)
+            self.set("frameFilter.current", list(map(int, self._frame_filter)))
+
+        if self._frame_filter is not None and (
+            self._frame_filter.min() < 0
+            or self._frame_filter.max() >= self.get("dataFormat.memoryCells")
+        ):
+            self.log_status_warn("Invalid frame filter set, expect exceptions!")
+
+        if update_buffers:
+            self._update_buffers()
+
+    def _update_buffers(self):
+        """(Re)initialize buffers / kernel runner according to expected data shapes"""
+        self.log.INFO("Updating buffers according to data shapes")
+        # reflect the axis reordering in the expected output shape
+        self.set("dataFormat.inputDataShape", list(self.input_data_shape))
+        self.set("dataFormat.outputDataShape", list(self.output_data_shape))
+        self.log.INFO(f"Input shape: {self.input_data_shape}")
+        self.log.INFO(f"Output shape: {self.output_data_shape}")
+
+        if self._shmem_buffer is None:
+            shmem_buffer_name = self.getInstanceId() + ":dataOutput"
+            memory_budget = self.get("outputShmemBufferSize") * 2 ** 30
+            self.log.INFO(f"Opening new shmem buffer: {shmem_buffer_name}")
+            self._shmem_buffer = shmem_utils.ShmemCircularBuffer(
+                memory_budget,
+                self.output_data_shape,
+                self.output_data_dtype,
+                shmem_buffer_name,
+            )
+            self.log.INFO("Trying to pin the shmem buffer memory")
+            self._shmem_buffer.cuda_pin()
+            self.log.INFO("Done, shmem buffer is ready")
+        else:
+            self._shmem_buffer.change_shape(self.output_data_shape)
+
+        self.kernel_runner = self._kernel_runner_class(
+            self.get("dataFormat.pixelsX"),
+            self.get("dataFormat.pixelsY"),
+            self.get("dataFormat.filteredFrames"),
+            int(self.get("constantParameters.memoryCells")),
+            input_data_dtype=self.input_data_dtype,
+            output_data_dtype=self.output_data_dtype,
+            **self._kernel_runner_init_args,
+        )
+
+        # TODO: gracefully handle change in constantParameters.memoryCells
+        with self.calcat_friend.cached_constants_lock:
+            for (
+                constant,
+                data,
+            ) in self.calcat_friend.cached_constants.items():
+                self.log_status_info(f"Reload constant {constant}")
+                self._load_constant_to_runner(constant, data)
+
+    def input_handler(self, input_channel):
+        """Main handler for data input: Do a few simple checks to determine whether to
+        even try processing. If yes, will pass data and information to process_data
+        method provided by subclass."""
+
+        # Is device even ready for this?
+        state = State[self.unsafe_get("state")]
+        if state is State.ERROR:
+            # in this case, we should have already issued warning
+            return
+        elif self.kernel_runner is None:
+            self.log_status_warn("Received data, but have not initialized kernels yet")
+            return
+
+        all_metadata = input_channel.getMetaData()
+        for input_index in range(input_channel.size()):
+            self._last_processing_started = default_timer()
+            data_hash = input_channel.read(input_index)
+            metadata = all_metadata[input_index]
+            source = metadata.get("source")
+
+            if source not in self.sources:
+                self.log_status_info(f"Ignoring hash with unknown source {source}")
+                return
+            elif not data_hash.has(self._image_data_path):
+                self.log_status_info("Ignoring hash without image node")
+                return
+
+            train_id = metadata.getAttribute("timestamp", "tid")
+            self._train_ratio_tracker.update(train_id)
+            cell_table = data_hash.get(self._cell_table_path)
+            if (
+                    (isinstance(cell_table, np.ndarray) and cell_table.size == 0)
+                    or len(cell_table) == 0
+            ):
+                self.log_status_warn(
+                    "Empty cell table, DAQ probably not sending data."
+                )
+                return
+            cell_table = np.squeeze(cell_table)
+
+            # no more common reasons to skip input, so go to processing
+            if state is State.ON:
+                self.updateState(State.PROCESSING)
+                self.log_status_info("Processing data")
+
+            correction_cell_num = self.unsafe_get("constantParameters.memoryCells")
+            cell_table_max = np.max(cell_table)
+
+            image_data = data_hash.get(self._image_data_path)
+            if cell_table.size != self.unsafe_get("dataFormat.memoryCells"):
+                self.log_status_info(
+                    f"Updating new input shape {image_data.shape}, updating buffers"
+                )
+                self.set("dataFormat.memoryCells", cell_table.size)
+                with self._buffer_lock:
+                    self._update_frame_filter()
+
+            # DataAggregator typically tells us the wrong axis order
+            if self.unsafe_get("dataFormat.overrideInputAxisOrder"):
+                expected_shape = self.input_data_shape
+                if expected_shape != image_data.shape:
+                    image_data.shape = expected_shape
+
+            do_generate_preview = (
+                train_id % self.unsafe_get("preview.trainIdModulo") == 0
+                and self.unsafe_get("preview.enable")
+            )
+
+            with self._buffer_lock:
+                self.process_data(
+                    data_hash,
+                    metadata,
+                    source,
+                    train_id,
+                    image_data,
+                    cell_table,
+                    do_generate_preview,
+                )
+            self._buffered_status_update.set("trainId", train_id)
+            self._processing_time_ema.update(
+                default_timer() - self._last_processing_started
+            )
+            self._rate_tracker.update()
+
+    def _update_rate_and_state(self):
+
+        if self.get("state") is State.PROCESSING:
+            self._buffered_status_update.set(
+                "performance.rate", self._rate_tracker.get()
+            )
+            self._buffered_status_update.set(
+                "performance.processingTime", self._processing_time_ema.get() * 1000
+            )
+            self._buffered_status_update.set(
+                "performance.ratioOfRecentTrainsReceived",
+                self._train_ratio_tracker.get(),
+            )
+            # trainId in _buffered_status_update should be updated in input handler
+            self.set(self._buffered_status_update)
+            if (
+                default_timer() - self._last_processing_started
+                > PROCESSING_STATE_TIMEOUT
+            ):
+                self.updateState(State.ON)
+                self.log_status_info(
+                    f"No new train in {PROCESSING_STATE_TIMEOUT} s, switching state."
+                )
+
+    def handle_eos(self, channel):
+        self.updateState(State.ON)
+        self.signalEndOfStream("dataOutput")
+
+
+# forward-compatible unsafe_get proposed by @haufs
+if not hasattr(BaseCorrection, "unsafe_get"):
+    def unsafe_get(self, key):
+        """Look up key in device schema quickly, but without consistency locks
+
+        This is only relevant for use in hot path (input handler).  Circumvents the
+        locking done by PythonDevice.get. Note that PythonDevice.get does handle some
+        special types (by looking at full schema for type information).  In particular,
+        device state enum: `self.get("state")` will return a State whereas
+        `self.unsafe_get("state")` will return a string. Handle with care!"""
+
+        # at least until Karabo 2.14, self._parameters is maintained by PythonDevice
+        return self._parameters.get(key)
+
+    setattr(BaseCorrection, "unsafe_get", unsafe_get)
+
+
+def add_correction_step_schema(schema, managed_keys, field_flag_mapping):
+    """Using the fields in the provided mapping, will add nodes to schema
+
+    field_flag_mapping is assumed to be iterable of pairs where first entry in each
+    pair is the name of a correction step as it will appear in device schema (second
+    entry - typically an enum field - is ignored). For correction step, a node and some
+    booleans are added to the schema and the toggleable booleans are added to
+    managed_keys. Subclass can customize / add additional keys under node later.
+
+    This method should be called in expectedParameters of subclass after the same for
+    BaseCorrection has been called. Would be nice to include in BaseCorrection instead,
+    but that is tricky: static method of superclass will need _correction_field_names
+    of subclass or device server gets mad. A nice solution with classmethods would be
+    welcome.
+    """
+
+    for field_name, _ in field_flag_mapping:
+        node_name = f"corrections.{field_name}"
+        (
+            NODE_ELEMENT(schema).key(node_name).commit(),
+
+            BOOL_ELEMENT(schema)
+            .key(f"{node_name}.available")
+            .displayedName("Available")
+            .description(
+                "This boolean indicates whether the necessary constants have been "
+                "loaded for this correction step to be applied. Enabling the "
+                "correction will have no effect unless this is True."
+            )
+            .readOnly()
+            .initialValue(False)
+            .commit(),
+
+            BOOL_ELEMENT(schema)
+            .key(f"{node_name}.enable")
+            .displayedName("Enable")
+            .description(
+                "Controls whether to apply this correction step for main data "
+                "output - subject to availability."
+            )
+            .assignmentOptional()
+            .defaultValue(True)
+            .reconfigurable()
+            .commit(),
+
+            BOOL_ELEMENT(schema)
+            .key(f"{node_name}.preview")
+            .displayedName("Preview")
+            .description(
+                "Whether to apply this correction step for corrected preview "
+                "output - subject to availability."
+            )
+            .assignmentOptional()
+            .defaultValue(True)
+            .reconfigurable()
+            .commit(),
+        )
+        managed_keys.add(f"{node_name}.enable")
+        managed_keys.add(f"{node_name}.preview")
+
+
+def _parse_frame_filter(config):
+    print("Get type")
+    _t = config["frameFilter.type"]
+    print(_t)
+    filter_type = FramefilterSpecType(_t)
+    print("Get spec")
+    filter_string = config["frameFilter.spec"]
+
+    if filter_type is FramefilterSpecType.NONE or filter_string.strip() == "":
+        return None
+    elif filter_type is FramefilterSpecType.RANGE:
+        # allow exceptions
+        numbers = tuple(int(part) for part in filter_string.split(","))
+        return np.arange(*numbers, dtype=np.uint16)
+    elif filter_type is FramefilterSpecType.COMMASEPARATED:
+        # np.fromstring is too lenient I think
+        return np.array([int(s) for s in filter_string.split(",")], dtype=np.uint16)
+    else:
+        raise TypeError(f"Unknown frame filter type {filter_type}")
diff --git a/src/calng/base_gpu.py b/src/calng/base_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0619d0ae253b7cb736dbc2fe7275092c7493a3c
--- /dev/null
+++ b/src/calng/base_gpu.py
@@ -0,0 +1,195 @@
+import pathlib
+
+import cupy
+import jinja2
+import numpy as np
+
+from . import utils
+
+
+class BaseGpuRunner:
+    """Class to handle GPU buffers and execution of CUDA kernels on image data
+
+    All GPU buffers are kept within this class and it is intentionally very stateful.
+    This generally means that you will want to load data into it and then do something.
+    Typical usage in correct order:
+
+    1. instantiate
+    2. load constants
+    3. load_data
+    4. load_cell_table
+    5. correct
+    6a. reshape (only here does data transfer back to host)
+    6b. compute_preview (optional)
+
+    repeat from 2. or 3.
+
+    In case no constants are available / correction is not desired, can skip 3 and 4 and
+    pass CorrectionFlags.NONE to correct(...). Generally, user must handle which
+    correction steps are appropriate given the constants loaded so far.
+    """
+
+    # These must be set by subclass
+    _kernel_source_filename = None
+    _corrected_axis_order = None
+
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=np.uint16,
+        output_data_dtype=np.float32,
+    ):
+        _src_dir = pathlib.Path(__file__).absolute().parent
+        # subclass must define _kernel_source_filename
+        with (_src_dir / "kernels" / self._kernel_source_filename).open("r") as fd:
+            self._kernel_template = jinja2.Template(fd.read())
+
+        self.pixels_x = pixels_x
+        self.pixels_y = pixels_y
+        self.memory_cells = memory_cells
+        if constant_memory_cells == 0:
+            # if not set, guess same as input; may save one recompilation
+            self.constant_memory_cells = memory_cells
+        else:
+            self.constant_memory_cells = constant_memory_cells
+        # preview will only be single memory cell
+        self.preview_shape = (self.pixels_x, self.pixels_y)
+        self.input_data_dtype = input_data_dtype
+        self.output_data_dtype = output_data_dtype
+
+        self._init_kernels()
+
+        # reuse buffers for input / output
+        self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16)
+        self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
+        self.processed_data_gpu = cupy.empty(
+            self.processed_shape, dtype=output_data_dtype
+        )
+        self.reshaped_data_gpu = None  # currently not reusing buffer
+
+        # default preview layers: raw and corrected (subclass can extend)
+        self.preview_buffer_getters = [
+            self._get_raw_for_preview,
+            self._get_corrected_for_preview,
+        ]
+
+    # to get data from respective buffers to cell, x, y shape for preview computation
+    def _get_raw_for_preview(self):
+        """Should return view of self.input_data_gpu with shape (cell, x/y, x/y)"""
+        raise NotImplementedError()
+
+    def _get_corrected_for_preview(self):
+        """Should return view of self.processed_data_gpu with shape (cell, x/y, x/y)"""
+        raise NotImplementedError()
+
+    def flush_buffers(self):
+        """Optional reset GPU buffers (implement in subclasses which need this)"""
+        pass
+
+    def correct(self, flags):
+        """Correct (already loaded) image data according to flags
+
+        Subclass must define this method. It should assume that image data, cell table,
+        and other data (including constants) has already been loaded. It should
+        probably run some GPU kernel and output should go into self.processed_data_gpu.
+
+        Keep in mind that user only gets output from compute_preview or reshape
+        (either of these should come after correct).
+
+        The submodules providing subclasses should have some IntFlag enums defining
+        which flags are available to pass along to the kernel. A zero flag should allow
+        the kernel to do no actual correction - but still copy the data between buffers
+        and cast it to desired output type.
+
+        """
+        raise NotImplementedError()
+
+    def reshape(self, output_order, out=None):
+        """Move axes to desired output order and copy to host memory
+
+        The out parameter is passed directly to the get function of GPU array: if
+        None, then a new ndarray (in host memory) is returned. If not None, then data
+        will be loaded into the provided array, which must match shape / dtype.
+        """
+        # TODO: avoid copy
+        if output_order == self._corrected_axis_order:
+            self.reshaped_data_gpu = self.processed_data_gpu
+        else:
+            self.reshaped_data_gpu = cupy.transpose(
+                self.processed_data_gpu,
+                utils.transpose_order(self._corrected_axis_order, output_order),
+            )
+
+        return self.reshaped_data_gpu.get(out=out)
+
+    def load_data(self, raw_data):
+        self.input_data_gpu.set(np.squeeze(raw_data))
+
+    def load_cell_table(self, cell_table):
+        self.cell_table_gpu.set(cell_table)
+
+    def compute_previews(self, preview_index):
+        """Generate single slice or reduction preview of raw and corrected data
+
+        Special values of preview_index are -1 for max, -2 for mean, -3 for sum, and
+        -4 for stdev (across cells).
+
+        Note that preview_index is taken from data without checking cell table.
+        Caller has to figure out which index along memory cell dimension they
+        actually want to preview in case it needs to be a specific pulse.
+
+        Will reuse data from corrected output buffer. Therefore, correct(...) must have
+        been called with the appropriate flags before compute_preview(...).
+        """
+
+        if preview_index < -4:
+            raise ValueError(f"No statistic with code {preview_index} defined")
+        elif preview_index >= self.memory_cells:
+            raise ValueError(f"Memory cell index {preview_index} out of range")
+
+        # TODO: enum around reduction type
+        return tuple(
+            self._compute_a_preview(image_data=getter(), preview_index=preview_index)
+            for getter in self.preview_buffer_getters
+        )
+
+    def _compute_a_preview(self, image_data, preview_index):
+        """image_data must have cells on first axis; X and Y order is not important
+        here for now (and can differ between AGIPD and DSSC)"""
+        if preview_index >= 0:
+            # TODO: reuse pinned buffers for this
+            return image_data[preview_index].astype(np.float32).get()
+        elif preview_index == -1:
+            # TODO: confirm that max is pixel and not integrated intensity
+            # separate from next case because dtype not applicable here
+            return cupy.nanmax(image_data, axis=0).astype(cupy.float32).get()
+        elif preview_index in (-2, -3, -4):
+            stat_fun = {
+                -2: cupy.nanmean,
+                -3: cupy.nansum,
+                -4: cupy.nanstd,
+            }[preview_index]
+            return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
+
+    def update_block_size(self, full_block):
+        """Set execution grid such that it covers processed_shape with full_blocks
+
+        Execution is scheduled with 3d "blocks" of CUDA threads. Tuning can affect
+        performance. Correction kernels are "monolithic" for simplicity (i.e. each
+        logical thread handles one entry in output data), so in each dimension we
+        parallelize, grid * block >= length to cover all entries.
+
+        Note that individual kernels must themselves check whether they go out of
+        bounds; grid dimensions get rounded up in case ndarray size is not multiple of
+        block size.
+
+        """
+        assert len(full_block) == 3
+        self.full_block = tuple(full_block)
+        self.full_grid = tuple(
+            utils.ceil_div(a_length, block_length)
+            for (a_length, block_length) in zip(self.processed_shape, full_block)
+        )
diff --git a/src/calng/calcat_utils.py b/src/calng/calcat_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b27adf0d18e3129daf5e331bd5492caef59ce1
--- /dev/null
+++ b/src/calng/calcat_utils.py
@@ -0,0 +1,558 @@
+import copy
+import functools
+import json
+import pathlib
+import threading
+
+import calibration_client
+import h5py
+import numpy as np
+from calibration_client.modules import (
+    Calibration,
+    CalibrationConstant,
+    CalibrationConstantVersion,
+    Detector,
+    DetectorType,
+    PhysicalDetectorUnit,
+)
+from karabo.bound import (
+    BOOL_ELEMENT,
+    DOUBLE_ELEMENT,
+    NODE_ELEMENT,
+    SLOT_ELEMENT,
+    STRING_ELEMENT,
+    UINT32_ELEMENT,
+    VECTOR_UINT32_ELEMENT,
+)
+from karabo import version as karaboVersion
+from pkg_resources import parse_version
+
+from . import utils
+
+
+class ConditionNotFound(Exception):
+    pass
+
+
+class DetectorNotFound(Exception):
+    pass
+
+
+class ModuleNotFound(Exception):
+    pass
+
+
+class CalibrationNotFound(Exception):
+    pass
+
+
+class CalibrationClientConfigError(Exception):
+    pass
+
+
+def add_status_schema_from_enum(schema, prefix, enum_class):
+    for constant in enum_class:
+        constant_node = f"{prefix}.{constant.name}"
+        (
+            NODE_ELEMENT(schema).key(constant_node).commit(),
+
+            BOOL_ELEMENT(schema)
+            .key(f"{constant_node}.found")
+            .readOnly()
+            .initialValue(False)
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{constant_node}.validFrom")
+            .readOnly()
+            .initialValue("")
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{constant_node}.calibrationId")
+            .readOnly()
+            .initialValue("")
+            .commit(),
+
+            VECTOR_UINT32_ELEMENT(schema)
+            .key(f"{constant_node}.conditionIds")
+            .readOnly()
+            .initialValue([])
+            .commit(),
+
+            VECTOR_UINT32_ELEMENT(schema)
+            .key(f"{constant_node}.constantIds")
+            .readOnly()
+            .initialValue([])
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{constant_node}.constantVersionId")
+            .description(
+                "This field is editable - if for any reason a specific constant "
+                "version is desired, the constant version ID (as used in CalCat) can "
+                "be set here and the slot below can be called to load this particular "
+                "version, overriding the automatic loading of latest constants."
+            )
+            .assignmentOptional()
+            .defaultValue("")
+            .reconfigurable()
+            .commit(),
+        )
+        if parse_version(karaboVersion) >= parse_version("2.11"):
+            (
+                SLOT_ELEMENT(schema)
+                .key(f"{constant_node}.overrideConstantVersion")
+                .displayedName("Override constant version")
+                .commit(),
+            )
+
+
+class OperatingConditions(dict):
+    # TODO: support deviation?
+    def encode(self):
+        return {
+            "parameters": [
+                {
+                    "parameter_name": key,
+                    "lower_deviation_value": 0.0,
+                    "upper_deviation_value": 0.0,
+                    "flg_available": False,
+                    "value": value,
+                }
+                for (key, value) in self.items()
+            ]
+        }
+
+    def __hash__(self):
+        # this takes me back to pre-screening interview time...
+        return hash(tuple(sorted(self.items())))
+
+
+class BaseCalcatFriend:
+    """Base class for CalCat friends - handles interacting with CalCat for the device
+
+    A CalCat friend uses the device schema to build up parameters for CalCat queries.
+    It focuses on two nodes (added by static method add_schema): param_prefix and
+    status_prefix. The former is primarily used to get parameters which are (via
+    condition methods - see for example dark_condition of DsscCalcatFriend) used
+    to look for constants. The latter is primarily used to give user information
+    about what was found.
+    """
+
+    _constant_enum_class = None  # subclass should set
+    _constants_need_conditions = None  # subclass should set
+
+    @staticmethod
+    def add_schema(
+        schema,
+        managed_keys,
+        detector_type,
+        param_prefix="constantParameters",
+        status_prefix="foundConstants",
+    ):
+        """Add elements needed by this object to device's schema (expectedSchema)
+
+        All elements added to schema go under prefixes which should end with name of
+        node which does not exist yet. To change default values and add more fields,
+        extend this method in subclass.
+
+        The param_prefix node will hold all the parameters needed to build constant
+        condition dicts for querying CalCat. These values are set either directly on
+        the device or via manager and this class gets them from the device using helper
+        function _get_param. See for example AgipdCalcatFriend.dark_condition.
+
+        The status_prefix node is used to report information about what was found in
+        CalCat. This class will update the values on the device using the helper
+        function _set_status. This should not need to happen in subclass methods.
+        """
+
+        (
+            NODE_ELEMENT(schema)
+            .key(param_prefix)
+            .displayedName("Constant retrieval parameters")
+            .commit(),
+
+            NODE_ELEMENT(schema)
+            .key(status_prefix)
+            .displayedName("Constants retrieved")
+            .commit(),
+        )
+
+        # Parameters which any detector would probably have (extend this in subclass)
+        # TODO: probably switch to floating point for everything, including mem cells
+        (
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.deviceMappingSnapshotAt")
+            .displayedName("Snapshot timestamp (for device mapping)")
+            .description(
+                "CalCat supports querying with a specific snapshot of the database. "
+                "When playing back a run from the file system, this feature is useful "
+                "to look up the device mapping at the time of the run. If this field "
+                "is left empty, the latest device mapping is used. Date format should "
+                "be 'YYYY-MM-DD' with optional time of day starting with 'T' followed "
+                "by 'hh:mm:ss.mil+02:00'."
+            )
+            .assignmentOptional()
+            .defaultValue("")
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.constantVersionEventAt")
+            .displayedName("Event at timestamp (for constant version)")
+            .description("TODO")
+            .assignmentOptional()
+            .defaultValue("")
+            .reconfigurable()
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.detectorType")
+            .displayedName("Detector type name")
+            .description(
+                "Name of detector type in CalCat; typically has suffix '-Type'"
+            )
+            .readOnly()
+            .initialValue(detector_type)
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.detectorTypeId")
+            .readOnly()
+            .initialValue("")
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.detectorName")
+            .assignmentOptional()
+            .defaultValue("")
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.detectorId")
+            .readOnly()
+            .initialValue("")
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.karaboDa")
+            .assignmentOptional()
+            .defaultValue("")
+            .commit(),
+
+            STRING_ELEMENT(schema)
+            .key(f"{param_prefix}.moduleId")
+            .readOnly()
+            .initialValue("")
+            .commit(),
+
+            UINT32_ELEMENT(schema)
+            .key(f"{param_prefix}.memoryCells")
+            .displayedName("Memory cells")
+            .description(
+                "Number of memory cells / frames per train. Relevant for burst mode."
+            )
+            .assignmentOptional()
+            .defaultValue(1)
+            .reconfigurable()
+            .commit(),
+
+            UINT32_ELEMENT(schema)
+            .key(f"{param_prefix}.pixelsX")
+            .displayedName("Pixels X")
+            .assignmentOptional()
+            .defaultValue(512)
+            .commit(),
+
+            UINT32_ELEMENT(schema)
+            .key(f"{param_prefix}.pixelsY")
+            .displayedName("Pixels Y")
+            .assignmentOptional()
+            .defaultValue(128)
+            .commit(),
+
+            DOUBLE_ELEMENT(schema)
+            .key(f"{param_prefix}.biasVoltage")
+            .displayedName("Bias voltage")
+            .description("Sensor bias voltage")
+            .assignmentOptional()
+            .defaultValue(300)
+            .reconfigurable()
+            .commit(),
+        )
+        managed_keys.add(f"{param_prefix}.deviceMappingSnapshotAt")
+        managed_keys.add(f"{param_prefix}.constantVersionEventAt")
+        managed_keys.add(f"{param_prefix}.memoryCells")
+        managed_keys.add(f"{param_prefix}.pixelsX")
+        managed_keys.add(f"{param_prefix}.pixelsY")
+        managed_keys.add(f"{param_prefix}.biasVoltage")
+
+    def __init__(
+        self,
+        device,
+        secrets_fn: pathlib.Path,
+        param_prefix="constantParameters",
+        status_prefix="foundConstants",
+    ):
+        self.device = device
+        self.param_prefix = param_prefix
+        self.status_prefix = status_prefix
+        self.cached_constants = {}
+        self.cached_constants_lock = threading.Lock()
+        # api lock used to force queries to be sequential (SSL issue on ONC)
+        self.api_lock = threading.Lock()
+
+        if not secrets_fn.is_file():
+            self.device.log_status_warn(
+                f"Missing CalCat secrets file (expected {secrets_fn})"
+            )
+        with secrets_fn.open("r") as fd:
+            calcat_secrets = json.load(fd)
+
+        self.caldb_store = pathlib.Path(calcat_secrets["caldb_store_path"])
+        if not self.caldb_store.is_dir():
+            raise ValueError(f"caldb_store location '{self.caldb_store}' is not dir")
+
+        self.device.log.INFO(f"Connecting to CalCat at {calcat_secrets['base_url']}")
+        base_url = calcat_secrets["base_url"]
+        self.client = calibration_client.CalibrationClient(
+            client_id=calcat_secrets["client_id"],
+            client_secret=calcat_secrets["client_secret"],
+            user_email=calcat_secrets["user_email"],
+            base_api_url=f"{base_url}/api/",
+            token_url=f"{base_url}/oauth/token",
+            refresh_url=f"{base_url}/oauth/token",
+            auth_url=f"{base_url}/oauth/authorize",
+            scope="public",
+            session_token=None,
+        )
+        self.device.log_status_info("CalCat connection established")
+
+    def _get_param(self, key):
+        """Helper to get value from attached device schema"""
+        return self.device.get(f"{self.param_prefix}.{key}")
+
+    def _set_param(self, key, value):
+        self.device.set(f"{self.param_prefix}.{key}", value)
+
+    def _get_status(self, constant, key):
+        return self.device.get(f"{self.status_prefix}.{constant.name}.{key}")
+
+    def _set_status(self, constant, key, value):
+        """Helper to update information about found constants on device"""
+        self.device.set(f"{self.status_prefix}.{constant.name}.{key}", value)
+
+    # Python 3.6 does not have functools.cached_property or even functools.cache
+    @property
+    @functools.lru_cache()
+    def detector_id(self):
+        detector_name = self._get_param("detectorName")
+        resp = Detector.get_by_identifier(self.client, detector_name)
+        self._check_resp(resp, DetectorNotFound, f"Detector {detector_name} not found")
+        res = resp["data"]["id"]
+        self._set_param("detectorId", str(res))
+        return res
+
+    @property
+    @functools.lru_cache()
+    def detector_type_id(self):
+        detector_type = self._get_param("detectorType")
+        resp = DetectorType.get_by_name(self.client, detector_type)
+        self._check_resp(
+            resp, DetectorNotFound, f"Detector type {detector_type} not found"
+        )
+        res = resp["data"]["id"]
+        self._set_param("detectorTypeId", str(res))
+        return res
+
+    @property
+    @functools.lru_cache()
+    def pdus(self):
+        resp = PhysicalDetectorUnit.get_all_by_detector(
+            self.client, self.detector_id, self._get_param("deviceMappingSnapshotAt")
+        )
+        self._check_resp(resp, warning="Failed to retrieve module mapping")
+        for irrelevant_key in ("detector", "detector_type", "flg_available"):
+            for pdu in resp["data"]:
+                del pdu[irrelevant_key]
+        return resp["data"]
+
+    @property
+    @functools.lru_cache()
+    def _karabo_da_to_float_uuid(self):
+        return {pdu["karabo_da"]: pdu["float_uuid"] for pdu in self.pdus}
+
+    @property
+    @functools.lru_cache()
+    def _karabo_da_to_id(self):
+        return {pdu["karabo_da"]: pdu["id"] for pdu in self.pdus}
+
+    def flush_pdu_mapping(self):
+        for attr in ("pdus", "_karabo_da_to_float_uuid", "_karabo_da_to_id"):
+            if hasattr(self, attr):
+                delattr(self, attr)
+        self._set_param("moduleId", "")
+
+    @utils.threadsafe_cache
+    def calibration_id(self, calibration_name: str):
+        resp = Calibration.get_by_name(self.client, calibration_name)
+        self._check_resp(
+            resp, CalibrationNotFound, f"Calibration type {calibration_name} not found!"
+        )
+        return resp["data"]["id"]
+
+    @utils.threadsafe_cache
+    def condition_ids(self, pdu, condition):
+        # modifying condition parameter messes with cache
+        condition_with_detector = copy.copy(condition)
+        condition_with_detector["Detector UUID"] = pdu
+        self.device.log.DEBUG(f"Look for condition: {condition_with_detector}")
+        resp = self.client.search_possible_conditions_from_dict(
+            "", condition_with_detector.encode()
+        )
+        self._check_resp(
+            resp,
+            ConditionNotFound,
+            f"Failed to find condition {condition} for pdu {pdu}",
+        )
+        return [d["id"] for d in resp["data"]]
+
+    def constant_ids(self, calibration_id, condition_ids):
+        resp = CalibrationConstant.get_all_by_conditions(
+            self.client,
+            calibration_id=calibration_id,
+            detector_type_id=self.detector_type_id,
+            condition_ids=condition_ids,
+        )
+        self._check_resp(resp, warning="Failed to retrieve constant ID")
+        return [d["id"] for d in resp["data"]]
+
+    def get_constant_version(self, constant):
+        # TODO: catch exceptions, give warnings appropriately
+        karabo_da = self._get_param("karaboDa")
+        self.device.log_status_info(f"Attempting to find {constant} for {karabo_da}")
+
+        if karabo_da not in self._karabo_da_to_float_uuid:
+            self.device.log_status_warn(
+                f"Module {karabo_da} not found in mapping, check configuration!"
+            )
+            raise ModuleNotFound(f"Module map did not include {karabo_da}")
+        self._set_param("moduleId", str(self._karabo_da_to_id[karabo_da]))
+
+        if isinstance(constant, str):
+            constant = self._constant_enum_class[constant]
+
+        calibration_id = self.calibration_id(constant.name)
+        self._set_status(constant, "calibrationId", calibration_id)
+
+        condition = self._constants_need_conditions[constant]()
+        condition_ids = self.condition_ids(
+            self._karabo_da_to_float_uuid[karabo_da], condition
+        )
+        self._set_status(constant, "conditionIds", condition_ids)
+
+        constant_ids = self.constant_ids(
+            calibration_id=calibration_id, condition_ids=condition_ids
+        )
+        self._set_status(constant, "constantIds", constant_ids)
+
+        resp = CalibrationConstantVersion.get_closest_by_time(
+            self.client,
+            calibration_constant_ids=constant_ids,
+            physical_detector_unit_id=self._karabo_da_to_id[karabo_da],
+            event_at=self._get_param("constantVersionEventAt"),
+            snapshot_at=None,
+        )
+        self._check_resp(resp, warning="Failed to find calibration constant version")
+        # TODO: replace with start date and end date
+        timestamp = resp["data"]["begin_validity_at"]
+        self._set_status(constant, "validFrom", timestamp)
+        self._set_status(constant, "constantVersionId", resp["data"]["id"])
+
+        file_path = (
+            self.caldb_store / resp["data"]["path_to_file"] / resp["data"]["file_name"]
+        )
+        # TODO: handle FileNotFoundError if we are led astray
+        with h5py.File(file_path, "r") as fd:
+            constant_data = np.array(fd[resp["data"]["data_set_name"]]["data"])
+        with self.cached_constants_lock:
+            self.cached_constants[constant] = constant_data
+        self._set_status(constant, "found", True)
+        self.device.log_status_info(f"Done finding {constant} for {karabo_da}")
+
+        return constant_data
+
+    def get_specific_constant_version(self, constant):
+        # TODO: warn if PDU or constant type does not match
+        # TODO: warn if result is list (happens for empty version ID)
+        constant_version_id = self.device.get(
+            f"{self.status_prefix}.{constant.name}.constantVersionId"
+        )
+
+        resp = CalibrationConstantVersion.get_by_id(self.client, constant_version_id)
+        self._check_resp(resp, warning="Failed to find calibration constant version")
+        file_path = (
+            self.caldb_store / resp["data"]["path_to_file"] / resp["data"]["file_name"]
+        )
+        with h5py.File(file_path, "r") as fd:
+            constant_data = np.array(fd[resp["data"]["data_set_name"]]["data"])
+        with self.cached_constants_lock:
+            self.cached_constants[constant] = constant_data
+        self._set_status(constant, "validFrom", resp["data"]["begin_at"])
+        self._set_status(constant, "calibrationId", "manual override")
+        self._set_status(constant, "conditionId", "manual override")
+        self._set_status(constant, "constantId", "manual override")
+        self._set_status(constant, "constantVersionId", constant_version_id)
+        self._set_status(constant, "found", True)
+        return constant_data
+
+    def get_constant_version_and_call_me_back(self, constant, callback):
+        """Runs get_constant_version in thread, will call callback on completion"""
+        # TODO: do we want to use asyncio / "modern" async?
+        # TODO: consider moving out of this class, closer to correction device
+        def aux():
+            with self.api_lock:
+                data = self.get_constant_version(constant)
+            callback(constant, data)
+
+        thread = threading.Thread(target=aux)
+        thread.start()
+        return thread
+
+    def get_specific_constant_version_and_call_me_back(self, constant, callback):
+        """Blindly load whatever CalCat points to for CCV - user must be confident that
+        this CCV corresponds to correct kind of constant."""
+
+        # TODO: warn user about all the things that go wrong
+        def aux():
+            with self.api_lock:
+                data = self.get_specific_constant_version(constant)
+            callback(constant, data)
+
+        thread = threading.Thread(target=aux)
+        thread.start()
+        return thread
+
+    def flush_constants(self):
+        for constant in self._constant_enum_class:
+            self._set_status(constant, "validFrom", "")
+            self._set_status(constant, "found", False)
+
+    def _check_resp(self, resp, exception=Exception, warning=None):
+        # TODO: probably verify using "info" that exception is the right one
+        to_raise = None
+        if not resp["success"]:
+            # TODO: probably more types of app_info errors?
+            if resp["app_info"]:
+                if "not found" in resp["info"]:
+                    # this was likely the exception exception
+                    to_raise = exception(resp["info"])
+                else:
+                    # but could also be authorization or similar issue
+                    to_raise = CalibrationClientConfigError(resp["app_info"])
+            to_raise = exception(resp["info"])
+        if to_raise is not None:
+            if warning is not None:
+                self.device.log_status_warn(warning)
+                raise to_raise
diff --git a/src/calng/dssc_gpu.py b/src/calng/dssc_gpu.py
deleted file mode 100644
index 61b00655fe77026472d203224d502edee406433c..0000000000000000000000000000000000000000
--- a/src/calng/dssc_gpu.py
+++ /dev/null
@@ -1,233 +0,0 @@
-import pathlib
-
-import cupy
-import cupyx
-import jinja2
-import numpy as np
-
-from . import utils
-
-
-class DsscGpuRunner:
-    """Class to handle instantiation and execution of CUDA kernels on trains
-
-    All GPU buffers are kept within this class. This generally means that you will
-    want to load data into it and then do something. Typical usage in correct order:
-
-    1. instantiate
-    2. load_constants
-    3. load_data
-    4. load_cell_table
-    5. correct
-    6a. reshape (only here does data transfer back to host)
-    6b. compute_preview (optional)
-
-    repeat from 2. or 3.
-
-    In case no constants are available / correction is not desired, can skip 3 and 4
-    and use only_cast in step 5 instead of correct (taking care to call
-    compute_preview with parameters set accordingly).
-    """
-
-    _src_dir = pathlib.Path(__file__).absolute().parent
-    with (_src_dir / "gpu-dssc-correct.cpp").open("r") as fd:
-        _kernel_template = jinja2.Template(fd.read())
-
-    def __init__(
-        self,
-        pixels_x,
-        pixels_y,
-        memory_cells,
-        output_transpose=(2, 1, 0),  # default: memorycells-fast
-        constant_memory_cells=None,
-        input_data_dtype=np.uint16,
-        output_data_dtype=np.float32,
-    ):
-        self.pixels_x = pixels_x
-        self.pixels_y = pixels_y
-        self.memory_cells = memory_cells
-        self.output_transpose = output_transpose
-        if constant_memory_cells is None:
-            self.constant_memory_cells = memory_cells
-        else:
-            self.constant_memory_cells = constant_memory_cells
-        self.input_shape = (self.memory_cells, self.pixels_y, self.pixels_x)
-        self.output_shape = utils.shape_after_transpose(
-            self.input_shape, self.output_transpose
-        )
-        self.map_shape = (self.pixels_x, self.pixels_y, self.constant_memory_cells)
-        # preview will only be single memory cell
-        self.preview_shape = (self.pixels_x, self.pixels_y)
-        self.input_data_dtype = input_data_dtype
-        self.output_data_dtype = output_data_dtype
-
-        self._init_kernels()
-
-        self.offset_map_gpu = cupy.empty(self.map_shape, dtype=np.float32)
-
-        # reuse output arrays
-        self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16)
-        self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
-        self.processed_data_gpu = cupy.empty(self.input_shape, dtype=output_data_dtype)
-        self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype)
-        self.preview_raw = cupyx.empty_pinned(self.preview_shape, dtype=np.float32)
-        self.preview_corrected = cupyx.empty_pinned(
-            self.preview_shape, dtype=np.float32
-        )
-        self.output_buffer_next_index = 0
-
-        self.update_block_size((1, 1, 64))
-
-    def load_constants(self, offset_map):
-        constant_memory_cells = offset_map.shape[-1]
-        if constant_memory_cells != self.constant_memory_cells:
-            self.constant_memory_cells = constant_memory_cells
-            self.map_shape = (self.pixels_x, self.pixels_y, self.constant_memory_cells)
-            self.offset_map_gpu = cupy.empty(self.map_shape, dtype=np.float32)
-            self._init_kernels()
-        self.offset_map_gpu.set(offset_map)
-
-    def load_data(self, raw_data):
-        self.input_data_gpu.set(np.squeeze(raw_data))
-
-    def load_cell_table(self, cell_table):
-        self.cell_table_gpu.set(cell_table)
-
-    def update_block_size(self, full_block):
-        """Execution is scheduled with 3d "blocks" of CUDA threads, tuning can
-        affect performance
-
-        Grid size is automatically computed based on block size. Note that
-        individual kernels must themselves check whether they go out of bounds;
-        grid dimensions get rounded up in case ndarray size is not multiple of
-        block size.
-
-        """
-        assert len(full_block) == 3
-        self.full_block = tuple(full_block)
-        self.full_grid = tuple(
-            utils.ceil_div(a_length, block_length)
-            for (a_length, block_length) in zip(self.input_shape, full_block)
-        )
-
-    def correct(self):
-        """Apply corrections to data (must load constant, data, and cell_table first)
-
-        Applies corrections to input data and casts to desired output dtype.
-        Parameter cell_table allows out of order or non-contiguous memory cells
-        in input data.  Both input ndarrays are assumed to be on GPU already,
-        preferably wrapped in GPU arrays (cupy array).
-
-        Will return string encoded handle to shared memory output buffer and
-        (view of) said buffer as an ndarray.  Keep in mind that the output
-        buffers will get overwritten eventually (circular buffer).
-        """
-        self.correction_kernel(
-            self.full_grid,
-            self.full_block,
-            (
-                self.input_data_gpu,
-                self.cell_table_gpu,
-                self.offset_map_gpu,
-                self.processed_data_gpu,
-            ),
-        )
-
-    def only_cast(self):
-        """Like correct without the correction
-
-        This currently means just casting to output dtype.
-        """
-        self.casting_kernel(
-            self.full_grid,
-            self.full_block,
-            (
-                self.input_data_gpu,
-                self.processed_data_gpu,
-            ),
-        )
-
-    def reshape(self, out=None):
-        """Move axes to desired output order
-
-        The out parameter is passed directly to the get function of GPU array: if
-        None, then a new ndarray (in host memory) is returned. If not None, then data
-        will be loaded into the provided array, which must match shape / dtype.
-        """
-        # TODO: avoid copy
-        if self.output_transpose is None:
-            self.reshaped_data_gpu = cupy.ascontiguousarray(
-                cupy.squeeze(self.processed_data_gpu)
-            )
-        else:
-            self.reshaped_data_gpu = cupy.ascontiguousarray(
-                cupy.transpose(
-                    cupy.squeeze(self.processed_data_gpu), self.output_transpose
-                )
-            )
-        return self.reshaped_data_gpu.get(out=out)
-
-    def compute_preview(self, preview_index, have_corrected=True, can_correct=True):
-        """Generate single slice or reduction preview of raw and corrected data
-
-        Special values of preview_index are -1 for max, -2 for mean, -3 for
-        sum, and -4 for stdev (across cells).
-
-        Note that preview_index is taken from data without checking cell table.
-        Caller has to figure out which index along memory cell dimension they
-        actually want to preview.
-
-        Can reuse data from corrected output buffer with have_corrected parameter.
-        Note that preview requires relevant data to be loaded (raw data for raw
-        preview, correction map and cell table in addition for corrected preview).
-        """
-
-        if preview_index < -4:
-            raise ValueError(f"No statistic with code {preview_index} defined")
-        elif preview_index >= self.memory_cells:
-            raise ValueError(f"Memory cell index {preview_index} out of range")
-
-        if (not have_corrected) and can_correct:
-            self.correct()
-            # if not have_corrected and not can_correct, assume only_cast already done
-
-        # TODO: enum around reduction type
-        for (image_data, output_buffer) in (
-            (self.input_data_gpu, self.preview_raw),
-            (self.processed_data_gpu, self.preview_corrected),
-        ):
-            if preview_index >= 0:
-                # TODO: change axis order when moving reshape to after correction
-                image_data[preview_index].astype(np.float32).transpose().get(
-                    out=output_buffer
-                )
-            elif preview_index == -1:
-                # TODO: select argmax independently for raw and corrected?
-                # TODO: send frame sums somewhere to compute global max frame
-                max_index = cupy.argmax(
-                    cupy.sum(image_data, axis=(1, 2), dtype=cupy.float32)
-                )
-                image_data[max_index].astype(np.float32).transpose().get(
-                    out=output_buffer
-                )
-            elif preview_index in (-2, -3, -4):
-                stat_fun = {-2: cupy.mean, -3: cupy.sum, -4: cupy.std}[preview_index]
-                stat_fun(image_data, axis=0, dtype=cupy.float32).transpose().get(
-                    out=output_buffer
-                )
-        return self.preview_raw, self.preview_corrected
-
-    def _init_kernels(self):
-        kernel_source = self._kernel_template.render(
-            {
-                "pixels_x": self.pixels_x,
-                "pixels_y": self.pixels_y,
-                "data_memory_cells": self.memory_cells,
-                "constant_memory_cells": self.constant_memory_cells,
-                "input_data_dtype": utils.np_dtype_to_c_type(self.input_data_dtype),
-                "output_data_dtype": utils.np_dtype_to_c_type(self.output_data_dtype),
-            }
-        )
-        self.source_module = cupy.RawModule(code=kernel_source)
-        self.correction_kernel = self.source_module.get_function("correct")
-        self.casting_kernel = self.source_module.get_function("only_cast")
diff --git a/src/calng/gpu_utils.py b/src/calng/gpu_utils.py
deleted file mode 100644
index c945aee1a5bf8b64fa2621de49cf19511cabd41b..0000000000000000000000000000000000000000
--- a/src/calng/gpu_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import re
-
-import pycuda.driver
-import pycuda.gpuarray
-
-_gpuptr_re = re.compile(
-    r"GPUPTR:(?P<gpu_pointer>\w+)" r"DEVID:(?P<device_id>.+)" r"SHAPE:(?P<shape>.+)"
-)
-
-
-def get_shape_from_ipc_handle(handle_string):
-    match = _gpuptr_re.match(handle_string)
-    return tuple(int(num) for num in match.group("shape").split(","))
-
-
-class IPCGPUArray:
-    """Context manager providing a GPUArray opened from string encoding IPC handle
-
-    Arguments:
-    handle_string: String encoding a "GPU pointer" (IPC address) plus some more
-    stuff.  This is "parsed" using _gpuptr_re.
-    dtype: self-explanatory (but make sure it is correct)
-    aray shape is parsed from the handle_string
-    """
-
-    def __init__(self, handle_string, dtype, gpu_pointer_re=None):
-        match = _gpuptr_re.match(handle_string)
-        assert match is not None
-
-        self.dtype = dtype
-        self.handle_address = bytearray.fromhex(match.group("gpu_pointer"))
-        self.shape = tuple(int(num) for num in match.group("shape").split(","))
-        # assuming contiguous C-order strides probably
-        # TODO: smarter
-
-        self.open_handle = None
-        self.gpu_array = None
-
-    def __enter__(self):
-        self.open_handle = pycuda.driver.IPCMemoryHandle(self.handle_address)
-        self.gpu_array = pycuda.gpuarray.GPUArray(
-            self.shape, dtype=self.dtype, gpudata=self.open_handle
-        )
-        return self.gpu_array
-
-    def __exit__(self, t, v, tb):
-        self.open_handle.close()
-
-
-class GPUContextContext:
-    def __init__(self, gpu_context):
-        self.gpu_context = gpu_context
-
-    def __enter__(self):
-        self.gpu_context.push()
-        return self.gpu_context
-
-    def __exit__(self, t, v, tb):
-        self.gpu_context.pop()
diff --git a/src/calng/kernels/agipd_gpu.cu b/src/calng/kernels/agipd_gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..20b08d43d26b5f37d25c8633adb60ed3179db648
--- /dev/null
+++ b/src/calng/kernels/agipd_gpu.cu
@@ -0,0 +1,148 @@
+#include <cuda_fp16.h>
+
+{{corr_enum}}
+
+extern "C" {
+	/*
+	  Perform corrections; see agipd_gpu.CorrectionFlags
+	  Note that THRESHOLD and OFFSET should for any later corrections to make sense
+	  Will take cell_table into account when getting correction values
+	  Will convert from input dtype to float for correction
+	  Will convert to output dtype for output
+	*/
+	__global__ void correct(const {{input_data_dtype}}* data,
+	                        const unsigned short* cell_table,
+	                        const unsigned char corr_flags,
+	                        // default_gain can be 0, 1, or 2, and is relevant for fixed gain mode (no THRESHOLD)
+	                        const unsigned char default_gain,
+	                        const float* threshold_map,
+	                        const float* offset_map,
+	                        const float* rel_gain_pc_map,
+	                        const float* md_additional_offset,
+	                        const float* rel_gain_xray_map,
+	                        const float g_gain_value,
+	                        const unsigned int* bad_pixel_map,
+	                        const float bad_pixel_mask_value,
+	                        float* gain_map, // TODO: more compact yet plottable representation
+	                        {{output_data_dtype}}* output) {
+		const size_t X = {{pixels_x}};
+		const size_t Y = {{pixels_y}};
+		const size_t input_cells = {{data_memory_cells}};
+		const size_t map_cells = {{constant_memory_cells}};
+
+		const size_t cell = blockIdx.x * blockDim.x + threadIdx.x;
+		const size_t x = blockIdx.y * blockDim.y + threadIdx.y;
+		const size_t y = blockIdx.z * blockDim.z + threadIdx.z;
+
+		if (cell >= input_cells || y >= Y || x >= X) {
+			return;
+		}
+
+		// data shape: memory cell, data/raw_gain (dim size 2), x, y
+		const size_t data_stride_y = 1;
+		const size_t data_stride_x = Y * data_stride_y;
+		const size_t data_stride_raw_gain = X * data_stride_x;
+		const size_t data_stride_cell = 2 * data_stride_raw_gain;
+		const size_t data_index = cell * data_stride_cell +
+			0 * data_stride_raw_gain +
+			y * data_stride_y +
+			x * data_stride_x;
+		const size_t raw_gain_index = cell * data_stride_cell +
+			1 * data_stride_raw_gain +
+			y * data_stride_y +
+			x * data_stride_x;
+		float corrected = (float)data[data_index];
+		const float raw_gain_val = (float)data[raw_gain_index];
+
+		const size_t output_stride_y = 1;
+		const size_t output_stride_x = output_stride_y * Y;
+		const size_t output_stride_cell = output_stride_x * X;
+		const size_t output_index = cell * output_stride_cell + x * output_stride_x + y * output_stride_y;
+
+		// per-pixel only constant: cell, x, y
+		const size_t map_stride_y = 1;
+		const size_t map_stride_x = Y * map_stride_y;
+		const size_t map_stride_cell = X * map_stride_x;
+
+		// threshold constant shape: cell, x, y, threshold (dim size 2)
+		const size_t threshold_map_stride_threshold = 1;
+		const size_t threshold_map_stride_y = 2 * threshold_map_stride_threshold;
+		const size_t threshold_map_stride_x = Y * threshold_map_stride_y;
+		const size_t threshold_map_stride_cell = X * threshold_map_stride_x;
+
+		// gain mapped constant shape: cell, x, y, gain_level (dim size 3)
+		const size_t gm_map_stride_gain = 1;
+		const size_t gm_map_stride_y = 3 * gm_map_stride_gain;
+		const size_t gm_map_stride_x = Y * gm_map_stride_y;
+		const size_t gm_map_stride_cell = X * gm_map_stride_x;
+		// note: assuming all maps have same shape (in terms of cells / x / y)
+
+		const size_t map_cell = cell_table[cell];
+
+		if (map_cell < map_cells) {
+			unsigned char gain = default_gain;
+			if (corr_flags & THRESHOLD) {
+				const float threshold_0 = threshold_map[0 * threshold_map_stride_threshold +
+				                                        map_cell * threshold_map_stride_cell +
+				                                        y * threshold_map_stride_y +
+				                                        x * threshold_map_stride_x];
+				const float threshold_1 = threshold_map[1 * threshold_map_stride_threshold +
+				                                        map_cell * threshold_map_stride_cell +
+				                                        y * threshold_map_stride_y +
+				                                        x * threshold_map_stride_x];
+				// could consider making this const using ternaries / tiny function
+				if (raw_gain_val <= threshold_0) {
+					gain = 0;
+				} else if (raw_gain_val <= threshold_1) {
+					gain = 1;
+				} else {
+					gain = 2;
+				}
+			}
+			gain_map[output_index] = (float)gain;
+
+			const size_t map_index = map_cell * map_stride_cell +
+				y * map_stride_y +
+				x * map_stride_x;
+
+			const size_t gm_map_index = gain * gm_map_stride_gain +
+				map_cell * gm_map_stride_cell +
+				y * gm_map_stride_y +
+				x * gm_map_stride_x;
+
+			if ((corr_flags & BPMASK) && bad_pixel_map[gm_map_index]) {
+				corrected = bad_pixel_mask_value;
+				gain_map[output_index] = bad_pixel_mask_value;
+			} else {
+				if (corr_flags & OFFSET) {
+					corrected -= offset_map[gm_map_index];
+					// TODO: optionally reassign gain stage for this pixel based on new value
+				}
+				// TODO: baseline shift
+				if (corr_flags & REL_GAIN_PC) {
+					corrected *= rel_gain_pc_map[gm_map_index];
+					if (gain == 1) {
+						corrected += md_additional_offset[map_index];
+					}
+				}
+				if (corr_flags & GAIN_XRAY) {
+					corrected = (corrected / rel_gain_xray_map[map_index]) * g_gain_value;
+				}
+			}
+			{% if output_data_dtype == "half" %}
+			output[output_index] = __float2half(corrected);
+			{% else %}
+			output[output_index] = ({{output_data_dtype}})corrected;
+			{% endif %}
+		} else {
+			// TODO: decide what to do when we cannot threshold
+			{% if output_data_dtype == "half" %}
+			output[data_index] = __float2half(corrected);
+			{% else %}
+			output[data_index] = ({{output_data_dtype}})corrected;
+			{% endif %}
+
+			gain_map[data_index] = 255;
+		}
+	}
+}
diff --git a/src/calng/gpu-dssc-correct.cpp b/src/calng/kernels/dssc_gpu.cu
similarity index 51%
rename from src/calng/gpu-dssc-correct.cpp
rename to src/calng/kernels/dssc_gpu.cu
index 2412a86ac89d3eb07335f59d2630793e2f04b1d7..a35eed986a4483e0b84ca84e35d2cc3d56d11cb5 100644
--- a/src/calng/gpu-dssc-correct.cpp
+++ b/src/calng/kernels/dssc_gpu.cu
@@ -1,16 +1,21 @@
 #include <cuda_fp16.h>
 
+{{corr_enum}}
+
 extern "C" {
 	/*
-	  Perform correction: offset
+	  Perform corrections: NONE or OFFSET
 	  Take cell_table into account when getting correction values
-	  Converting to float for doing the correction
+	  Converting to float while correcting
 	  Converting to output dtype at the end
+	  Shape of input data: memory cell, 1, y, x
+	  Shape of offset constant: x, y, memory cell
 	*/
-	__global__ void correct(const {{input_data_dtype}}* data,
-							const unsigned short* cell_table,
-							const float* offset_map,
-							{{output_data_dtype}}* output) {
+	__global__ void correct(const {{input_data_dtype}}* data, // shape: memory cell, 1, y, x
+	                        const unsigned short* cell_table,
+	                        const unsigned char corr_flags,
+	                        const float* offset_map,
+	                        {{output_data_dtype}}* output) {
 		const size_t X = {{pixels_x}};
 		const size_t Y = {{pixels_y}};
 		const size_t memory_cells = {{data_memory_cells}};
@@ -31,13 +36,16 @@ extern "C" {
 		const size_t data_index = memory_cell * data_stride_cell + y * data_stride_y + x * data_stride_x;
 		const float raw = (float)data[data_index];
 
-		const size_t map_stride_cell = 1;
-		const size_t map_stride_y = map_memory_cells * map_stride_cell;
-		const size_t map_stride_x = Y * map_stride_y;
+		const size_t map_stride_x = 1;
+		const size_t map_stride_y = X * map_stride_x;
+		const size_t map_stride_cell = Y * map_stride_y;
 		const size_t map_cell = cell_table[memory_cell];
 		if (map_cell < map_memory_cells) {
 			const size_t map_index = map_cell * map_stride_cell + y * map_stride_y + x * map_stride_x;
-			const float corrected = raw - offset_map[map_index];
+			float corrected = raw;
+			if (corr_flags & OFFSET) {
+				corrected -= offset_map[map_index];
+			}
 			{% if output_data_dtype == "half" %}
 			output[data_index] = __float2half(corrected);
 			{% else %}
@@ -51,34 +59,4 @@ extern "C" {
 			{% endif %}
 		}
 	}
-
-	/*
-	  Same as correction, except don't do any correction
-	*/
-	__global__ void only_cast(const {{input_data_dtype}}* data,
-							  {{output_data_dtype}}* output) {
-		const size_t X = {{pixels_x}};
-		const size_t Y = {{pixels_y}};
-		const size_t memory_cells = {{data_memory_cells}};
-
-		const size_t data_stride_x = 1;
-		const size_t data_stride_y = X * data_stride_x;
-		const size_t data_stride_cell = Y * data_stride_y;
-
-		const size_t cell = blockIdx.x * blockDim.x + threadIdx.x;
-		const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-		const size_t x = blockIdx.z * blockDim.z + threadIdx.z;
-
-		if (cell >= memory_cells || y >= Y || x >= X) {
-			return;
-		}
-
-		const size_t data_index = cell * data_stride_cell + y * data_stride_y + x * data_stride_x;
-		const float raw = (float)data[data_index];
-		{% if output_data_dtype == "half" %}
-		output[data_index] = __float2half(raw);
-		{% else %}
-		output[data_index] = ({{output_data_dtype}})raw;
-		{% endif %}
-	}
 }
diff --git a/src/calng/kernels/jungfrau_gpu.cu b/src/calng/kernels/jungfrau_gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d111c0b903e1d67aa85783f5a7914271a15c8d9c
--- /dev/null
+++ b/src/calng/kernels/jungfrau_gpu.cu
@@ -0,0 +1,86 @@
+#include <cuda_fp16.h>
+
+{{corr_enum}}
+
+extern "C" {
+	__global__ void correct(const {{input_data_dtype}}* data, // shape: memory cell, y, x
+	                        const unsigned char* gain_stage, // same shape
+	                        const unsigned char* cell_table,
+	                        const unsigned char corr_flags,
+	                        const float* offset_map,
+	                        const float* rel_gain_map,
+	                        const unsigned int* bad_pixel_map,
+	                        const float bad_pixel_mask_value,
+	                        {{output_data_dtype}}* output) {
+		const size_t X = {{pixels_x}};
+		const size_t Y = {{pixels_y}};
+		const size_t memory_cells = {{data_memory_cells}};
+		const size_t map_memory_cells = {{constant_memory_cells}};
+
+		const size_t memory_cell = blockIdx.x * blockDim.x + threadIdx.x;
+		const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+		const size_t x = blockIdx.z * blockDim.z + threadIdx.z;
+
+		if (memory_cell >= memory_cells || y >= Y || x >= X) {
+			return;
+		}
+
+		const size_t data_stride_x = 1;
+		const size_t data_stride_y = X * data_stride_x;
+		const size_t data_stride_cell = Y * data_stride_y;
+		const size_t data_index = memory_cell * data_stride_cell +
+			y * data_stride_y +
+			x * data_stride_x;
+		float res = (float)data[data_index];
+
+		// gain mapped constant shape: cell, y, x, gain_level (dim size 3)
+		// note: in fixed gain mode, constant still provides data for three stages
+		const size_t map_stride_gain = 1;
+		const size_t map_stride_x = 3 * map_stride_gain;
+		const size_t map_stride_y = X * map_stride_x;
+		const size_t map_stride_cell = Y * map_stride_y;
+
+		// TODO: warn user about cell_table value of 255 in either mode
+		// note: cell table may contain 255 if data didn't arrive
+		{% if burst_mode %}
+		// burst mode: "cell 255" will get copied
+		// TODO: consider masking "cell 255"
+		const size_t map_cell = cell_table[memory_cell];
+		{% else %}
+		// single cell: "cell 255" will get "corrected"
+		const size_t map_cell = 0;
+		{% endif %}
+
+		if (map_cell < map_memory_cells) {
+			unsigned char gain = gain_stage[data_index];
+			if (gain == 2) {
+				// gain should be read as 0, 1, or 3; value of 2 indicates issue
+				res = bad_pixel_mask_value;
+			} else {
+				if (gain == 3) {
+					gain = 2;
+				}
+				const size_t map_index = map_cell * map_stride_cell +
+					y * map_stride_y +
+					x * map_stride_x +
+					gain * map_stride_gain;
+				if ((corr_flags & BPMASK) && bad_pixel_map[map_index]) {
+					res = bad_pixel_mask_value;
+				} else {
+					if (corr_flags & OFFSET) {
+						res -= offset_map[map_index];
+					}
+					if (corr_flags & REL_GAIN) {
+						res /= rel_gain_map[map_index];
+					}
+				}
+			}
+		}
+
+		{% if output_data_dtype == "half" %}
+		output[data_index] = __float2half(res);
+		{% else %}
+		output[data_index] = ({{output_data_dtype}})res;
+		{% endif %}
+	}
+}
diff --git a/src/calng/manual_geometry_base.py b/src/calng/manual_geometry_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3748fc3eb95f5d374ba10693fe9eee3679b63f
--- /dev/null
+++ b/src/calng/manual_geometry_base.py
@@ -0,0 +1,239 @@
+import pickle
+
+import matplotlib.pyplot as plt
+import numpy as np
+from karabo.bound import (
+    DOUBLE_ELEMENT,
+    IMAGEDATA_ELEMENT,
+    INT32_ELEMENT,
+    KARABO_CLASSINFO,
+    NODE_ELEMENT,
+    OUTPUT_CHANNEL,
+    SLOT_ELEMENT,
+    TABLE_ELEMENT,
+    VECTOR_CHAR_ELEMENT,
+    VECTOR_STRING_ELEMENT,
+    Encoding,
+    Hash,
+    ImageData,
+    PythonDevice,
+    Schema,
+    State,
+)
+from karabo.common.api import KARABO_SCHEMA_DISPLAY_TYPE_SCENES as DT_SCENES
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+
+from . import scenes
+from ._version import version as deviceVersion
+
+geometry_schema = Schema()
+(
+    VECTOR_CHAR_ELEMENT(geometry_schema)
+    .key("pickledGeometry")
+    .displayedName("Pickled geometry")
+    .assignmentOptional()
+    .defaultValue([])
+    .commit()
+)
+
+preview_schema = Schema()
+(IMAGEDATA_ELEMENT(preview_schema).key("overview").commit())
+
+ModuleColumn = Schema()
+(
+    DOUBLE_ELEMENT(ModuleColumn)
+    .key("posX")
+    .assignmentOptional()
+    .defaultValue(95)
+    .reconfigurable()
+    .commit(),
+
+    DOUBLE_ELEMENT(ModuleColumn)
+    .key("posY")
+    .assignmentOptional()
+    .defaultValue(564)
+    .reconfigurable()
+    .commit(),
+
+    INT32_ELEMENT(ModuleColumn)
+    .key("orientationX")
+    .assignmentOptional()
+    .defaultValue(-1)
+    .reconfigurable()
+    .commit(),
+
+    INT32_ELEMENT(ModuleColumn)
+    .key("orientationY")
+    .assignmentOptional()
+    .defaultValue(-1)
+    .reconfigurable()
+    .commit(),
+)
+
+
+@KARABO_CLASSINFO("ManualGeometryBase", deviceVersion)
+class ManualGeometryBase(PythonDevice):
+    @staticmethod
+    def expectedParameters(expected):
+        # "mandatory" for geometry serving device
+        (
+            OUTPUT_CHANNEL(expected)
+            .key("geometryOutput")
+            .dataSchema(geometry_schema)
+            .commit(),
+
+            SLOT_ELEMENT(expected).key("pleaseSendYourGeometry").commit(),
+
+            OUTPUT_CHANNEL(expected)
+            .key("previewOutput")
+            .dataSchema(preview_schema)
+            .commit(),
+
+            IMAGEDATA_ELEMENT(expected).key("layoutPreview").commit(),
+        )
+
+        # scenes are fun
+        (
+            VECTOR_STRING_ELEMENT(expected)
+            .key("availableScenes")
+            .setSpecialDisplayType(DT_SCENES)
+            .readOnly()
+            .initialValue(["overview"])
+            .commit(),
+        )
+
+    def update_geom(self):
+        raise NotImplementedError()
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.KARABO_SLOT(self.pleaseSendYourGeometry)
+        self.KARABO_SLOT(self.requestScene)
+        self.update_geom()
+        plt.switch_backend("agg")
+        self.registerInitialFunction(self._initialization)
+
+    def _initialization(self):
+        self.updateState(State.ON)
+        self.pleaseSendYourGeometry()
+
+    def requestScene(self, params):
+        payload = Hash()
+        scene_name = params.get("name", default="")
+        payload["name"] = scene_name
+        payload["success"] = True
+        if scene_name == "overview":
+            payload["data"] = scenes.manual_geometry_overview(
+                device_id=self.getInstanceId()
+            )
+        else:
+            payload["success"] = False
+        response = Hash()
+        response["type"] = "deviceScene"
+        response["origin"] = self.getInstanceId()
+        response["payload"] = payload
+        self.reply(response)
+
+    def pleaseSendYourGeometry(self):
+        self.update_geom()
+        self.writeChannel("geometryOutput", Hash("pickledGeometry", self.pickled))
+        axis = self.geom.inspect()
+        axis.figure.tight_layout(pad=0)
+        axis.figure.set_facecolor("none")
+        # axis.figure.set_size_inches(6, 6)
+        # axis.figure.set_dpi(300)
+        canvas = FigureCanvasAgg(axis.figure)
+        canvas.draw()
+        image_buffer = np.frombuffer(canvas.buffer_rgba(), dtype=np.uint8).reshape(
+            canvas.get_width_height()[::-1] + (4,)
+        )
+        self.set(
+            "layoutPreview",
+            ImageData(image_buffer, encoding=Encoding.RGBA, bitsPerPixel=3 * 8),
+        )
+
+    def preReconfigure(self, config):
+        self._prereconfigure_update_hash = config
+
+    def postReconfigure(self):
+        del self._prereconfigure_update_hash
+
+
+@KARABO_CLASSINFO("ManualQuadrantsGeometryBase", deviceVersion)
+class ManualQuadrantsGeometryBase(ManualGeometryBase):
+    @staticmethod
+    def expectedParameters(expected):
+        # note: subclasses should set better defaults
+        (NODE_ELEMENT(expected).key("quadrantCorners").commit(),)
+        for q in range(1, 5):
+            (
+                NODE_ELEMENT(expected).key(f"quadrantCorners.Q{q}").commit(),
+                DOUBLE_ELEMENT(expected)
+                .key(f"quadrantCorners.Q{q}.x")
+                .assignmentOptional()
+                .defaultValue(0)
+                .reconfigurable()
+                .commit(),
+
+                DOUBLE_ELEMENT(expected)
+                .key(f"quadrantCorners.Q{q}.y")
+                .assignmentOptional()
+                .defaultValue(0)
+                .reconfigurable()
+                .commit(),
+            )
+
+    def postReconfigure(self):
+        if any(
+            path.startswith("quadrantCorners")
+            for path in self._prereconfigure_update_hash.getPaths()
+        ):
+            self.update_geom()
+
+        super().postReconfigure()
+
+    def update_geom(self):
+        self.quadrant_corners = tuple(
+            (self.get(f"quadrantCorners.Q{q}.x"), self.get(f"quadrantCorners.Q{q}.y"))
+            for q in range(1, 5)
+        )
+        self.geom = self.geometry_class.from_quad_positions(self.quadrant_corners)
+        self.pickled = pickle.dumps(self.geom)
+        # TODO: send to anyone who asks? make slot for that? send on connect?
+        self.writeChannel("geometryOutput", Hash("pickledGeometry", self.pickled))
+
+
+@KARABO_CLASSINFO("ManualModulesGeometryBase", deviceVersion)
+class ManualModulesGeometryBase(ManualGeometryBase):
+    @staticmethod
+    def expectedParameters(expected):
+        (
+            TABLE_ELEMENT(expected)
+            .key("modules")
+            .setColumns(ModuleColumn)
+            .assignmentOptional()
+            .defaultValue([])
+            .reconfigurable()
+            .commit(),
+        )
+
+    def postReconfigure(self):
+        if self._prereconfigure_update_hash.has("modules"):
+            self.update_geom()
+
+        super().postReconfigure()
+
+    def update_geom(self):
+        modules = self.get("modules")
+        module_pos = [(module.get("posX"), module.get("posY")) for module in modules]
+        orientations = [
+            (module.get("orientationX"), module.get("orientationY"))
+            for module in modules
+        ]
+        self.geom = self.geometry_class.from_module_positions(
+            module_pos, orientations=orientations
+        )
+        self.pickled = pickle.dumps(self.geom)
+        # TODO: send to anyone who asks? make slot for that?
+        self.writeChannel("geometryOutput", Hash("pickledGeometry", self.pickled))
diff --git a/src/calng/scenes.py b/src/calng/scenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d714f9d2a8b1cd1eb701178429712adf3ab716d
--- /dev/null
+++ b/src/calng/scenes.py
@@ -0,0 +1,824 @@
+import enum
+
+import karabo.native
+import karathon
+from karabo.common.scenemodel.api import (
+    CheckBoxModel,
+    ColorBoolModel,
+    ComboBoxModel,
+    DeviceSceneLinkModel,
+    DetectorGraphModel,
+    DisplayCommandModel,
+    DisplayLabelModel,
+    DisplayStateColorModel,
+    DisplayTextLogModel,
+    DoubleLineEditModel,
+    EvaluatorModel,
+    IntLineEditModel,
+    LabelModel,
+    LineEditModel,
+    RectangleModel,
+    SceneModel,
+    SceneTargetWindow,
+    write_scene,
+)
+
+
+# section: common setup
+
+
+BASE_INC = 25
+NARROW_INC = 20
+PADDING = 5
+RECONFIGURABLE = 4  # TODO: look up proper enum
+NODE_TYPE_NODE = 1
+
+_type_to_display_model = {
+    "BOOL": CheckBoxModel
+}
+_type_to_line_editable = {
+    "BOOL": (CheckBoxModel, {"klass": "EditableCheckBox"}),
+    "DOUBLE": (DoubleLineEditModel, {}),
+    "FLOAT": (DoubleLineEditModel, {}),
+    "INT32": (IntLineEditModel, {}),
+    "UINT32": (IntLineEditModel, {}),
+    "INT64": (IntLineEditModel, {}),
+    "UINT64": (IntLineEditModel, {}),
+    "STRING": (LineEditModel, {"klass": "EditableLineEdit"}),
+}
+
+
+def safe_render(obj, x, y):
+    if hasattr(obj, "render"):
+        return obj.render(x, y)
+    else:
+        obj.x = x
+        obj.y = y
+        return [obj]
+
+
+class Align(enum.Enum):
+    CENTER = enum.auto()
+    TOP = enum.auto()
+    BOTTOM = enum.auto()
+    LEFT = enum.auto()
+    RIGHT = enum.auto()
+
+
+# section: nice component decorators
+
+
+def titled(title, width=8 * NARROW_INC):
+    def actual_decorator(component_class):
+        class new_class(component_class):
+            def render(self, x, y, *args, **kwargs):
+                return [
+                    LabelModel(
+                        frame_width=1,
+                        text=title,
+                        width=width,
+                        height=NARROW_INC,
+                        x=x,
+                        y=y,
+                    )
+                ] + component_class.render(self, x, y + NARROW_INC, *args, **kwargs)
+
+            @property
+            def width(self):
+                return max(component_class.width.fget(self), width)
+
+            @property
+            def height(self):
+                return component_class.height.fget(self) + NARROW_INC
+
+        return new_class
+
+    return actual_decorator
+
+
+def boxed(component_class):
+    class new_class(component_class):
+        def render(self, x, y, *args, **kwargs):
+            return [
+                RectangleModel(
+                    x=x,
+                    y=y,
+                    width=component_class.width.fget(self) + 2 * PADDING,
+                    height=component_class.height.fget(self) + 2 * PADDING,
+                    stroke="#000000",
+                )
+            ] + component_class.render(self, x + PADDING, y + PADDING, *args, **kwargs)
+
+        @property
+        def width(self):
+            return component_class.width.fget(self) + 2 * PADDING
+
+        @property
+        def height(self):
+            return component_class.height.fget(self) + 2 * PADDING
+
+    return new_class
+
+
+# section: useful layout and utility classes
+
+
+def DisplayRoundedFloat(*args, decimals=2, **kwargs):
+    # note: naive subclass breaks as registry looks for writer based on exact class
+    return EvaluatorModel(*args, expression=f"f'{{x:{decimals}}}'", **kwargs)
+
+
+class Space:
+    def __init__(self, width, height):
+        self.width = width
+        self.height = height
+
+    def render(self, x, y):
+        return []
+
+
+def dummy_wrap(model_class):
+    class Wrapper:
+        def __init__(self, *args, **kwargs):
+            self.thing = model_class(*args, **kwargs)
+
+        def render(self, x, y):
+            self.thing.x = x
+            self.thing.y = y
+            return [self.thing]
+
+        @property
+        def width(self):
+            return self.thing.width
+
+        @property
+        def height(self):
+            return self.thing.height
+
+    return Wrapper
+
+
+class HorizontalLayout:
+    def __init__(self, *arg_children, children=None, padding=PADDING):
+        self.children = list(arg_children)
+        if children is not None:
+            self.children.extend(children)
+        self.padding = padding
+
+    def render(self, x, y, align=Align.TOP):
+        if align is not Align.TOP:
+            height = self.height
+        res = []
+        for child in self.children:
+            if align is Align.TOP:
+                y_ = y
+            elif align is Align.CENTER:
+                y_ = y + (height - child.height) / 2
+            elif align is Align.BOTTOM:
+                y_ = y + (height - child.height)
+            else:
+                raise ValueError(f"Invalid align {align} for HorizontalLayout")
+            res.extend(safe_render(child, x, y_))
+            x += child.width + self.padding
+        return res
+
+    @property
+    def width(self):
+        if not self.children:
+            return 0
+        return self.padding * (len(self.children) - 1) + sum(
+            c.width for c in self.children
+        )
+
+    @property
+    def height(self):
+        if not self.children:
+            return 0
+        return max(c.height for c in self.children)
+
+
+class VerticalLayout:
+    def __init__(self, *arg_children, children=None, padding=PADDING):
+        self.children = list(arg_children)
+        if children is not None:
+            self.children.extend(children)
+        self.padding = padding
+
+    def render(self, x, y):
+        res = []
+        for child in self.children:
+            res.extend(safe_render(child, x, y))
+            y += child.height + self.padding
+        return res
+
+    @property
+    def width(self):
+        if not self.children:
+            return 0
+        return max(c.width for c in self.children)
+
+    @property
+    def height(self):
+        if not self.children:
+            return 0
+        return self.padding * (len(self.children) - 1) + sum(
+            c.height for c in self.children
+        )
+
+
+class MaybeEditableRow(HorizontalLayout):
+    def __init__(
+        self,
+        device_id,
+        schema_hash,
+        key_path,
+        label_width=7 * NARROW_INC,
+        display_width=5 * NARROW_INC,
+        edit_width=5 * NARROW_INC,
+        height=NARROW_INC,
+    ):
+        super().__init__(padding=0)
+        key_attr = schema_hash.getAttributes(key_path)
+        label_text = (
+            key_attr["displayedName"]
+            if "displayedName" in key_attr
+            else key_path.split(".")[-1]
+        )
+        if "valueType" not in key_attr:
+            print(f"Key {key_path} on {device_id} had no valueType")
+            return
+        value_type = key_attr["valueType"]
+        self.children.extend(
+            [
+                LabelModel(
+                    text=label_text,
+                    width=label_width,
+                    height=height,
+                ),
+                _type_to_display_model.get(value_type, DisplayLabelModel)(
+                    keys=[f"{device_id}.{key_path}"],
+                    width=display_width,
+                    height=height,
+                ),
+            ]
+        )
+        if key_attr["accessMode"] == RECONFIGURABLE:
+            if "options" in key_attr:
+                self.children.append(
+                    ComboBoxModel(
+                        keys=[f"{device_id}.{key_path}"],
+                        width=edit_width,
+                        height=height,
+                        klass="EditableComboBox",
+                    )
+                )
+            elif value_type in _type_to_line_editable:
+                line_editable_class, extra_args = _type_to_line_editable[value_type]
+                self.children.append(
+                    line_editable_class(
+                        keys=[f"{device_id}.{key_path}"],
+                        width=edit_width,
+                        height=height,
+                        **extra_args,
+                    )
+                )
+            else:
+                self.children.append(
+                    LabelModel(
+                        text=f"Not implemented: editing {value_type} ({key_path})",
+                        width=edit_width,
+                        height=height,
+                    )
+                )
+
+
+# section: specific handcrafted components for device classes
+
+
+@titled("Found constants", width=6 * NARROW_INC)
+@boxed
+class FoundConstantsColumn(VerticalLayout):
+    def __init__(self, device_id, schema_hash, prefix="foundConstants"):
+        super().__init__(padding=0)
+        self.children.extend(
+            [
+                HorizontalLayout(
+                    LabelModel(
+                        text=constant_name,
+                        width=6 * NARROW_INC,
+                        height=NARROW_INC,
+                    ),
+                    ColorBoolModel(
+                        width=NARROW_INC,
+                        height=NARROW_INC,
+                        keys=[f"{device_id}.{prefix}.{constant_name}.found"],
+                    ),
+                    DisplayLabelModel(
+                        keys=[f"{device_id}.{prefix}.{constant_name}.validFrom"],
+                        width=8 * BASE_INC,
+                        height=BASE_INC,
+                    ),
+                    padding=0,
+                )
+                for constant_name in schema_hash.get(prefix).getKeys()
+            ]
+        )
+
+
+class ConstantLoadedAmpeln(HorizontalLayout):
+    def __init__(self, device_id, schema_hash, prefix="foundConstants"):
+        super().__init__(padding=0)
+        self.children.extend(
+            [
+                ColorBoolModel(
+                    keys=[f"{device_id}.{prefix}.{key}.found"],
+                    height=BASE_INC,
+                    width=BASE_INC,
+                )
+                for key in schema_hash.get(prefix).getKeys()
+            ]
+        )
+
+
+@titled("Manager status", width=6 * NARROW_INC)
+@boxed
+class ManagerDeviceStatus(VerticalLayout):
+    def __init__(self, device_id):
+        super().__init__(padding=0)
+        name = DisplayLabelModel(
+            keys=[f"{device_id}.deviceId"],
+            width=14 * BASE_INC,
+            height=BASE_INC,
+        )
+        state = DisplayStateColorModel(
+            show_string=True,
+            keys=[f"{device_id}.state"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        restart_button = DisplayCommandModel(
+            keys=[f"{device_id}.restartServers"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        instantiate_button = DisplayCommandModel(
+            keys=[f"{device_id}.startInstantiate"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        apply_button = DisplayCommandModel(
+            keys=[f"{device_id}.applyManagedValues"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        status_log = DisplayTextLogModel(
+            keys=[f"{device_id}.status"],
+            width=14 * BASE_INC,
+            height=14 * BASE_INC,
+        )
+        self.children.extend(
+            [
+                name,
+                HorizontalLayout(
+                    state,
+                    restart_button,
+                    padding=0,
+                ),
+                HorizontalLayout(
+                    instantiate_button,
+                    apply_button,
+                    padding=0,
+                ),
+                DeviceSceneLinkModel(
+                    text="All managed properties",
+                    keys=[f"{device_id}.availableScenes"],
+                    target="browse_schema",
+                    target_window=SceneTargetWindow.Dialog,
+                    width=7 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                status_log,
+            ]
+        )
+
+
+@titled("Device status", width=6 * NARROW_INC)
+@boxed
+class CorrectionDeviceStatus(VerticalLayout):
+    def __init__(self, device_id):
+        super().__init__(padding=0)
+        name = DisplayLabelModel(
+            keys=[f"{device_id}.deviceId"],
+            width=14 * BASE_INC,
+            height=BASE_INC,
+        )
+        state = DisplayStateColorModel(
+            show_string=True,
+            keys=[f"{device_id}.state"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        rate = DisplayRoundedFloat(
+            keys=[f"{device_id}.performance.rate"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        processing_time = DisplayRoundedFloat(
+            keys=[f"{device_id}.performance.processingTime"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        tid = DisplayLabelModel(
+            keys=[f"{device_id}.trainId"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        status_log = DisplayTextLogModel(
+            keys=[f"{device_id}.status"],
+            width=14 * BASE_INC,
+            height=14 * BASE_INC,
+        )
+        self.children.extend(
+            [
+                name,
+                HorizontalLayout(
+                    state,
+                    tid,
+                    padding=0,
+                ),
+                HorizontalLayout(
+                    rate,
+                    processing_time,
+                    padding=0,
+                ),
+                status_log,
+            ]
+        )
+
+
+class CompactCorrectionDeviceOverview(HorizontalLayout):
+    def __init__(self, device_id, schema_hash):
+        super().__init__(padding=0)
+        self.children.extend(
+            [
+                DeviceSceneLinkModel(
+                    text=device_id.split("/")[-1],
+                    keys=[f"{device_id}.availableScenes"],
+                    target="overview",
+                    target_window=SceneTargetWindow.Dialog,
+                    width=5 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                DisplayStateColorModel(
+                    show_string=True,
+                    keys=[f"{device_id}.state"],
+                    width=6 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                DisplayRoundedFloat(
+                    keys=[f"{device_id}.performance.rate"],
+                    width=4 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                DisplayLabelModel(
+                    keys=[f"{device_id}.trainId"],
+                    width=4 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                ConstantLoadedAmpeln(device_id, schema_hash),
+            ]
+        )
+
+
+@titled("Other devices managed")
+@boxed
+class CompactDeviceLinkList(VerticalLayout):
+    def __init__(self, device_ids):
+        super().__init__()
+        self.children.extend(
+            [
+                HorizontalLayout(
+                    DeviceSceneLinkModel(
+                        text=device_id.split("/")[-1],
+                        keys=[f"{device_id}.availableScenes"],
+                        width=7 * BASE_INC,
+                        height=BASE_INC,
+                    ),
+                    DisplayStateColorModel(
+                        show_string=True,
+                        keys=[f"{device_id}.state"],
+                        width=7 * BASE_INC,
+                        height=BASE_INC,
+                    ),
+                    padding=0,
+                )
+                for device_id in device_ids
+            ]
+        )
+
+
+@titled("Assembler status", width=8 * NARROW_INC)
+@boxed
+class AssemblerDeviceStatus(VerticalLayout):
+    def __init__(self, device_id):
+        super().__init__(padding=0)
+        name = DisplayLabelModel(
+            keys=[f"{device_id}.deviceId"],
+            width=14 * BASE_INC,
+            height=BASE_INC,
+        )
+        state = DisplayStateColorModel(
+            show_string=True,
+            keys=[f"{device_id}.state"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        train_id = DisplayLabelModel(
+            keys=[f"{device_id}.trainId"],
+            width=7 * BASE_INC,
+            height=BASE_INC,
+        )
+        self.children.extend(
+            [
+                name,
+                HorizontalLayout(
+                    state,
+                    train_id,
+                    padding=0,
+                ),
+                LabelModel(
+                    text="Image downsampling",
+                    width=14 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                HorizontalLayout(
+                    LabelModel(
+                        text="Factor",
+                        width=7 * BASE_INC,
+                        height=BASE_INC,
+                    ),
+                    ComboBoxModel(
+                        keys=[f"{device_id}.downsamplingFactor"],
+                        width=7 * BASE_INC,
+                        height=BASE_INC,
+                        klass="EditableComboBox",
+                    ),
+                    padding=0,
+                ),
+                HorizontalLayout(
+                    LabelModel(
+                        text="Function",
+                        width=7 * BASE_INC,
+                        height=BASE_INC,
+                    ),
+                    ComboBoxModel(
+                        keys=[f"{device_id}.downsamplingFunction"],
+                        width=7 * BASE_INC,
+                        height=BASE_INC,
+                        klass="EditableComboBox",
+                    ),
+                    padding=0,
+                ),
+                DeviceSceneLinkModel(
+                    text="I'm actually a TrainMatcher",
+                    keys=[f"{device_id}.availableScenes"],
+                    target="trainMatcherScene",
+                    target_window=SceneTargetWindow.Dialog,
+                    width=14 * BASE_INC,
+                    height=BASE_INC,
+                ),
+            ]
+        )
+
+
+# section: generating actual scenes
+
+
+def schema_to_hash(schema):
+    if isinstance(schema, (karathon.Hash, karabo.native.Hash)):
+        return schema
+    elif isinstance(schema, karathon.Schema):
+        return schema.getParameterHash()
+    else:
+        return schema.hash
+
+
+def scene_generator(fun):
+    # TODO: pretty decorator
+    def aux(*args, **kwargs):
+        content = fun(*args, **kwargs)
+
+        scene = SceneModel(
+            children=content.render(PADDING, PADDING),
+            width=content.width + 2 * PADDING,
+            height=content.height + 2 * PADDING,
+        )
+        return write_scene(scene)
+
+    return aux
+
+
+@scene_generator
+def correction_device_overview_scene(device_id, schema):
+    schema_hash = schema_to_hash(schema)
+
+    return HorizontalLayout(
+        CorrectionDeviceStatus(device_id),
+        VerticalLayout(
+            recursive_maybe_editable(
+                device_id,
+                schema_hash,
+                "constantParameters",
+            ),
+            DisplayCommandModel(
+                keys=[f"{device_id}.loadMostRecentConstants"],
+                width=10 * BASE_INC,
+                height=BASE_INC,
+            ),
+        ),
+        FoundConstantsColumn(device_id, schema_hash),
+        recursive_maybe_editable(
+            device_id,
+            schema_hash,
+            "corrections",
+            max_depth=2,
+        ),
+    )
+
+
+@scene_generator
+def manager_device_overview_scene(
+    manager_device_id,
+    manager_device_schema,
+    correction_device_schema,
+    correction_device_ids,
+    domain_device_ids,
+):
+    mds_hash = schema_to_hash(manager_device_schema)
+    cds_hash = schema_to_hash(correction_device_schema)
+
+    return VerticalLayout(
+        HorizontalLayout(
+            ManagerDeviceStatus(manager_device_id),
+            VerticalLayout(
+                recursive_maybe_editable(
+                    manager_device_id,
+                    mds_hash,
+                    "managed.constantParameters",
+                ),
+                DisplayCommandModel(
+                    keys=[f"{manager_device_id}.managed.loadMostRecentConstants"],
+                    width=10 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                recursive_maybe_editable(
+                    manager_device_id,
+                    mds_hash,
+                    "managed.preview",
+                    max_depth=2,
+                ),
+            ),
+            recursive_maybe_editable(
+                manager_device_id,
+                mds_hash,
+                "managed.corrections",
+                max_depth=2,
+            ),
+        ),
+        HorizontalLayout(
+            titled("Correction devices", width=8 * NARROW_INC)(boxed(VerticalLayout))(
+                children=[
+                    CompactCorrectionDeviceOverview(device_id, cds_hash)
+                    for device_id in sorted(correction_device_ids)
+                ],
+                padding=0,
+            ),
+            CompactDeviceLinkList(
+                sorted(
+                    set(domain_device_ids)
+                    - set(correction_device_ids)
+                    - {manager_device_id}
+                )
+            ),
+        ),
+    )
+
+@scene_generator
+def simple_assembler_overview(device_id, geometry_device_id):
+    return VerticalLayout(
+        HorizontalLayout(
+            AssemblerDeviceStatus(device_id),
+            titled("My geometry device")(boxed(VerticalLayout))(
+                DeviceSceneLinkModel(
+                    text=f"Geometry device: {geometry_device_id}",
+                    keys=[f"{geometry_device_id}.availableScenes"],
+                    target="overview",
+                    target_window=SceneTargetWindow.Dialog,
+                    frame_width=1,
+                    width=14 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                DisplayCommandModel(
+                    keys=[f"{geometry_device_id}.pleaseSendYourGeometry"],
+                    width=14 * BASE_INC,
+                    height=BASE_INC,
+                ),
+                padding=0,
+            ),
+        ),
+        titled("Preview image")(boxed(dummy_wrap(DetectorGraphModel)))(
+            keys=[f"{device_id}.preview.output.schema.image"],
+            colormap="viridis",
+            width=30 * BASE_INC,
+            height=30 * BASE_INC,
+            x=PADDING,
+            y=PADDING,
+        ),
+    )
+
+
+# section: here be monsters
+
+
+def recursive_maybe_editable(
+    device_id, schema_hash, prefix, depth=1, max_depth=3, title=None
+):
+    schema_hash = schema_to_hash(schema_hash)
+    # note: not just using sets because that loses ordering
+    node_keys = []
+    value_keys = []
+    slot_keys = []
+    attr = schema_hash.getAttributes(prefix)
+
+    if title is None:
+        if "displayedName" in attr:
+            title = attr.get("displayedName")
+        else:
+            title = prefix.split(".")[-1]
+
+    for key in schema_hash.get(prefix).getKeys():
+        attrs = schema_hash.getAttributes(f"{prefix}.{key}")
+        if attrs.get("nodeType") == NODE_TYPE_NODE:
+            if "classId" in attrs and attrs.get("classId") == "Slot":
+                slot_keys.append(key)
+            else:
+                node_keys.append(key)
+        else:
+            value_keys.append(key)
+    res = titled(title)(boxed(VerticalLayout))(
+        children=[
+            MaybeEditableRow(device_id, schema_hash, f"{prefix}.{key}")
+            for key in value_keys
+        ]
+        + [
+            DisplayCommandModel(
+                keys=[f"{device_id}.{prefix}.{key}"],
+                width=10 * BASE_INC,
+                height=BASE_INC,
+            )
+            for key in slot_keys
+        ],
+        padding=0,
+    )
+    if depth < max_depth:
+        res.children.append(
+            VerticalLayout(
+                children=[
+                    recursive_maybe_editable(
+                        device_id,
+                        schema_hash,
+                        f"{prefix}.{key}",
+                        depth=depth + 1,
+                        max_depth=max_depth,
+                    )
+                    for key in node_keys
+                ]
+            )
+        )
+    else:
+        res.children.extend(
+            [
+                VerticalLayout(
+                    DeviceSceneLinkModel(
+                        text=key,
+                        keys=[f"{device_id}.availableScenes"],
+                        target=f"browse_schema:{prefix}.{key}",
+                        target_window=SceneTargetWindow.Dialog,
+                        width=5 * BASE_INC,
+                        height=BASE_INC,
+                    ),
+                )
+                for key in node_keys
+            ]
+        )
+    return res
+
+
+@scene_generator
+def recursive_subschema_scene(
+    device_id,
+    device_schema,
+    prefix="managed",
+):
+    mds_hash = schema_to_hash(device_schema)
+    return recursive_maybe_editable(device_id, mds_hash, prefix)
diff --git a/src/calng/shmem_utils.py b/src/calng/shmem_utils.py
index d4cc860fdb2f253514b00170bc8262d8e1104652..0a82cdf6eaa6a5e0bf065e6eda1812597db1f3d0 100644
--- a/src/calng/shmem_utils.py
+++ b/src/calng/shmem_utils.py
@@ -23,6 +23,30 @@ def open_shmem_from_handle(handle_string):
     return buffer_mem, array
 
 
+class ShmemCircularBufferReceiver:
+    def __init__(self):
+        self._name_to_mem = {}
+        self._name_to_ary = {}
+
+    def get(self, handle_string):
+        name, dtype, shape, index = parse_shmem_handle(handle_string)
+        if name not in self._name_to_mem:
+            mem = posixshmem.SharedMemory(name=name, rw=False)
+            self._name_to_mem[name] = mem
+            ary = mem.ndarray(shape=shape, dtype=dtype)
+            self._name_to_ary[name] = ary
+            return ary[index]
+
+        ary = self._name_to_ary[name]
+        if ary.shape != shape or ary.dtype != dtype:
+            del ary
+            mem = self._name_to_mem[name]
+            ary = mem.ndarray(shape=shape, dtype=dtype)
+            self._name_to_ary[name] = ary
+
+        return ary[index]
+
+
 class ShmemCircularBuffer:
     """Convenience wrapper around posixshmem-backed ndarray buffers
 
@@ -42,6 +66,9 @@ class ShmemCircularBuffer:
         )
         self._buffer_ary = None
         self._update_shape(array_shape, dtype)
+        self._cuda_pinned = False
+        # important for performance and pinning: touch memory to actually allocate
+        self._buffer_ary.fill(0)
 
     def _update_shape(self, array_shape, dtype):
         array_shape = tuple(array_shape)
@@ -72,6 +99,22 @@ class ShmemCircularBuffer:
             dtype = self._buffer_ary.dtype
         self._update_shape(array_shape, dtype)
 
+    def cuda_pin(self):
+        import cupy
+        self._memory_pointer = self._buffer_ary.ctypes.get_data()
+        cupy.cuda.runtime.hostRegister(
+            self._memory_pointer,
+            self._shared_memory.size,
+            0
+        )
+
+    def __del__(self):
+        if self._cuda_pinned:
+            import cupy
+            cupy.cuda.runtime.hostUnregister(self._memory_pointer)
+        del self._buffer_ary
+        del self._shared_memory
+
     @property
     def num_slots(self):
         return self._buffer_ary.shape[0]
diff --git a/src/calng/utils.py b/src/calng/utils.py
index 424e45de992a6f2e7599ed089d27f4ba72ec798c..a623731de3faf347817b816e74f295a7ce7f6bf2 100644
--- a/src/calng/utils.py
+++ b/src/calng/utils.py
@@ -1,9 +1,115 @@
+import collections
+import functools
+import inspect
 import threading
 import time
-import timeit
+from timeit import default_timer
 
 import numpy as np
 
+
+def pick_frame_index(selection_mode, index, cell_table, pulse_table, warn_func=None):
+    """When selecting a single frame to preview, an obvious question is whether the
+    number the operator provides is a frame index, a cell ID, or a pulse ID. This
+    function allows any of the three, translating into frame index.
+
+    As this will be used by correction devices, the warn_func parameter allows the
+    function to issue warnings via this instead of raising exceptions.
+
+    Indices below zero are special values and thus returned directly.
+
+    Returns: (found frame index, corresponding cell ID, corresponding pulse ID)"""
+
+    if index < 0:
+        return index, index, index
+
+    # TODO: enum
+    if selection_mode == "frame":
+        if index >= cell_table.size:
+            if warn_func is not None:
+                warn_func(
+                    f"Index {index} out of range for cell table of length "
+                    f"{len(cell_table)}, returning index 0 instead"
+                )
+            frame_index = 0
+        else:
+            frame_index = index
+
+        return frame_index, cell_table[frame_index], pulse_table[frame_index]
+    elif selection_mode == "cell":
+        found = np.where(cell_table == index)[0]
+        if len(found) > 0:
+            cell = index
+            frame_index = found[0]
+        else:
+            cell = cell_table[0]
+            if warn_func is not None:
+                warn_func(
+                    f"Cell {index} not found, arbitrary cell {cell} returned instead"
+                )
+            frame_index = 0
+        return frame_index, cell, pulse_table[frame_index]
+    elif selection_mode == "pulse":
+        found = np.where(pulse_table == index)[0]
+        if len(found) > 0:
+            pulse = index
+            frame_index = found[0]
+        else:
+            pulse = pulse_table[0]
+            if warn_func is not None:
+                warn_func(
+                    f"Pulse {index} not found, arbitrary pulse {pulse} returned instead"
+                )
+            frame_index = 0
+        return frame_index, cell_table[frame_index], pulse
+    else:
+        raise ValueError(f"Invalid selection mode '{selection_mode}'")
+
+
+def threadsafe_cache(fun):
+    """This decorator imitates functools.cache, but threadsafer
+
+    With multiple threads hitting a function cached by functools.cache, it is possible
+    to trigger recomputation. This decorator adds granular locking: each key in the
+    cache (derived from arguments) has its own lock.
+    """
+
+    locks = {}
+    results = {}
+    fun_sig = inspect.signature(fun)
+
+    @functools.wraps(fun)
+    def aux(*args, **kwargs):
+        bound_args = fun_sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+        key = bound_args.args + tuple(bound_args.kwargs.items())
+        if key in results:
+            return results[key]
+        with locks.setdefault(key, threading.Lock()):
+            if key in results:
+                # someone else did this - may still be processing
+                return results[key]
+            else:
+                res = fun(*args, **kwargs)
+                results[key] = res
+                return res
+
+    return aux
+
+
+@functools.lru_cache()
+def transpose_order(axes_in, axes_out):
+    """Computes the order of axes_out relative to axes_in for transposition purposes
+
+    Both axes_in and axes_out are assumed to be strings in which each letter represents
+    an axis (duck typing accepts: any iterable of hashable elements). They should
+    probably be of the same length and have no repetitions, but this is not enforced.
+    Off-label use voids warranty.
+    """
+    axis_order = {axis: index for index, axis in enumerate(axes_in)}
+    return tuple(axis_order[axis] for axis in axes_out)
+
+
 _np_typechar_to_c_typestring = {
     "?": "bool",
     "B": "unsigned char",
@@ -31,6 +137,14 @@ def np_dtype_to_c_type(dtype):
     return _np_typechar_to_c_typestring[as_char]
 
 
+def enum_to_c_template(enum_class):
+    res = [f"enum {enum_class.__name__} {{"]
+    for field in enum_class:
+        res.append(f"\t{field.name} = {field.value},")
+    res.append("};")
+    return "\n".join(res)
+
+
 def ceil_div(num, denom):
     return (num + denom - 1) // denom
 
@@ -43,99 +157,201 @@ def shape_after_transpose(input_shape, transpose_pattern, squeeze=True):
     return tuple(np.array(input_shape)[list(transpose_pattern)].tolist())
 
 
-class DelayableTimer:
-    """Start a timer which can be extended
-
-    Useful for reverting to state after inactivity, for instance.
-
-    timer defaults to timeit.default_timer - it should be a timer returning
-    globally increasing number of seconds.
-    """
-
-    def __init__(self, timeout, callback, timer=timeit.default_timer):
-        self.timer = timer
-        self.stop_time = self.timer() + timeout
-
-        def runner():
-            now = self.timer()
-            while now < self.stop_time:
-                diff = self.stop_time - now
-                time.sleep(diff)
-                now = self.timer()
-            callback()
-
-        self.thread = threading.Thread(target=runner)
-        self.thread.start()
-
-    def set_timeout(self, timeout):
-        """Delay stop time to now + timeout
-
-        If now + timeout is sooner than already set timeout, this does nothing"""
-        self.stop_time = self.timer() + timeout
-
-    def add_timeout(self, timeout):
-        """Simply add timeout to current stop time"""
-        self.stop_time += timeout
-
-
 class RepeatingTimer:
-    """Similar to DelayableTimer, but will keep running with pre-set intervals"""
-
-    def __init__(self, interval, callback, timer=timeit.default_timer, start_now=True):
-        self.timer = timer
+    """A timer which will call callback every interval seconds"""
+
+    def __init__(
+        self,
+        interval,
+        callback,
+        start_now=True,
+        daemon=True,
+    ):
         self.stopped = True
         self.interval = interval
         self.callback = callback
+        self.daemonize = daemon
         if start_now:
             self.start()
 
-    def delay(self):
-        self.stop_time = self.timer() + self.interval
-
     def start(self):
         self.stopped = False
-        self.stop_time = self.timer() + self.interval
+        self.wakeup_time = default_timer() + self.interval
 
         def runner():
             while not self.stopped:
-                now = self.timer()
-                while now < self.stop_time:
-                    diff = self.stop_time - now
+                now = default_timer()
+                while now < self.wakeup_time:
+                    diff = self.wakeup_time - now
                     time.sleep(diff)
                     if self.stopped:
                         return
-                    now = self.timer()
+                    now = default_timer()
                 self.callback()
-                self.stop_time = self.timer() + self.interval
+                self.wakeup_time = default_timer() + self.interval
 
-        self.thread = threading.Thread(target=runner)
+        self.thread = threading.Thread(target=runner, daemon=self.daemonize)
         self.thread.start()
 
     def stop(self):
         self.stopped = True
 
 
-class Throttler:
-    """Similar to DelayableTimer, but will keep running with pre-set intervals"""
-
-    def __init__(self, interval, timer=timeit.default_timer):
-        self.timer = timer
-        self.interval = interval
-        self.latest_call = None
+class ExponentialMovingAverage:
+    def __init__(self, alpha, use_first_value=True):
+        self.alpha = alpha
+        self.initialised = not use_first_value
+        self.mean = 0
 
-    def ready(self):
-        if self.latest_call is None:
-            return True
+    def update(self, value):
+        if self.initialised:
+            self.mean += self.alpha * (value - self.mean)
         else:
-            return self.latest_call + self.interval <= self.timer()
+            self.mean = value
+            self.initialised = True
+
+    def get(self):
+        return self.mean
+
+
+class WindowRateTracker:
+    def __init__(self, buffer_size=20, time_window=10):
+        self.time_window = time_window
+        self.buffer_size = buffer_size
+        self.deque = collections.deque(maxlen=self.buffer_size)
 
     def update(self):
-        self.latest_call = self.timer()
+        self.deque.append(default_timer())
+
+    def get(self):
+        now = default_timer()
+        cutoff = now - self.time_window
+        try:
+            while self.deque[0] < cutoff:
+                self.deque.popleft()
+        except IndexError:
+            return 0
+        if len(self.deque) < 2:
+            return 0
+        if len(self.deque) < self.buffer_size:
+            # TODO: estimator avoiding ramp-up of when starting anew
+            return len(self.deque) / self.time_window
+        else:
+            # if going faster than buffer size per time window, look at timestamps
+            oldest, newest = self.deque[0], self.deque[-1]
+            buffer_span = newest - oldest
+            period = buffer_span / (self.buffer_size - 1)
+            if (now - newest) < period:
+                # no new estimate yet, expecting new event after period
+                return 1 / period
+            else:
+                return self.buffer_size / (now - oldest)
+
+
+class Stopwatch:
+    """Context manager measuring time spent in context.
+
+    Keyword arguments:
+    name: if not None, will appear in string representation
+          also, if not None, will automatically print self when done
+    """
 
-    def ready_update(self):
-        time = self.timer()
-        if self.latest_call is None or self.latest_call + self.interval <= time:
-            self.latest_call = time
-            return True
+    def __init__(self, name=None):
+        self.stop_time = None
+        self.name = name
+
+    def __enter__(self):
+        self.start_time = default_timer()
+        return self
+
+    def __exit__(self, t, v, tb):  # type, value and traceback irrelevant
+        self.stop_time = default_timer()
+        if self.name is not None:
+            print(repr(self))
+
+    @property
+    def elapsed(self):
+        if self.stop_time is not None:
+            return self.stop_time - self.start_time
+        else:
+            return default_timer() - self.start_time
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        if self.name is None:
+            return f"{self.elapsed():.3f} s"
         else:
-            return False
+            return f"{self.name}: {self.elapsed():.3f} s"
+
+
+class TrainRatioTracker:
+    """Measure how many percent of recent train IDs (from contiguous set) were seen
+
+    The tracker will maintain a queue of buffer_size train IDs going back at most
+    buffer_size from latest train ID (depending on calls to get). Call update(train_id)
+    when you see a new train and call get to get() the ratio of recent trains seen.
+
+    In case warn_callback is given, update can issue a warning in case invalid train
+    IDs are received. The tracker assumes trains are strictly increasing and that they
+    are supposed to be contiguous - hence the ability to infer when some are missing.
+    """
+
+    def __init__(self, buffer_size=50, warn_callback=None):
+        self._train_id_queue = collections.deque(maxlen=buffer_size)
+        self._train_id_range = buffer_size
+        self._warn_callback = warn_callback
+
+    def get(self, current_train=None):
+        """Get the ratio of recent trains until current_train or latest updated.
+
+        If current_train is not provided, then the range considered in computing the
+        ratio will be from latest train_id in update (going back buffer_size trains).
+
+        If you happen to know a more current train ID that has not been given in a
+        call to update (maybe you are receiving invalid trains and don't count them),
+        you can give this as current_train, yielding a lower ratio. Note that this will
+        trim the queue, so a subsequent call with lower or no current_train will return
+        an incorrectly low ratio.
+        """
+        if len(self._train_id_queue) == 0:
+            return 0
+
+        if current_train is None:
+            current_train = self._train_id_queue[-1]
+
+        cutoff = current_train - self._train_id_range + 1
+        try:
+            while self._train_id_queue[0] < cutoff:
+                self._train_id_queue.popleft()
+        except IndexError:
+            return 0
+
+        # TODO: avoid estimator ramp-up (don't initially divide by full range)
+        return len(self._train_id_queue) * 100 / self._train_id_range
+
+    def update(self, train_id):
+        if (
+            len(self._train_id_queue) > 0
+            and self._train_id_queue[-1] >= train_id
+            and self._warn_callback is not None
+        ):
+            self._warn_callback(
+                f"New train ID {train_id} not greater than last thing in queue, "
+                f"{self._train_id_queue[-1]}, just thought you should know..."
+            )
+        self._train_id_queue.append(train_id)
+
+
+class ChainHash:
+    """Like read-only ChainMap, but for karabo.bound.Hash(es) instead!"""
+
+    def __init__(self, *hashes):
+        self._hashes = hashes
+
+    def __getitem__(self, key):
+        for h in self._hashes:
+            if h.has(key):
+                return h[key]
+        raise KeyError()
diff --git a/src/tests/problem.py b/src/tests/problem.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c2ca2cbb25310492561d267a482147c0751ce9
--- /dev/null
+++ b/src/tests/problem.py
@@ -0,0 +1,20 @@
+from calng import utils
+
+
+calls = 0
+
+@utils.threadsafe_cache
+def will_raise_once(argument):
+    global calls
+    calls += 1
+    if calls == 1:
+        raise Exception("That's just what I do")
+    return argument + 1
+
+try:
+    will_raise_once(0)
+except Exception as ex:
+    print("As expected, firs call raised:", ex)
+
+print("Now calling again:")
+print(will_raise_once(0))
diff --git a/src/tests/test_agipd_kernels.py b/src/tests/test_agipd_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b776aba85acf5e34bb5a671f3b484e3c57b8b9
--- /dev/null
+++ b/src/tests/test_agipd_kernels.py
@@ -0,0 +1,119 @@
+import h5py
+import numpy as np
+import pathlib
+import pytest
+
+from calng import AgipdCorrection
+
+input_dtype = np.uint16
+output_dtype = np.float16
+corr_dtype = np.float32
+pixels_x = 512
+pixels_y = 128
+memory_cells = 352
+
+raw_data = np.random.randint(
+    low=0, high=2000, size=(memory_cells, 2, pixels_x, pixels_y), dtype=input_dtype
+)
+image_data = raw_data[:, 0]
+raw_gain = raw_data[:, 1]
+cell_table = np.arange(memory_cells, dtype=np.uint16)
+np.random.shuffle(cell_table)
+
+caldb_store = pathlib.Path("/gpfs/exfel/d/cal/caldb_store/xfel/cal")
+caldb_prefix = caldb_store / "agipd-type/agipd_siv1_agipdv11_m305"
+
+with h5py.File(caldb_prefix / "cal.1619543695.4679213.h5", "r") as fd:
+    thresholds = np.array(fd["/AGIPD_SIV1_AGIPDV11_M305/ThresholdsDark/0/data"])
+with h5py.File(caldb_prefix / "cal.1619543664.1545036.h5", "r") as fd:
+    offset_map = np.array(fd["/AGIPD_SIV1_AGIPDV11_M305/Offset/0/data"])
+with h5py.File(caldb_prefix / "cal.1615377705.8904035.h5", "r") as fd:
+    slopes_pc_map = np.array(fd["/AGIPD_SIV1_AGIPDV11_M305/SlopesPC/0/data"])
+
+kernel_runner = AgipdCorrection.AgipdGpuRunner(
+    pixels_x,
+    pixels_y,
+    memory_cells,
+    constant_memory_cells=memory_cells,
+    input_data_dtype=input_dtype,
+    output_data_dtype=output_dtype,
+)
+
+
+def thresholding_cpu(data, cell_table, thresholds):
+    # get to memory_cell, x, y
+    raw_gain = data[:, 1, ...].astype(corr_dtype)
+    # get to threshold, memory_cell, x, y
+    thresholds = np.transpose(thresholds)[:, cell_table]
+    res = np.zeros((memory_cells, pixels_x, pixels_y), dtype=np.uint8)
+    res[raw_gain > thresholds[0]] = 1
+    res[raw_gain > thresholds[1]] = 2
+    return res
+
+
+gain_map_cpu = thresholding_cpu(raw_data, cell_table, thresholds)
+
+
+def corr_offset_cpu(data, cell_table, gain_map, offset):
+    image_data = data[:, 0].astype(corr_dtype)
+    offset = np.transpose(offset)[:, cell_table]
+    return (image_data - np.choose(gain_map, offset)).astype(output_dtype)
+
+
+def corr_rel_gain_pc_cpu(data, cell_table, gain_map, slopes_pc):
+    slopes_pc = slopes_pc.astype(np.float32)
+    pc_high_m = slopes_pc[0]
+    pc_high_I = slopes_pc[1]
+    pc_med_m = slopes_pc[3]
+    pc_med_I = slopes_pc[4]
+    frac_high_med = pc_high_m / pc_med_m
+    md_additional_offset = pc_high_I - pc_med_I * frac_high_med
+    rel_gain_map = np.ones((3, pixels_x, pixels_y, memory_cells), dtype=np.float32)
+    rel_gain_map[0] = 1  # rel xray gain can come after
+    rel_gain_map[1] = rel_gain_map[0] * np.transpose(frac_high_med)
+    rel_gain_map[2] = rel_gain_map[1] * 4.48
+    res = data[:, 0].astype(corr_dtype, copy=True)
+    res *= np.choose(gain_map, np.transpose(rel_gain_map, (0, 3, 1, 2)))
+    pixels_in_medium_gain = gain_map == 1
+    res[pixels_in_medium_gain] += np.transpose(md_additional_offset, (0, 2, 1))[
+        pixels_in_medium_gain
+    ]
+    return res
+
+
+def test_thresholding():
+    kernel_runner.load_cell_table(cell_table)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.load_thresholds(thresholds)
+    kernel_runner.correct(AgipdCorrection.CorrectionFlags.THRESHOLD)
+    gpu_res = kernel_runner.gain_map_gpu.get()
+    assert np.allclose(gpu_res, gain_map_cpu)
+
+
+def test_offset():
+    kernel_runner.load_cell_table(cell_table)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.load_thresholds(thresholds)
+    kernel_runner.load_offset_map(offset_map)
+    # have to do thresholding, otherwise all is treated as high gain
+    kernel_runner.correct(
+        AgipdCorrection.CorrectionFlags.THRESHOLD
+        | AgipdCorrection.CorrectionFlags.OFFSET
+    )
+    cpu_res = corr_offset_cpu(raw_data, cell_table, gain_map_cpu, offset_map)
+    gpu_res = kernel_runner.processed_data_gpu.get()
+    assert np.allclose(gpu_res, cpu_res)
+
+
+def test_rel_gain_pc():
+    kernel_runner.load_cell_table(cell_table)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.load_thresholds(thresholds)
+    kernel_runner.load_rel_gain_pc_map(slopes_pc_map)
+    kernel_runner.correct(
+        AgipdCorrection.CorrectionFlags.THRESHOLD
+        | AgipdCorrection.CorrectionFlags.REL_GAIN_PC
+    )
+    cpu_res = corr_rel_gain_pc_cpu(raw_data, cell_table, gain_map_cpu, slopes_pc_map)
+    gpu_res = kernel_runner.processed_data_gpu.get()
+    assert np.allclose(gpu_res, cpu_res)
diff --git a/src/tests/test_calcat_utils.py b/src/tests/test_calcat_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b647c805876766ccc9fc66a36576788f3f216e5
--- /dev/null
+++ b/src/tests/test_calcat_utils.py
@@ -0,0 +1,154 @@
+import pathlib
+import timeit
+
+from calng import AgipdCorrection, DsscCorrection
+from calng.utils import Stopwatch
+from karabo.bound import Hash, Schema
+import pytest
+
+# TODO: secrets management
+_test_dir = pathlib.Path(__file__).absolute().parent
+_test_calcat_secrets_fn = _test_dir / "calibration-client-secrets.json"
+
+
+class DummyLogger:
+    DEBUG = print
+    INFO = print
+    WARN = print
+
+
+class DummyBaseDevice:
+    log = DummyLogger()
+
+    def log_status_info(self, msg):
+        self.log.INFO(msg)
+
+    def log_status_warn(self, msg):
+        self.log.WARN(msg)
+
+    def get(self, key):
+        return self.schema.get(key)
+
+    def set(self, key, value):
+        print(f'Would set "{key}" = {value}')
+
+
+# TODO: consider testing by attaching to real karabo.bound.PythonDevice
+class DummyAgipdDevice(DummyBaseDevice):
+    device_class_schema = Schema()
+    managed_keys = set()
+
+    @staticmethod
+    def expectedParameters(expected):
+        AgipdCorrection.AgipdCalcatFriend.add_schema(
+            expected, DummyAgipdDevice.managed_keys
+        )
+
+    def __init__(self, config):
+        self.schema = config
+        self.calibration_constant_manager = AgipdCorrection.AgipdCalcatFriend(
+            self,
+            _test_calcat_secrets_fn,
+        )
+        print(self.managed_keys)
+
+
+DummyAgipdDevice.expectedParameters(DummyAgipdDevice.device_class_schema)
+
+
+class DummyDsscDevice(DummyBaseDevice):
+    device_class_schema = Schema()
+    managed_keys = set()
+
+    @staticmethod
+    def expectedParameters(expected):
+        DsscCorrection.DsscCalcatFriend.add_schema(
+            expected, DummyDsscDevice.managed_keys
+        )
+
+    def __init__(self, config):
+        # TODO: check config against schema (as Karabo would)
+        self.schema = config
+        self.calibration_constant_manager = DsscCorrection.DsscCalcatFriend(
+            self,
+            _test_calcat_secrets_fn,
+        )
+
+
+DummyDsscDevice.expectedParameters(DummyDsscDevice.device_class_schema)
+
+
+@pytest.mark.skip(reason="Async currently behind lock, so no concurrent funt")
+def test_agipd_constants_and_caching_and_async():
+    # def test_agipd_constants():
+    conf = Hash()
+    conf["constantParameters.detectorType"] = "AGIPD-Type"
+    conf["constantParameters.detectorName"] = "SPB_DET_AGIPD1M-1"
+    conf["constantParameters.karaboDa"] = "AGIPD00"
+    conf["constantParameters.pixelsX"] = 512
+    conf["constantParameters.pixelsY"] = 128
+    conf["constantParameters.memoryCells"] = 352
+    conf["constantParameters.acquisitionRate"] = 1.1
+    conf["constantParameters.biasVoltage"] = 300
+    conf["constantParameters.gainSetting"] = 0
+    conf["constantParameters.photonEnergy"] = 9.2
+    device = DummyAgipdDevice(conf)
+
+    def backcall(constant_name, metadata_and_data):
+        # TODO: think of something reasonable to check
+        data = metadata_and_data
+        assert data.nbytes > 1000
+
+    with Stopwatch() as timer_async_cold:
+        # TODO: put this sort of thing in BaseCalcatFriend
+        threads = []
+        for constant in AgipdCorrection.AgipdConstants:
+            thread = device.calibration_constant_manager.get_constant_version_and_call_me_back(
+                constant, backcall
+            )
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+
+    with Stopwatch() as timer_async_warm:
+        threads = []
+        for constant in AgipdCorrection.AgipdConstants:
+            thread = device.calibration_constant_manager.get_constant_version_and_call_me_back(
+                constant, backcall
+            )
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+
+    with Stopwatch() as timer_sync_warm:
+        for constant in AgipdCorrection.AgipdConstants:
+            data = device.calibration_constant_manager.get_constant_version(
+                constant,
+            )
+            assert data.nbytes > 1000, "Should find some constant data"
+
+    print(f"Cold async took {timer_async_cold.elapsed} s")
+    print(f"Warm async took {timer_async_warm.elapsed} s")
+    print(f"Warm sync took {timer_sync_warm.elapsed} s")
+    assert (
+        timer_async_cold.elapsed > timer_async_warm.elapsed
+    ), "Caching should make second go faster"
+    assert timer_sync_warm.elapsed > timer_async_warm.elapsed, "Async should be faster"
+
+
+def test_dssc_constants():
+    conf = Hash()
+    conf["constantParameters.detectorType"] = "DSSC-Type"
+    conf["constantParameters.detectorName"] = "SCS_DET_DSSC1M-1"
+    conf["constantParameters.karaboDa"] = "DSSC00"
+    conf["constantParameters.memoryCells"] = 400
+    conf["constantParameters.biasVoltage"] = 100
+    conf["constantParameters.pixelsX"] = 512
+    conf["constantParameters.pixelsY"] = 128
+    # conf["constantParameters.pulseIdChecksum"] = 2.8866323107820637e-36
+    # conf["constantParameters.acquisitionRate"] = 4.5
+    # conf["constantParameters.encodedGain"] = 67328
+    device = DummyDsscDevice(conf)
+    offset_map = device.calibration_constant_manager.get_constant_version("Offset")
+
+    assert offset_map is not None
diff --git a/src/tests/test_dssc_kernels.py b/src/tests/test_dssc_kernels.py
index 02369c9892a4380f2aed68578cb721afe3150401..95c3ec55c580aa216eef871006955109c0a31ee4 100644
--- a/src/tests/test_dssc_kernels.py
+++ b/src/tests/test_dssc_kernels.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from calng import dssc_gpu
+from calng import DsscCorrection
 
 input_dtype = np.uint16
 output_dtype = np.float16
@@ -31,7 +31,7 @@ corrected_data = correct_cpu(raw_data, cell_table, offset_map)
 only_cast_data = np.squeeze(raw_data).astype(output_dtype)
 
 
-kernel_runner = dssc_gpu.DsscGpuRunner(
+kernel_runner = DsscCorrection.DsscGpuRunner(
     pixels_x,
     pixels_y,
     memory_cells,
@@ -43,28 +43,28 @@ kernel_runner = dssc_gpu.DsscGpuRunner(
 
 def test_only_cast():
     kernel_runner.load_data(raw_data)
-    kernel_runner.only_cast()
+    kernel_runner.correct(DsscCorrection.CorrectionFlags.NONE)
     assert np.allclose(
         kernel_runner.processed_data_gpu.get(), raw_data.astype(output_dtype)
     )
 
 
 def test_correct():
-    kernel_runner.load_constants(offset_map)
+    kernel_runner.load_offset_map(offset_map)
     kernel_runner.load_data(raw_data)
     kernel_runner.load_cell_table(cell_table)
-    kernel_runner.correct()
+    kernel_runner.correct(DsscCorrection.CorrectionFlags.OFFSET)
     assert np.allclose(kernel_runner.processed_data_gpu.get(), corrected_data)
 
 
 def test_correct_oob_cells():
-    kernel_runner.load_constants(offset_map)
+    kernel_runner.load_offset_map(offset_map)
     kernel_runner.load_data(raw_data)
     # here, half the cell IDs will be out of bounds
     wild_cell_table = cell_table * 2
     kernel_runner.load_cell_table(wild_cell_table)
     # should not crash
-    kernel_runner.correct()
+    kernel_runner.correct(DsscCorrection.CorrectionFlags.OFFSET)
     # should correct as much as possible
     assert np.allclose(
         kernel_runner.processed_data_gpu.get(),
@@ -74,81 +74,68 @@ def test_correct_oob_cells():
 
 def test_reshape():
     kernel_runner.processed_data_gpu.set(corrected_data)
-    assert np.allclose(kernel_runner.reshape(), corrected_data.transpose())
+    assert np.allclose(
+        kernel_runner.reshape(output_order="xyc"), corrected_data.transpose()
+    )
 
 
 def test_preview_slice():
     kernel_runner.load_data(raw_data)
     kernel_runner.processed_data_gpu.set(corrected_data)
-    preview_raw, preview_corrected = kernel_runner.compute_preview(42)
+    preview_raw, preview_corrected = kernel_runner.compute_previews(42)
     assert np.allclose(
         preview_raw,
-        raw_data[42].astype(np.float32).transpose(),
+        raw_data[42].astype(np.float32),
     )
     assert np.allclose(
         preview_corrected,
-        corrected_data[42].astype(np.float32).transpose(),
+        corrected_data[42].astype(np.float32),
     )
 
 
 def test_preview_max():
-    # can it find max intensity frame?
     # note: in case correction failed, still test this separately
     kernel_runner.load_data(raw_data)
     kernel_runner.processed_data_gpu.set(corrected_data)
-    preview_raw, preview_corrected = kernel_runner.compute_preview(-1)
-    assert np.allclose(
-        preview_raw,
-        raw_data[np.argmax(np.sum(raw_data, axis=(1, 2), dtype=np.float32))]
-        .astype(np.float32)
-        .transpose(),
-    )
+    preview_raw, preview_corrected = kernel_runner.compute_previews(-1)
+    assert np.allclose(preview_raw, np.max(raw_data, axis=0).astype(np.float32))
     assert np.allclose(
-        preview_corrected,
-        corrected_data[np.argmax(np.sum(corrected_data, axis=(1, 2), dtype=np.float32))]
-        .astype(np.float32)
-        .transpose(),
+        preview_corrected, np.max(corrected_data, axis=0).astype(np.float32)
     )
 
 
 def test_preview_mean():
     kernel_runner.load_data(raw_data)
     kernel_runner.processed_data_gpu.set(corrected_data)
-    preview_raw, preview_corrected = kernel_runner.compute_preview(-2)
+    preview_raw, preview_corrected = kernel_runner.compute_previews(-2)
+    assert np.allclose(preview_raw, np.nanmean(raw_data, axis=0, dtype=np.float32))
     assert np.allclose(
-        preview_raw, np.mean(raw_data, axis=0, dtype=np.float32).transpose()
-    )
-    assert np.allclose(
-        preview_corrected, np.mean(corrected_data, axis=0, dtype=np.float32).transpose()
+        preview_corrected, np.nanmean(corrected_data, axis=0, dtype=np.float32)
     )
 
 
 def test_preview_sum():
     kernel_runner.load_data(raw_data)
     kernel_runner.processed_data_gpu.set(corrected_data)
-    preview_raw, preview_corrected = kernel_runner.compute_preview(-3)
-    assert np.allclose(
-        preview_raw, np.sum(raw_data, axis=0, dtype=np.float32).transpose()
-    )
+    preview_raw, preview_corrected = kernel_runner.compute_previews(-3)
+    assert np.allclose(preview_raw, np.nansum(raw_data, axis=0, dtype=np.float32))
     assert np.allclose(
-        preview_corrected, np.sum(corrected_data, axis=0, dtype=np.float32).transpose()
+        preview_corrected, np.nansum(corrected_data, axis=0, dtype=np.float32)
     )
 
 
 def test_preview_std():
     kernel_runner.load_data(raw_data)
     kernel_runner.processed_data_gpu.set(corrected_data)
-    preview_raw, preview_corrected = kernel_runner.compute_preview(-4)
-    assert np.allclose(
-        preview_raw, np.std(raw_data, axis=0, dtype=np.float32).transpose()
-    )
+    preview_raw, preview_corrected = kernel_runner.compute_previews(-4)
+    assert np.allclose(preview_raw, np.nanstd(raw_data, axis=0, dtype=np.float32))
     assert np.allclose(
-        preview_corrected, np.std(corrected_data, axis=0, dtype=np.float32).transpose()
+        preview_corrected, np.nanstd(corrected_data, axis=0, dtype=np.float32)
     )
 
 
 def test_preview_valid_index():
     with pytest.raises(ValueError):
-        kernel_runner.compute_preview(-5)
+        kernel_runner.compute_previews(-5)
     with pytest.raises(ValueError):
-        kernel_runner.compute_preview(memory_cells)
+        kernel_runner.compute_previews(memory_cells)
diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py
index d8e8c6ce407e70e1fb9e9386e2a44a3868051c76..91b1f280f63bb61f54e0325d94216ecfe5d90a44 100644
--- a/src/tests/test_utils.py
+++ b/src/tests/test_utils.py
@@ -1,7 +1,12 @@
-import numpy as np
+import random
+import threading
+import time
+import timeit
 
+import numpy as np
 from calng import utils
 
+
 def test_get_c_type():
     assert utils.np_dtype_to_c_type(np.float16) == "half"
     assert utils.np_dtype_to_c_type(np.float32) == "float"
@@ -16,3 +21,82 @@ def test_get_c_type():
     assert utils.np_dtype_to_c_type(np.int16) == "short"
     assert utils.np_dtype_to_c_type(np.int32) == "int"
     assert utils.np_dtype_to_c_type(np.int64) == "long"
+
+
+class TestThreadsafeCache:
+    def test_arg_key_wrap(self):
+        calls = []
+
+        @utils.threadsafe_cache
+        def fun(a, b, c=1, d=2, *args, **kwargs):
+            calls.append((a, b, c, d, args, kwargs))
+
+        # reordering kwargs /does/ matter because dicts are ordered now
+        # (note: functools.lru_cache doesn't sort, claims because of speed)
+        fun(1, 2, 3, 4, 5, six=6, seven=7)
+        fun(1, 2, 3, 4, 5, seven=7, six=6)
+        assert len(calls) == 2, "kwargs order matters"
+        calls.clear()
+
+        # reordering kw-style positional args does not matter
+        fun(1, 2, 1, 2)
+        fun(a=1, c=1, b=2, d=2)
+        assert len(calls) == 1, "reordering regular args as kws doesn't matter"
+        # and omitting default values does not matter
+        fun(b=2, a=1)
+        fun(1, 2)
+        assert len(calls) == 1, "omitting default args doesn't matter"
+
+    def test_threadsafeness(self):
+        # wow, synchronization (presumably) makes this take forever *without* the decorator...
+        from_was_called = []
+
+        base_sleep = 1
+        random_sleep = 0.1
+
+        @utils.threadsafe_cache
+        def was_called(x):
+            time.sleep(random.random() * random_sleep + base_sleep)
+            from_was_called.append(x)
+
+        threads = []
+        num_threads = 1000
+        letters = "abcd"
+        start_ts = timeit.default_timer()
+        for i in range(num_threads):
+            for l in letters:
+                thread = threading.Thread(target=was_called, args=(l,))
+                thread.start()
+                threads.append(thread)
+        submitted_ts = timeit.default_timer()
+        print(f"Right after: {len(from_was_called)}")
+        for thread in threads:
+            thread.join()
+        stop_ts = timeit.default_timer()
+        total_time = stop_ts - start_ts
+        print(f"After join: {len(from_was_called)}")
+        print(f"Time to submit: {submitted_ts - start_ts}")
+        print(f"Wait for join: {stop_ts - submitted_ts}")
+        print(f"Total: {total_time}")
+
+        # check that function was only called with each letter once
+        # this is where the decorator from functools will fail
+        assert len(from_was_called) == len(
+            letters
+        ), "Caching prevents recomputation due to threading"
+
+        # check that the function was not locked too broadly (should run faster than sequential lower bound)
+        reasonable_time_to_spawn_thread = 0.45 / 1000
+        cutoff = (
+            len(letters) * base_sleep + reasonable_time_to_spawn_thread * num_threads
+        )
+        print(f"Cutoff (sequential lower bound): {cutoff}")
+        assert (
+            total_time < cutoff
+        ), "Locking should not be so broad as to make sequential"
+        print(
+            f"Each thread would have slept [{base_sleep}, {base_sleep + random_sleep})"
+        )
+
+        # check that time doesn't go backwards suddenly
+        assert total_time >= base_sleep, "These tests should measure time correctly"