From df209b62dc4f90679015c80a2cf7bf22e7497db8 Mon Sep 17 00:00:00 2001
From: David Hammer <dhammer@mailbox.org>
Date: Fri, 3 Jun 2022 12:36:27 +0200
Subject: [PATCH] JF: Add CPU runner version

Still to do: speed up preview generation; single-threaded numpy reduction
functions are a bit of an issue.
---
 setup.py                           |  16 ++-
 src/calng/AgipdCorrection.py       |   4 +-
 src/calng/DsscCorrection.py        |   4 +-
 src/calng/Gotthard2Correction.py   |  62 ++++-------
 src/calng/JungfrauCorrection.py    | 172 +++++++++++++++++++++++++----
 src/calng/LpdCorrection.py         |   4 +-
 src/calng/base_kernel_runner.py    |  37 +++++--
 src/calng/kernels/jungfrau_cpu.pyx |  87 +++++++++++++++
 8 files changed, 303 insertions(+), 83 deletions(-)
 create mode 100644 src/calng/kernels/jungfrau_cpu.pyx

diff --git a/setup.py b/setup.py
index 5b4bb5da..5555d8dc 100644
--- a/setup.py
+++ b/setup.py
@@ -49,8 +49,18 @@ setup(name='calng',
           [
               Extension(
                   'calng.kernels.gotthard2_cython',
-                  ['src/calng/kernels/gotthard2_cpu.pyx']
-              )
-          ]
+                  ['src/calng/kernels/gotthard2_cpu.pyx'],
+                  extra_compile_args = ['-O3', '-march=native'],
+              ),
+              Extension(
+                  'calng.kernels.jungfrau_cython',
+                  ['src/calng/kernels/jungfrau_cpu.pyx'],
+                  extra_compile_args = ['-O3', '-march=native', '-fopenmp' ],
+                  extra_link_args=['-fopenmp'],
+              ),
+          ],
+          compiler_directives={
+              'language_level': 3
+          },
       ),
 )
diff --git a/src/calng/AgipdCorrection.py b/src/calng/AgipdCorrection.py
index d86b953e..7d294611 100644
--- a/src/calng/AgipdCorrection.py
+++ b/src/calng/AgipdCorrection.py
@@ -14,7 +14,7 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, base_kernel_runner, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
 
@@ -47,7 +47,7 @@ class CorrectionFlags(enum.IntFlag):
     BPMASK = 32
 
 
-class AgipdGpuRunner(base_gpu.BaseGpuRunner):
+class AgipdGpuRunner(base_kernel_runner.BaseGpuRunner):
     _kernel_source_filename = "agipd_gpu.cu"
     _corrected_axis_order = "cxy"
 
diff --git a/src/calng/DsscCorrection.py b/src/calng/DsscCorrection.py
index 21b2df28..c6605dcf 100644
--- a/src/calng/DsscCorrection.py
+++ b/src/calng/DsscCorrection.py
@@ -9,7 +9,7 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, base_kernel_runner, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema
 
@@ -23,7 +23,7 @@ class DsscConstants(enum.Enum):
     Offset = enum.auto()
 
 
-class DsscGpuRunner(base_gpu.BaseGpuRunner):
+class DsscGpuRunner(base_kernel_runner.BaseGpuRunner):
     _kernel_source_filename = "dssc_gpu.cu"
     _corrected_axis_order = "cyx"
 
diff --git a/src/calng/Gotthard2Correction.py b/src/calng/Gotthard2Correction.py
index 39c6c46e..f561ec17 100644
--- a/src/calng/Gotthard2Correction.py
+++ b/src/calng/Gotthard2Correction.py
@@ -24,6 +24,7 @@ from .base_correction import (
     add_correction_step_schema,
     preview_schema,
 )
+from .base_kernel_runner import BaseKernelRunner
 
 
 _pretend_pulse_table = np.arange(2720, dtype=np.uint8)
@@ -52,7 +53,7 @@ class CorrectionFlags(enum.IntFlag):
     GAIN = 4
 
 
-class Gotthard2CpuRunner:
+class Gotthard2CpuRunner(BaseKernelRunner):
     def __init__(
         self,
         pixels_x,
@@ -63,12 +64,18 @@ class Gotthard2CpuRunner:
         output_data_dtype=np.float32,
         bad_pixel_mask_value=np.nan,
     ):
-        from .kernels import gotthard2_cython  # TODO
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+
+        from .kernels import gotthard2_cython
 
         self.correction_kernel = gotthard2_cython.correct
-        self.pixels_x = pixels_x
-        self.memory_cells = memory_cells
-        self.constant_memory_cells = constant_memory_cells
         self.input_shape = (memory_cells, pixels_x)
         self.processed_shape = self.input_shape
         # model: 2 buffers (corresponding to actual memory cells), 2720 frames
@@ -79,13 +86,10 @@ class Gotthard2CpuRunner:
         self.offset_map = np.empty(self.map_shape, dtype=np.float32)
         self.rel_gain_map = np.empty(self.map_shape, dtype=np.float32)
         self.flush_buffers()
+
         self.input_data = None  # will just point to data coming in
         self.input_gain_stage = None  # will just point to data coming in
         self.processed_data = None  # will just point to buffer we're given
-        self.preview_buffer_getters = [
-            self._get_raw_for_preview,
-            self._get_corrected_for_preview,
-        ]
 
     def _get_raw_for_preview(self):
         return self.input_data
@@ -94,7 +98,7 @@ class Gotthard2CpuRunner:
         return self.processed_data
 
     def load_data(self, image_data, input_gain_stage):
-        """Experiment: loading all three in one function as they are tied"""
+        """Experiment: loading both in one function as they are tied"""
         self.input_data = image_data.astype(np.uint16, copy=False)
         self.input_gain_stage = input_gain_stage.astype(np.uint8, copy=False)
 
@@ -122,33 +126,6 @@ class Gotthard2CpuRunner:
         self.processed_data = out
         return out
 
-    def compute_previews(self, preview_index):
-        """See BaseGpuRunner.compute_previews"""
-
-        if preview_index < -4:
-            raise ValueError(f"No statistic with code {preview_index} defined")
-        elif preview_index >= self.memory_cells:
-            raise ValueError(f"Memory cell index {preview_index} out of range")
-
-        # TODO: enum around reduction type
-        return tuple(
-            self._compute_a_preview(image_data=getter(), preview_index=preview_index)
-            for getter in self.preview_buffer_getters
-        )
-
-    def _compute_a_preview(self, image_data, preview_index):
-        if preview_index >= 0:
-            return image_data[preview_index].astype(np.float32, copy=False)
-        elif preview_index == -1:
-            return np.nanmax(image_data, axis=0).astype(np.float32, copy=False)
-        elif preview_index in (-2, -3, -4):
-            stat_fun = {
-                -2: np.nanmean,
-                -3: np.nansum,
-                -4: np.nanstd,
-            }[preview_index]
-            return stat_fun(image_data, axis=0, dtype=np.float32)
-
 
 class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
     _constant_enum_class = Gotthard2Constants
@@ -184,11 +161,6 @@ class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
             .key(f"{param_prefix}.memoryCells")
             .setNewDefaultValue(2)
             .commit(),
-
-            OVERWRITE_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .setNewDefaultValue(2)
-            .commit(),
         )
 
         base_calcat.add_status_schema_from_enum(
@@ -236,6 +208,11 @@ class Gotthard2Correction(BaseCorrection):
             .key("preview.selectionMode")
             .setNewDefaultValue("frame")
             .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
         )
 
         (
@@ -327,7 +304,6 @@ class Gotthard2Correction(BaseCorrection):
         buffer_handle, buffer_array = self._shmem_buffer.next_slot()
         self.kernel_runner.correct(self._correction_flag_enabled, out=buffer_array)
 
-
         if do_generate_preview:
             if self._correction_flag_enabled != self._correction_flag_preview:
                 self.kernel_runner.correct(self._correction_flag_preview)
diff --git a/src/calng/JungfrauCorrection.py b/src/calng/JungfrauCorrection.py
index 5aa8bb5c..788da429 100644
--- a/src/calng/JungfrauCorrection.py
+++ b/src/calng/JungfrauCorrection.py
@@ -11,9 +11,10 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
+from .base_kernel_runner import BaseGpuRunner, BaseKernelRunner
 
 
 _pretend_pulse_table = np.arange(16, dtype=np.uint8)
@@ -33,7 +34,12 @@ class CorrectionFlags(enum.IntFlag):
     BPMASK = 4
 
 
-class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
+class KernelRunnerVersions(enum.Enum):
+    CPU = enum.auto()
+    GPU = enum.auto()
+
+
+class JungfrauGpuRunner(BaseGpuRunner):
     _kernel_source_filename = "jungfrau_gpu.cu"
     _corrected_axis_order = "cyx"
 
@@ -59,8 +65,8 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
         )
         # TODO: avoid superclass creating cell table with wrong dtype first
         self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=cupy.uint8)
-        self.input_gain_map_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
-        self.preview_buffer_getters.append(self._get_gain_map_for_preview)
+        self.input_gain_stage_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
+        self.preview_buffer_getters.append(self._get_gain_stage_for_preview)
         self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
         self.offset_map_gpu = cupy.zeros(self.map_shape, dtype=cupy.float32)
         self.rel_gain_map_gpu = cupy.ones(self.map_shape, dtype=cupy.float32)
@@ -95,13 +101,13 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
     def _get_corrected_for_preview(self):
         return self.processed_data_gpu
 
-    def _get_gain_map_for_preview(self):
-        return self.input_gain_map_gpu
+    def _get_gain_stage_for_preview(self):
+        return self.input_gain_stage_gpu
 
-    def load_data(self, image_data, input_gain_map, cell_table):
+    def load_data(self, image_data, input_gain_stage, cell_table):
         """Experiment: loading all three in one function as they are tied"""
         self.input_data_gpu.set(image_data)
-        self.input_gain_map_gpu.set(input_gain_map)
+        self.input_gain_stage_gpu.set(input_gain_stage)
         if self.burst_mode:
             self.cell_table_gpu.set(cell_table)
 
@@ -116,7 +122,7 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
             self.full_block,
             (
                 self.input_data_gpu,
-                self.input_gain_map_gpu,
+                self.input_gain_stage_gpu,
                 self.cell_table_gpu,
                 cupy.uint8(flags),
                 self.offset_map_gpu,
@@ -128,6 +134,98 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
         )
 
 
+class JungfrauCpuRunner(BaseKernelRunner):
+    _corrected_axis_order = "cyx"
+
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=np.uint16,
+        output_data_dtype=np.float32,  # TODO: configurable
+        bad_pixel_mask_value=np.nan,
+    ):
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+
+        from .kernels import jungfrau_cython
+        self.correction_kernel_single = jungfrau_cython.correct_single
+        self.correction_kernel_burst = jungfrau_cython.correct_burst
+        self.input_shape = (memory_cells, pixels_y, pixels_x)
+        self.preview_buffer_getters.append(self._get_gain_stage_for_preview)
+        self.processed_shape = self.input_shape
+        self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
+        self.offset_map = np.empty(self.map_shape, dtype=np.float32)
+        self.rel_gain_map = np.empty(self.map_shape, dtype=np.float32)
+        self.bad_pixel_map = np.empty(self.map_shape, dtype=np.uint32)
+        self.bad_pixel_mask_value = bad_pixel_mask_value
+        self.flush_buffers()
+
+        self.input_data = None  # will just point to data coming in
+        self.input_gain_stage = None  # will just point to data coming in
+        self.processed_data = None  # will just point to buffer we're given
+
+    def _get_raw_for_preview(self):
+        return self.input_data
+
+    def _get_corrected_for_preview(self):
+        return self.processed_data
+
+    def _get_gain_stage_for_preview(self):
+        return self.input_gain_stage
+
+    @property
+    def burst_mode(self):
+        return self.memory_cells > 1
+
+    def load_data(self, image_data, input_gain_stage, cell_table):
+        self.input_data = image_data.astype(np.uint16, copy=False)
+        self.input_gain_stage = input_gain_stage.astype(np.uint8, copy=False)
+        self.input_cell_table = cell_table.astype(np.uint8, copy=False)
+
+    def flush_buffers(self):
+        self.offset_map.fill(0)
+        self.rel_gain_map.fill(1)
+        self.bad_pixel_map.fill(0)
+
+    def correct(self, flags, out=None):
+        if out is None:
+            out = np.empty(self.processed_shape, dtype=np.float32)
+
+        if self.burst_mode:
+            self.correction_kernel_burst(
+                self.input_data,
+                self.input_gain_stage,
+                self.input_cell_table,
+                flags,
+                self.offset_map,
+                self.rel_gain_map,
+                self.bad_pixel_map,
+                self.bad_pixel_mask_value,
+                out,
+            )
+        else:
+            self.correction_kernel_single(
+                self.input_data,
+                self.input_gain_stage,
+                flags,
+                self.offset_map,
+                self.rel_gain_map,
+                self.bad_pixel_map,
+                self.bad_pixel_mask_value,
+                out,
+            )
+        self.processed_data = out
+
+
 class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
     _constant_enum_class = JungfrauConstants
 
@@ -172,12 +270,6 @@ class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
             .key(f"{param_prefix}.biasVoltage")
             .setNewDefaultValue(90)
             .commit(),
-
-            # JUNGFRAU data is small, can fit plenty of trains in here
-            OVERWRITE_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .setNewDefaultValue(2)
-            .commit(),
         )
 
         # add extra parameters
@@ -255,7 +347,7 @@ class JungfrauCorrection(BaseCorrection):
         ("relGain", CorrectionFlags.REL_GAIN),
         ("badPixels", CorrectionFlags.BPMASK),
     )
-    _kernel_runner_class = JungfrauGpuRunner
+    _kernel_runner_class = None  # note: set in __init__ based on config
     _calcat_friend_class = JungfrauCalcatFriend
     _constant_enum_class = JungfrauConstants
     _managed_keys = BaseCorrection._managed_keys.copy()
@@ -264,7 +356,6 @@ class JungfrauCorrection(BaseCorrection):
 
     @staticmethod
     def expectedParameters(expected):
-        super(JungfrauCorrection, JungfrauCorrection).expectedParameters(expected)
         (
             OVERWRITE_ELEMENT(expected)
             .key("dataFormat.pixelsX")
@@ -285,8 +376,26 @@ class JungfrauCorrection(BaseCorrection):
             .key("preview.selectionMode")
             .setNewDefaultValue("frame")
             .commit(),
+
+            # JUNGFRAU data is small, can fit plenty of trains in here
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
         )
 
+        # support both CPU and GPU kernels
+        (
+            STRING_ELEMENT(expected)
+            .key("kernelType")
+            .assignmentOptional()
+            .defaultValue(KernelRunnerVersions.CPU.name)
+            .options(",".join(kernel_type.name for kernel_type in KernelRunnerVersions))
+            .reconfigurable()
+            .commit(),
+        )
+        JungfrauCorrection._managed_keys.add("kernelType")
+
         (
             OUTPUT_CHANNEL(expected)
             .key("preview.outputGainMap")
@@ -320,6 +429,11 @@ class JungfrauCorrection(BaseCorrection):
 
     def __init__(self, config):
         super().__init__(config)
+        kernel_type = KernelRunnerVersions[config["kernelType"]]
+        if kernel_type is KernelRunnerVersions.CPU:
+            self._kernel_runner_class = JungfrauCpuRunner
+        else:
+            self._kernel_runner_class = JungfrauGpuRunner
         # TODO: gain mode as constant parameter and / or device configuration
 
         try:
@@ -406,20 +520,34 @@ class JungfrauCorrection(BaseCorrection):
             constant_data = np.transpose(constant_data, (2, 1, 0, 3))
         else:
             constant_data = np.transpose(constant_data, (2, 0, 1, 3))
+
+        kernel_type = KernelRunnerVersions[self.get("kernelType")]
         if constant is JungfrauConstants.Offset10Hz:
-            self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.offset_map[:] = constant_data.astype(np.float32)
+            else:
+                self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
             if not self.get("corrections.offset.available"):
                 self.set("corrections.offset.available", True)
         elif constant is JungfrauConstants.RelativeGain10Hz:
-            self.kernel_runner.rel_gain_map_gpu.set(constant_data.astype(np.float32))
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.rel_gain_map[:] = constant_data.astype(np.float32)
+            else:
+                self.kernel_runner.rel_gain_map_gpu.set(
+                    constant_data.astype(np.float32)
+                )
             if not self.get("corrections.relGain.available"):
                 self.set("corrections.relGain.available", True)
         elif constant in (
-                JungfrauConstants.BadPixelsDark10Hz, JungfrauConstants.BadPixelsFF10Hz
+            JungfrauConstants.BadPixelsDark10Hz,
+            JungfrauConstants.BadPixelsFF10Hz,
         ):
-            self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.bad_pixel_map |= constant_data
+            else:
+                self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
             if not self.get("corrections.badPixels.available"):
                 self.set("corrections.badPixels.available", True)
 
         self._update_correction_flags()
-        self.log_status_info(f"Done loading {constant.name} to GPU")
+        self.log_status_info(f"Done loading {constant.name} to runner")
diff --git a/src/calng/LpdCorrection.py b/src/calng/LpdCorrection.py
index c159132c..2f832130 100644
--- a/src/calng/LpdCorrection.py
+++ b/src/calng/LpdCorrection.py
@@ -11,7 +11,7 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_gpu, base_calcat, utils
+from . import base_kernel_runner, base_calcat, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
 
@@ -34,7 +34,7 @@ class CorrectionFlags(enum.IntFlag):
     BPMASK = 16
 
 
-class LpdGpuRunner(base_gpu.BaseGpuRunner):
+class LpdGpuRunner(base_kernel_runner.BaseGpuRunner):
     _kernel_source_filename = "lpd_gpu.cu"
     _corrected_axis_order = "cxy"
 
diff --git a/src/calng/base_kernel_runner.py b/src/calng/base_kernel_runner.py
index 38127f06..1d0d8123 100644
--- a/src/calng/base_kernel_runner.py
+++ b/src/calng/base_kernel_runner.py
@@ -43,7 +43,7 @@ class BaseKernelRunner:
         Detector-specific subclass must define this method. It should assume that image
         data, cell table, and other data (including constants) has already been loaded.
         It should probably run some GPU kernel and output should go into
-        self.processed_data_gpu.
+        self.processed_data(_gpu).
 
         Keep in mind that user only gets output from compute_preview or reshape
         (either of these should come after correct).
@@ -62,7 +62,7 @@ class BaseKernelRunner:
         raise NotImplementedError()
 
     def _get_corrected_for_preview(self):
-        """Should return view of self.processed_data_gpu with shape (cell, x/y, x/y)"""
+        """Should return view of self.processed_data(_gpu) with shape (cell, x/y, x/y)"""
         raise NotImplementedError()
 
     def flush_buffers(self):
@@ -99,21 +99,37 @@ class BaseKernelRunner:
         here for now (and can differ between AGIPD and DSSC)"""
         if preview_index >= 0:
             # TODO: reuse pinned buffers for this
-            return image_data[preview_index].astype(np.float32).get()
+            return image_data[preview_index].astype(np.float32)
         elif preview_index == -1:
             # TODO: confirm that max is pixel and not integrated intensity
             # separate from next case because dtype not applicable here
-            return cupy.nanmax(image_data, axis=0).astype(cupy.float32).get()
+            return np.nanmax(image_data, axis=0).astype(np.float32)
         elif preview_index in (-2, -3, -4):
             stat_fun = {
-                -2: cupy.nanmean,
-                -3: cupy.nansum,
-                -4: cupy.nanstd,
+                -2: np.nanmean,
+                -3: np.nansum,
+                -4: np.nanstd,
             }[preview_index]
-            return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
+            return stat_fun(image_data, axis=0, dtype=np.float32)
 
+    def reshape(self, output_order, out=None):
+        """Move axes to desired output order"""
+        # TODO: avoid copy
+        if output_order == self._corrected_axis_order:
+            self.reshaped_data = self.processed_data
+        else:
+            self.reshaped_data = np.transpose(
+                self.processed_data,
+                utils.transpose_order(self._corrected_axis_order, output_order),
+            )
 
-class BaseGpuRunner(base_kernel_runner):
+        if out is None:
+            return self.reshaped_data
+        else:
+            out[:] = self.reshaped_data
+
+
+class BaseGpuRunner(BaseKernelRunner):
     """Class to handle GPU buffers and execution of CUDA kernels on image data
 
     All GPU buffers are kept within this class and it is intentionally very stateful.
@@ -214,3 +230,6 @@ class BaseGpuRunner(base_kernel_runner):
             utils.ceil_div(a_length, block_length)
             for (a_length, block_length) in zip(self.processed_shape, full_block)
         )
+
+    def _compute_a_preview(self, image_data, preview_index):
+        return super()._compute_a_preview(image_data, preview_index).get()
diff --git a/src/calng/kernels/jungfrau_cpu.pyx b/src/calng/kernels/jungfrau_cpu.pyx
new file mode 100644
index 00000000..02d92537
--- /dev/null
+++ b/src/calng/kernels/jungfrau_cpu.pyx
@@ -0,0 +1,87 @@
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: wrapararound=False
+
+cdef unsigned char NONE = 0
+cdef unsigned char OFFSET = 1
+cdef unsigned char REL_GAIN = 2
+cdef unsigned char BPMASK = 4
+
+from cython.parallel import prange
+
+
+def correct_burst(
+    unsigned short[:, :, :] image_data,
+    unsigned char[:, :, :] gain_stage,
+    unsigned char[:] cell_table,
+    unsigned char flags,
+    float[:, :, :, :] offset_map,
+    float[:, :, :, :] relgain_map,
+    unsigned[:, :, :, :] badpixel_mask,
+    float badpixel_fill_value,
+    float[:, :, :] output,
+):
+    cdef int input_cell, map_cell, x, y
+    cdef unsigned char gain
+    cdef float corrected
+    for input_cell in prange(image_data.shape[0], nogil=True):
+        map_cell = cell_table[input_cell]
+        if map_cell >= offset_map.shape[0]:
+            for y in range(image_data.shape[1]):
+                for x in range(image_data.shape[2]):
+                    output[input_cell, y, x] = <float>image_data[input_cell, y, x]
+            continue
+        for y in range(image_data.shape[1]):
+            for x in range(image_data.shape[2]):
+                corrected = image_data[input_cell, y, x]
+                gain = gain_stage[input_cell, y, x]
+                # legal values: 0, 1, or 3
+                if gain == 2:
+                    corrected = badpixel_fill_value
+                else:
+                    if gain == 3:
+                        gain = 2
+
+                    if (flags & BPMASK) and badpixel_mask[map_cell, y, x, gain] != 0:
+                        corrected = badpixel_fill_value
+                    else:
+                        if (flags & OFFSET):
+                            corrected = corrected - offset_map[map_cell, y, x, gain]
+                        if (flags & REL_GAIN):
+                            corrected = corrected / relgain_map[map_cell, y, x, gain]
+                output[input_cell, y, x] = corrected
+
+
+def correct_single(
+    unsigned short[:, :, :] image_data,
+    unsigned char[:, :, :] gain_stage,
+    unsigned char flags,
+    float[:, :, :] offset_map,
+    float[:, :, :] relgain_map,
+    unsigned[:, :, :] badpixel_mask,
+    float badpixel_fill_value,
+    float[:, :, :] output,
+):
+    # ignore "cell table", constant pretends cell 0
+    cdef int x, y
+    cdef unsigned char gain
+    cdef float corrected
+    for y in range(image_data.shape[1]):
+        for x in range(image_data.shape[2]):
+            corrected = image_data[0, y, x]
+            gain = gain_stage[0, y, x]
+            # legal values: 0, 1, or 3
+            if gain == 2:
+                corrected = badpixel_fill_value
+            else:
+                if gain == 3:
+                    gain = 2
+
+                if (flags & BPMASK) and badpixel_mask[y, x, gain] != 0:
+                    corrected = badpixel_fill_value
+                else:
+                    if (flags & OFFSET):
+                        corrected = corrected - offset_map[y, x, gain]
+                    if (flags & REL_GAIN):
+                        corrected = corrected / relgain_map[y, x, gain]
+            output[0, y, x] = corrected
-- 
GitLab