diff --git a/setup.py b/setup.py
index 5b4bb5da6e85b50e71aed81b379a07453066af6b..5555d8dc6832f0807be98fa16b4ba8cb524ad9ab 100644
--- a/setup.py
+++ b/setup.py
@@ -49,8 +49,18 @@ setup(name='calng',
           [
               Extension(
                   'calng.kernels.gotthard2_cython',
-                  ['src/calng/kernels/gotthard2_cpu.pyx']
-              )
-          ]
+                  ['src/calng/kernels/gotthard2_cpu.pyx'],
+                  extra_compile_args = ['-O3', '-march=native'],
+              ),
+              Extension(
+                  'calng.kernels.jungfrau_cython',
+                  ['src/calng/kernels/jungfrau_cpu.pyx'],
+                  extra_compile_args = ['-O3', '-march=native', '-fopenmp' ],
+                  extra_link_args=['-fopenmp'],
+              ),
+          ],
+          compiler_directives={
+              'language_level': 3
+          },
       ),
 )
diff --git a/src/calng/AgipdCorrection.py b/src/calng/AgipdCorrection.py
index d86b953ed6960619a8b17323537d34bb300d6ff0..7d2946113c9f124432bfa8b5fecc34a01cb61380 100644
--- a/src/calng/AgipdCorrection.py
+++ b/src/calng/AgipdCorrection.py
@@ -14,7 +14,7 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, base_kernel_runner, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
 
@@ -47,7 +47,7 @@ class CorrectionFlags(enum.IntFlag):
     BPMASK = 32
 
 
-class AgipdGpuRunner(base_gpu.BaseGpuRunner):
+class AgipdGpuRunner(base_kernel_runner.BaseGpuRunner):
     _kernel_source_filename = "agipd_gpu.cu"
     _corrected_axis_order = "cxy"
 
diff --git a/src/calng/DsscCorrection.py b/src/calng/DsscCorrection.py
index 21b2df287579319a0ccf01b323608086cebc1a56..c6605dcf535417f1fa415c50ee10243c4b0da4be 100644
--- a/src/calng/DsscCorrection.py
+++ b/src/calng/DsscCorrection.py
@@ -9,7 +9,7 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, base_kernel_runner, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema
 
@@ -23,7 +23,7 @@ class DsscConstants(enum.Enum):
     Offset = enum.auto()
 
 
-class DsscGpuRunner(base_gpu.BaseGpuRunner):
+class DsscGpuRunner(base_kernel_runner.BaseGpuRunner):
     _kernel_source_filename = "dssc_gpu.cu"
     _corrected_axis_order = "cyx"
 
diff --git a/src/calng/Gotthard2Correction.py b/src/calng/Gotthard2Correction.py
index 39c6c46e229b18c123051b1bc0e7dd379d47f775..f561ec1732d390c95c60a7988797933af0b5c76b 100644
--- a/src/calng/Gotthard2Correction.py
+++ b/src/calng/Gotthard2Correction.py
@@ -24,6 +24,7 @@ from .base_correction import (
     add_correction_step_schema,
     preview_schema,
 )
+from .base_kernel_runner import BaseKernelRunner
 
 
 _pretend_pulse_table = np.arange(2720, dtype=np.uint8)
@@ -52,7 +53,7 @@ class CorrectionFlags(enum.IntFlag):
     GAIN = 4
 
 
-class Gotthard2CpuRunner:
+class Gotthard2CpuRunner(BaseKernelRunner):
     def __init__(
         self,
         pixels_x,
@@ -63,12 +64,18 @@ class Gotthard2CpuRunner:
         output_data_dtype=np.float32,
         bad_pixel_mask_value=np.nan,
     ):
-        from .kernels import gotthard2_cython  # TODO
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+
+        from .kernels import gotthard2_cython
 
         self.correction_kernel = gotthard2_cython.correct
-        self.pixels_x = pixels_x
-        self.memory_cells = memory_cells
-        self.constant_memory_cells = constant_memory_cells
         self.input_shape = (memory_cells, pixels_x)
         self.processed_shape = self.input_shape
         # model: 2 buffers (corresponding to actual memory cells), 2720 frames
@@ -79,13 +86,10 @@ class Gotthard2CpuRunner:
         self.offset_map = np.empty(self.map_shape, dtype=np.float32)
         self.rel_gain_map = np.empty(self.map_shape, dtype=np.float32)
         self.flush_buffers()
+
         self.input_data = None  # will just point to data coming in
         self.input_gain_stage = None  # will just point to data coming in
         self.processed_data = None  # will just point to buffer we're given
-        self.preview_buffer_getters = [
-            self._get_raw_for_preview,
-            self._get_corrected_for_preview,
-        ]
 
     def _get_raw_for_preview(self):
         return self.input_data
@@ -94,7 +98,7 @@ class Gotthard2CpuRunner:
         return self.processed_data
 
     def load_data(self, image_data, input_gain_stage):
-        """Experiment: loading all three in one function as they are tied"""
+        """Experiment: loading both in one function as they are tied"""
         self.input_data = image_data.astype(np.uint16, copy=False)
         self.input_gain_stage = input_gain_stage.astype(np.uint8, copy=False)
 
@@ -122,33 +126,6 @@ class Gotthard2CpuRunner:
         self.processed_data = out
         return out
 
-    def compute_previews(self, preview_index):
-        """See BaseGpuRunner.compute_previews"""
-
-        if preview_index < -4:
-            raise ValueError(f"No statistic with code {preview_index} defined")
-        elif preview_index >= self.memory_cells:
-            raise ValueError(f"Memory cell index {preview_index} out of range")
-
-        # TODO: enum around reduction type
-        return tuple(
-            self._compute_a_preview(image_data=getter(), preview_index=preview_index)
-            for getter in self.preview_buffer_getters
-        )
-
-    def _compute_a_preview(self, image_data, preview_index):
-        if preview_index >= 0:
-            return image_data[preview_index].astype(np.float32, copy=False)
-        elif preview_index == -1:
-            return np.nanmax(image_data, axis=0).astype(np.float32, copy=False)
-        elif preview_index in (-2, -3, -4):
-            stat_fun = {
-                -2: np.nanmean,
-                -3: np.nansum,
-                -4: np.nanstd,
-            }[preview_index]
-            return stat_fun(image_data, axis=0, dtype=np.float32)
-
 
 class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
     _constant_enum_class = Gotthard2Constants
@@ -184,11 +161,6 @@ class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
             .key(f"{param_prefix}.memoryCells")
             .setNewDefaultValue(2)
             .commit(),
-
-            OVERWRITE_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .setNewDefaultValue(2)
-            .commit(),
         )
 
         base_calcat.add_status_schema_from_enum(
@@ -236,6 +208,11 @@ class Gotthard2Correction(BaseCorrection):
             .key("preview.selectionMode")
             .setNewDefaultValue("frame")
             .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
         )
 
         (
@@ -327,7 +304,6 @@ class Gotthard2Correction(BaseCorrection):
         buffer_handle, buffer_array = self._shmem_buffer.next_slot()
         self.kernel_runner.correct(self._correction_flag_enabled, out=buffer_array)
 
-
         if do_generate_preview:
             if self._correction_flag_enabled != self._correction_flag_preview:
                 self.kernel_runner.correct(self._correction_flag_preview)
diff --git a/src/calng/JungfrauCorrection.py b/src/calng/JungfrauCorrection.py
index 5aa8bb5cb22eb402ae153e32a86b8e3a18f00acb..788da429a28dfada638c3f0401e86c486374b9ce 100644
--- a/src/calng/JungfrauCorrection.py
+++ b/src/calng/JungfrauCorrection.py
@@ -11,9 +11,10 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
+from .base_kernel_runner import BaseGpuRunner, BaseKernelRunner
 
 
 _pretend_pulse_table = np.arange(16, dtype=np.uint8)
@@ -33,7 +34,12 @@ class CorrectionFlags(enum.IntFlag):
     BPMASK = 4
 
 
-class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
+class KernelRunnerVersions(enum.Enum):
+    CPU = enum.auto()
+    GPU = enum.auto()
+
+
+class JungfrauGpuRunner(BaseGpuRunner):
     _kernel_source_filename = "jungfrau_gpu.cu"
     _corrected_axis_order = "cyx"
 
@@ -59,8 +65,8 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
         )
         # TODO: avoid superclass creating cell table with wrong dtype first
         self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=cupy.uint8)
-        self.input_gain_map_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
-        self.preview_buffer_getters.append(self._get_gain_map_for_preview)
+        self.input_gain_stage_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
+        self.preview_buffer_getters.append(self._get_gain_stage_for_preview)
         self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
         self.offset_map_gpu = cupy.zeros(self.map_shape, dtype=cupy.float32)
         self.rel_gain_map_gpu = cupy.ones(self.map_shape, dtype=cupy.float32)
@@ -95,13 +101,13 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
     def _get_corrected_for_preview(self):
         return self.processed_data_gpu
 
-    def _get_gain_map_for_preview(self):
-        return self.input_gain_map_gpu
+    def _get_gain_stage_for_preview(self):
+        return self.input_gain_stage_gpu
 
-    def load_data(self, image_data, input_gain_map, cell_table):
+    def load_data(self, image_data, input_gain_stage, cell_table):
         """Experiment: loading all three in one function as they are tied"""
         self.input_data_gpu.set(image_data)
-        self.input_gain_map_gpu.set(input_gain_map)
+        self.input_gain_stage_gpu.set(input_gain_stage)
         if self.burst_mode:
             self.cell_table_gpu.set(cell_table)
 
@@ -116,7 +122,7 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
             self.full_block,
             (
                 self.input_data_gpu,
-                self.input_gain_map_gpu,
+                self.input_gain_stage_gpu,
                 self.cell_table_gpu,
                 cupy.uint8(flags),
                 self.offset_map_gpu,
@@ -128,6 +134,98 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
         )
 
 
+class JungfrauCpuRunner(BaseKernelRunner):
+    _corrected_axis_order = "cyx"
+
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=np.uint16,
+        output_data_dtype=np.float32,  # TODO: configurable
+        bad_pixel_mask_value=np.nan,
+    ):
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+
+        from .kernels import jungfrau_cython
+        self.correction_kernel_single = jungfrau_cython.correct_single
+        self.correction_kernel_burst = jungfrau_cython.correct_burst
+        self.input_shape = (memory_cells, pixels_y, pixels_x)
+        self.preview_buffer_getters.append(self._get_gain_stage_for_preview)
+        self.processed_shape = self.input_shape
+        self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
+        self.offset_map = np.empty(self.map_shape, dtype=np.float32)
+        self.rel_gain_map = np.empty(self.map_shape, dtype=np.float32)
+        self.bad_pixel_map = np.empty(self.map_shape, dtype=np.uint32)
+        self.bad_pixel_mask_value = bad_pixel_mask_value
+        self.flush_buffers()
+
+        self.input_data = None  # will just point to data coming in
+        self.input_gain_stage = None  # will just point to data coming in
+        self.processed_data = None  # will just point to buffer we're given
+
+    def _get_raw_for_preview(self):
+        return self.input_data
+
+    def _get_corrected_for_preview(self):
+        return self.processed_data
+
+    def _get_gain_stage_for_preview(self):
+        return self.input_gain_stage
+
+    @property
+    def burst_mode(self):
+        return self.memory_cells > 1
+
+    def load_data(self, image_data, input_gain_stage, cell_table):
+        self.input_data = image_data.astype(np.uint16, copy=False)
+        self.input_gain_stage = input_gain_stage.astype(np.uint8, copy=False)
+        self.input_cell_table = cell_table.astype(np.uint8, copy=False)
+
+    def flush_buffers(self):
+        self.offset_map.fill(0)
+        self.rel_gain_map.fill(1)
+        self.bad_pixel_map.fill(0)
+
+    def correct(self, flags, out=None):
+        if out is None:
+            out = np.empty(self.processed_shape, dtype=np.float32)
+
+        if self.burst_mode:
+            self.correction_kernel_burst(
+                self.input_data,
+                self.input_gain_stage,
+                self.input_cell_table,
+                flags,
+                self.offset_map,
+                self.rel_gain_map,
+                self.bad_pixel_map,
+                self.bad_pixel_mask_value,
+                out,
+            )
+        else:
+            self.correction_kernel_single(
+                self.input_data,
+                self.input_gain_stage,
+                flags,
+                self.offset_map,
+                self.rel_gain_map,
+                self.bad_pixel_map,
+                self.bad_pixel_mask_value,
+                out,
+            )
+        self.processed_data = out
+
+
 class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
     _constant_enum_class = JungfrauConstants
 
@@ -172,12 +270,6 @@ class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
             .key(f"{param_prefix}.biasVoltage")
             .setNewDefaultValue(90)
             .commit(),
-
-            # JUNGFRAU data is small, can fit plenty of trains in here
-            OVERWRITE_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .setNewDefaultValue(2)
-            .commit(),
         )
 
         # add extra parameters
@@ -255,7 +347,7 @@ class JungfrauCorrection(BaseCorrection):
         ("relGain", CorrectionFlags.REL_GAIN),
         ("badPixels", CorrectionFlags.BPMASK),
     )
-    _kernel_runner_class = JungfrauGpuRunner
+    _kernel_runner_class = None  # note: set in __init__ based on config
     _calcat_friend_class = JungfrauCalcatFriend
     _constant_enum_class = JungfrauConstants
     _managed_keys = BaseCorrection._managed_keys.copy()
@@ -264,7 +356,6 @@ class JungfrauCorrection(BaseCorrection):
 
     @staticmethod
     def expectedParameters(expected):
-        super(JungfrauCorrection, JungfrauCorrection).expectedParameters(expected)
         (
             OVERWRITE_ELEMENT(expected)
             .key("dataFormat.pixelsX")
@@ -285,8 +376,26 @@ class JungfrauCorrection(BaseCorrection):
             .key("preview.selectionMode")
             .setNewDefaultValue("frame")
             .commit(),
+
+            # JUNGFRAU data is small, can fit plenty of trains in here
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
         )
 
+        # support both CPU and GPU kernels
+        (
+            STRING_ELEMENT(expected)
+            .key("kernelType")
+            .assignmentOptional()
+            .defaultValue(KernelRunnerVersions.CPU.name)
+            .options(",".join(kernel_type.name for kernel_type in KernelRunnerVersions))
+            .reconfigurable()
+            .commit(),
+        )
+        JungfrauCorrection._managed_keys.add("kernelType")
+
         (
             OUTPUT_CHANNEL(expected)
             .key("preview.outputGainMap")
@@ -320,6 +429,11 @@ class JungfrauCorrection(BaseCorrection):
 
     def __init__(self, config):
         super().__init__(config)
+        kernel_type = KernelRunnerVersions[config["kernelType"]]
+        if kernel_type is KernelRunnerVersions.CPU:
+            self._kernel_runner_class = JungfrauCpuRunner
+        else:
+            self._kernel_runner_class = JungfrauGpuRunner
         # TODO: gain mode as constant parameter and / or device configuration
 
         try:
@@ -406,20 +520,34 @@ class JungfrauCorrection(BaseCorrection):
             constant_data = np.transpose(constant_data, (2, 1, 0, 3))
         else:
             constant_data = np.transpose(constant_data, (2, 0, 1, 3))
+
+        kernel_type = KernelRunnerVersions[self.get("kernelType")]
         if constant is JungfrauConstants.Offset10Hz:
-            self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.offset_map[:] = constant_data.astype(np.float32)
+            else:
+                self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
             if not self.get("corrections.offset.available"):
                 self.set("corrections.offset.available", True)
         elif constant is JungfrauConstants.RelativeGain10Hz:
-            self.kernel_runner.rel_gain_map_gpu.set(constant_data.astype(np.float32))
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.rel_gain_map[:] = constant_data.astype(np.float32)
+            else:
+                self.kernel_runner.rel_gain_map_gpu.set(
+                    constant_data.astype(np.float32)
+                )
             if not self.get("corrections.relGain.available"):
                 self.set("corrections.relGain.available", True)
         elif constant in (
-                JungfrauConstants.BadPixelsDark10Hz, JungfrauConstants.BadPixelsFF10Hz
+            JungfrauConstants.BadPixelsDark10Hz,
+            JungfrauConstants.BadPixelsFF10Hz,
         ):
-            self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.bad_pixel_map |= constant_data
+            else:
+                self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
             if not self.get("corrections.badPixels.available"):
                 self.set("corrections.badPixels.available", True)
 
         self._update_correction_flags()
-        self.log_status_info(f"Done loading {constant.name} to GPU")
+        self.log_status_info(f"Done loading {constant.name} to runner")
diff --git a/src/calng/LpdCorrection.py b/src/calng/LpdCorrection.py
index c159132cb659bc5031a26bdca66f81eab0b1ad65..2f8321302f34df976bd85c36dfee29fc79b6fdc7 100644
--- a/src/calng/LpdCorrection.py
+++ b/src/calng/LpdCorrection.py
@@ -11,7 +11,7 @@ from karabo.bound import (
     VECTOR_STRING_ELEMENT,
 )
 
-from . import base_gpu, base_calcat, utils
+from . import base_kernel_runner, base_calcat, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
 
@@ -34,7 +34,7 @@ class CorrectionFlags(enum.IntFlag):
     BPMASK = 16
 
 
-class LpdGpuRunner(base_gpu.BaseGpuRunner):
+class LpdGpuRunner(base_kernel_runner.BaseGpuRunner):
     _kernel_source_filename = "lpd_gpu.cu"
     _corrected_axis_order = "cxy"
 
diff --git a/src/calng/base_kernel_runner.py b/src/calng/base_kernel_runner.py
index 38127f06549fefcd1bf684e86c5f742c249b00b1..1d0d81233725b25ae1cfe7acf89c5d1086b0afb7 100644
--- a/src/calng/base_kernel_runner.py
+++ b/src/calng/base_kernel_runner.py
@@ -43,7 +43,7 @@ class BaseKernelRunner:
         Detector-specific subclass must define this method. It should assume that image
         data, cell table, and other data (including constants) has already been loaded.
         It should probably run some GPU kernel and output should go into
-        self.processed_data_gpu.
+        self.processed_data(_gpu).
 
         Keep in mind that user only gets output from compute_preview or reshape
         (either of these should come after correct).
@@ -62,7 +62,7 @@ class BaseKernelRunner:
         raise NotImplementedError()
 
     def _get_corrected_for_preview(self):
-        """Should return view of self.processed_data_gpu with shape (cell, x/y, x/y)"""
+        """Should return view of self.processed_data(_gpu) with shape (cell, x/y, x/y)"""
         raise NotImplementedError()
 
     def flush_buffers(self):
@@ -99,21 +99,37 @@ class BaseKernelRunner:
         here for now (and can differ between AGIPD and DSSC)"""
         if preview_index >= 0:
             # TODO: reuse pinned buffers for this
-            return image_data[preview_index].astype(np.float32).get()
+            return image_data[preview_index].astype(np.float32)
         elif preview_index == -1:
             # TODO: confirm that max is pixel and not integrated intensity
             # separate from next case because dtype not applicable here
-            return cupy.nanmax(image_data, axis=0).astype(cupy.float32).get()
+            return np.nanmax(image_data, axis=0).astype(np.float32)
         elif preview_index in (-2, -3, -4):
             stat_fun = {
-                -2: cupy.nanmean,
-                -3: cupy.nansum,
-                -4: cupy.nanstd,
+                -2: np.nanmean,
+                -3: np.nansum,
+                -4: np.nanstd,
             }[preview_index]
-            return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
+            return stat_fun(image_data, axis=0, dtype=np.float32)
 
+    def reshape(self, output_order, out=None):
+        """Move axes to desired output order"""
+        # TODO: avoid copy
+        if output_order == self._corrected_axis_order:
+            self.reshaped_data = self.processed_data
+        else:
+            self.reshaped_data = np.transpose(
+                self.processed_data,
+                utils.transpose_order(self._corrected_axis_order, output_order),
+            )
 
-class BaseGpuRunner(base_kernel_runner):
+        if out is None:
+            return self.reshaped_data
+        else:
+            out[:] = self.reshaped_data
+
+
+class BaseGpuRunner(BaseKernelRunner):
     """Class to handle GPU buffers and execution of CUDA kernels on image data
 
     All GPU buffers are kept within this class and it is intentionally very stateful.
@@ -214,3 +230,6 @@ class BaseGpuRunner(base_kernel_runner):
             utils.ceil_div(a_length, block_length)
             for (a_length, block_length) in zip(self.processed_shape, full_block)
         )
+
+    def _compute_a_preview(self, image_data, preview_index):
+        return super()._compute_a_preview(image_data, preview_index).get()
diff --git a/src/calng/kernels/jungfrau_cpu.pyx b/src/calng/kernels/jungfrau_cpu.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..02d92537c8d214bd30103bf29ee7acaba734db4f
--- /dev/null
+++ b/src/calng/kernels/jungfrau_cpu.pyx
@@ -0,0 +1,87 @@
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: wrapararound=False
+
+cdef unsigned char NONE = 0
+cdef unsigned char OFFSET = 1
+cdef unsigned char REL_GAIN = 2
+cdef unsigned char BPMASK = 4
+
+from cython.parallel import prange
+
+
+def correct_burst(
+    unsigned short[:, :, :] image_data,
+    unsigned char[:, :, :] gain_stage,
+    unsigned char[:] cell_table,
+    unsigned char flags,
+    float[:, :, :, :] offset_map,
+    float[:, :, :, :] relgain_map,
+    unsigned[:, :, :, :] badpixel_mask,
+    float badpixel_fill_value,
+    float[:, :, :] output,
+):
+    cdef int input_cell, map_cell, x, y
+    cdef unsigned char gain
+    cdef float corrected
+    for input_cell in prange(image_data.shape[0], nogil=True):
+        map_cell = cell_table[input_cell]
+        if map_cell >= offset_map.shape[0]:
+            for y in range(image_data.shape[1]):
+                for x in range(image_data.shape[2]):
+                    output[input_cell, y, x] = <float>image_data[input_cell, y, x]
+            continue
+        for y in range(image_data.shape[1]):
+            for x in range(image_data.shape[2]):
+                corrected = image_data[input_cell, y, x]
+                gain = gain_stage[input_cell, y, x]
+                # legal values: 0, 1, or 3
+                if gain == 2:
+                    corrected = badpixel_fill_value
+                else:
+                    if gain == 3:
+                        gain = 2
+
+                    if (flags & BPMASK) and badpixel_mask[map_cell, y, x, gain] != 0:
+                        corrected = badpixel_fill_value
+                    else:
+                        if (flags & OFFSET):
+                            corrected = corrected - offset_map[map_cell, y, x, gain]
+                        if (flags & REL_GAIN):
+                            corrected = corrected / relgain_map[map_cell, y, x, gain]
+                output[input_cell, y, x] = corrected
+
+
+def correct_single(
+    unsigned short[:, :, :] image_data,
+    unsigned char[:, :, :] gain_stage,
+    unsigned char flags,
+    float[:, :, :] offset_map,
+    float[:, :, :] relgain_map,
+    unsigned[:, :, :] badpixel_mask,
+    float badpixel_fill_value,
+    float[:, :, :] output,
+):
+    # ignore "cell table", constant pretends cell 0
+    cdef int x, y
+    cdef unsigned char gain
+    cdef float corrected
+    for y in range(image_data.shape[1]):
+        for x in range(image_data.shape[2]):
+            corrected = image_data[0, y, x]
+            gain = gain_stage[0, y, x]
+            # legal values: 0, 1, or 3
+            if gain == 2:
+                corrected = badpixel_fill_value
+            else:
+                if gain == 3:
+                    gain = 2
+
+                if (flags & BPMASK) and badpixel_mask[y, x, gain] != 0:
+                    corrected = badpixel_fill_value
+                else:
+                    if (flags & OFFSET):
+                        corrected = corrected - offset_map[y, x, gain]
+                    if (flags & REL_GAIN):
+                        corrected = corrected / relgain_map[y, x, gain]
+            output[0, y, x] = corrected