JF: Add CPU runner version

Still to do: speed up preview generation; single-threaded numpy reduction functions are a bit of an issue.

JF: Add CPU runner version
Still to do: speed up preview generation; single-threaded numpy reduction functions are a bit of an issue.
df209b62 · David Hammer · d234558a · df209b62 · df209b62 · df209b62
Commit df209b62 authored 2 years ago by David Hammer
--- a/setup.py
+++ b/setup.py
@@ -49,8 +49,18 @@ setup(name='calng',
          [
              Extension(
                  'calng.kernels.gotthard2_cython',
-                  ['src/calng/kernels/gotthard2_cpu.pyx']
+                  ['src/calng/kernels/gotthard2_cpu.pyx'],
-              )
+                  extra_compile_args = ['-O3', '-march=native'],
-          ]
+              ),
+              Extension(
+                  'calng.kernels.jungfrau_cython',
+                  ['src/calng/kernels/jungfrau_cpu.pyx'],
+                  extra_compile_args = ['-O3', '-march=native', '-fopenmp' ],
+                  extra_link_args=['-fopenmp'],
+              ),
+          ],
+          compiler_directives={
+              'language_level': 3
+          },
      ),
 )
--- a/src/calng/AgipdCorrection.py
+++ b/src/calng/AgipdCorrection.py
@@ -14,7 +14,7 @@ from karabo.bound import (
    VECTOR_STRING_ELEMENT,
 )
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, base_kernel_runner, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
@@ -47,7 +47,7 @@ class CorrectionFlags(enum.IntFlag):
    BPMASK = 32
-class AgipdGpuRunner(base_gpu.BaseGpuRunner):
+class AgipdGpuRunner(base_kernel_runner.BaseGpuRunner):
    _kernel_source_filename = "agipd_gpu.cu"
    _corrected_axis_order = "cxy"

--- a/src/calng/DsscCorrection.py
+++ b/src/calng/DsscCorrection.py
@@ -9,7 +9,7 @@ from karabo.bound import (
    VECTOR_STRING_ELEMENT,
 )
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, base_kernel_runner, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema
@@ -23,7 +23,7 @@ class DsscConstants(enum.Enum):
    Offset = enum.auto()
-class DsscGpuRunner(base_gpu.BaseGpuRunner):
+class DsscGpuRunner(base_kernel_runner.BaseGpuRunner):
    _kernel_source_filename = "dssc_gpu.cu"
    _corrected_axis_order = "cyx"

--- a/src/calng/Gotthard2Correction.py
+++ b/src/calng/Gotthard2Correction.py
@@ -24,6 +24,7 @@ from .base_correction import (
    add_correction_step_schema,
    preview_schema,
 )
+from .base_kernel_runner import BaseKernelRunner
 _pretend_pulse_table = np.arange(2720, dtype=np.uint8)
@@ -52,7 +53,7 @@ class CorrectionFlags(enum.IntFlag):
    GAIN = 4
-class Gotthard2CpuRunner:
+class Gotthard2CpuRunner(BaseKernelRunner):
    def __init__(
        self,
        pixels_x,
@@ -63,12 +64,18 @@ class Gotthard2CpuRunner:
        output_data_dtype=np.float32,
        bad_pixel_mask_value=np.nan,
    ):
-        from .kernels import gotthard2_cython  # TODO
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+        from .kernels import gotthard2_cython
        self.correction_kernel = gotthard2_cython.correct
-        self.pixels_x = pixels_x
-        self.memory_cells = memory_cells
-        self.constant_memory_cells = constant_memory_cells
        self.input_shape = (memory_cells, pixels_x)
        self.processed_shape = self.input_shape
        # model: 2 buffers (corresponding to actual memory cells), 2720 frames
@@ -79,13 +86,10 @@ class Gotthard2CpuRunner:
        self.offset_map = np.empty(self.map_shape, dtype=np.float32)
        self.rel_gain_map = np.empty(self.map_shape, dtype=np.float32)
        self.flush_buffers()
        self.input_data = None  # will just point to data coming in
        self.input_gain_stage = None  # will just point to data coming in
        self.processed_data = None  # will just point to buffer we're given
-        self.preview_buffer_getters = [
-            self._get_raw_for_preview,
-            self._get_corrected_for_preview,
-        ]
    def _get_raw_for_preview(self):
        return self.input_data
@@ -94,7 +98,7 @@ class Gotthard2CpuRunner:
        return self.processed_data
    def load_data(self, image_data, input_gain_stage):
-        """Experiment: loading all three in one function as they are tied"""
+        """Experiment: loading both in one function as they are tied"""
        self.input_data = image_data.astype(np.uint16, copy=False)
        self.input_gain_stage = input_gain_stage.astype(np.uint8, copy=False)
@@ -122,33 +126,6 @@ class Gotthard2CpuRunner:
        self.processed_data = out
        return out
-    def compute_previews(self, preview_index):
-        """See BaseGpuRunner.compute_previews"""
-        if preview_index < -4:
-            raise ValueError(f"No statistic with code {preview_index} defined")
-        elif preview_index >= self.memory_cells:
-            raise ValueError(f"Memory cell index {preview_index} out of range")
-        # TODO: enum around reduction type
-        return tuple(
-            self._compute_a_preview(image_data=getter(), preview_index=preview_index)
-            for getter in self.preview_buffer_getters
-        )
-    def _compute_a_preview(self, image_data, preview_index):
-        if preview_index >= 0:
-            return image_data[preview_index].astype(np.float32, copy=False)
-        elif preview_index == -1:
-            return np.nanmax(image_data, axis=0).astype(np.float32, copy=False)
-        elif preview_index in (-2, -3, -4):
-            stat_fun = {
-                -2: np.nanmean,
-                -3: np.nansum,
-                -4: np.nanstd,
-            }[preview_index]
-            return stat_fun(image_data, axis=0, dtype=np.float32)
 class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
    _constant_enum_class = Gotthard2Constants
@@ -184,11 +161,6 @@ class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
            .key(f"{param_prefix}.memoryCells")
            .setNewDefaultValue(2)
            .commit(),
-            OVERWRITE_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .setNewDefaultValue(2)
-            .commit(),
        )
        base_calcat.add_status_schema_from_enum(
@@ -236,6 +208,11 @@ class Gotthard2Correction(BaseCorrection):
            .key("preview.selectionMode")
            .setNewDefaultValue("frame")
            .commit(),
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
        )
        (
@@ -327,7 +304,6 @@ class Gotthard2Correction(BaseCorrection):
        buffer_handle, buffer_array = self._shmem_buffer.next_slot()
        self.kernel_runner.correct(self._correction_flag_enabled, out=buffer_array)
        if do_generate_preview:
            if self._correction_flag_enabled != self._correction_flag_preview:
                self.kernel_runner.correct(self._correction_flag_preview)

--- a/src/calng/JungfrauCorrection.py
+++ b/src/calng/JungfrauCorrection.py
@@ -11,9 +11,10 @@ from karabo.bound import (
    VECTOR_STRING_ELEMENT,
 )
-from . import base_calcat, base_gpu, utils
+from . import base_calcat, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
+from .base_kernel_runner import BaseGpuRunner, BaseKernelRunner
 _pretend_pulse_table = np.arange(16, dtype=np.uint8)
@@ -33,7 +34,12 @@ class CorrectionFlags(enum.IntFlag):
    BPMASK = 4
-class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
+class KernelRunnerVersions(enum.Enum):
+    CPU = enum.auto()
+    GPU = enum.auto()
+class JungfrauGpuRunner(BaseGpuRunner):
    _kernel_source_filename = "jungfrau_gpu.cu"
    _corrected_axis_order = "cyx"
@@ -59,8 +65,8 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
        )
        # TODO: avoid superclass creating cell table with wrong dtype first
        self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=cupy.uint8)
-        self.input_gain_map_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
+        self.input_gain_stage_gpu = cupy.empty(self.input_shape, dtype=cupy.uint8)
-        self.preview_buffer_getters.append(self._get_gain_map_for_preview)
+        self.preview_buffer_getters.append(self._get_gain_stage_for_preview)
        self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
        self.offset_map_gpu = cupy.zeros(self.map_shape, dtype=cupy.float32)
        self.rel_gain_map_gpu = cupy.ones(self.map_shape, dtype=cupy.float32)
@@ -95,13 +101,13 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
    def _get_corrected_for_preview(self):
        return self.processed_data_gpu
-    def _get_gain_map_for_preview(self):
+    def _get_gain_stage_for_preview(self):
-        return self.input_gain_map_gpu
+        return self.input_gain_stage_gpu
-    def load_data(self, image_data, input_gain_map, cell_table):
+    def load_data(self, image_data, input_gain_stage, cell_table):
        """Experiment: loading all three in one function as they are tied"""
        self.input_data_gpu.set(image_data)
-        self.input_gain_map_gpu.set(input_gain_map)
+        self.input_gain_stage_gpu.set(input_gain_stage)
        if self.burst_mode:
            self.cell_table_gpu.set(cell_table)
@@ -116,7 +122,7 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
            self.full_block,
            (
                self.input_data_gpu,
-                self.input_gain_map_gpu,
+                self.input_gain_stage_gpu,
                self.cell_table_gpu,
                cupy.uint8(flags),
                self.offset_map_gpu,
@@ -128,6 +134,98 @@ class JungfrauGpuRunner(base_gpu.BaseGpuRunner):
        )
+class JungfrauCpuRunner(BaseKernelRunner):
+    _corrected_axis_order = "cyx"
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=np.uint16,
+        output_data_dtype=np.float32,  # TODO: configurable
+        bad_pixel_mask_value=np.nan,
+    ):
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+        from .kernels import jungfrau_cython
+        self.correction_kernel_single = jungfrau_cython.correct_single
+        self.correction_kernel_burst = jungfrau_cython.correct_burst
+        self.input_shape = (memory_cells, pixels_y, pixels_x)
+        self.preview_buffer_getters.append(self._get_gain_stage_for_preview)
+        self.processed_shape = self.input_shape
+        self.map_shape = (self.constant_memory_cells, self.pixels_y, self.pixels_x, 3)
+        self.offset_map = np.empty(self.map_shape, dtype=np.float32)
+        self.rel_gain_map = np.empty(self.map_shape, dtype=np.float32)
+        self.bad_pixel_map = np.empty(self.map_shape, dtype=np.uint32)
+        self.bad_pixel_mask_value = bad_pixel_mask_value
+        self.flush_buffers()
+        self.input_data = None  # will just point to data coming in
+        self.input_gain_stage = None  # will just point to data coming in
+        self.processed_data = None  # will just point to buffer we're given
+    def _get_raw_for_preview(self):
+        return self.input_data
+    def _get_corrected_for_preview(self):
+        return self.processed_data
+    def _get_gain_stage_for_preview(self):
+        return self.input_gain_stage
+    @property
+    def burst_mode(self):
+        return self.memory_cells > 1
+    def load_data(self, image_data, input_gain_stage, cell_table):
+        self.input_data = image_data.astype(np.uint16, copy=False)
+        self.input_gain_stage = input_gain_stage.astype(np.uint8, copy=False)
+        self.input_cell_table = cell_table.astype(np.uint8, copy=False)
+    def flush_buffers(self):
+        self.offset_map.fill(0)
+        self.rel_gain_map.fill(1)
+        self.bad_pixel_map.fill(0)
+    def correct(self, flags, out=None):
+        if out is None:
+            out = np.empty(self.processed_shape, dtype=np.float32)
+        if self.burst_mode:
+            self.correction_kernel_burst(
+                self.input_data,
+                self.input_gain_stage,
+                self.input_cell_table,
+                flags,
+                self.offset_map,
+                self.rel_gain_map,
+                self.bad_pixel_map,
+                self.bad_pixel_mask_value,
+                out,
+            )
+        else:
+            self.correction_kernel_single(
+                self.input_data,
+                self.input_gain_stage,
+                flags,
+                self.offset_map,
+                self.rel_gain_map,
+                self.bad_pixel_map,
+                self.bad_pixel_mask_value,
+                out,
+            )
+        self.processed_data = out
 class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
    _constant_enum_class = JungfrauConstants
@@ -172,12 +270,6 @@ class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
            .key(f"{param_prefix}.biasVoltage")
            .setNewDefaultValue(90)
            .commit(),
-            # JUNGFRAU data is small, can fit plenty of trains in here
-            OVERWRITE_ELEMENT(expected)
-            .key("outputShmemBufferSize")
-            .setNewDefaultValue(2)
-            .commit(),
        )
        # add extra parameters
@@ -255,7 +347,7 @@ class JungfrauCorrection(BaseCorrection):
        ("relGain", CorrectionFlags.REL_GAIN),
        ("badPixels", CorrectionFlags.BPMASK),
    )
-    _kernel_runner_class = JungfrauGpuRunner
+    _kernel_runner_class = None  # note: set in __init__ based on config
    _calcat_friend_class = JungfrauCalcatFriend
    _constant_enum_class = JungfrauConstants
    _managed_keys = BaseCorrection._managed_keys.copy()
@@ -264,7 +356,6 @@ class JungfrauCorrection(BaseCorrection):
    @staticmethod
    def expectedParameters(expected):
-        super(JungfrauCorrection, JungfrauCorrection).expectedParameters(expected)
        (
            OVERWRITE_ELEMENT(expected)
            .key("dataFormat.pixelsX")
@@ -285,8 +376,26 @@ class JungfrauCorrection(BaseCorrection):
            .key("preview.selectionMode")
            .setNewDefaultValue("frame")
            .commit(),
+            # JUNGFRAU data is small, can fit plenty of trains in here
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
        )
+        # support both CPU and GPU kernels
+        (
+            STRING_ELEMENT(expected)
+            .key("kernelType")
+            .assignmentOptional()
+            .defaultValue(KernelRunnerVersions.CPU.name)
+            .options(",".join(kernel_type.name for kernel_type in KernelRunnerVersions))
+            .reconfigurable()
+            .commit(),
+        )
+        JungfrauCorrection._managed_keys.add("kernelType")
        (
            OUTPUT_CHANNEL(expected)
            .key("preview.outputGainMap")
@@ -320,6 +429,11 @@ class JungfrauCorrection(BaseCorrection):
    def __init__(self, config):
        super().__init__(config)
+        kernel_type = KernelRunnerVersions[config["kernelType"]]
+        if kernel_type is KernelRunnerVersions.CPU:
+            self._kernel_runner_class = JungfrauCpuRunner
+        else:
+            self._kernel_runner_class = JungfrauGpuRunner
        # TODO: gain mode as constant parameter and / or device configuration
        try:
@@ -406,20 +520,34 @@ class JungfrauCorrection(BaseCorrection):
            constant_data = np.transpose(constant_data, (2, 1, 0, 3))
        else:
            constant_data = np.transpose(constant_data, (2, 0, 1, 3))
+        kernel_type = KernelRunnerVersions[self.get("kernelType")]
        if constant is JungfrauConstants.Offset10Hz:
-            self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.offset_map[:] = constant_data.astype(np.float32)
+            else:
+                self.kernel_runner.offset_map_gpu.set(constant_data.astype(np.float32))
            if not self.get("corrections.offset.available"):
                self.set("corrections.offset.available", True)
        elif constant is JungfrauConstants.RelativeGain10Hz:
-            self.kernel_runner.rel_gain_map_gpu.set(constant_data.astype(np.float32))
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.rel_gain_map[:] = constant_data.astype(np.float32)
+            else:
+                self.kernel_runner.rel_gain_map_gpu.set(
+                    constant_data.astype(np.float32)
+                )
            if not self.get("corrections.relGain.available"):
                self.set("corrections.relGain.available", True)
        elif constant in (
-                JungfrauConstants.BadPixelsDark10Hz, JungfrauConstants.BadPixelsFF10Hz
+            JungfrauConstants.BadPixelsDark10Hz,
+            JungfrauConstants.BadPixelsFF10Hz,
        ):
-            self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
+            if kernel_type is KernelRunnerVersions.CPU:
+                self.kernel_runner.bad_pixel_map |= constant_data
+            else:
+                self.kernel_runner.bad_pixel_map_gpu |= cupy.asarray(constant_data)
            if not self.get("corrections.badPixels.available"):
                self.set("corrections.badPixels.available", True)
        self._update_correction_flags()
-        self.log_status_info(f"Done loading {constant.name} to GPU")
+        self.log_status_info(f"Done loading {constant.name} to runner")
--- a/src/calng/LpdCorrection.py
+++ b/src/calng/LpdCorrection.py
@@ -11,7 +11,7 @@ from karabo.bound import (
    VECTOR_STRING_ELEMENT,
 )
-from . import base_gpu, base_calcat, utils
+from . import base_kernel_runner, base_calcat, utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema, preview_schema
@@ -34,7 +34,7 @@ class CorrectionFlags(enum.IntFlag):
    BPMASK = 16
-class LpdGpuRunner(base_gpu.BaseGpuRunner):
+class LpdGpuRunner(base_kernel_runner.BaseGpuRunner):
    _kernel_source_filename = "lpd_gpu.cu"
    _corrected_axis_order = "cxy"

--- a/src/calng/base_kernel_runner.py
+++ b/src/calng/base_kernel_runner.py
@@ -43,7 +43,7 @@ class BaseKernelRunner:
        Detector-specific subclass must define this method. It should assume that image
        data, cell table, and other data (including constants) has already been loaded.
        It should probably run some GPU kernel and output should go into
-        self.processed_data_gpu.
+        self.processed_data(_gpu).
        Keep in mind that user only gets output from compute_preview or reshape
        (either of these should come after correct).
@@ -62,7 +62,7 @@ class BaseKernelRunner:
        raise NotImplementedError()
    def _get_corrected_for_preview(self):
-        """Should return view of self.processed_data_gpu with shape (cell, x/y, x/y)"""
+        """Should return view of self.processed_data(_gpu) with shape (cell, x/y, x/y)"""
        raise NotImplementedError()
    def flush_buffers(self):
@@ -99,21 +99,37 @@ class BaseKernelRunner:
        here for now (and can differ between AGIPD and DSSC)"""
        if preview_index >= 0:
            # TODO: reuse pinned buffers for this
-            return image_data[preview_index].astype(np.float32).get()
+            return image_data[preview_index].astype(np.float32)
        elif preview_index == -1:
            # TODO: confirm that max is pixel and not integrated intensity
            # separate from next case because dtype not applicable here
-            return cupy.nanmax(image_data, axis=0).astype(cupy.float32).get()
+            return np.nanmax(image_data, axis=0).astype(np.float32)
        elif preview_index in (-2, -3, -4):
            stat_fun = {
-                -2: cupy.nanmean,
+                -2: np.nanmean,
-                -3: cupy.nansum,
+                -3: np.nansum,
-                -4: cupy.nanstd,
+                -4: np.nanstd,
            }[preview_index]
-            return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
+            return stat_fun(image_data, axis=0, dtype=np.float32)
+    def reshape(self, output_order, out=None):
+        """Move axes to desired output order"""
+        # TODO: avoid copy
+        if output_order == self._corrected_axis_order:
+            self.reshaped_data = self.processed_data
+        else:
+            self.reshaped_data = np.transpose(
+                self.processed_data,
+                utils.transpose_order(self._corrected_axis_order, output_order),
+            )
-class BaseGpuRunner(base_kernel_runner):
+        if out is None:
+            return self.reshaped_data
+        else:
+            out[:] = self.reshaped_data
+class BaseGpuRunner(BaseKernelRunner):
    """Class to handle GPU buffers and execution of CUDA kernels on image data
    All GPU buffers are kept within this class and it is intentionally very stateful.
@@ -214,3 +230,6 @@ class BaseGpuRunner(base_kernel_runner):
            utils.ceil_div(a_length, block_length)
            for (a_length, block_length) in zip(self.processed_shape, full_block)
        )
+    def _compute_a_preview(self, image_data, preview_index):
+        return super()._compute_a_preview(image_data, preview_index).get()
--- a/src/calng/kernels/jungfrau_cpu.pyx
+++ b/src/calng/kernels/jungfrau_cpu.pyx
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: wrapararound=False
+cdef unsigned char NONE = 0
+cdef unsigned char OFFSET = 1
+cdef unsigned char REL_GAIN = 2
+cdef unsigned char BPMASK = 4
+from cython.parallel import prange
+def correct_burst(
+    unsigned short[:, :, :] image_data,
+    unsigned char[:, :, :] gain_stage,
+    unsigned char[:] cell_table,
+    unsigned char flags,
+    float[:, :, :, :] offset_map,
+    float[:, :, :, :] relgain_map,
+    unsigned[:, :, :, :] badpixel_mask,
+    float badpixel_fill_value,
+    float[:, :, :] output,
+):
+    cdef int input_cell, map_cell, x, y
+    cdef unsigned char gain
+    cdef float corrected
+    for input_cell in prange(image_data.shape[0], nogil=True):
+        map_cell = cell_table[input_cell]
+        if map_cell >= offset_map.shape[0]:
+            for y in range(image_data.shape[1]):
+                for x in range(image_data.shape[2]):
+                    output[input_cell, y, x] = <float>image_data[input_cell, y, x]
+            continue
+        for y in range(image_data.shape[1]):
+            for x in range(image_data.shape[2]):
+                corrected = image_data[input_cell, y, x]
+                gain = gain_stage[input_cell, y, x]
+                # legal values: 0, 1, or 3
+                if gain == 2:
+                    corrected = badpixel_fill_value
+                else:
+                    if gain == 3:
+                        gain = 2
+                    if (flags & BPMASK) and badpixel_mask[map_cell, y, x, gain] != 0:
+                        corrected = badpixel_fill_value
+                    else:
+                        if (flags & OFFSET):
+                            corrected = corrected - offset_map[map_cell, y, x, gain]
+                        if (flags & REL_GAIN):
+                            corrected = corrected / relgain_map[map_cell, y, x, gain]
+                output[input_cell, y, x] = corrected
+def correct_single(
+    unsigned short[:, :, :] image_data,
+    unsigned char[:, :, :] gain_stage,
+    unsigned char flags,
+    float[:, :, :] offset_map,
+    float[:, :, :] relgain_map,
+    unsigned[:, :, :] badpixel_mask,
+    float badpixel_fill_value,
+    float[:, :, :] output,
+):
+    # ignore "cell table", constant pretends cell 0
+    cdef int x, y
+    cdef unsigned char gain
+    cdef float corrected
+    for y in range(image_data.shape[1]):
+        for x in range(image_data.shape[2]):
+            corrected = image_data[0, y, x]
+            gain = gain_stage[0, y, x]
+            # legal values: 0, 1, or 3
+            if gain == 2:
+                corrected = badpixel_fill_value
+            else:
+                if gain == 3:
+                    gain = 2
+                if (flags & BPMASK) and badpixel_mask[y, x, gain] != 0:
+                    corrected = badpixel_fill_value
+                else:
+                    if (flags & OFFSET):
+                        corrected = corrected - offset_map[y, x, gain]
+                    if (flags & REL_GAIN):
+                        corrected = corrected / relgain_map[y, x, gain]
+            output[0, y, x] = corrected