From a4afb6d0dab7304fc1cb3d57f740e40b007118f2 Mon Sep 17 00:00:00 2001
From: David Hammer <dhammer@mailbox.org>
Date: Mon, 30 Aug 2021 20:09:44 +0200
Subject: [PATCH] Refactor: correct / cast before reshape

---
 src/calng/DsscCorrection.py    |   6 +-
 src/calng/dssc_gpu.py          | 151 +++++++++++++++------------------
 src/calng/gpu-dssc-correct.cpp |  42 ++++-----
 src/tests/test_dssc_kernels.py |  82 ++++++++++--------
 4 files changed, 142 insertions(+), 139 deletions(-)

diff --git a/src/calng/DsscCorrection.py b/src/calng/DsscCorrection.py
index e922572e..995b8689 100644
--- a/src/calng/DsscCorrection.py
+++ b/src/calng/DsscCorrection.py
@@ -544,13 +544,13 @@ class DsscCorrection(calibrationBase.CalibrationReceiverBaseDevice):
                     self.set("status", msg)
 
             self.gpu_runner.load_data(image_data)
-            self.gpu_runner.reshape()
             buffer_handle, buffer_array = self._shmem_buffer.next_slot()
             if do_apply_correction:
                 self.gpu_runner.load_cell_table(cell_table)
-                self.gpu_runner.correct(out=buffer_array)
+                self.gpu_runner.correct()
             else:
-                self.gpu_runner.only_cast(out=buffer_array)
+                self.gpu_runner.only_cast()
+            self.gpu_runner.reshape(out=buffer_array)
             if do_generate_preview:
                 preview_slice_index = self.get("preview.pulse")
                 if preview_slice_index >= 0:
diff --git a/src/calng/dssc_gpu.py b/src/calng/dssc_gpu.py
index 3d5e9349..7dacb363 100644
--- a/src/calng/dssc_gpu.py
+++ b/src/calng/dssc_gpu.py
@@ -18,15 +18,15 @@ class DsscGpuRunner:
     2. load_constants
     3. load_data
     4. load_cell_table
-    5. reshape
-    6. correct
-    7. compute_preview (optional)
+    5. correct
+    6a. reshape (only here does data transfer back to host)
+    6b. compute_preview (optional)
 
     repeat from 2. or 3.
 
-    In case no constants are available / correction is not desired, can skip 3. and 4.
-    and use only_cast instead of correct (taking care to call compute_preview with
-    parameters set accordingly).
+    In case no constants are available / correction is not desired, can skip 3 and 4
+    and use only_cast in step 5 instead of correct (taking care to call
+    compute_preview with parameters set accordingly).
     """
 
     _src_dir = pathlib.Path(__file__).absolute().parent
@@ -58,10 +58,10 @@ class DsscGpuRunner:
         self.offset_map_gpu = cupy.empty(self.map_shape, dtype=np.float32)
 
         # reuse output arrays
-        self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
-        self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=input_data_dtype)
         self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16)
-        self.output_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype)
+        self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
+        self.processed_data_gpu = cupy.empty(self.input_shape, dtype=output_data_dtype)
+        self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype)
         self.preview_raw = cupyx.empty_pinned(self.preview_shape, dtype=np.float32)
         self.preview_corrected = cupyx.empty_pinned(
             self.preview_shape, dtype=np.float32
@@ -99,23 +99,10 @@ class DsscGpuRunner:
         self.full_block = tuple(full_block)
         self.full_grid = tuple(
             utils.ceil_div(a_length, block_length)
-            for (a_length, block_length) in zip(self.output_shape, full_block)
+            for (a_length, block_length) in zip(self.input_shape, full_block)
         )
 
-    def reshape(self):
-        """Do the reshaping that the splitter would have done
-
-        equivalent to:
-        output_data[:] = np.moveaxis(
-            np.squeeze(input_data), (0, 1, 2), (2, 1, 0)
-        )
-        """
-        # TODO: Move to somewhere else
-        self.reshaped_data_gpu[:] = cupy.ascontiguousarray(
-            cupy.transpose(cupy.squeeze(self.input_data_gpu))
-        )
-
-    def correct(self, out=None):
+    def correct(self):
         """Apply corrections to data (must load constant, data, and cell_table first)
 
         Applies corrections to input data and casts to desired output dtype.
@@ -127,16 +114,45 @@ class DsscGpuRunner:
         (view of) said buffer as an ndarray.  Keep in mind that the output
         buffers will get overwritten eventually (circular buffer).
         """
-        self._run_correct()
-        return self.output_data_gpu.get(out=out)
+        self.correction_kernel(
+            self.full_grid,
+            self.full_block,
+            (
+                self.input_data_gpu,
+                self.cell_table_gpu,
+                self.offset_map_gpu,
+                self.processed_data_gpu,
+            ),
+        )
 
-    def only_cast(self, out=None):
+    def only_cast(self):
         """Like correct without the correction
 
         This currently means just casting to output dtype.
         """
-        self._run_only_cast()
-        return self.output_data_gpu.get(out=out)
+        self.casting_kernel(
+            self.full_grid,
+            self.full_block,
+            (
+                self.input_data_gpu,
+                self.processed_data_gpu,
+            ),
+        )
+
+    def reshape(self, out=None):
+        # TODO: make order configurable
+        """Move axes to desired output order
+
+        equivalent to:
+        output_data[:] = np.moveaxis(
+            np.squeeze(input_data), (0, 1, 2), (2, 1, 0)
+        )
+        """
+        # TODO: avoid copy
+        self.reshaped_data_gpu = cupy.ascontiguousarray(
+            cupy.transpose(cupy.squeeze(self.processed_data_gpu))
+        )
+        return self.reshaped_data_gpu.get(out=out)
 
     def compute_preview(self, preview_index, have_corrected=True, can_correct=True):
         """Generate single slice or reduction preview of raw and corrected data
@@ -160,40 +176,35 @@ class DsscGpuRunner:
 
         if not have_corrected:
             if can_correct:
-                self._run_correct()
+                self.correct()
             else:
                 print("Warning: corrected preview will not actually be corrected.")
-                self._run_only_cast()
+                self.only_cast()
 
         # TODO: enum around reduction type
-        if preview_index >= 0:
-            # TODO: change axis order when moving reshape to after correction
-            self.input_data_gpu[preview_index].astype(np.float32).transpose().get(
-                out=self.preview_corrected
-            )
-            self.output_data_gpu[..., preview_index].astype(np.float32).get(
-                out=self.preview_corrected
-            )
-        elif preview_index == -1:
-            # TODO: select argmax independently for raw and corrected?
-            # TODO: send frame sums somewhere to compute global max frame
-            max_index = cupy.argmax(
-                cupy.sum(self.output_data_gpu, axis=(0, 1), dtype=cupy.float64)
-            )
-            self.input_data_gpu[max_index].astype(np.float32).transpose().get(
-                out=self.preview_raw
-            )
-            self.output_data_gpu[..., max_index].astype(np.float32).get(
-                out=self.preview_corrected
-            )
-        elif preview_index in (-2, -3, -4):
-            stat_fun = {-2: cupy.mean, -3: cupy.sum, -4: cupy.std}[preview_index]
-            stat_fun(self.input_data_gpu, axis=0, dtype=cupy.float32).transpose().get(
-                out=self.preview_raw
-            )
-            stat_fun(self.output_data_gpu, axis=2, dtype=cupy.float32).get(
-                out=self.preview_corrected
-            )
+        for (image_data, output_buffer) in (
+            (self.input_data_gpu, self.preview_raw),
+            (self.processed_data_gpu, self.preview_corrected),
+        ):
+            if preview_index >= 0:
+                # TODO: change axis order when moving reshape to after correction
+                image_data[preview_index].astype(np.float32).transpose().get(
+                    out=output_buffer
+                )
+            elif preview_index == -1:
+                # TODO: select argmax independently for raw and corrected?
+                # TODO: send frame sums somewhere to compute global max frame
+                max_index = cupy.argmax(
+                    cupy.sum(image_data, axis=(1, 2), dtype=cupy.float32)
+                )
+                image_data[max_index].astype(np.float32).transpose().get(
+                    out=output_buffer
+                )
+            elif preview_index in (-2, -3, -4):
+                stat_fun = {-2: cupy.mean, -3: cupy.sum, -4: cupy.std}[preview_index]
+                stat_fun(
+                    image_data, axis=0, dtype=cupy.float32
+                ).transpose().get(out=output_buffer)
         return self.preview_raw, self.preview_corrected
 
     def _init_kernels(self):
@@ -214,25 +225,3 @@ class DsscGpuRunner:
         self.source_module = cupy.RawModule(code=kernel_source)
         self.correction_kernel = self.source_module.get_function("correct")
         self.casting_kernel = self.source_module.get_function("only_cast")
-
-    def _run_correct(self):
-        self.correction_kernel(
-            self.full_grid,
-            self.full_block,
-            (
-                self.reshaped_data_gpu,
-                self.cell_table_gpu,
-                self.offset_map_gpu,
-                self.output_data_gpu,
-            ),
-        )
-
-    def _run_only_cast(self):
-        self.casting_kernel(
-            self.full_grid,
-            self.full_block,
-            (
-                self.input_data_gpu,
-                self.output_data_gpu,
-            ),
-        )
diff --git a/src/calng/gpu-dssc-correct.cpp b/src/calng/gpu-dssc-correct.cpp
index 7154bbd2..2412a86a 100644
--- a/src/calng/gpu-dssc-correct.cpp
+++ b/src/calng/gpu-dssc-correct.cpp
@@ -16,27 +16,27 @@ extern "C" {
 		const size_t memory_cells = {{data_memory_cells}};
 		const size_t map_memory_cells = {{constant_memory_cells}};
 
-		const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-		const size_t j = blockIdx.y * blockDim.y + threadIdx.y;
-		const size_t k = blockIdx.z * blockDim.z + threadIdx.z;
+		const size_t memory_cell = blockIdx.x * blockDim.x + threadIdx.x;
+		const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+		const size_t x = blockIdx.z * blockDim.z + threadIdx.z;
 
-		if (i >= X || j >= Y || k >= memory_cells) {
+		if (memory_cell >= memory_cells || y >= Y || x >= X) {
 			return;
 		}
 
 		// note: strides differ from numpy strides because unit here is sizeof(...), not byte
-		const size_t data_stride_2 = 1;
-		const size_t data_stride_1 = memory_cells * data_stride_2;
-		const size_t data_stride_0 = Y * data_stride_1;
-		const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2;
+		const size_t data_stride_x = 1;
+		const size_t data_stride_y = X * data_stride_x;
+		const size_t data_stride_cell = Y * data_stride_y;
+		const size_t data_index = memory_cell * data_stride_cell + y * data_stride_y + x * data_stride_x;
 		const float raw = (float)data[data_index];
 
-		const size_t map_stride_2 = 1;
-		const size_t map_stride_1 = map_memory_cells * map_stride_2;
-		const size_t map_stride_0 = Y * map_stride_1;
-		const size_t map_cell = cell_table[k];
+		const size_t map_stride_cell = 1;
+		const size_t map_stride_y = map_memory_cells * map_stride_cell;
+		const size_t map_stride_x = Y * map_stride_y;
+		const size_t map_cell = cell_table[memory_cell];
 		if (map_cell < map_memory_cells) {
-			const size_t map_index = i * map_stride_0 + j * map_stride_1 + map_cell * map_stride_2;
+			const size_t map_index = map_cell * map_stride_cell + y * map_stride_y + x * map_stride_x;
 			const float corrected = raw - offset_map[map_index];
 			{% if output_data_dtype == "half" %}
 			output[data_index] = __float2half(corrected);
@@ -61,19 +61,19 @@ extern "C" {
 		const size_t Y = {{pixels_y}};
 		const size_t memory_cells = {{data_memory_cells}};
 
-		const size_t data_stride_2 = 1;
-		const size_t data_stride_1 = memory_cells * data_stride_2;
-		const size_t data_stride_0 = Y * data_stride_1;
+		const size_t data_stride_x = 1;
+		const size_t data_stride_y = X * data_stride_x;
+		const size_t data_stride_cell = Y * data_stride_y;
 
-		const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-		const size_t j = blockIdx.y * blockDim.y + threadIdx.y;
-		const size_t k = blockIdx.z * blockDim.z + threadIdx.z;
+		const size_t cell = blockIdx.x * blockDim.x + threadIdx.x;
+		const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+		const size_t x = blockIdx.z * blockDim.z + threadIdx.z;
 
-		if (i >= X || j >= Y || k >= memory_cells) {
+		if (cell >= memory_cells || y >= Y || x >= X) {
 			return;
 		}
 
-		const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2;
+		const size_t data_index = cell * data_stride_cell + y * data_stride_y + x * data_stride_x;
 		const float raw = (float)data[data_index];
 		{% if output_data_dtype == "half" %}
 		output[data_index] = __float2half(raw);
diff --git a/src/tests/test_dssc_kernels.py b/src/tests/test_dssc_kernels.py
index ca108003..bbaafa22 100644
--- a/src/tests/test_dssc_kernels.py
+++ b/src/tests/test_dssc_kernels.py
@@ -3,22 +3,24 @@ import pytest
 
 from calng import dssc_gpu
 
+input_dtype = np.uint16
+output_dtype = np.float16
+corr_dtype = np.float32
 pixels_x = 512
 pixels_y = 128
 memory_cells = 400
 offset_map = (
-    np.random.random(size=(pixels_x, pixels_y, memory_cells)).astype(np.float32) * 20
+    np.random.random(size=(pixels_x, pixels_y, memory_cells)).astype(corr_dtype) * 20
 )
 cell_table = np.arange(memory_cells, dtype=np.uint16)
 # TODO: also test out of (constant map) bound cell ID handling
 np.random.shuffle(cell_table)
-input_image_data = np.random.randint(
-    low=0, high=2000, size=(memory_cells, 1, pixels_y, pixels_x), dtype=np.uint16
+raw_data = np.random.randint(
+    low=0, high=2000, size=(memory_cells, pixels_y, pixels_x), dtype=input_dtype
 )
-reshaped_image_data = np.ascontiguousarray(np.transpose(np.squeeze(input_image_data)))
-corrected_image_data = (
-    reshaped_image_data.astype(np.float32) - offset_map[..., cell_table]
-).astype(np.float16)
+corrected_data = (
+    np.squeeze(raw_data).astype(np.float32) - offset_map.transpose()[cell_table, ...]
+).astype(output_dtype)
 
 # TODO: test non-contiguous memory cells
 # TODO: test graceful handling of cells not covered by correction map
@@ -27,77 +29,89 @@ kernel_runner = dssc_gpu.DsscGpuRunner(
     pixels_x,
     pixels_y,
     memory_cells,
-    input_data_dtype=np.uint16,
-    output_data_dtype=np.float16,
+    input_data_dtype=input_dtype,
+    output_data_dtype=output_dtype,
 )
 # TODO: initialize with map (avoid reallocation of buffer, recompilation of kernel)
 
 
-def test_reshape():
-    kernel_runner.load_data(input_image_data)
-    kernel_runner.reshape()
-    assert np.allclose(kernel_runner.reshaped_data_gpu.get(), reshaped_image_data)
+def test_only_cast():
+    kernel_runner.load_data(raw_data)
+    kernel_runner.only_cast()
+    assert np.allclose(
+        kernel_runner.processed_data_gpu.get(), raw_data.astype(output_dtype)
+    )
 
 
 def test_correct():
     kernel_runner.load_constants(offset_map)
-    kernel_runner.load_data(input_image_data)
+    kernel_runner.load_data(raw_data)
     kernel_runner.load_cell_table(cell_table)
-    kernel_runner.reshape()
-    res = kernel_runner.correct()
-    assert np.allclose(res, corrected_image_data)
+    kernel_runner.correct()
+    assert np.allclose(kernel_runner.processed_data_gpu.get(), corrected_data)
+
+
+def test_reshape():
+    kernel_runner.processed_data_gpu.set(corrected_data)
+    assert np.allclose(kernel_runner.reshape(), corrected_data.transpose())
+
+
+# TODO: test preview slice
 
 
 def test_preview_max():
     # can it find max intensity frame?
     # note: in case correction failed, still test this separately
-    kernel_runner.load_data(input_image_data)
-    kernel_runner.output_data_gpu.set(corrected_image_data)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.processed_data_gpu.set(corrected_data)
     preview_raw, preview_corrected = kernel_runner.compute_preview(-1)
-    max_index = np.argmax(np.sum(corrected_image_data, axis=(0, 1), dtype=np.float32))
     assert np.allclose(
         preview_raw,
-        reshaped_image_data[..., max_index].astype(np.float32),
+        raw_data[np.argmax(np.sum(raw_data, axis=(1, 2), dtype=np.float32))]
+        .astype(np.float32)
+        .transpose(),
     )
     assert np.allclose(
         preview_corrected,
-        corrected_image_data[..., max_index].astype(np.float32),
+        corrected_data[np.argmax(np.sum(corrected_data, axis=(1, 2), dtype=np.float32))]
+        .astype(np.float32)
+        .transpose(),
     )
 
 
 def test_preview_mean():
-    kernel_runner.load_data(input_image_data)
-    kernel_runner.output_data_gpu.set(corrected_image_data)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.processed_data_gpu.set(corrected_data)
     preview_raw, preview_corrected = kernel_runner.compute_preview(-2)
     assert np.allclose(
-        preview_raw, np.mean(reshaped_image_data, axis=2, dtype=np.float32)
+        preview_raw, np.mean(raw_data, axis=0, dtype=np.float32).transpose()
     )
     assert np.allclose(
-        preview_corrected, np.mean(corrected_image_data, axis=2, dtype=np.float32)
+        preview_corrected, np.mean(corrected_data, axis=0, dtype=np.float32).transpose()
     )
 
 
 def test_preview_sum():
-    kernel_runner.load_data(input_image_data)
-    kernel_runner.output_data_gpu.set(corrected_image_data)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.processed_data_gpu.set(corrected_data)
     preview_raw, preview_corrected = kernel_runner.compute_preview(-3)
     assert np.allclose(
-        preview_raw, np.sum(reshaped_image_data, axis=2, dtype=np.float32)
+        preview_raw, np.sum(raw_data, axis=0, dtype=np.float32).transpose()
     )
     assert np.allclose(
-        preview_corrected, np.sum(corrected_image_data, axis=2, dtype=np.float32)
+        preview_corrected, np.sum(corrected_data, axis=0, dtype=np.float32).transpose()
     )
 
 
 def test_preview_std():
-    kernel_runner.load_data(input_image_data)
-    kernel_runner.output_data_gpu.set(corrected_image_data)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.processed_data_gpu.set(corrected_data)
     preview_raw, preview_corrected = kernel_runner.compute_preview(-4)
     assert np.allclose(
-        preview_raw, np.std(reshaped_image_data, axis=2, dtype=np.float64)
+        preview_raw, np.std(raw_data, axis=0, dtype=np.float32).transpose()
     )
     assert np.allclose(
-        preview_corrected, np.std(corrected_image_data, axis=2, dtype=np.float64)
+        preview_corrected, np.std(corrected_data, axis=0, dtype=np.float32).transpose()
     )
 
 
-- 
GitLab