From a4afb6d0dab7304fc1cb3d57f740e40b007118f2 Mon Sep 17 00:00:00 2001 From: David Hammer <dhammer@mailbox.org> Date: Mon, 30 Aug 2021 20:09:44 +0200 Subject: [PATCH] Refactor: correct / cast before reshape --- src/calng/DsscCorrection.py | 6 +- src/calng/dssc_gpu.py | 151 +++++++++++++++------------------ src/calng/gpu-dssc-correct.cpp | 42 ++++----- src/tests/test_dssc_kernels.py | 82 ++++++++++-------- 4 files changed, 142 insertions(+), 139 deletions(-) diff --git a/src/calng/DsscCorrection.py b/src/calng/DsscCorrection.py index e922572e..995b8689 100644 --- a/src/calng/DsscCorrection.py +++ b/src/calng/DsscCorrection.py @@ -544,13 +544,13 @@ class DsscCorrection(calibrationBase.CalibrationReceiverBaseDevice): self.set("status", msg) self.gpu_runner.load_data(image_data) - self.gpu_runner.reshape() buffer_handle, buffer_array = self._shmem_buffer.next_slot() if do_apply_correction: self.gpu_runner.load_cell_table(cell_table) - self.gpu_runner.correct(out=buffer_array) + self.gpu_runner.correct() else: - self.gpu_runner.only_cast(out=buffer_array) + self.gpu_runner.only_cast() + self.gpu_runner.reshape(out=buffer_array) if do_generate_preview: preview_slice_index = self.get("preview.pulse") if preview_slice_index >= 0: diff --git a/src/calng/dssc_gpu.py b/src/calng/dssc_gpu.py index 3d5e9349..7dacb363 100644 --- a/src/calng/dssc_gpu.py +++ b/src/calng/dssc_gpu.py @@ -18,15 +18,15 @@ class DsscGpuRunner: 2. load_constants 3. load_data 4. load_cell_table - 5. reshape - 6. correct - 7. compute_preview (optional) + 5. correct + 6a. reshape (only here does data transfer back to host) + 6b. compute_preview (optional) repeat from 2. or 3. - In case no constants are available / correction is not desired, can skip 3. and 4. - and use only_cast instead of correct (taking care to call compute_preview with - parameters set accordingly). + In case no constants are available / correction is not desired, can skip 3 and 4 + and use only_cast in step 5 instead of correct (taking care to call + compute_preview with parameters set accordingly). """ _src_dir = pathlib.Path(__file__).absolute().parent @@ -58,10 +58,10 @@ class DsscGpuRunner: self.offset_map_gpu = cupy.empty(self.map_shape, dtype=np.float32) # reuse output arrays - self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype) - self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=input_data_dtype) self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16) - self.output_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype) + self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype) + self.processed_data_gpu = cupy.empty(self.input_shape, dtype=output_data_dtype) + self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype) self.preview_raw = cupyx.empty_pinned(self.preview_shape, dtype=np.float32) self.preview_corrected = cupyx.empty_pinned( self.preview_shape, dtype=np.float32 @@ -99,23 +99,10 @@ class DsscGpuRunner: self.full_block = tuple(full_block) self.full_grid = tuple( utils.ceil_div(a_length, block_length) - for (a_length, block_length) in zip(self.output_shape, full_block) + for (a_length, block_length) in zip(self.input_shape, full_block) ) - def reshape(self): - """Do the reshaping that the splitter would have done - - equivalent to: - output_data[:] = np.moveaxis( - np.squeeze(input_data), (0, 1, 2), (2, 1, 0) - ) - """ - # TODO: Move to somewhere else - self.reshaped_data_gpu[:] = cupy.ascontiguousarray( - cupy.transpose(cupy.squeeze(self.input_data_gpu)) - ) - - def correct(self, out=None): + def correct(self): """Apply corrections to data (must load constant, data, and cell_table first) Applies corrections to input data and casts to desired output dtype. @@ -127,16 +114,45 @@ class DsscGpuRunner: (view of) said buffer as an ndarray. Keep in mind that the output buffers will get overwritten eventually (circular buffer). """ - self._run_correct() - return self.output_data_gpu.get(out=out) + self.correction_kernel( + self.full_grid, + self.full_block, + ( + self.input_data_gpu, + self.cell_table_gpu, + self.offset_map_gpu, + self.processed_data_gpu, + ), + ) - def only_cast(self, out=None): + def only_cast(self): """Like correct without the correction This currently means just casting to output dtype. """ - self._run_only_cast() - return self.output_data_gpu.get(out=out) + self.casting_kernel( + self.full_grid, + self.full_block, + ( + self.input_data_gpu, + self.processed_data_gpu, + ), + ) + + def reshape(self, out=None): + # TODO: make order configurable + """Move axes to desired output order + + equivalent to: + output_data[:] = np.moveaxis( + np.squeeze(input_data), (0, 1, 2), (2, 1, 0) + ) + """ + # TODO: avoid copy + self.reshaped_data_gpu = cupy.ascontiguousarray( + cupy.transpose(cupy.squeeze(self.processed_data_gpu)) + ) + return self.reshaped_data_gpu.get(out=out) def compute_preview(self, preview_index, have_corrected=True, can_correct=True): """Generate single slice or reduction preview of raw and corrected data @@ -160,40 +176,35 @@ class DsscGpuRunner: if not have_corrected: if can_correct: - self._run_correct() + self.correct() else: print("Warning: corrected preview will not actually be corrected.") - self._run_only_cast() + self.only_cast() # TODO: enum around reduction type - if preview_index >= 0: - # TODO: change axis order when moving reshape to after correction - self.input_data_gpu[preview_index].astype(np.float32).transpose().get( - out=self.preview_corrected - ) - self.output_data_gpu[..., preview_index].astype(np.float32).get( - out=self.preview_corrected - ) - elif preview_index == -1: - # TODO: select argmax independently for raw and corrected? - # TODO: send frame sums somewhere to compute global max frame - max_index = cupy.argmax( - cupy.sum(self.output_data_gpu, axis=(0, 1), dtype=cupy.float64) - ) - self.input_data_gpu[max_index].astype(np.float32).transpose().get( - out=self.preview_raw - ) - self.output_data_gpu[..., max_index].astype(np.float32).get( - out=self.preview_corrected - ) - elif preview_index in (-2, -3, -4): - stat_fun = {-2: cupy.mean, -3: cupy.sum, -4: cupy.std}[preview_index] - stat_fun(self.input_data_gpu, axis=0, dtype=cupy.float32).transpose().get( - out=self.preview_raw - ) - stat_fun(self.output_data_gpu, axis=2, dtype=cupy.float32).get( - out=self.preview_corrected - ) + for (image_data, output_buffer) in ( + (self.input_data_gpu, self.preview_raw), + (self.processed_data_gpu, self.preview_corrected), + ): + if preview_index >= 0: + # TODO: change axis order when moving reshape to after correction + image_data[preview_index].astype(np.float32).transpose().get( + out=output_buffer + ) + elif preview_index == -1: + # TODO: select argmax independently for raw and corrected? + # TODO: send frame sums somewhere to compute global max frame + max_index = cupy.argmax( + cupy.sum(image_data, axis=(1, 2), dtype=cupy.float32) + ) + image_data[max_index].astype(np.float32).transpose().get( + out=output_buffer + ) + elif preview_index in (-2, -3, -4): + stat_fun = {-2: cupy.mean, -3: cupy.sum, -4: cupy.std}[preview_index] + stat_fun( + image_data, axis=0, dtype=cupy.float32 + ).transpose().get(out=output_buffer) return self.preview_raw, self.preview_corrected def _init_kernels(self): @@ -214,25 +225,3 @@ class DsscGpuRunner: self.source_module = cupy.RawModule(code=kernel_source) self.correction_kernel = self.source_module.get_function("correct") self.casting_kernel = self.source_module.get_function("only_cast") - - def _run_correct(self): - self.correction_kernel( - self.full_grid, - self.full_block, - ( - self.reshaped_data_gpu, - self.cell_table_gpu, - self.offset_map_gpu, - self.output_data_gpu, - ), - ) - - def _run_only_cast(self): - self.casting_kernel( - self.full_grid, - self.full_block, - ( - self.input_data_gpu, - self.output_data_gpu, - ), - ) diff --git a/src/calng/gpu-dssc-correct.cpp b/src/calng/gpu-dssc-correct.cpp index 7154bbd2..2412a86a 100644 --- a/src/calng/gpu-dssc-correct.cpp +++ b/src/calng/gpu-dssc-correct.cpp @@ -16,27 +16,27 @@ extern "C" { const size_t memory_cells = {{data_memory_cells}}; const size_t map_memory_cells = {{constant_memory_cells}}; - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - const size_t j = blockIdx.y * blockDim.y + threadIdx.y; - const size_t k = blockIdx.z * blockDim.z + threadIdx.z; + const size_t memory_cell = blockIdx.x * blockDim.x + threadIdx.x; + const size_t y = blockIdx.y * blockDim.y + threadIdx.y; + const size_t x = blockIdx.z * blockDim.z + threadIdx.z; - if (i >= X || j >= Y || k >= memory_cells) { + if (memory_cell >= memory_cells || y >= Y || x >= X) { return; } // note: strides differ from numpy strides because unit here is sizeof(...), not byte - const size_t data_stride_2 = 1; - const size_t data_stride_1 = memory_cells * data_stride_2; - const size_t data_stride_0 = Y * data_stride_1; - const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2; + const size_t data_stride_x = 1; + const size_t data_stride_y = X * data_stride_x; + const size_t data_stride_cell = Y * data_stride_y; + const size_t data_index = memory_cell * data_stride_cell + y * data_stride_y + x * data_stride_x; const float raw = (float)data[data_index]; - const size_t map_stride_2 = 1; - const size_t map_stride_1 = map_memory_cells * map_stride_2; - const size_t map_stride_0 = Y * map_stride_1; - const size_t map_cell = cell_table[k]; + const size_t map_stride_cell = 1; + const size_t map_stride_y = map_memory_cells * map_stride_cell; + const size_t map_stride_x = Y * map_stride_y; + const size_t map_cell = cell_table[memory_cell]; if (map_cell < map_memory_cells) { - const size_t map_index = i * map_stride_0 + j * map_stride_1 + map_cell * map_stride_2; + const size_t map_index = map_cell * map_stride_cell + y * map_stride_y + x * map_stride_x; const float corrected = raw - offset_map[map_index]; {% if output_data_dtype == "half" %} output[data_index] = __float2half(corrected); @@ -61,19 +61,19 @@ extern "C" { const size_t Y = {{pixels_y}}; const size_t memory_cells = {{data_memory_cells}}; - const size_t data_stride_2 = 1; - const size_t data_stride_1 = memory_cells * data_stride_2; - const size_t data_stride_0 = Y * data_stride_1; + const size_t data_stride_x = 1; + const size_t data_stride_y = X * data_stride_x; + const size_t data_stride_cell = Y * data_stride_y; - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - const size_t j = blockIdx.y * blockDim.y + threadIdx.y; - const size_t k = blockIdx.z * blockDim.z + threadIdx.z; + const size_t cell = blockIdx.x * blockDim.x + threadIdx.x; + const size_t y = blockIdx.y * blockDim.y + threadIdx.y; + const size_t x = blockIdx.z * blockDim.z + threadIdx.z; - if (i >= X || j >= Y || k >= memory_cells) { + if (cell >= memory_cells || y >= Y || x >= X) { return; } - const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2; + const size_t data_index = cell * data_stride_cell + y * data_stride_y + x * data_stride_x; const float raw = (float)data[data_index]; {% if output_data_dtype == "half" %} output[data_index] = __float2half(raw); diff --git a/src/tests/test_dssc_kernels.py b/src/tests/test_dssc_kernels.py index ca108003..bbaafa22 100644 --- a/src/tests/test_dssc_kernels.py +++ b/src/tests/test_dssc_kernels.py @@ -3,22 +3,24 @@ import pytest from calng import dssc_gpu +input_dtype = np.uint16 +output_dtype = np.float16 +corr_dtype = np.float32 pixels_x = 512 pixels_y = 128 memory_cells = 400 offset_map = ( - np.random.random(size=(pixels_x, pixels_y, memory_cells)).astype(np.float32) * 20 + np.random.random(size=(pixels_x, pixels_y, memory_cells)).astype(corr_dtype) * 20 ) cell_table = np.arange(memory_cells, dtype=np.uint16) # TODO: also test out of (constant map) bound cell ID handling np.random.shuffle(cell_table) -input_image_data = np.random.randint( - low=0, high=2000, size=(memory_cells, 1, pixels_y, pixels_x), dtype=np.uint16 +raw_data = np.random.randint( + low=0, high=2000, size=(memory_cells, pixels_y, pixels_x), dtype=input_dtype ) -reshaped_image_data = np.ascontiguousarray(np.transpose(np.squeeze(input_image_data))) -corrected_image_data = ( - reshaped_image_data.astype(np.float32) - offset_map[..., cell_table] -).astype(np.float16) +corrected_data = ( + np.squeeze(raw_data).astype(np.float32) - offset_map.transpose()[cell_table, ...] +).astype(output_dtype) # TODO: test non-contiguous memory cells # TODO: test graceful handling of cells not covered by correction map @@ -27,77 +29,89 @@ kernel_runner = dssc_gpu.DsscGpuRunner( pixels_x, pixels_y, memory_cells, - input_data_dtype=np.uint16, - output_data_dtype=np.float16, + input_data_dtype=input_dtype, + output_data_dtype=output_dtype, ) # TODO: initialize with map (avoid reallocation of buffer, recompilation of kernel) -def test_reshape(): - kernel_runner.load_data(input_image_data) - kernel_runner.reshape() - assert np.allclose(kernel_runner.reshaped_data_gpu.get(), reshaped_image_data) +def test_only_cast(): + kernel_runner.load_data(raw_data) + kernel_runner.only_cast() + assert np.allclose( + kernel_runner.processed_data_gpu.get(), raw_data.astype(output_dtype) + ) def test_correct(): kernel_runner.load_constants(offset_map) - kernel_runner.load_data(input_image_data) + kernel_runner.load_data(raw_data) kernel_runner.load_cell_table(cell_table) - kernel_runner.reshape() - res = kernel_runner.correct() - assert np.allclose(res, corrected_image_data) + kernel_runner.correct() + assert np.allclose(kernel_runner.processed_data_gpu.get(), corrected_data) + + +def test_reshape(): + kernel_runner.processed_data_gpu.set(corrected_data) + assert np.allclose(kernel_runner.reshape(), corrected_data.transpose()) + + +# TODO: test preview slice def test_preview_max(): # can it find max intensity frame? # note: in case correction failed, still test this separately - kernel_runner.load_data(input_image_data) - kernel_runner.output_data_gpu.set(corrected_image_data) + kernel_runner.load_data(raw_data) + kernel_runner.processed_data_gpu.set(corrected_data) preview_raw, preview_corrected = kernel_runner.compute_preview(-1) - max_index = np.argmax(np.sum(corrected_image_data, axis=(0, 1), dtype=np.float32)) assert np.allclose( preview_raw, - reshaped_image_data[..., max_index].astype(np.float32), + raw_data[np.argmax(np.sum(raw_data, axis=(1, 2), dtype=np.float32))] + .astype(np.float32) + .transpose(), ) assert np.allclose( preview_corrected, - corrected_image_data[..., max_index].astype(np.float32), + corrected_data[np.argmax(np.sum(corrected_data, axis=(1, 2), dtype=np.float32))] + .astype(np.float32) + .transpose(), ) def test_preview_mean(): - kernel_runner.load_data(input_image_data) - kernel_runner.output_data_gpu.set(corrected_image_data) + kernel_runner.load_data(raw_data) + kernel_runner.processed_data_gpu.set(corrected_data) preview_raw, preview_corrected = kernel_runner.compute_preview(-2) assert np.allclose( - preview_raw, np.mean(reshaped_image_data, axis=2, dtype=np.float32) + preview_raw, np.mean(raw_data, axis=0, dtype=np.float32).transpose() ) assert np.allclose( - preview_corrected, np.mean(corrected_image_data, axis=2, dtype=np.float32) + preview_corrected, np.mean(corrected_data, axis=0, dtype=np.float32).transpose() ) def test_preview_sum(): - kernel_runner.load_data(input_image_data) - kernel_runner.output_data_gpu.set(corrected_image_data) + kernel_runner.load_data(raw_data) + kernel_runner.processed_data_gpu.set(corrected_data) preview_raw, preview_corrected = kernel_runner.compute_preview(-3) assert np.allclose( - preview_raw, np.sum(reshaped_image_data, axis=2, dtype=np.float32) + preview_raw, np.sum(raw_data, axis=0, dtype=np.float32).transpose() ) assert np.allclose( - preview_corrected, np.sum(corrected_image_data, axis=2, dtype=np.float32) + preview_corrected, np.sum(corrected_data, axis=0, dtype=np.float32).transpose() ) def test_preview_std(): - kernel_runner.load_data(input_image_data) - kernel_runner.output_data_gpu.set(corrected_image_data) + kernel_runner.load_data(raw_data) + kernel_runner.processed_data_gpu.set(corrected_data) preview_raw, preview_corrected = kernel_runner.compute_preview(-4) assert np.allclose( - preview_raw, np.std(reshaped_image_data, axis=2, dtype=np.float64) + preview_raw, np.std(raw_data, axis=0, dtype=np.float32).transpose() ) assert np.allclose( - preview_corrected, np.std(corrected_image_data, axis=2, dtype=np.float64) + preview_corrected, np.std(corrected_data, axis=0, dtype=np.float32).transpose() ) -- GitLab