From b90d24cf77358735ee12f16a2672aacce2c11ba5 Mon Sep 17 00:00:00 2001 From: David Hammer <dhammer@mailbox.org> Date: Thu, 13 Feb 2025 17:47:52 +0100 Subject: [PATCH] Custom kernel to count --- src/calng/kernels/gaincount.cu | 54 ++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/calng/kernels/gaincount.cu diff --git a/src/calng/kernels/gaincount.cu b/src/calng/kernels/gaincount.cu new file mode 100644 index 00000000..a6176a1d --- /dev/null +++ b/src/calng/kernels/gaincount.cu @@ -0,0 +1,54 @@ +extern "C" { + __global__ void count_pixels_per_gain_stage(const char* gain_map, // num_frames x ss_dim x fs_dim + const unsigned short num_frames, + unsigned int* counts) { // output: num_frames x num_gain_stages + if (blockIdx.x >= num_frames) { + return; + } + const size_t ss_dim = {{ss_dim}}; + const size_t fs_dim = {{fs_dim}}; + const char* frame_start = gain_map + ss_dim * fs_dim * blockIdx.x; + // block x: handle frame x + // block y: block within frame x (merge slow scan / fast scan, it doesn't matter) + // so blockDim y is size of group for parallel reduction + assert(blockDim.x == 1); + assert(blockDim.y == {{block_size}}); + + // first: grid-stride loop to compute local part + unsigned int my_res[{{num_gain_stages}}] = {0}; + for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < ss_dim * fs_dim; i += blockDim.y * gridDim.y) { + const char gain_value = frame_start[i]; + if (gain_value < 0 || gain_value >= {{num_gain_stages}}) { + continue; + } + my_res[gain_value] += 1; + } + + // share with the class + // each thread will write num_gain_stages adjacent values + __shared__ unsigned int group_res[{{block_size * num_gain_stages}}]; + for (int i=0; i<{{num_gain_stages}}; ++i) { + group_res[threadIdx.y * {{num_gain_stages}} + i] = my_res[i]; + } + + // now for parallel reduction + __syncthreads(); + int active_threads = blockDim.y; + while (active_threads > 1) { + active_threads /= 2; + if (threadIdx.y < active_threads) { + for (int i=0; i<{{num_gain_stages}}; ++i) { + group_res[threadIdx.y * {{num_gain_stages}} + i] += group_res[(threadIdx.y + active_threads) * {{num_gain_stages}} + i]; + } + } + __syncthreads(); + } + + // and write out the result + if (threadIdx.y == 0) { + for (int i=0; i<{{num_gain_stages}}; ++i) { + counts[blockIdx.x * {{num_gain_stages}} + i] = group_res[i]; + } + } + } +} -- GitLab