From b90d24cf77358735ee12f16a2672aacce2c11ba5 Mon Sep 17 00:00:00 2001
From: David Hammer <dhammer@mailbox.org>
Date: Thu, 13 Feb 2025 17:47:52 +0100
Subject: [PATCH] Custom kernel to count

---
 src/calng/kernels/gaincount.cu | 54 ++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 src/calng/kernels/gaincount.cu

diff --git a/src/calng/kernels/gaincount.cu b/src/calng/kernels/gaincount.cu
new file mode 100644
index 00000000..a6176a1d
--- /dev/null
+++ b/src/calng/kernels/gaincount.cu
@@ -0,0 +1,54 @@
+extern "C" {
+    __global__ void count_pixels_per_gain_stage(const char* gain_map,  // num_frames x ss_dim x fs_dim
+                                                const unsigned short num_frames,
+                                                unsigned int* counts) {  // output: num_frames x num_gain_stages
+        if (blockIdx.x >= num_frames) {
+            return;
+        }
+        const size_t ss_dim = {{ss_dim}};
+        const size_t fs_dim = {{fs_dim}};
+        const char* frame_start = gain_map + ss_dim * fs_dim * blockIdx.x;
+        // block x: handle frame x
+        // block y: block within frame x (merge slow scan / fast scan, it doesn't matter)
+        // so blockDim y is size of group for parallel reduction
+        assert(blockDim.x == 1);
+        assert(blockDim.y == {{block_size}});
+
+        // first: grid-stride loop to compute local part
+        unsigned int my_res[{{num_gain_stages}}] = {0};
+        for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < ss_dim * fs_dim; i += blockDim.y * gridDim.y) {
+            const char gain_value = frame_start[i];
+            if (gain_value < 0 || gain_value >= {{num_gain_stages}}) {
+                continue;
+            }
+            my_res[gain_value] += 1;
+        }
+
+        // share with the class
+        // each thread will write num_gain_stages adjacent values
+        __shared__ unsigned int group_res[{{block_size * num_gain_stages}}];
+        for (int i=0; i<{{num_gain_stages}}; ++i) {
+            group_res[threadIdx.y * {{num_gain_stages}} + i] = my_res[i];
+        }
+
+        // now for parallel reduction
+        __syncthreads();
+        int active_threads = blockDim.y;
+        while (active_threads > 1) {
+            active_threads /= 2;
+            if (threadIdx.y < active_threads) {
+                for (int i=0; i<{{num_gain_stages}}; ++i) {
+                    group_res[threadIdx.y * {{num_gain_stages}} + i] += group_res[(threadIdx.y + active_threads) * {{num_gain_stages}} + i];
+                }
+            }
+            __syncthreads();
+        }
+
+        // and write out the result
+        if (threadIdx.y == 0) {
+            for (int i=0; i<{{num_gain_stages}}; ++i) {
+                counts[blockIdx.x * {{num_gain_stages}} + i] = group_res[i];
+            }
+        }
+    }
+}
-- 
GitLab