Custom kernel to count

b90d24cf · David Hammer · f6d60ea9 · b90d24cf
Commit b90d24cf authored 1 month ago by David Hammer
--- a/src/calng/kernels/gaincount.cu
+++ b/src/calng/kernels/gaincount.cu
+extern "C" {
+    __global__ void count_pixels_per_gain_stage(const char* gain_map,  // num_frames x ss_dim x fs_dim
+                                                const unsigned short num_frames,
+                                                unsigned int* counts) {  // output: num_frames x num_gain_stages
+        if (blockIdx.x >= num_frames) {
+            return;
+        }
+        const size_t ss_dim = {{ss_dim}};
+        const size_t fs_dim = {{fs_dim}};
+        const char* frame_start = gain_map + ss_dim * fs_dim * blockIdx.x;
+        // block x: handle frame x
+        // block y: block within frame x (merge slow scan / fast scan, it doesn't matter)
+        // so blockDim y is size of group for parallel reduction
+        assert(blockDim.x == 1);
+        assert(blockDim.y == {{block_size}});
+
+        // first: grid-stride loop to compute local part
+        unsigned int my_res[{{num_gain_stages}}] = {0};
+        for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < ss_dim * fs_dim; i += blockDim.y * gridDim.y) {
+            const char gain_value = frame_start[i];
+            if (gain_value < 0 || gain_value >= {{num_gain_stages}}) {
+                continue;
+            }
+            my_res[gain_value] += 1;
+        }
+
+        // share with the class
+        // each thread will write num_gain_stages adjacent values
+        __shared__ unsigned int group_res[{{block_size * num_gain_stages}}];
+        for (int i=0; i<{{num_gain_stages}}; ++i) {
+            group_res[threadIdx.y * {{num_gain_stages}} + i] = my_res[i];
+        }
+
+        // now for parallel reduction
+        __syncthreads();
+        int active_threads = blockDim.y;
+        while (active_threads > 1) {
+            active_threads /= 2;
+            if (threadIdx.y < active_threads) {
+                for (int i=0; i<{{num_gain_stages}}; ++i) {
+                    group_res[threadIdx.y * {{num_gain_stages}} + i] += group_res[(threadIdx.y + active_threads) * {{num_gain_stages}} + i];
+                }
+            }
+            __syncthreads();
+        }
+
+        // and write out the result
+        if (threadIdx.y == 0) {
+            for (int i=0; i<{{num_gain_stages}}; ++i) {
+                counts[blockIdx.x * {{num_gain_stages}} + i] = group_res[i];
+            }
+        }
+    }
+}