#include <cuda_fp16.h> __device__ unsigned short pulse_filter[{{pulse_filter|length}}] = { {{pulse_filter|join(', ')}} }; extern "C" { /* Reshuffle data from shape like (400, 1, 128, 512) to shape like (512, 128, <=400) That is, (cell, ???, y, x) to (x, y, cell) Applies pulse filter; essentially taking subset of indices along memory cell axis Equivalent to np.moveaxis(np.squeeze(data, (0, 1, 2), (2, 1, 0))) */ __global__ void reshape_4_3(const {{input_data_dtype}}* data, {{input_data_dtype}}* output) { const size_t X = {{pixels_x}}; const size_t Y = {{pixels_y}}; const size_t extra_dim = 1; // mysterious extra dimension in incoming data const size_t pulse_filter_size = {{pulse_filter|length}}; const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t j = blockIdx.y * blockDim.y + threadIdx.y; const size_t k = blockIdx.z * blockDim.z + threadIdx.z; if (i >= X || j >= Y || k >= pulse_filter_size) { // in case block size doesn't fit perfectly, some threads do nothing return; } const size_t in_stride_3 = 1; const size_t in_stride_2 = in_stride_3 * X; const size_t in_stride_1 = in_stride_2 * Y; const size_t in_stride_0 = in_stride_1 * extra_dim; const size_t in_index = pulse_filter[k] * in_stride_0 // k is cell + 0 * in_stride_1 // for completeness, the squeezed dimension + j * in_stride_2 // j is y + i * in_stride_3; // i is x const size_t out_stride_2 = 1; const size_t out_stride_1 = out_stride_2 * pulse_filter_size; const size_t out_stride_0 = out_stride_1 * Y; const size_t out_index = i * out_stride_0 + j * out_stride_1 + k * out_stride_2; output[out_index] = data[in_index]; } /* Perform correction: offset Take cell_table into account when getting correction values Converting to float for doing the correction Converting to output dtype at the end */ __global__ void correct(const {{input_data_dtype}}* data, const unsigned short* cell_table, const float* offset_map, {{output_data_dtype}}* output) { const size_t X = {{pixels_x}}; const size_t Y = {{pixels_y}}; // reshaped and output data have pulse filter length memory cells dim const size_t filtered_memory_cells = {{pulse_filter|length}}; // but correction map has some number which may even exceed input data's (due to veto pattern) const size_t map_memory_cells = {{constant_memory_cells}}; const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t j = blockIdx.y * blockDim.y + threadIdx.y; const size_t k = blockIdx.z * blockDim.z + threadIdx.z; if (i >= X || j >= Y || k >= filtered_memory_cells) { return; } // note: strides differ from numpy strides because unit here is sizeof(...), not byte const size_t data_stride_2 = 1; const size_t data_stride_1 = filtered_memory_cells * data_stride_2; const size_t data_stride_0 = Y * data_stride_1; const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2; const float raw = (float)data[data_index]; const size_t map_stride_2 = 1; const size_t map_stride_1 = map_memory_cells * map_stride_2; const size_t map_stride_0 = Y * map_stride_1; const size_t map_cell = cell_table[k]; if (map_cell < map_memory_cells) { const size_t map_index = i * map_stride_0 + j * map_stride_1 + map_cell * map_stride_2; const float corrected = raw - offset_map[map_index]; {% if output_data_dtype == "half" %} output[data_index] = __float2half(corrected); {% else %} output[data_index] = ({{output_data_dtype}})corrected; {% endif %} } else { {% if output_data_dtype == "half" %} output[data_index] = __float2half(raw); {% else %} output[data_index] = ({{output_data_dtype}})raw; {% endif %} } } /* Same as correction, except don't do any correction */ __global__ void only_cast(const {{input_data_dtype}}* data, {{output_data_dtype}}* output) { const size_t X = {{pixels_x}}; const size_t Y = {{pixels_y}}; const size_t memory_cells = {{pulse_filter|length}}; const size_t data_stride_2 = 1; const size_t data_stride_1 = memory_cells * data_stride_2; const size_t data_stride_0 = Y * data_stride_1; const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t j = blockIdx.y * blockDim.y + threadIdx.y; const size_t k = blockIdx.z * blockDim.z + threadIdx.z; if (i >= X || j >= Y || k >= memory_cells) { return; } const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2; const float raw = (float)data[data_index]; {% if output_data_dtype == "half" %} output[data_index] = __float2half(raw); {% else %} output[data_index] = ({{output_data_dtype}})raw; {% endif %} } /* Kernels for preview ≥0: just slice desired cell; uses cell_slice_preview_* -1: slice cell with max integrated intensity (hybrid) -2: mean; uses cell_stat_preview_* -3: sum; ditto -4: stdev; ditto Note: for loop in template due to differing dtypes TODO: simplify - [ ] set up C++ compilation chain on ONC - [ ] use C++ templates Or even better: - [ ] switch to cupy, all of this becomes trivial */ {% for (name_suffix, data_dtype) in (("raw", input_data_dtype), ("corrected", output_data_dtype)) %} __global__ void cell_slice_preview_{{name_suffix}}(const {{data_dtype}}* data, const short cell_to_preview, float* preview) { const size_t X = {{pixels_x}}; const size_t Y = {{pixels_y}}; const size_t filtered_memory_cells = {{pulse_filter|length}}; const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t j = blockIdx.y * blockDim.y + threadIdx.y; if (i >= X || j >= Y) { return; } const size_t preview_stride_1 = 1; const size_t preview_stride_0 = Y * preview_stride_1; const size_t preview_index = i * preview_stride_0 + j * preview_stride_1; const size_t data_stride_2 = 1; const size_t data_stride_1 = filtered_memory_cells * data_stride_2; const size_t data_stride_0 = Y * data_stride_1; const size_t data_index = i * data_stride_0 + j * data_stride_1 + cell_to_preview * data_stride_2; preview[preview_index] = (float)data[data_index]; } __global__ void cell_stat_preview_{{name_suffix}}(const {{data_dtype}}* data, const short preview_stat, float* preview) { const size_t X = {{pixels_x}}; const size_t Y = {{pixels_y}}; const size_t filtered_memory_cells = {{pulse_filter|length}}; const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t j = blockIdx.y * blockDim.y + threadIdx.y; if (i >= X || j >= Y) { return; } const size_t preview_stride_1 = 1; const size_t preview_stride_0 = Y * preview_stride_1; const size_t preview_index = i * preview_stride_0 + j * preview_stride_1; const size_t data_stride_2 = 1; const size_t data_stride_1 = filtered_memory_cells * data_stride_2; const size_t data_stride_0 = Y * data_stride_1; float sum = 0; for (int k=0; k<filtered_memory_cells; ++k) { const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2; sum += (float)data[data_index]; } if (preview_stat == -3) { // just sum preview[preview_index] = sum; } else if (preview_stat == -2) { // mean preview[preview_index] = sum / filtered_memory_cells; } else if (preview_stat == -4) { // standard deviation const double mean = sum / filtered_memory_cells; // try to reduce error by increasing precision on accumulator double var = 0; for (int k=0; k<filtered_memory_cells; ++k) { const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2; // but "compute" values the same (floats) var += pow((double)data[data_index] - mean, 2); } var /= filtered_memory_cells; preview[preview_index] = (float)sqrt(var); } } {% endfor %} // used to find max integrated intensity frame __global__ void sum_frames({{output_data_dtype}}* data, float* sums) { const size_t X = {{pixels_x}}; const size_t Y = {{pixels_y}}; const size_t filtered_memory_cells = {{pulse_filter|length}}; const size_t memory_cell = blockIdx.z * blockDim.z + threadIdx.z; if (memory_cell >= filtered_memory_cells) { return; } const size_t data_stride_2 = 1; const size_t data_stride_1 = filtered_memory_cells * data_stride_2; const size_t data_stride_0 = Y * data_stride_1; float my_res = 0; for (int i=0; i<X; ++i) { for (int j=0; j<Y; ++j) { const size_t data_index = i * data_stride_0 + j * data_stride_1 + memory_cell * data_stride_2; const float raw = (float)data[data_index]; my_res += raw; } } sums[memory_cell] = my_res; } }