diff --git a/src/calng/kernels/peakfinder9_gpu.cu b/src/calng/kernels/peakfinder9_gpu.cu index 7d172c9eacfc49e5b333e7f6306e9b5a7b948c51..a07bdfb3c27140dd9c807cc6836abdd1480f8932 100644 --- a/src/calng/kernels/peakfinder9_gpu.cu +++ b/src/calng/kernels/peakfinder9_gpu.cu @@ -4,15 +4,15 @@ class MaskedImageFrame { public: const float* data_start; - const unsigned short num_rows; - const unsigned short num_cols; - __device__ MaskedImageFrame(const float* data_start, const unsigned short num_rows, const unsigned short num_cols): - data_start(data_start), num_rows(num_rows), num_cols(num_cols) {} + const unsigned short ss_dim; + const unsigned short fs_dim; + __device__ MaskedImageFrame(const float* data_start, const unsigned short ss_dim, const unsigned short fs_dim): + data_start(data_start), ss_dim(ss_dim), fs_dim(fs_dim) {} __device__ bool is_masked(const unsigned short i, const unsigned short j) { - return isnan(data_start[i * num_cols + j]); + return isnan(data_start[i * fs_dim + j]); } __device__ float get(const unsigned short i, const unsigned short j) { - return data_start[i * num_cols + j]; + return data_start[i * fs_dim + j]; } __device__ float get(const unsigned short i, const unsigned short j, const float fallback) { if (is_masked(i, j)) { @@ -56,8 +56,8 @@ public: }; extern "C" __global__ void pf9(const unsigned short num_frames, - const unsigned short num_rows, - const unsigned short num_cols, + const unsigned short ss_dim, + const unsigned short fs_dim, const unsigned short window_radius, const float min_peak_over_border, const float min_snr_biggest_pixel, @@ -73,37 +73,37 @@ extern "C" __global__ void pf9(const unsigned short num_frames, // execution model: one thread handles one pixel in one frame const unsigned short frame = blockDim.x * blockIdx.x + threadIdx.x; - const unsigned short row = blockDim.y * blockIdx.y + threadIdx.y; - const unsigned short col = blockDim.z * blockIdx.z + threadIdx.z; + const unsigned short ss_index = blockDim.y * blockIdx.y + threadIdx.y; + const unsigned short fs_index = blockDim.z * blockIdx.z + threadIdx.z; if (frame >= num_frames || - row < window_radius || row >= num_rows - window_radius || - col < window_radius || col >= num_cols - window_radius) { + ss_index < window_radius || ss_index >= ss_dim - window_radius || + fs_index < window_radius || fs_index >= fs_dim - window_radius) { return; } // wrap thin helper class around for convenience - MaskedImageFrame masked_frame(image + frame * (num_rows * num_cols), - num_rows, - num_cols); + MaskedImageFrame masked_frame(image + frame * (ss_dim * fs_dim), + ss_dim, + fs_dim); // candidate should not be masked - if (masked_frame.is_masked(row, col)) { + if (masked_frame.is_masked(ss_index, fs_index)) { return; } - float pixel_val = masked_frame.get(row, col); + float pixel_val = masked_frame.get(ss_index, fs_index); // candidate should be greater than immediate neighbors - // with tie breaking: in case of equality, lowest row, col is candidate - if (pixel_val <= masked_frame.get(row-1, col-1, -INFINITY) || - pixel_val <= masked_frame.get(row-1, col, -INFINITY) || - pixel_val <= masked_frame.get(row-1, col+1, -INFINITY) || - pixel_val <= masked_frame.get(row, col-1, -INFINITY) || - pixel_val < masked_frame.get(row, col+1, -INFINITY) || - pixel_val < masked_frame.get(row+1, col, -INFINITY) || - pixel_val < masked_frame.get(row+1, col, -INFINITY) || - pixel_val < masked_frame.get(row+1, col, -INFINITY)) { + // with tie breaking: in case of equality, lowest row, fs_index is candidate + if (pixel_val <= masked_frame.get(ss_index-1, fs_index-1, -INFINITY) || + pixel_val <= masked_frame.get(ss_index-1, fs_index, -INFINITY) || + pixel_val <= masked_frame.get(ss_index-1, fs_index+1, -INFINITY) || + pixel_val <= masked_frame.get(ss_index, fs_index-1, -INFINITY) || + pixel_val < masked_frame.get(ss_index, fs_index+1, -INFINITY) || + pixel_val < masked_frame.get(ss_index+1, fs_index, -INFINITY) || + pixel_val < masked_frame.get(ss_index+1, fs_index, -INFINITY) || + pixel_val < masked_frame.get(ss_index+1, fs_index, -INFINITY)) { return; } @@ -112,7 +112,7 @@ extern "C" __global__ void pf9(const unsigned short num_frames, // (full border for window radius 2, less for higher) { float border_max = -INFINITY; - masked_frame.fun_ring(row, col, window_radius, [&] (unsigned short, unsigned short, float val) { + masked_frame.fun_ring(ss_index, fs_index, window_radius, [&] (unsigned short, unsigned short, float val) { border_max = max(border_max, val); }); if (pixel_val - min_peak_over_border <= border_max) { @@ -127,7 +127,7 @@ extern "C" __global__ void pf9(const unsigned short num_frames, // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm unsigned short count = 0; float M2 = 0; - masked_frame.fun_ring(row, col, window_radius, [&] (unsigned short, unsigned short, float val) { + masked_frame.fun_ring(ss_index, fs_index, window_radius, [&] (unsigned short, unsigned short, float val) { ++count; float delta = val - mean; mean += delta / static_cast<float>(count); @@ -147,44 +147,47 @@ extern "C" __global__ void pf9(const unsigned short num_frames, } // whole peak should have sufficent SNR - float peak_weighted_row; - float peak_weighted_col; - float peak_total_mass = pixel_val - mean; + float peak_weighted_ss; + float peak_weighted_fs; + float peak_total_mass = 0; { /* TODO: more compact form */ - float peak_weighted_row_nom = static_cast<float>(row) * pixel_val; - float peak_weighted_col_nom = static_cast<float>(col) * pixel_val; - const float peak_pixel_threshold = mean + min_snr_peak_pixels * sigma; - for (unsigned short layer=1; layer<=window_radius; ++layer) { - float total_mass_before = peak_total_mass; - masked_frame.fun_ring(row, col, layer, [&] (unsigned short i, unsigned short j, float val) { - if (val > peak_pixel_threshold) { - float val_over_mean = val - mean; - peak_total_mass += val_over_mean; - peak_weighted_row_nom += val_over_mean * static_cast<float>(i); - peak_weighted_col_nom += val_over_mean * static_cast<float>(j); - } - }); - // in case nothing was added, stop expanding - if (peak_total_mass == total_mass_before) { - break; - } - } + float peak_weighted_ss_nom = 0; + float peak_weighted_fs_nom = 0; + // expand peak in rings + { + const float peak_pixel_threshold = mean + min_snr_peak_pixels * sigma; + for (unsigned short layer=1; layer<=window_radius; ++layer) { + float total_mass_before = peak_total_mass; + masked_frame.fun_ring(ss_index, fs_index, layer, [&] (unsigned short i, unsigned short j, float val) { + if (val > peak_pixel_threshold) { + float val_over_mean = val - mean; + peak_total_mass += val_over_mean; + peak_weighted_ss_nom += val_over_mean * static_cast<float>(i); + peak_weighted_fs_nom += val_over_mean * static_cast<float>(j); + } + }); + // in case nothing was added, stop expanding + if (peak_total_mass == total_mass_before) { + break; + } + } + } if (peak_total_mass <= mean + min_snr_whole_peak * sigma) { return; } if (peak_total_mass == 0) { - float peak_weighted_row = static_cast<float>(row); - float peak_weighted_col = static_cast<float>(col); + peak_weighted_ss = static_cast<float>(ss_index); + peak_weighted_fs = static_cast<float>(fs_index); } else { - peak_weighted_row = peak_weighted_row_nom / peak_total_mass; - peak_weighted_col = peak_weighted_col_nom / peak_total_mass; + peak_weighted_ss = peak_weighted_ss_nom / peak_total_mass; + peak_weighted_fs = peak_weighted_fs_nom / peak_total_mass; } } unsigned int output_index = atomicInc(output_counts + frame, max_peaks); unsigned int output_pos = frame * max_peaks + output_index; - output_x[output_pos] = peak_weighted_row; - output_y[output_pos] = peak_weighted_col; + output_x[output_pos] = peak_weighted_ss; + output_y[output_pos] = peak_weighted_fs; output_intensity[output_pos] = peak_total_mass; }