From 4b6ce2cec453b49a4625c9a300a26173d2f1cced Mon Sep 17 00:00:00 2001
From: David Hammer <dhammer@mailbox.org>
Date: Mon, 16 Aug 2021 16:35:21 +0200
Subject: [PATCH] Hotfixes applied during beamtime at SQS

---
 .../combined_correction_dssc.py               | 147 ++++++++++-----
 src/DsscCorrection/cuda_pipeline.py           | 173 ++++++++++--------
 src/DsscCorrection/gpu-dssc-correct.cpp       |  31 ++--
 3 files changed, 223 insertions(+), 128 deletions(-)

diff --git a/src/DsscCorrection/combined_correction_dssc.py b/src/DsscCorrection/combined_correction_dssc.py
index 429798a7..a0452368 100644
--- a/src/DsscCorrection/combined_correction_dssc.py
+++ b/src/DsscCorrection/combined_correction_dssc.py
@@ -24,6 +24,7 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
         "applyCorrection",
         "doAnything",
         "dataFormat.memoryCells",
+        "dataFormat.memoryCellsCorrection",
         "dataFormat.pixelsX",
         "dataFormat.pixelsY",
         "preview.enable",
@@ -131,6 +132,15 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
         ).description(
             "Full number of memory cells in incoming data"
         ).assignmentMandatory().commit()
+        bound.UINT32_ELEMENT(expected).key(
+            "dataFormat.memoryCellsCorrection"
+        ).displayedName("(Debug) Memory cells in correction map").description(
+            "Full number of memory cells in currently loaded correction map. "
+            "May exceed memory cell number in input if veto is on. "
+            "This value just displayed for debugging."
+        ).readOnly().initialValue(
+            0
+        ).commit()
         bound.VECTOR_UINT32_ELEMENT(expected).key(
             "dataFormat.inputDataShape"
         ).displayedName("Input data shape").description(
@@ -186,9 +196,6 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
             "-4: stdev\n"
             "Max means selecting the pulse with the maximum integrated value. "
             "The others are computed across all filtered pulses in the train."
-            "Note that index slicing (â‰¥ 0 case) currently does not take "
-            "image.pulseId into account, so certain pulse filters may yield "
-            "unexpected preview pulse shown."
         ).assignmentOptional().defaultValue(
             0
         ).reconfigurable().commit()
@@ -318,6 +325,7 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
 
         self.input_data_dtype = getattr(np, config.get("dataFormat.inputImageDtype"))
         self.output_data_dtype = getattr(np, config.get("dataFormat.outputImageDtype"))
+        self._offset_map = None
         self._update_pulse_filter(config.get("pulseFilter"))
         self._update_shapes(
             config.get("dataFormat.pixelsX"),
@@ -373,6 +381,7 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
                 # apply new pulse filter
                 self._update_pulse_filter(config.get("pulseFilter"))
                 # but existing shapes (not reconfigurable)
+                # TODO: avoid double compilation here if constants are loaded
                 self._update_shapes(
                     self.get("dataFormat.pixelsX"),
                     self.get("dataFormat.pixelsY"),
@@ -425,10 +434,10 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
         train_id = metadata.getAttribute("timestamp", "tid")
         cell_table = np.squeeze(data.get("image.cellId"))
         assert isinstance(cell_table, np.ndarray), "image.cellId should be ndarray"
-        if not len(cell_table.shape) == 1:
-            self.set(
-                "status", f"Failed to process, cell table had shape {cell_table.shape}"
-            )
+        if len(cell_table.shape) == 0:
+            msg = "cellId had 0 dimensions. DAQ may not be sending data."
+            self.set("status", msg)
+            self.log.WARN(msg)
             return
         # original shape: 400, 1, 128, 512 (memory cells, something, y, x)
         # TODO: consider making paths configurable
@@ -440,6 +449,7 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
             # TODO: truncate if > 800
             self.set("dataFormat.memoryCells", image_data.shape[0])
             with self._buffer_lock:
+                self._update_pulse_filter(self.get("pulseFilter"))
                 self._update_shapes(
                     self.get("dataFormat.pixelsX"),
                     self.get("dataFormat.pixelsY"),
@@ -452,6 +462,7 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
         do_generate_preview = train_id % self.get(
             "preview.trainIdModulo"
         ) == 0 and self.get("preview.enable")
+        do_apply_correction = self.get("applyCorrection")
 
         if not self.get("state") is State.PROCESSING:
             self.updateState(State.PROCESSING)
@@ -468,6 +479,23 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
             cell_table = cell_table[self.pulse_filter]
             pulse_table = np.squeeze(data.get("image.pulseId"))[self.pulse_filter]
 
+            cell_table_max = np.max(cell_table)
+            correction_cell_num = self.get("dataFormat.memoryCellsCorrection")
+            if do_apply_correction:
+                if correction_cell_num == 0:
+                    msg = "No constant loaded, correction will not be applied."
+                    self.log.WARN(msg)
+                    self.set("status", msg)
+                    do_apply_correction = False
+                elif cell_table_max >= correction_cell_num:
+                    msg = (
+                        f"Max cell ID ({cell_table_max}) exceeds range for loaded "
+                        f"constant (has {correction_cell_num} cells). Some frames "
+                        "will not be corrected."
+                    )
+                    self.log.WARN(msg)
+                    self.set("status", msg)
+
             with gpu_utils.GPUContextContext(self.gpu_context):
                 self.gpu_buffer_cell_table.set(cell_table)
                 self.gpu_buffer_input_image_data.set(image_data)
@@ -475,7 +503,7 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
                     self.gpu_buffer_input_image_data,
                     self.gpu_buffer_reshaped_image_data,
                 )
-                if self.get("applyCorrection"):
+                if do_apply_correction:
                     buffer_handle, result = self.pipeline.correct(
                         self.gpu_buffer_reshaped_image_data,
                         self.gpu_buffer_cell_table,
@@ -485,9 +513,27 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
                         self.gpu_buffer_reshaped_image_data
                     )
                 if do_generate_preview:
+                    preview_slice_index = self.get("preview.pulse")
+                    if preview_slice_index >= 0:
+                        # look at pulse_table to find which index this pulse ID is in
+                        pulse_id_found = np.where(pulse_table == preview_slice_index)[0]
+                        if len(pulse_id_found) == 0:
+                            pulse_found_instead = pulse_table[0]
+                            msg = (
+                                f"Pulse {preview_slice_index} not found in "
+                                f"image.pulseId, arbitrary pulse "
+                                f"{pulse_found_instead} will be shown."
+                            )
+                            preview_slice_index = 0
+                            self.log.WARN(msg)
+                            self.set("status", msg)
+                        else:
+                            preview_slice_index = pulse_id_found[0]
                     preview_raw, preview_corrected = self.pipeline.compute_preview(
                         self.gpu_buffer_reshaped_image_data,
-                        self.get("preview.pulse"),
+                        preview_slice_index,
+                        reuse_corrected=do_apply_correction,
+                        cell_table=self.gpu_buffer_cell_table,
                     )
 
         data.set("image.data", buffer_handle)
@@ -579,6 +625,44 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
 
         """
 
+        offset_map = self.getConstant("Offset")
+        input_memory_cells = self.get("dataFormat.memoryCells")
+        if offset_map is None:
+            msg = (
+                "Warning: Did not find offset constant, offset correction "
+                "will not be applied"
+            )
+            self.set("status", msg)
+            self.log.WARN(msg)
+            self._offset_map = None
+        elif len(offset_map.shape) not in (3, 4):
+            msg = (
+                f"Offset map had unexpected shape {offset_map.shape}, "
+                "offset correction will not be applied"
+            )
+            self.set("status", msg)
+            self.log.WARN(msg)
+        else:
+            self.log.INFO(f"Offset map loaded has shape {offset_map.shape}")
+            if len(offset_map.shape) == 4:  # old format (see offsetcorrection_dssc.py)?
+                offset_map = offset_map[..., 0]
+            constant_memory_cells = offset_map.shape[-1]
+            if input_memory_cells > constant_memory_cells:
+                msg = (
+                    f"Warning: Memory cells in input {input_memory_cells} > "
+                    f"memory cells in constant {constant_memory_cells}, some "
+                    "frames may not get correction applied."
+                )
+                self.set("status", msg)
+                self.log.WARN(msg)
+            self._offset_map = offset_map.astype(np.float32)
+            msg = f"Offset map with shape {self._offset_map.shape} ready to load to GPU"
+            self.set("status", msg)
+            self.log.INFO(msg)
+            if constant_memory_cells != self.get("dataFormat.memoryCellsCorrection"):
+                self.log.INFO("Will first have to update buffers on GPU")
+                self.set("dataFormat.memoryCellsCorrection", constant_memory_cells)
+
         self._update_maps_on_gpu()
 
     def registerManager(self, instance_id):
@@ -692,37 +776,12 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
 
         self.set("status", "Updating constants on GPU using known constants")
         self.updateState(State.CHANGING)
-
-        offset_map = self.getConstant("Offset")
-        memory_cells = self.get("dataFormat.memoryCells")
-        if offset_map is None:
-            msg = f"Warning: Did not find offset constant, offset correction will not be applied"
+        if self._offset_map is not None:
+            with gpu_utils.GPUContextContext(self.gpu_context):
+                self.pipeline.load_constants(self._offset_map)
+            msg = "Done transferring known constant(s) to GPU"
+            self.log.INFO(msg)
             self.set("status", msg)
-            self.log.WARN(msg)
-        else:
-            if len(offset_map.shape) in (3, 4):
-                self.log.INFO(f"Offset map known has shape {offset_map.shape}")
-                # this is from offsetcorrection_dssc.py
-                if len(offset_map.shape) == 4:  # old format?
-                    offset_map = np.squeeze(offset_map[..., 0])
-                constant_memory_cells = offset_map.shape[-1]
-                if memory_cells > constant_memory_cells:
-                    msg = (
-                        f"Warning: Memory cells in input ({memory_cells}) exceeded memory cells in constant ({constant_memory_cells}), offset correction will not be applied",
-                    )
-                    self.set("status", msg)
-                    self.log.WARN(msg)
-                else:
-                    offset_map = offset_map[..., :memory_cells, :].astype(np.float32)
-                    with gpu_utils.GPUContextContext(self.gpu_context):
-                        self.pipeline.offset_map.set(offset_map)
-                    msg = "Done transferring known constant(s) to GPU"
-                    self.set("status", msg)
-                    self.log.INFO(msg)
-            else:
-                msg = f"Offset map had unexpected shape {offset_map.shape}, offset correction will not be applied"
-                self.set("status", msg)
-                self.log.WARN(msg)
 
         self.updateState(State.ON)
 
@@ -736,11 +795,13 @@ class DsscCombinedCorrection(calibrationBase.CalibrationReceiverBaseDevice):
             self._rate_update_timer.delay()
             return
         self._buffered_status_update.set("performance.rate", self._rate_tracker.rate())
-        theoretical_rate = 1000 / self._buffered_status_update.get(
+        last_processing = self._buffered_status_update.get(
             "performance.lastProcessingDuration"
         )
-        self._buffered_status_update.set(
-            "performance.theoreticalRate", theoretical_rate
-        )
+        if last_processing > 0:
+            theoretical_rate = 1000 / last_processing
+            self._buffered_status_update.set(
+                "performance.theoreticalRate", theoretical_rate
+            )
         self.set(self._buffered_status_update)
         self._rate_update_timer.delay()
diff --git a/src/DsscCorrection/cuda_pipeline.py b/src/DsscCorrection/cuda_pipeline.py
index f4a4e533..82fb4da8 100644
--- a/src/DsscCorrection/cuda_pipeline.py
+++ b/src/DsscCorrection/cuda_pipeline.py
@@ -34,43 +34,18 @@ class PyCudaPipeline:
         self.pixels_x = pixels_x
         self.pixels_y = pixels_y
         self.memory_cells = memory_cells
+        self.constant_memory_cells = 0
         self.pulse_filter = pulse_filter
         self.output_shape = (self.pixels_x, self.pixels_y, self.pulse_filter.size)
-        self.map_shape = (self.pixels_x, self.pixels_y, self.memory_cells)
+        self.map_shape = (self.pixels_x, self.pixels_y, self.constant_memory_cells)
         # preview will only be single memory cell
         self.preview_shape = self.output_shape[:-1]
+        self.input_data_dtype = input_data_dtype
+        self.output_data_dtype = output_data_dtype
 
-        kernel_source = self._kernel_template.render(
-            {
-                "pixels_x": self.pixels_x,
-                "pixels_y": self.pixels_y,
-                "memory_cells": self.memory_cells,
-                "input_data_dtype": utils.numpy_dtype_to_c_type_str[input_data_dtype],
-                "output_data_dtype": utils.numpy_dtype_to_c_type_str[output_data_dtype],
-                "pulse_filter": pulse_filter,
-            }
-        )
-        self.source_module = pycuda.compiler.SourceModule(
-            kernel_source, no_extern_c=True
-        )
-        self.reshaping_kernel = self.source_module.get_function("reshape_4_3")
-        self.correction_kernel = self.source_module.get_function("correct")
-        self.casting_kernel = self.source_module.get_function("only_cast")
-        self.preview_slice_raw_kernel = self.source_module.get_function(
-            "cell_slice_preview_raw"
-        )
-        self.preview_slice_corrected_kernel = self.source_module.get_function(
-            "cell_slice_preview_corrected"
-        )
-        self.preview_stat_raw_kernel = self.source_module.get_function(
-            "cell_stat_preview_raw"
-        )
-        self.preview_stat_corrected_kernel = self.source_module.get_function(
-            "cell_stat_preview_corrected"
-        )
-        self.frame_sum_kernel = self.source_module.get_function("sum_frames")
+        self._init_kernels()
 
-        self.offset_map = pycuda.gpuarray.zeros(self.map_shape, dtype=np.float32)
+        self.offset_map = pycuda.gpuarray.empty(self.map_shape, dtype=np.float32)
 
         # reuse output arrays
         self.gpu_result = pycuda.gpuarray.empty(
@@ -105,6 +80,51 @@ class PyCudaPipeline:
 
         self.update_block_size(full_block=(1, 1, 64), preview_block=(1, 64, 1))
 
+    def load_constants(self, offset_map_host):
+        constant_memory_cells = offset_map_host.shape[-1]
+        if constant_memory_cells != self.constant_memory_cells:
+            self.constant_memory_cells = constant_memory_cells
+            self.map_shape = (self.pixels_x, self.pixels_y, self.constant_memory_cells)
+            self.offset_map = pycuda.gpuarray.empty(self.map_shape, dtype=np.float32)
+            self._init_kernels()
+        self.offset_map.set(offset_map_host)
+
+    def _init_kernels(self):
+        kernel_source = self._kernel_template.render(
+            {
+                "pixels_x": self.pixels_x,
+                "pixels_y": self.pixels_y,
+                "memory_cells": self.memory_cells,
+                "constant_memory_cells": self.constant_memory_cells,
+                "input_data_dtype": utils.numpy_dtype_to_c_type_str[
+                    self.input_data_dtype
+                ],
+                "output_data_dtype": utils.numpy_dtype_to_c_type_str[
+                    self.output_data_dtype
+                ],
+                "pulse_filter": self.pulse_filter,
+            }
+        )
+        self.source_module = pycuda.compiler.SourceModule(
+            kernel_source, no_extern_c=True
+        )
+        self.reshaping_kernel = self.source_module.get_function("reshape_4_3")
+        self.correction_kernel = self.source_module.get_function("correct")
+        self.casting_kernel = self.source_module.get_function("only_cast")
+        self.preview_slice_raw_kernel = self.source_module.get_function(
+            "cell_slice_preview_raw"
+        )
+        self.preview_slice_corrected_kernel = self.source_module.get_function(
+            "cell_slice_preview_corrected"
+        )
+        self.preview_stat_raw_kernel = self.source_module.get_function(
+            "cell_stat_preview_raw"
+        )
+        self.preview_stat_corrected_kernel = self.source_module.get_function(
+            "cell_stat_preview_corrected"
+        )
+        self.frame_sum_kernel = self.source_module.get_function("sum_frames")
+
     def update_block_size(self, full_block=None, preview_block=None):
         """Execution is scheduled with 3d "blocks" of CUDA threads, tuning can
         affect performance
@@ -161,7 +181,6 @@ class PyCudaPipeline:
         Will return string encoded handle to shared memory output buffer and
         (view of) said buffer as an ndarray.  Keep in mind that the output
         buffers will get overwritten eventually (circular buffer).
-
         """
         self.correction_kernel(
             data,
@@ -201,17 +220,20 @@ class PyCudaPipeline:
         return handle, output_buffer
 
     def compute_preview(
-        self, raw_data, cell_to_preview, has_just_corrected=True, verify=False
+        self,
+        raw_data,
+        preview_index,
+        reuse_corrected=True,
+        cell_table=None,
     ):
         """Generate single slice or reduction preview of raw and corrected data
 
-        Special values of cell_to_preview are -1 for max, -2 for mean, -3 for
+        Special values of preview_index are -1 for max, -2 for mean, -3 for
         sum, and -4 for stdev (across cells).
 
-        Note that cell_to_preview is taken from data without checking cell
-        table, so if a pulse filter not contiguous from 0 has been applied
-        first, the resulting cell will be offset.  Cell table is only used to
-        get the correct slice of the correction map.
+        Note that preview_index is taken from data without checking cell table.
+        Caller has to figure out which index along memory cell dimension they
+        actually want to preview.
 
         raw_data should be a gpuarray
 
@@ -220,35 +242,57 @@ class PyCudaPipeline:
 
         """
 
-        if cell_to_preview < -4:
-            raise ValueError(f"No statistic with code {cell_to_preview} defined")
-        elif cell_to_preview >= self.memory_cells:
-            raise ValueError(f"Memory cell index {cell_to_preview} out of range")
+        if preview_index < -4:
+            raise ValueError(f"No statistic with code {preview_index} defined")
+        elif preview_index >= self.memory_cells:
+            raise ValueError(f"Memory cell index {preview_index} out of range")
+
+        if not reuse_corrected:
+            # if we didn't already correct, need to do so to get corrected data in buffer
+            if self.offset_map.size == 0 or cell_table is None:
+                self.casting_kernel(
+                    raw_data,
+                    self.gpu_result,
+                    block=self.full_block,
+                    grid=self.full_grid,
+                )
+                if self.offset_map.size == 0:
+                    print(
+                        "Warning: no offset map loaded, corrected preview "
+                        "will be not actually have correction applied."
+                    )
+                if cell_table is None:
+                    print(
+                        "Warning: missing parameter cell_table for applying "
+                        "correction for preview."
+                    )
+            else:
+                self.correction_kernel(
+                    raw_data,
+                    cell_table,
+                    self.offset_map,
+                    self.gpu_result,
+                    block=self.full_block,
+                    grid=self.full_grid,
+                )
 
-        # TODO: lift this restriction.
-        assert has_just_corrected
         # TODO: enum around reduction type
-        if cell_to_preview >= 0:
+        if preview_index >= 0:
             self.preview_slice_raw_kernel(
                 raw_data,
-                np.int16(cell_to_preview),
+                np.int16(preview_index),
                 self.gpu_preview_raw,
                 block=self.preview_block,
                 grid=self.preview_grid,
             )
             self.preview_slice_corrected_kernel(
                 self.gpu_result,
-                np.int16(cell_to_preview),
+                np.int16(preview_index),
                 self.gpu_preview_corrected,
                 block=self.preview_block,
                 grid=self.preview_grid,
             )
-            if verify:
-                assert np.allclose(
-                    self.gpu_preview_raw.get(),
-                    raw_data.get()[..., cell_to_preview],
-                )
-        elif cell_to_preview == -1:
+        elif preview_index == -1:
             # TODO: select argmax independently for raw and corrected?
             # TODO: send frame sums somewhere to compute global max frame
             self.frame_sum_kernel(
@@ -272,38 +316,21 @@ class PyCudaPipeline:
                 block=self.preview_block,
                 grid=self.preview_grid,
             )
-            if verify:
-                assert np.allclose(
-                    self.gpu_preview_raw.get(),
-                    raw_data.get()[
-                        ...,
-                        np.argmax(
-                            np.sum(raw_data.get(), axis=(0, 1), dtype=np.float32)
-                        ),
-                    ],
-                )
-        elif cell_to_preview in (-2, -3, -4):
+        elif preview_index in (-2, -3, -4):
             self.preview_stat_raw_kernel(
                 raw_data,  # this is input_data_dtype
-                np.int16(cell_to_preview),
+                np.int16(preview_index),
                 self.gpu_preview_raw,
                 block=self.preview_block,
                 grid=self.preview_grid,
             )
             self.preview_stat_corrected_kernel(
                 self.gpu_result,  # this is output_data_dtype
-                np.int16(cell_to_preview),
+                np.int16(preview_index),
                 self.gpu_preview_corrected,
                 block=self.preview_block,
                 grid=self.preview_grid,
             )
-            if verify:
-                assert np.allclose(
-                    self.gpu_preview_raw.get(),
-                    {-2: np.mean, -3: np.sum, -4: np.std}[cell_to_preview](
-                        raw_data.get(), axis=2
-                    ),
-                )
         self.gpu_preview_raw.get(ary=self.preview_raw)
         self.gpu_preview_corrected.get(ary=self.preview_corrected)
         return self.preview_raw, self.preview_corrected
diff --git a/src/DsscCorrection/gpu-dssc-correct.cpp b/src/DsscCorrection/gpu-dssc-correct.cpp
index a86d9707..d40c4ddf 100644
--- a/src/DsscCorrection/gpu-dssc-correct.cpp
+++ b/src/DsscCorrection/gpu-dssc-correct.cpp
@@ -55,8 +55,8 @@ extern "C" {
 		const size_t Y = {{pixels_y}};
 		// reshaped and output data have pulse filter length memory cells dim
 		const size_t filtered_memory_cells = {{pulse_filter|length}};
-		// but correction map has "full size"
-		const size_t full_memory_cells = {{memory_cells}};
+		// but correction map has some number which may even exceed input data's (due to veto pattern)
+		const size_t map_memory_cells = {{constant_memory_cells}};
 
 		const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 		const size_t j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -71,20 +71,27 @@ extern "C" {
 		const size_t data_stride_1 = filtered_memory_cells * data_stride_2;
 		const size_t data_stride_0 = Y * data_stride_1;
 		const size_t data_index = i * data_stride_0 + j * data_stride_1 + k * data_stride_2;
+		const float raw = (float)data[data_index];
 
 		const size_t map_stride_2 = 1;
-		const size_t map_stride_1 = full_memory_cells * map_stride_2;
+		const size_t map_stride_1 = map_memory_cells * map_stride_2;
 		const size_t map_stride_0 = Y * map_stride_1;
 		const size_t map_cell = cell_table[k];
-		const size_t map_index = i * map_stride_0 + j * map_stride_1 + map_cell * map_stride_2;
-
-		const float raw = (float)data[data_index];
-		const float corrected = raw - offset_map[map_index];
-		{% if output_data_dtype == "half" %}
-		output[data_index] = __float2half(corrected);
-		{% else %}
-		output[data_index] = ({{output_data_dtype}})corrected;
-		{% endif %}
+		if (map_cell < map_memory_cells) {
+			const size_t map_index = i * map_stride_0 + j * map_stride_1 + map_cell * map_stride_2;
+			const float corrected = raw - offset_map[map_index];
+			{% if output_data_dtype == "half" %}
+			output[data_index] = __float2half(corrected);
+			{% else %}
+			output[data_index] = ({{output_data_dtype}})corrected;
+			{% endif %}
+		} else {
+			{% if output_data_dtype == "half" %}
+			output[data_index] = __float2half(raw);
+			{% else %}
+			output[data_index] = ({{output_data_dtype}})raw;
+			{% endif %}
+		}
 	}
 
 	/*
-- 
GitLab