From 876fbbe82c9fe5cbc67237253616e78cd86a74b3 Mon Sep 17 00:00:00 2001
From: David Hammer <dhammer@mailbox.org>
Date: Thu, 2 Jun 2022 15:31:20 +0200
Subject: [PATCH] GH/JF: Decrease default shmem buffer size

---
 src/calng/Gotthard2Correction.py              |   5 +
 src/calng/JungfrauCorrection.py               |   6 +
 .../{base_gpu.py => base_kernel_runner.py}    | 175 ++++++++++--------
 3 files changed, 109 insertions(+), 77 deletions(-)
 rename src/calng/{base_gpu.py => base_kernel_runner.py} (90%)

diff --git a/src/calng/Gotthard2Correction.py b/src/calng/Gotthard2Correction.py
index d5ac7722..39c6c46e 100644
--- a/src/calng/Gotthard2Correction.py
+++ b/src/calng/Gotthard2Correction.py
@@ -184,6 +184,11 @@ class Gotthard2CalcatFriend(base_calcat.BaseCalcatFriend):
             .key(f"{param_prefix}.memoryCells")
             .setNewDefaultValue(2)
             .commit(),
+
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
         )
 
         base_calcat.add_status_schema_from_enum(
diff --git a/src/calng/JungfrauCorrection.py b/src/calng/JungfrauCorrection.py
index 7be0e149..553f86db 100644
--- a/src/calng/JungfrauCorrection.py
+++ b/src/calng/JungfrauCorrection.py
@@ -172,6 +172,12 @@ class JungfrauCalcatFriend(base_calcat.BaseCalcatFriend):
             .key(f"{param_prefix}.biasVoltage")
             .setNewDefaultValue(90)
             .commit(),
+
+            # JUNGFRAU data is small, can fit plenty of trains in here
+            OVERWRITE_ELEMENT(expected)
+            .key("outputShmemBufferSize")
+            .setNewDefaultValue(2)
+            .commit(),
         )
 
         # add extra parameters
diff --git a/src/calng/base_gpu.py b/src/calng/base_kernel_runner.py
similarity index 90%
rename from src/calng/base_gpu.py
rename to src/calng/base_kernel_runner.py
index 333eb053..38127f06 100644
--- a/src/calng/base_gpu.py
+++ b/src/calng/base_kernel_runner.py
@@ -7,32 +7,7 @@ import numpy as np
 from . import utils
 
 
-class BaseGpuRunner:
-    """Class to handle GPU buffers and execution of CUDA kernels on image data
-
-    All GPU buffers are kept within this class and it is intentionally very stateful.
-    This generally means that you will want to load data into it and then do something.
-    Typical usage in correct order:
-
-    1. instantiate
-    2. load constants
-    3. load_data
-    4. load_cell_table
-    5. correct
-    6a. reshape (only here does data transfer back to host)
-    6b. compute_preview (optional)
-
-    repeat from 2. or 3.
-
-    In case no constants are available / correction is not desired, can skip 3 and 4 and
-    pass CorrectionFlags.NONE to correct(...). Generally, user must handle which
-    correction steps are appropriate given the constants loaded so far.
-    """
-
-    # These must be set by subclass
-    _kernel_source_filename = None
-    _corrected_axis_order = None
-
+class BaseKernelRunner:
     def __init__(
         self,
         pixels_x,
@@ -42,11 +17,6 @@ class BaseGpuRunner:
         input_data_dtype=np.uint16,
         output_data_dtype=np.float32,
     ):
-        _src_dir = pathlib.Path(__file__).absolute().parent
-        # subclass must define _kernel_source_filename
-        with (_src_dir / "kernels" / self._kernel_source_filename).open("r") as fd:
-            self._kernel_template = jinja2.Template(fd.read())
-
         self.pixels_x = pixels_x
         self.pixels_y = pixels_y
         self.memory_cells = memory_cells
@@ -60,41 +30,20 @@ class BaseGpuRunner:
         self.input_data_dtype = input_data_dtype
         self.output_data_dtype = output_data_dtype
 
-        self._init_kernels()
-
-        # reuse buffers for input / output
-        self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16)
-        self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
-        self.processed_data_gpu = cupy.empty(
-            self.processed_shape, dtype=output_data_dtype
-        )
-        self.reshaped_data_gpu = None  # currently not reusing buffer
-
         # default preview layers: raw and corrected (subclass can extend)
         self.preview_buffer_getters = [
             self._get_raw_for_preview,
             self._get_corrected_for_preview,
         ]
 
-    # to get data from respective buffers to cell, x, y shape for preview computation
-    def _get_raw_for_preview(self):
-        """Should return view of self.input_data_gpu with shape (cell, x/y, x/y)"""
-        raise NotImplementedError()
-
-    def _get_corrected_for_preview(self):
-        """Should return view of self.processed_data_gpu with shape (cell, x/y, x/y)"""
-        raise NotImplementedError()
-
-    def flush_buffers(self):
-        """Optional reset GPU buffers (implement in subclasses which need this)"""
-        pass
 
     def correct(self, flags):
         """Correct (already loaded) image data according to flags
 
-        Subclass must define this method. It should assume that image data, cell table,
-        and other data (including constants) has already been loaded. It should
-        probably run some GPU kernel and output should go into self.processed_data_gpu.
+        Detector-specific subclass must define this method. It should assume that image
+        data, cell table, and other data (including constants) has already been loaded.
+        It should probably run some GPU kernel and output should go into
+        self.processed_data_gpu.
 
         Keep in mind that user only gets output from compute_preview or reshape
         (either of these should come after correct).
@@ -107,29 +56,18 @@ class BaseGpuRunner:
         """
         raise NotImplementedError()
 
-    def reshape(self, output_order, out=None):
-        """Move axes to desired output order and copy to host memory
-
-        The out parameter is passed directly to the get function of GPU array: if
-        None, then a new ndarray (in host memory) is returned. If not None, then data
-        will be loaded into the provided array, which must match shape / dtype.
-        """
-        # TODO: avoid copy
-        if output_order == self._corrected_axis_order:
-            self.reshaped_data_gpu = self.processed_data_gpu
-        else:
-            self.reshaped_data_gpu = cupy.transpose(
-                self.processed_data_gpu,
-                utils.transpose_order(self._corrected_axis_order, output_order),
-            )
-
-        return self.reshaped_data_gpu.get(out=out)
+    # to get data from respective buffers to cell, x, y shape for preview computation
+    def _get_raw_for_preview(self):
+        """Should return view of self.input_data_gpu with shape (cell, x/y, x/y)"""
+        raise NotImplementedError()
 
-    def load_data(self, raw_data):
-        self.input_data_gpu.set(raw_data)
+    def _get_corrected_for_preview(self):
+        """Should return view of self.processed_data_gpu with shape (cell, x/y, x/y)"""
+        raise NotImplementedError()
 
-    def load_cell_table(self, cell_table):
-        self.cell_table_gpu.set(cell_table)
+    def flush_buffers(self):
+        """Optional reset GPU buffers (implement in subclasses which need this)"""
+        pass
 
     def compute_previews(self, preview_index):
         """Generate single slice or reduction preview of raw and corrected data
@@ -174,6 +112,89 @@ class BaseGpuRunner:
             }[preview_index]
             return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
 
+
+class BaseGpuRunner(base_kernel_runner):
+    """Class to handle GPU buffers and execution of CUDA kernels on image data
+
+    All GPU buffers are kept within this class and it is intentionally very stateful.
+    This generally means that you will want to load data into it and then do something.
+    Typical usage in correct order:
+
+    1. instantiate
+    2. load constants
+    3. load_data
+    4. load_cell_table
+    5. correct
+    6a. reshape (only here does data transfer back to host)
+    6b. compute_preview (optional)
+
+    repeat from 2. or 3.
+
+    In case no constants are available / correction is not desired, can skip 3 and 4 and
+    pass CorrectionFlags.NONE to correct(...). Generally, user must handle which
+    correction steps are appropriate given the constants loaded so far.
+    """
+
+    # These must be set by subclass
+    _kernel_source_filename = None
+    _corrected_axis_order = None
+
+    def __init__(
+        self,
+        pixels_x,
+        pixels_y,
+        memory_cells,
+        constant_memory_cells,
+        input_data_dtype=np.uint16,
+        output_data_dtype=np.float32,
+    ):
+        super().__init__(
+            pixels_x,
+            pixels_y,
+            memory_cells,
+            constant_memory_cells,
+            input_data_dtype,
+            output_data_dtype,
+        )
+        _src_dir = pathlib.Path(__file__).absolute().parent
+        # subclass must define _kernel_source_filename
+        with (_src_dir / "kernels" / self._kernel_source_filename).open("r") as fd:
+            self._kernel_template = jinja2.Template(fd.read())
+
+        self._init_kernels()
+
+        # reuse buffers for input / output
+        self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16)
+        self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
+        self.processed_data_gpu = cupy.empty(
+            self.processed_shape, dtype=output_data_dtype
+        )
+        self.reshaped_data_gpu = None  # currently not reusing buffer
+
+    def reshape(self, output_order, out=None):
+        """Move axes to desired output order and copy to host memory
+
+        The out parameter is passed directly to the get function of GPU array: if
+        None, then a new ndarray (in host memory) is returned. If not None, then data
+        will be loaded into the provided array, which must match shape / dtype.
+        """
+        # TODO: avoid copy
+        if output_order == self._corrected_axis_order:
+            self.reshaped_data_gpu = self.processed_data_gpu
+        else:
+            self.reshaped_data_gpu = cupy.transpose(
+                self.processed_data_gpu,
+                utils.transpose_order(self._corrected_axis_order, output_order),
+            )
+
+        return self.reshaped_data_gpu.get(out=out)
+
+    def load_data(self, raw_data):
+        self.input_data_gpu.set(raw_data)
+
+    def load_cell_table(self, cell_table):
+        self.cell_table_gpu.set(cell_table)
+
     def update_block_size(self, full_block):
         """Set execution grid such that it covers processed_shape with full_blocks
 
-- 
GitLab