diff --git a/src/calng/base_correction.py b/src/calng/base_correction.py index 5aa158c24e7c21d29f4af99b19010056eec664d1..2dccd1e22d27d1e50601f1033c7b7a1d461a6349 100644 --- a/src/calng/base_correction.py +++ b/src/calng/base_correction.py @@ -211,11 +211,13 @@ class BaseCorrection(PythonDevice): .commit(), UINT32_ELEMENT(expected) .key("outputShmemBufferSize") - .displayedName("Output buffer size limit (GB)") + .displayedName("Output buffer size limit") + .unit(Unit.BYTE) + .metricPrefix(MetricPrefix.GIGA) .description( "Corrected trains are written to shared memory locations. These are " - "pre-allocated and re-used. This parameter determines how big (number " - "of GB) the circular buffer will be." + "pre-allocated and re-used (circular buffer). This parameter determines " + "much memory to set aside for the buffer." ) .assignmentOptional() .defaultValue(10) @@ -506,6 +508,9 @@ class BaseCorrection(PythonDevice): slotName=meth_name, ) + def __del__(self): + del self._shmem_buffer + def preReconfigure(self, config): for ts_path in ( "constantParameters.deviceMappingSnapshotAt", @@ -667,6 +672,8 @@ class BaseCorrection(PythonDevice): self.output_data_dtype, shmem_buffer_name, ) + self.log.INFO("Trying to pin the shmem buffer memory") + self._shmem_buffer.cuda_pin() else: self._shmem_buffer.change_shape(self.output_data_shape) diff --git a/src/calng/shmem_utils.py b/src/calng/shmem_utils.py index 54f3bc9df926dfbfc47d18e01ad880331810fece..0a82cdf6eaa6a5e0bf065e6eda1812597db1f3d0 100644 --- a/src/calng/shmem_utils.py +++ b/src/calng/shmem_utils.py @@ -66,6 +66,9 @@ class ShmemCircularBuffer: ) self._buffer_ary = None self._update_shape(array_shape, dtype) + self._cuda_pinned = False + # important for performance and pinning: touch memory to actually allocate + self._buffer_ary.fill(0) def _update_shape(self, array_shape, dtype): array_shape = tuple(array_shape) @@ -96,6 +99,22 @@ class ShmemCircularBuffer: dtype = self._buffer_ary.dtype self._update_shape(array_shape, dtype) + def cuda_pin(self): + import cupy + self._memory_pointer = self._buffer_ary.ctypes.get_data() + cupy.cuda.runtime.hostRegister( + self._memory_pointer, + self._shared_memory.size, + 0 + ) + + def __del__(self): + if self._cuda_pinned: + import cupy + cupy.cuda.runtime.hostUnregister(self._memory_pointer) + del self._buffer_ary + del self._shared_memory + @property def num_slots(self): return self._buffer_ary.shape[0]