diff --git a/src/calng/DetectorAssembler.py b/src/calng/DetectorAssembler.py
index 0259846f4f7a09e16457bbbf26233b7cccbdad09..9fc0dc68ca430f0fd08e2bd66dbfa19cd995b063 100644
--- a/src/calng/DetectorAssembler.py
+++ b/src/calng/DetectorAssembler.py
@@ -200,6 +200,8 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
         self._path_to_stack = self.get("pathToStack")
         self._geometry = None
         self._stack_input_buffer = None
+        self._position_output_buffer = None
+        self._extra_shape = ()
 
         self.KARABO_SLOT(self.requestScene)
 
@@ -256,9 +258,6 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
             return
         self._geometry = geom_utils.deserialize_geometry(serialized_geometry)
         # TODO: allow multiple memory cells (extra geom notion of extra dimensions)
-        self._stack_input_buffer = np.zeros(
-            self._geometry.expected_data_shape, dtype=np.float32
-        )
 
     def on_matched_data(self, train_id, sources):
         if self._geometry is None:
@@ -271,7 +270,26 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
             self.unsafe_get("outputForBridgeOutput")
         )
 
-        module_indices_unfilled = set(range(self._stack_input_buffer.shape[0]))
+        # check and maybe update stacking, output buffers
+        input_shape = next(iter(sources.values()))[0].get(self._path_to_stack).shape
+        input_extra_shape = input_shape[:-2]
+        if self._stack_input_buffer is None or input_extra_shape != self._extra_shape:
+            self._extra_shape = input_extra_shape
+            self._stack_input_buffer = np.zeros(
+                self._extra_shape + self._geometry.expected_data_shape,
+                dtype=np.float32
+            )
+            self._position_output_buffer = self._geometry.output_array_for_position_fast(
+                extra_shape=self._extra_shape, dtype=np.float32
+            )
+            self.log.INFO(
+                f"Updating stacking buffer to shape: {self._stack_input_buffer.shape}"
+            )
+            self.log.INFO(
+                f"Updating output buffer to shape: {self._position_output_buffer.shape}"
+            )
+
+        module_indices_unfilled = set(range(self._geometry.n_modules))
         earliest_source_timestamp = float("inf")
         for source, (data, source_timestamp) in sources.items():
             # regular TrainMatcher output
@@ -282,7 +300,7 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
             # prepare for assembly
             # TODO: handle failure to "parse" source, get data out
             module_index = self._source_to_index(source)
-            self._stack_input_buffer[module_index] = data.get(
+            self._stack_input_buffer[..., module_index, :, :] = data.get(
                 self._path_to_stack
             ).astype(np.float32, copy=False)  # TODO: set dtype based on input?
             module_indices_unfilled.discard(module_index)
@@ -299,7 +317,9 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
             # TODO: configurable treatment of missing modules
 
         # TODO: reusable output buffer to save on allocation
-        assembled, _ = self._geometry.position_modules_fast(self._stack_input_buffer)
+        assembled, _ = self._geometry.position_modules_fast(
+            self._stack_input_buffer, out=self._position_output_buffer
+        )
 
         # TODO: optionally include control data
         output_hash = Hash(
@@ -330,7 +350,7 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
                 "image.data",
                 ImageData(
                     # TODO: get around this being mirrored...
-                    assembled[::-1, ::-1],
+                    assembled[..., ::-1, ::-1],
                     Dims(*assembled.shape),
                     Encoding.GRAY,
                     bitsPerPixel=32,
@@ -390,22 +410,22 @@ class DetectorAssembler(TrainMatcher.TrainMatcher):
 def downsample_2d(arr, factor, reduction_fun=np.nanmax):
     """Generalization of downsampling from FemDataAssembler
 
-    Expects first two dimensions of arr to be multiple of 2 ** factor
+    Expects last two dimensions of arr to be multiple of 2 ** factor
     Useful if you're sitting at home and ssh connection is slow to get full-resolution
     previews."""
 
     for i in range(factor // 2):
         arr = reduction_fun(
             (
-                arr[:-1:2],
-                arr[1::2],
+                arr[..., :-1:2, :],
+                arr[..., 1::2, :],
             ),
             axis=0,
         )
         arr = reduction_fun(
             (
-                arr[:, :-1:2],
-                arr[:, 1::2],
+                arr[..., :-1:2],
+                arr[..., 1::2],
             ),
             axis=0,
         )
diff --git a/src/calng/base_correction.py b/src/calng/base_correction.py
index 3f58ca800c5e2a5c1837a2fbb0bd51711db4b43d..31b5bf3369f67201f600efd180c2ea8d31bfd26e 100644
--- a/src/calng/base_correction.py
+++ b/src/calng/base_correction.py
@@ -496,13 +496,15 @@ class BaseCorrection(PythonDevice):
             .displayedName("Index (or stat) for preview")
             .description(
                 "If this value is â‰¥ 0, the corresponding index (frame, cell, or pulse) "
-                "will be sliced for the preview output. If this value is < 0, preview "
-                "will be one of the following stats: -1: max, -2: mean, -3: sum, -4: "
-                "stdev. These stats are computed across memory cells."
+                "will be sliced for the preview output. If -4 â‰¤ this value â‰¤ -1, "
+                "preview will be one of the following stats: -1: max, -2: mean, -3: "
+                "sum, -4: stdev. These stats are computed across memory cells. "
+                "Finally, -5 will cause all memory cells to be sent. Be aware that "
+                "this can cause severe performance and bandwidth issues."
             )
             .assignmentOptional()
             .defaultValue(0)
-            .minInc(-4)
+            .minInc(-5)
             .reconfigurable()
             .commit(),
 
diff --git a/src/calng/base_gpu.py b/src/calng/base_gpu.py
index 333eb053815cd643540cca94e3b1aa0c457adc37..f41d06a0846fd594768a2cd08effb8271aa3861a 100644
--- a/src/calng/base_gpu.py
+++ b/src/calng/base_gpu.py
@@ -145,7 +145,7 @@ class BaseGpuRunner:
         been called with the appropriate flags before compute_preview(...).
         """
 
-        if preview_index < -4:
+        if preview_index < -5:
             raise ValueError(f"No statistic with code {preview_index} defined")
         elif preview_index >= self.memory_cells:
             raise ValueError(f"Memory cell index {preview_index} out of range")
@@ -173,6 +173,8 @@ class BaseGpuRunner:
                 -4: cupy.nanstd,
             }[preview_index]
             return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
+        elif preview_index == -5:
+            return image_data.get()
 
     def update_block_size(self, full_block):
         """Set execution grid such that it covers processed_shape with full_blocks