diff --git a/src/calng/base_gpu.py b/src/calng/base_gpu.py
index 903cb1e35e46f03013b627d1a47032b30c969984..c0619d0ae253b7cb736dbc2fe7275092c7493a3c 100644
--- a/src/calng/base_gpu.py
+++ b/src/calng/base_gpu.py
@@ -168,31 +168,28 @@ class BaseGpuRunner:
             return cupy.nanmax(image_data, axis=0).astype(cupy.float32).get()
         elif preview_index in (-2, -3, -4):
             stat_fun = {
-                -1: cupy.nanmax,
                 -2: cupy.nanmean,
                 -3: cupy.nansum,
                 -4: cupy.nanstd,
             }[preview_index]
             return stat_fun(image_data, axis=0, dtype=cupy.float32).get()
 
-    def update_block_size(self, full_block, target_shape=None):
-        """Compute grid such that thread block grid covers target shape
+    def update_block_size(self, full_block):
+        """Set execution grid such that it covers processed_shape with full_blocks
 
-        Execution is scheduled with 3d "blocks" of CUDA threads, tuning can affect
+        Execution is scheduled with 3d "blocks" of CUDA threads. Tuning can affect
         performance. Correction kernels are "monolithic" for simplicity (i.e. each
         logical thread handles one entry in output data), so in each dimension we
-        parallelize, grid * block >= length.
+        parallelize, grid * block >= length to cover all entries.
 
         Note that individual kernels must themselves check whether they go out of
         bounds; grid dimensions get rounded up in case ndarray size is not multiple of
         block size.
 
         """
-        if target_shape is None:
-            target_shape = self.processed_shape
         assert len(full_block) == 3
         self.full_block = tuple(full_block)
         self.full_grid = tuple(
             utils.ceil_div(a_length, block_length)
-            for (a_length, block_length) in zip(target_shape, full_block)
+            for (a_length, block_length) in zip(self.processed_shape, full_block)
         )