diff --git a/src/calng/AgipdCorrection.py b/src/calng/AgipdCorrection.py
index 144295641c6805ec2d6a320bdc6331d4ee263f16..c295f6ac6fa0c676786e47bb526e49fd9c9e56d7 100644
--- a/src/calng/AgipdCorrection.py
+++ b/src/calng/AgipdCorrection.py
@@ -12,7 +12,6 @@ from karabo.bound import (
 )
 from karabo.common.states import State
 
-from . import utils
 from ._version import version as deviceVersion
 from .agipd_gpu import AgipdGainMode, AgipdGpuRunner, BadPixelValues, CorrectionFlags
 from .base_correction import BaseCorrection, add_correction_step_schema
@@ -164,17 +163,6 @@ class AgipdCorrection(BaseCorrection):
             self.get("dataFormat.pixelsY"),
         )
 
-    @property
-    def output_data_shape(self):
-        return utils.shape_after_transpose(
-            (
-                self.get("dataFormat.memoryCells"),
-                self.get("dataFormat.pixelsX"),
-                self.get("dataFormat.pixelsY"),
-            ),
-            self._output_transpose,
-        )
-
     def __init__(self, config):
         super().__init__(config)
         # TODO: different gpu runner for fixed gain mode
@@ -188,11 +176,6 @@ class AgipdCorrection(BaseCorrection):
             "g_gain_value": config.get("corrections.relGainXray.gGainValue"),
         }
 
-        self._output_transpose = {
-            "pixels-fast": None,
-            "memorycells-fast": (2, 1, 0),
-            "no-reshape": None,
-        }[config.get("dataFormat.outputAxisOrder")]
         self._update_shapes()
 
         # configurability: overriding md_additional_offset
@@ -268,7 +251,10 @@ class AgipdCorrection(BaseCorrection):
             buffer_handle, buffer_array = self._shmem_buffer.next_slot()
             self.gpu_runner.load_cell_table(cell_table)
             self.gpu_runner.correct(self._correction_flag_enabled)
-            self.gpu_runner.reshape(out=buffer_array)
+            self.gpu_runner.reshape(
+                output_order=self._schema_cache["dataFormat.outputAxisOrder"],
+                out=buffer_array,
+            )
             # after reshape, data for dataOutput is now safe in its own buffer
             if do_generate_preview:
                 if self._correction_flag_enabled != self._correction_flag_preview:
diff --git a/src/calng/DsscCorrection.py b/src/calng/DsscCorrection.py
index 349e5414f6d13e11b720fdcbedbec8a4fe1428a2..04916db116346e31999f0f4515d5b5fe1afeb34b 100644
--- a/src/calng/DsscCorrection.py
+++ b/src/calng/DsscCorrection.py
@@ -4,7 +4,6 @@ import numpy as np
 from karabo.bound import KARABO_CLASSINFO, VECTOR_STRING_ELEMENT
 from karabo.common.states import State
 
-from . import utils
 from ._version import version as deviceVersion
 from .base_correction import BaseCorrection, add_correction_step_schema
 from .calcat_utils import DsscCalcatFriend, DsscConstants
@@ -48,20 +47,8 @@ class DsscCorrection(BaseCorrection):
             self.get("dataFormat.pixelsX"),
         )
 
-    @property
-    def output_data_shape(self):
-        return utils.shape_after_transpose(
-            self.input_data_shape, self._output_transpose
-        )
-
     def __init__(self, config):
         super().__init__(config)
-        self._output_transpose = {
-            "pixels-fast": (0, 2, 1),
-            "memorycells-fast": (2, 1, 0),
-            "no-reshape": None,
-        }[config.get("dataFormat.outputAxisOrder")]
-        self._update_shapes()
         self.updateState(State.ON)
 
     def process_input(self, data, metadata):
@@ -124,7 +111,10 @@ class DsscCorrection(BaseCorrection):
             buffer_handle, buffer_array = self._shmem_buffer.next_slot()
             self.gpu_runner.load_cell_table(cell_table)
             self.gpu_runner.correct(self._correction_flag_enabled)
-            self.gpu_runner.reshape(out=buffer_array)
+            self.gpu_runner.reshape(
+                output_order=self._schema_cache["dataFormat.outputAxisOrder"],
+                out=buffer_array,
+            )
             if do_generate_preview:
                 if self._correction_flag_enabled != self._correction_flag_preview:
                     self.gpu_runner.correct(self._correction_flag_preview)
diff --git a/src/calng/agipd_gpu.py b/src/calng/agipd_gpu.py
index fd935ebade2ff0ea9794bc2018271f320f4e0a37..3b3802ab633f9cd89cb76934477bcb38b3e83101 100644
--- a/src/calng/agipd_gpu.py
+++ b/src/calng/agipd_gpu.py
@@ -26,6 +26,7 @@ class AgipdGainMode(enum.IntEnum):
 
 class AgipdGpuRunner(base_gpu.BaseGpuRunner):
     _kernel_source_filename = "agipd_gpu_kernels.cpp"
+    _corrected_axis_order = "cxy"
 
     def __init__(
         self,
@@ -33,7 +34,6 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
         pixels_y,
         memory_cells,
         constant_memory_cells,
-        output_transpose=(1, 2, 0),  # default: memorycells-fast
         input_data_dtype=cupy.uint16,
         output_data_dtype=cupy.float32,
         bad_pixel_mask_value=cupy.nan,
@@ -52,7 +52,6 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
             pixels_y,
             memory_cells,
             constant_memory_cells,
-            output_transpose,
             input_data_dtype,
             output_data_dtype,
         )
@@ -247,9 +246,12 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
             ),
         )
 
-    def get_gain_map(self, out=None):
+    def get_gain_map(self, output_order, out=None):
         return cupy.ascontiguousarray(
-            cupy.transpose(self.gain_map_gpu, self.output_transpose)
+            cupy.transpose(
+                self.gain_map_gpu,
+                utils.transpose_order(self._corrected_axis_order, output_order),
+            )
         ).get(out=out)
 
     def _init_kernels(self):
diff --git a/src/calng/base_correction.py b/src/calng/base_correction.py
index 82022c15acd2a2c40f72b65417490603a1896b5b..fd3fbdc0cbe514de13ef8aa7cd0a4d21cc669e5b 100644
--- a/src/calng/base_correction.py
+++ b/src/calng/base_correction.py
@@ -41,11 +41,12 @@ PROCESSING_STATE_TIMEOUT = 10
 
 @KARABO_CLASSINFO("BaseCorrection", deviceVersion)
 class BaseCorrection(PythonDevice):
-    _correction_flag_class = None  # subclass must override this with some enum class
+    _correction_flag_class = None  # subclass must set to some enum class
     _gpu_runner_class = None  # subclass must set this
     _gpu_runner_init_args = {}  # subclass can set this (TODO: remove, design better)
     _managed_keys = [
         "outputShmemBufferSize",
+        "dataFormat.outputAxisOrder",
         "dataFormat.outputImageDtype",
         "preview.enable",
         "preview.pulse",
@@ -59,12 +60,13 @@ class BaseCorrection(PythonDevice):
         "constantParameters.memoryCells",
         "dataFormat.pixelsX",
         "dataFormat.pixelsY",
+        "dataFormat.outputAxisOrder",
         "preview.enable",
         "preview.pulse",
         "preview.trainIdModulo",
         "processingStateTimeout",
         "state",
-    }
+    }  # subclass should be aware of cache, but does not need to extend
 
     def _load_constant_to_gpu(constant_name, constant_data):
         raise NotImplementedError()
@@ -75,7 +77,15 @@ class BaseCorrection(PythonDevice):
 
     @property
     def output_data_shape(self):
-        raise NotImplementedError()
+        axis_lengths = {
+            "x": self._schema_cache["dataFormat.pixelsX"],
+            "y": self._schema_cache["dataFormat.pixelsY"],
+            "c": self._schema_cache["dataFormat.memoryCells"],
+        }
+        return tuple(
+            axis_lengths[axis]
+            for axis in self._schema_cache["dataFormat.outputAxisOrder"]
+        )
 
     @staticmethod
     def expectedParameters(expected):
@@ -175,13 +185,13 @@ class BaseCorrection(PythonDevice):
             .key("dataFormat.outputAxisOrder")
             .displayedName("Output axis order")
             .description(
-                "Axes of main data output can be reordered after correction. Choose "
-                "between 'pixels-fast' (memory_cell, x, y), 'memorycells-fast' "
-                "(x, y, memory_cell), and 'no-reshape'"
+                "Axes of main data output can be reordered after correction. Axis order "
+                "is specified as string consisting of 'x', 'y', and 'c', with the "
+                "latter indicating the memory cell axis. The default value of 'cxy' "
+                "puts pixels along the fast axes."
             )
-            .options("pixels-fast,memorycells-fast,no-reshape")
             .assignmentOptional()
-            .defaultValue("pixels-fast")
+            .defaultValue("cxy")
             .commit(),
             VECTOR_UINT32_ELEMENT(expected)
             .key("dataFormat.inputDataShape")
@@ -337,6 +347,11 @@ class BaseCorrection(PythonDevice):
         }
         super().__init__(config)
 
+        if not sorted(config.get("dataFormat.outputAxisOrder")) == ["c", "x", "y"]:
+            # TODO: figure out how to get this information to operator
+            self.log_status_error("Invalid output axis order string")
+            return
+
         self.KARABO_ON_DATA("dataInput", self.process_input)
         self.KARABO_ON_EOS("dataInput", self.handle_eos)
 
@@ -574,7 +589,6 @@ class BaseCorrection(PythonDevice):
             self.get("dataFormat.pixelsY"),
             self.get("dataFormat.memoryCells"),
             int(self.get("constantParameters.memoryCells")),
-            output_transpose=self._output_transpose,
             input_data_dtype=self.input_data_dtype,
             output_data_dtype=self.output_data_dtype,
             **self._gpu_runner_init_args,
diff --git a/src/calng/base_gpu.py b/src/calng/base_gpu.py
index 868d42e0866e2be0866f6e2db45a9f92c93dabc2..e51bc99b0f67a24a2fc91cfbf39710af78be7e7f 100644
--- a/src/calng/base_gpu.py
+++ b/src/calng/base_gpu.py
@@ -29,13 +29,16 @@ class BaseGpuRunner:
     correction steps are appropriate given the constants loaded so far.
     """
 
+    # These must be set by subclass
+    _kernel_source_filename = None
+    _corrected_axis_order = None
+
     def __init__(
         self,
         pixels_x,
         pixels_y,
         memory_cells,
         constant_memory_cells,
-        output_transpose=(2, 1, 0),  # default: memorycells-fast
         input_data_dtype=np.uint16,
         output_data_dtype=np.float32,
     ):
@@ -47,15 +50,11 @@ class BaseGpuRunner:
         self.pixels_x = pixels_x
         self.pixels_y = pixels_y
         self.memory_cells = memory_cells
-        self.output_transpose = output_transpose
         if constant_memory_cells == 0:
             # if not set, guess same as input; may save one recompilation
             self.constant_memory_cells = memory_cells
         else:
             self.constant_memory_cells = constant_memory_cells
-        self.output_shape = utils.shape_after_transpose(
-            self.processed_shape, self.output_transpose
-        )
         # preview will only be single memory cell
         self.preview_shape = (self.pixels_x, self.pixels_y)
         self.input_data_dtype = input_data_dtype
@@ -69,7 +68,7 @@ class BaseGpuRunner:
         self.processed_data_gpu = cupy.empty(
             self.processed_shape, dtype=output_data_dtype
         )
-        self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype)
+        self.reshaped_data_gpu = None  # currently not reusing buffer
         self.preview_raw = np.empty(self.preview_shape, dtype=np.float32)
         self.preview_corrected = np.empty(self.preview_shape, dtype=np.float32)
 
@@ -105,22 +104,22 @@ class BaseGpuRunner:
         """
         raise NotImplementedError()
 
-    def reshape(self, out=None):
-        """Move axes to desired output order
+    def reshape(self, output_order, out=None):
+        """Move axes to desired output order and copy to host memory
 
         The out parameter is passed directly to the get function of GPU array: if
         None, then a new ndarray (in host memory) is returned. If not None, then data
         will be loaded into the provided array, which must match shape / dtype.
         """
         # TODO: avoid copy
-        if self.output_transpose is None:
-            self.reshaped_data_gpu = cupy.ascontiguousarray(
-                cupy.squeeze(self.processed_data_gpu)
-            )
+        # TODO: check if necessary to make contiguous
+        if output_order == self._corrected_axis_order:
+            self.reshaped_data_gpu = cupy.ascontiguousarray(self.processed_data_gpu)
         else:
             self.reshaped_data_gpu = cupy.ascontiguousarray(
                 cupy.transpose(
-                    cupy.squeeze(self.processed_data_gpu), self.output_transpose
+                    self.processed_data_gpu,
+                    utils.transpose_order(self._corrected_axis_order, output_order),
                 )
             )
         return self.reshaped_data_gpu.get(out=out)
diff --git a/src/calng/dssc_gpu.py b/src/calng/dssc_gpu.py
index 8c1995e39733f462c36f7a663da3b1ec2dbe0d16..2b4bac53f025f8bcbe3dde08355a35294319713d 100644
--- a/src/calng/dssc_gpu.py
+++ b/src/calng/dssc_gpu.py
@@ -13,6 +13,7 @@ class CorrectionFlags(enum.IntFlag):
 
 class DsscGpuRunner(base_gpu.BaseGpuRunner):
     _kernel_source_filename = "dssc_gpu_kernels.cpp"
+    _corrected_axis_order = "cyx"
 
     def __init__(
         self,
@@ -20,7 +21,6 @@ class DsscGpuRunner(base_gpu.BaseGpuRunner):
         pixels_y,
         memory_cells,
         constant_memory_cells,
-        output_transpose=(2, 1, 0),  # default: memorycells-fast
         input_data_dtype=np.uint16,
         output_data_dtype=np.float32,
     ):
@@ -31,7 +31,6 @@ class DsscGpuRunner(base_gpu.BaseGpuRunner):
             pixels_y,
             memory_cells,
             constant_memory_cells,
-            output_transpose,
             input_data_dtype,
             output_data_dtype,
         )
diff --git a/src/calng/utils.py b/src/calng/utils.py
index 7c4c78e5f47e1b338f9da54afe172632f72064dc..0acf8cfd401a8bb358dd6d33090f6861a0a671be 100644
--- a/src/calng/utils.py
+++ b/src/calng/utils.py
@@ -39,6 +39,19 @@ def threadsafe_cache(fun):
     return aux
 
 
+@functools.lru_cache
+def transpose_order(axes_in, axes_out):
+    """Computes the order of axes_out relative to axes_in for transposition purposes
+
+    Both axes_in and axes_out are assumed to be strings in which each letter represents
+    an axis (duck typing accepts: any iterable of hashable elements). They should
+    probably be of the same length and have no repetitions, but this is not enforced.
+    Off-label use voids warranty.
+    """
+    axis_order = {axis: index for index, axis in enumerate(axes_in)}
+    return tuple(axis_order[axis] for axis in axes_out)
+
+
 _np_typechar_to_c_typestring = {
     "?": "bool",
     "B": "unsigned char",