diff --git a/src/calng/agipd_gpu.py b/src/calng/agipd_gpu.py
index 110939ec8509c5ddc322081423bc83b4a7537780..1d8d2d150ce3e22ac32f4166b42411fba1b1ba31 100644
--- a/src/calng/agipd_gpu.py
+++ b/src/calng/agipd_gpu.py
@@ -41,7 +41,7 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
         )
         self.gain_map_gpu = cupy.empty(self.processed_shape, dtype=np.uint8)
 
-        self.map_shape = (self.pixels_x, self.pixels_y, self.constant_memory_cells)
+        self.map_shape = (self.constant_memory_cells, self.pixels_x, self.pixels_y)
         self.gm_map_shape = self.map_shape + (3,)  # for gain-mapped constants
         self.threshold_map_shape = self.map_shape + (2,)
         # constants
@@ -50,8 +50,7 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
         )
         self.offset_map_gpu = cupy.zeros(self.gm_map_shape, dtype=np.float32)
         self.rel_gain_pc_map_gpu = cupy.ones(self.gm_map_shape, dtype=np.float32)
-        # optional extra offset for medium gain
-        self.md_additional_offset_gpu = cupy.zeros(1, dtype=np.float32)
+        self.md_additional_offset_gpu = cupy.zeros(self.map_shape, dtype=np.float32)
         self.rel_gain_xray_map_gpu = cupy.ones(self.map_shape, dtype=np.float32)
         self.badpixel_map_gpu = cupy.zeros(self.map_shape, dtype=np.uint32)
 
@@ -59,33 +58,36 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
 
     def load_thresholds(self, threshold_map):
         # shape: y, x, memory cell, threshold 0 / threshold 1 / 3 gain values
-        # TODO: do we need the gain values for anything?
-        to_set = np.transpose(threshold_map[..., :2], (1, 0, 2, 3)).astype(np.float32)
+        # TODO: do we need the gain values (in the constant) for anything?
         self.gain_thresholds_gpu.set(
-            to_set
+            np.transpose(threshold_map[..., :2], (2, 1, 0, 3)).astype(np.float32)
         )
 
     def load_offset_map(self, offset_map):
         # shape: y, x, memory cell, gain stage
         self.offset_map_gpu.set(
-            np.transpose(offset_map, (1, 0, 2, 3)).astype(np.float32)
+            np.transpose(offset_map, (2, 1, 0, 3)).astype(np.float32)
         )
 
     def load_rel_gain_pc_map(self, slopes_pc_map):
         # pc has funny shape (11, 352, 128, 512) from file
         # this is (fi, memory cell, y, x)
+        slopes_pc_map = slopes_pc_map.astype(np.float32)
         pc_high_m = slopes_pc_map[0]
         pc_high_I = slopes_pc_map[1]
         pc_med_m = slopes_pc_map[3]
         pc_med_I = slopes_pc_map[4]
         frac_high_med = pc_high_m / pc_med_m
         # TODO: verify formula
-        md_additional_offset = pc_high_I - pc_med_I * frac_high_med
-        self.rel_gain_pc_map_gpu[..., 0] = 1  # rel xray gain can come after
-        self.rel_gain_pc_map_gpu[..., 1] = self.rel_gain_pc_map[..., 0] * frac_high_med
-        self.rel_gain_pc_map_gpu[..., 2] = self.rel_gain_pc_map[..., 1] * 4.48
+        md_additional_offset = (pc_high_I - pc_med_I * frac_high_med).astype(np.float32)
+        rel_gain_map = np.ones(
+            (3, self.constant_memory_cells, self.pixels_y, self.pixels_x), dtype=np.float32
+        )
+        rel_gain_map[1] = rel_gain_map[0] * frac_high_med
+        rel_gain_map[2] = rel_gain_map[1] * 4.48
         # TODO: enable overriding this based on user input
-        self.md_additional_offset_gpu.set(md_additional_offset)
+        self.rel_gain_pc_map_gpu.set(np.transpose(rel_gain_map, (1, 3, 2, 0)))
+        self.md_additional_offset_gpu.set(np.transpose(md_additional_offset, (0, 2, 1)))
 
     def load_rel_gain_ff_map(self, slopes_ff_map):
         # constant shape: y, x, memory cell
@@ -93,7 +95,7 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
             # old format, is per pixel only
             raise NotImplementedError("Old slopes FF map format")
         self.rel_gain_xray_map_gpu.set(
-            np.transpose(slopes_ff_map, (1, 0, 2)).astype(np.float32)
+            np.transpose(slopes_ff_map).astype(np.float32)
         )
 
     def correct(self, flags):
@@ -108,6 +110,8 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
         (view of) said buffer as an ndarray.  Keep in mind that the output
         buffers will get overwritten eventually (circular buffer).
         """
+        if flags & CorrectionFlags.BLSHIFT:
+            raise NotImplementedError("Baseline shift not implemented yet")
         self.correction_kernel(
             self.full_grid,
             self.full_block,
@@ -137,7 +141,6 @@ class AgipdGpuRunner(base_gpu.BaseGpuRunner):
                 "output_data_dtype": utils.np_dtype_to_c_type(self.output_data_dtype),
             }
         )
-        print(kernel_source)
         self.source_module = cupy.RawModule(code=kernel_source)
         self.correction_kernel = self.source_module.get_function("correct")
         self.casting_kernel = self.source_module.get_function("only_cast")
diff --git a/src/calng/agipd_gpu_kernels.cpp b/src/calng/agipd_gpu_kernels.cpp
index 1212e00b14b89118a92751bdd4727afc12ed72da..d11af9737db09bbcb6eaf8f62a0881c1388feefc 100644
--- a/src/calng/agipd_gpu_kernels.cpp
+++ b/src/calng/agipd_gpu_kernels.cpp
@@ -21,7 +21,7 @@ extern "C" {
 							const float* threshold_map,
 							const float* offset_map,
 							const float* rel_gain_pc_map,
-	                        const float md_additional_offset,
+	                        const float* md_additional_offset,
 	                        const float* rel_gain_xray_map,
 							const unsigned int* bad_pixel_map,
 							unsigned char* gain_map,
@@ -60,17 +60,22 @@ extern "C" {
 		const size_t output_stride_cell = output_stride_x * X;
 		const size_t output_index = cell * output_stride_cell + x * output_stride_x + y * output_stride_y;
 
-		// threshold constant shape: x, y, cell, threshold (dim size 2)
+		// per-pixel only constant: cell, x, y
+		const size_t map_stride_y = 1;
+		const size_t map_stride_x = Y * map_stride_y;
+		const size_t map_stride_cell = X * map_stride_x;
+
+		// threshold constant shape: cell, x, y, threshold (dim size 2)
 		const size_t threshold_map_stride_threshold = 1;
-		const size_t threshold_map_stride_cell = 2      * threshold_map_stride_threshold;
-		const size_t threshold_map_stride_y = map_cells * threshold_map_stride_cell;
-		const size_t threshold_map_stride_x = Y         * threshold_map_stride_y;
+		const size_t threshold_map_stride_y = 2 * threshold_map_stride_threshold;
+		const size_t threshold_map_stride_x = Y * threshold_map_stride_y;
+		const size_t threshold_map_stride_cell = X * threshold_map_stride_x;
 
-		// gain mapped constant shape: x, y, memory cell, gain_level (dim size 3)
+		// gain mapped constant shape: cell, x, y, gain_level (dim size 3)
 		const size_t gm_map_stride_gain = 1;
-		const size_t gm_map_stride_cell = 3      * gm_map_stride_gain;
-		const size_t gm_map_stride_y = map_cells * gm_map_stride_cell;
-		const size_t gm_map_stride_x = Y         * gm_map_stride_y;
+		const size_t gm_map_stride_y = 3 * gm_map_stride_gain;
+		const size_t gm_map_stride_x = Y * gm_map_stride_y;
+		const size_t gm_map_stride_cell = X * gm_map_stride_x;
 		// TODO: handle multiple maps, multiple strides, multiple limits
 
 		const size_t map_cell = cell_table[cell];
@@ -96,6 +101,10 @@ extern "C" {
 				}
 			}
 
+			const size_t map_index = map_cell * map_stride_cell +
+				y * map_stride_y +
+				x * map_stride_x;
+
 			const size_t gm_map_index = gain * gm_map_stride_gain +
 				map_cell * gm_map_stride_cell +
 				y * gm_map_stride_y +
@@ -111,7 +120,7 @@ extern "C" {
 				if (corr_flags & CORR_REL_GAIN_PC) {
 					corrected *= rel_gain_pc_map[gm_map_index];
 					if (gain == 1) {
-						corrected += md_additional_offset;
+						corrected += md_additional_offset[map_index];
 					}
 				}
 				if (corr_flags & CORR_REL_GAIN_XRAY) {
diff --git a/src/calng/base_gpu.py b/src/calng/base_gpu.py
index 0c1d2cd763abcc9ef980f5f52e58e473c7686fa6..ef8d3d3cf5609fc066dc27d2c91dccf31954271d 100644
--- a/src/calng/base_gpu.py
+++ b/src/calng/base_gpu.py
@@ -65,7 +65,7 @@ class BaseGpuRunner:
         # reuse output arrays
         self.cell_table_gpu = cupy.empty(self.memory_cells, dtype=np.uint16)
         self.input_data_gpu = cupy.empty(self.input_shape, dtype=input_data_dtype)
-        self.processed_data_gpu = cupy.empty(self.input_shape, dtype=output_data_dtype)
+        self.processed_data_gpu = cupy.empty(self.processed_shape, dtype=output_data_dtype)
         self.reshaped_data_gpu = cupy.empty(self.output_shape, dtype=output_data_dtype)
         self.preview_raw = cupyx.empty_pinned(self.preview_shape, dtype=np.float32)
         self.preview_corrected = cupyx.empty_pinned(
@@ -107,8 +107,6 @@ class BaseGpuRunner:
         return self.reshaped_data_gpu.get(out=out)
 
     def load_data(self, raw_data):
-        print(raw_data.shape)
-        print(self.input_data_gpu.shape)
         self.input_data_gpu.set(np.squeeze(raw_data))
 
     def load_cell_table(self, cell_table):
diff --git a/src/tests/test_agipd_kernels.py b/src/tests/test_agipd_kernels.py
index 91015e6a0508bbdedbc4b35e487310c91dfeb871..4a5aa10db181738274ebbd2c21eb90e44c2b39c1 100644
--- a/src/tests/test_agipd_kernels.py
+++ b/src/tests/test_agipd_kernels.py
@@ -1,4 +1,6 @@
+import h5py
 import numpy as np
+import pathlib
 import pytest
 
 from calng import agipd_gpu
@@ -9,19 +11,24 @@ corr_dtype = np.float32
 pixels_x = 512
 pixels_y = 128
 memory_cells = 352
-offset_map = (
-    np.random.random(size=(pixels_x, pixels_y, memory_cells)).astype(corr_dtype) * 20
-)
-cell_table = np.arange(memory_cells, dtype=np.uint16)
-np.random.shuffle(cell_table)
+
 raw_data = np.random.randint(
     low=0, high=2000, size=(memory_cells, 2, pixels_x, pixels_y), dtype=input_dtype
 )
+image_data = raw_data[:, 0]
+raw_gain = raw_data[:, 1]
+cell_table = np.arange(memory_cells, dtype=np.uint16)
+np.random.shuffle(cell_table)
+
+caldb_store = pathlib.Path("/gpfs/exfel/d/cal/caldb_store/xfel/cal")
+caldb_prefix = caldb_store / "agipd-type/agipd_siv1_agipdv11_m305"
 
-thresholds = np.random.random(size=(pixels_y, pixels_x, memory_cells)) * 1000
-thresholds = np.stack((thresholds, thresholds*2), axis=3).astype(np.float32)
-gm_const_zeros = np.zeros((pixels_x, pixels_y, memory_cells, 3), dtype=np.float32)
-gm_const_ones = np.ones((pixels_x, pixels_y, memory_cells, 3), dtype=np.float32)
+with h5py.File(caldb_prefix / "cal.1619543695.4679213.h5", "r") as fd:
+    thresholds = np.array(fd["/AGIPD_SIV1_AGIPDV11_M305/ThresholdsDark/0/data"])
+with h5py.File(caldb_prefix / "cal.1619543664.1545036.h5", "r") as fd:
+    offset_map = np.array(fd["/AGIPD_SIV1_AGIPDV11_M305/Offset/0/data"])
+with h5py.File(caldb_prefix / "cal.1615377705.8904035.h5", "r") as fd:
+    slopes_pc_map = np.array(fd["/AGIPD_SIV1_AGIPDV11_M305/SlopesPC/0/data"])
 
 kernel_runner = agipd_gpu.AgipdGpuRunner(
     pixels_x,
@@ -32,29 +39,79 @@ kernel_runner = agipd_gpu.AgipdGpuRunner(
     output_data_dtype=output_dtype,
 )
 
+
 def thresholding_cpu(data, cell_table, thresholds):
     # get to memory_cell, x, y
-    raw_gain = data[:, 1, ...].astype(np.float32)
+    raw_gain = data[:, 1, ...].astype(corr_dtype)
     # get to threshold, memory_cell, x, y
-    thresholds = np.transpose(thresholds, (3, 2, 1, 0))[:, cell_table]
+    thresholds = np.transpose(thresholds)[:, cell_table]
     res = np.zeros((memory_cells, pixels_x, pixels_y), dtype=np.uint8)
     res[raw_gain > thresholds[0]] = 1
     res[raw_gain > thresholds[1]] = 2
     return res
 
-kernel_runner.load_cell_table(cell_table)
-kernel_runner.load_data(raw_data)
-kernel_runner.load_thresholds(thresholds)
-kernel_runner.correct(agipd_gpu.CorrectionFlags.THRESHOLD)
-gpu_res = kernel_runner.gain_map_gpu.get()
 
-print(np.sum(gpu_res, axis=(0,2)))
+gain_map_cpu = thresholding_cpu(raw_data, cell_table, thresholds)
+
+
+def corr_offset_cpu(data, cell_table, gain_map, offset):
+    image_data = data[:, 0].astype(corr_dtype)
+    offset = np.transpose(offset)[:, cell_table]
+    return (image_data - np.choose(gain_map, offset)).astype(output_dtype)
+
+
+def corr_rel_gain_pc_cpu(data, cell_table, gain_map, slopes_pc):
+    slopes_pc = slopes_pc.astype(np.float32)
+    pc_high_m = slopes_pc[0]
+    pc_high_I = slopes_pc[1]
+    pc_med_m = slopes_pc[3]
+    pc_med_I = slopes_pc[4]
+    frac_high_med = pc_high_m / pc_med_m
+    md_additional_offset = pc_high_I - pc_med_I * frac_high_med
+    rel_gain_map = np.ones((3, pixels_x, pixels_y, memory_cells), dtype=np.float32)
+    rel_gain_map[0] = 1  # rel xray gain can come after
+    rel_gain_map[1] = rel_gain_map[0] * np.transpose(frac_high_med)
+    rel_gain_map[2] = rel_gain_map[1] * 4.48
+    res = data[:, 0].astype(corr_dtype, copy=True)
+    res *= np.choose(gain_map, np.transpose(rel_gain_map, (0, 3, 1, 2)))
+    pixels_in_medium_gain = gain_map == 1
+    res[pixels_in_medium_gain] += np.transpose(md_additional_offset, (0, 2, 1))[
+        pixels_in_medium_gain
+    ]
+    return res
 
-def test_only_thresholding():
+
+def test_thresholding():
     kernel_runner.load_cell_table(cell_table)
     kernel_runner.load_data(raw_data)
     kernel_runner.load_thresholds(thresholds)
     kernel_runner.correct(agipd_gpu.CorrectionFlags.THRESHOLD)
     gpu_res = kernel_runner.gain_map_gpu.get()
-    cpu_res = thresholding_cpu(raw_data, cell_table, thresholds)
+    assert np.allclose(gpu_res, gain_map_cpu)
+
+
+def test_offset():
+    kernel_runner.load_cell_table(cell_table)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.load_thresholds(thresholds)
+    kernel_runner.load_offset_map(offset_map)
+    # have to do thresholding, otherwise all is treated as high gain
+    kernel_runner.correct(
+        agipd_gpu.CorrectionFlags.THRESHOLD | agipd_gpu.CorrectionFlags.OFFSET
+    )
+    cpu_res = corr_offset_cpu(raw_data, cell_table, gain_map_cpu, offset_map)
+    gpu_res = kernel_runner.processed_data_gpu.get()
+    assert np.allclose(gpu_res, cpu_res)
+
+
+def test_rel_gain_pc():
+    kernel_runner.load_cell_table(cell_table)
+    kernel_runner.load_data(raw_data)
+    kernel_runner.load_thresholds(thresholds)
+    kernel_runner.load_rel_gain_pc_map(slopes_pc_map)
+    kernel_runner.correct(
+        agipd_gpu.CorrectionFlags.THRESHOLD | agipd_gpu.CorrectionFlags.REL_GAIN_PC
+    )
+    cpu_res = corr_rel_gain_pc_cpu(raw_data, cell_table, gain_map_cpu, slopes_pc_map)
+    gpu_res = kernel_runner.processed_data_gpu.get()
     assert np.allclose(gpu_res, cpu_res)