diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5189ae59f64f4cd001e4e7170f3eb3748c9059b8..7ae2b28ae72dca0aff0b83c2059f13f209dde184 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -67,7 +67,7 @@ automated_test:
     - python3 -m pip install ".[automated_test]"
     - echo "Running automated test. This can take sometime to finish depending on the test data."
     - echo "Given variables are REFERENCE=$REFERENCE, OUTPUT=$OUTPUT, DETECTORS=$DETECTORS, CALIBRATION=$CALIBRATION"
-    - python3 -m pytest ./tests/test_reference_runs --color yes --verbose --release-test --reference-folder /gpfs/exfel/d/cal_tst/reference_folder --out-folder /gpfs/exfel/data/scratch/xcaltst/test/$OUTPUT  --detectors $DETECTORS --calibration $CALIBRATION  --find-difference
+    - python3 -m pytest ./tests/test_reference_runs --color yes --verbose --release-test --reference-folder /gpfs/exfel/d/cal_tst/reference_folder --out-folder /gpfs/exfel/data/scratch/xcaltst/test/$OUTPUT  --detectors $DETECTORS --calibration $CALIBRATION
   timeout: 24 hours
 
 cython-editable-install-test:
diff --git a/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb b/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb
index cc60547a57f228e12bf9315ef3088fbf8ef8425c..76987d79519ae0f2642d001ec22fd94b05d5c8d3 100644
--- a/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb
+++ b/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb
@@ -145,17 +145,8 @@
     "\n",
     "run_numbers = [run_high, run_med, run_low]\n",
     "\n",
-    "creation_time=None\n",
-    "if use_dir_creation_date:\n",
-    "    creation_time = get_dir_creation_date(in_folder, run_high)\n",
-    "\n",
-    "print(f\"Using {creation_time} as creation time of constant.\")\n",
-    "\n",
     "run, prop, seq = run_prop_seq_from_path(in_folder)\n",
     "\n",
-    "# Read report path and create file location tuple to add with the injection\n",
-    "file_loc = f\"proposal:{prop} runs:{run_low} {run_med} {run_high}\"\n",
-    "\n",
     "report = get_report(metadata_folder)\n",
     "cal_db_interface = get_random_db_interface(cal_db_interface)\n",
     "print(f'Calibration database interface: {cal_db_interface}')\n",
@@ -271,6 +262,13 @@
     ")\n",
     "# Update run_numbers list in case it was sorted.\n",
     "run_numbers = agipd_ctrl_dark.runs\n",
+    "\n",
+    "creation_time = None\n",
+    "if use_dir_creation_date:\n",
+    "    creation_time = get_dir_creation_date(in_folder, run_numbers[0])\n",
+    "\n",
+    "print(f\"Using {creation_time} as creation time of constant.\")\n",
+    "\n",
     "if mem_cells == 0:\n",
     "    mem_cells = agipd_ctrl_dark.get_memory_cells()\n",
     "\n",
@@ -648,6 +646,8 @@
     "step_timer.start()\n",
     "\n",
     "md = None\n",
+    "# Location of source data, injected with the constants\n",
+    "file_loc = f\"proposal:{prop} runs:{' '.join([str(r) for r in reversed(run_numbers)])}\"\n",
     "\n",
     "for qm in res:\n",
     "    db_module = qm_dict[qm][\"db_module\"]\n",
diff --git a/tests/test_reference_runs/conftest.py b/tests/test_reference_runs/conftest.py
index ce3b9745888821e5e75fe4c6ef02e76a35047c56..c2c50991ea1806d3454a79a8169bc1f324cef2ec 100644
--- a/tests/test_reference_runs/conftest.py
+++ b/tests/test_reference_runs/conftest.py
@@ -38,15 +38,6 @@ def pytest_addoption(parser):
               "apply validation test on numerical data only."),
     )
 
-    parser.addoption(
-        "--find-difference",
-        action="store_true",
-        default=False,
-        help=(
-            "In case of non numerical validation of h5file. "
-            "Find the different attribute and fail on the first one."),
-    )
-
     parser.addoption(
         "--use-slurm",
         action="store_true",
@@ -80,8 +71,6 @@ def release_test_config(request):
         "--no-numerical-validation")
     validate_only = request.config.getoption(
         "--validation-only")
-    find_difference = request.config.getoption(
-        "--find-difference")
     use_slurm = request.config.getoption(
         "--use-slurm")
     picked_test = request.config.getoption("--picked-test")
@@ -90,8 +79,7 @@ def release_test_config(request):
     return (
         detectors, calibration, picked_test,
         skip_numerical_validation, validate_only,
-        find_difference, use_slurm, reference_folder,
-        out_folder,
+        use_slurm, reference_folder, out_folder,
     )
 
 
diff --git a/tests/test_reference_runs/test_pre_deployment.py b/tests/test_reference_runs/test_pre_deployment.py
index c6b2f573ac7b00d081725b7731fdea7db721fb61..cdd8a0f015baca42d15d9f3aa123ff992786fbdb 100644
--- a/tests/test_reference_runs/test_pre_deployment.py
+++ b/tests/test_reference_runs/test_pre_deployment.py
@@ -1,11 +1,10 @@
-import hashlib
 import io
 import logging
 import multiprocessing
 import pathlib
-import tempfile
 import time
 from contextlib import redirect_stdout
+from dataclasses import dataclass
 from datetime import datetime
 from functools import partial
 from subprocess import PIPE, run
@@ -22,253 +21,122 @@ from .callab_tests import automated_test_config
 LOGGER = logging.getLogger(__name__)
 
 
-def file_md5(
-    tested_file: str,
-    block_size: int = 2 ** 20,
-) -> bytes:
-    """Generating MD5 checksum for a file.
+@dataclass
+class ComparisonResult:
+    filename: str
+    new_dsets: list
+    missing_dsets: list
+    changed_dsets: list
 
-    Args:
-        tested_file: File to be tested.
-        block_size (_type_, optional): Block size for reading the file.
-            Defaults to 2**20.
-    """
-    f = open(tested_file, "rb")
-    md5 = hashlib.md5()
-    while True:
-        data = f.read(block_size)
-        if not data:
-            break
-        md5.update(data)
-    f.close()
-    return md5.digest()
-
-
-def collect_attrs(groups, datasets, objects, exclude_attrs, name, node):
-    """Collect h5 attrs in groups, datasets, and objects lists."""
-    if node.name not in exclude_attrs:
-        if isinstance(node, h5py.Group):
-            groups.append(name)
-        elif isinstance(node, h5py.Dataset):
-            if node.dtype == 'object':
-                objects.append(name)
-            else:
-                datasets.append(name)
+    def found_differences(self):
+        return bool(self.new_dsets or self.missing_dsets or self.changed_dsets)
 
+    def show(self):
+        if not self.found_differences():
+            print(f"{self.filename} - ✓ no changes")
+            return
 
-def compare_datasets(
-    file1,
-    file2,
-    datasets: list
-):
-    """Compare the values of datasets in two h5 files."""
-    h5_diff = []
-    for d in datasets:
-        try:
-            if not np.allclose(file1[d][()], file2[d][()], equal_nan=True):
-                h5_diff.append(d)
-        except ValueError as e:
-            LOGGER.error(f"ValueError: {e}, {d}")
-            h5_diff.append(d)
-        except AttributeError as e:
-            LOGGER.error(f"AttributeError: {e}, {d}")
-            h5_diff.append(d)
-    return h5_diff
-
-
-def compare_objects(
-    file1,
-    file2,
-    objects: list
-):
-    """Compare the objects in two h5 files."""
-    h5_diff = []
-    for d in objects:
-        try:
-            if isinstance(file1[d][()], bytes):
-                if (
-                    file1[d][()].decode('utf-8') != file2[d][()].decode('utf-8')  # noqa
-                ):
-                    h5_diff.append(d)
-            elif (
-                file1[d][()].dtype != file1[d][()].dtype and
-                not file1[d][()] != file2[d][()]
-            ):  # pnccd files has only list of bytes
-                h5_diff.append(d)
-        except ValueError as e:
-            LOGGER.error(f"ValueError: {e}, {d}")
-            h5_diff.append(d)
-        except AttributeError as e:
-            LOGGER.error(f"AttributeError: {e}, {d}, "
-                         f"{file1[d][()].decode('utf-8')}")
-            h5_diff.append(d)
-    return h5_diff
-
-
-def find_differences(
-    test_file,
-    reference_file,
-    exclude_attrs,
-):
-    """
-    Find difference in groups, datasets, and objects between two h5files.
-    Args:
-        file1: first h5 file.
-        file2: second h5 file.
-    """
+        print(self.filename)
+        for ds in self.new_dsets:
+            print(f"  + NEW: {ds}")
+        for ds in self.missing_dsets:
+            print(f"  - MISSING: {ds}")
+        for ds, detail in self.changed_dsets:
+            print(f"  ~ CHANGED: {ds} ({detail})")
 
-    groups_f1 = []
-    datasets_f1 = []
-    objects_f1 = []
-
-    groups_f2 = []
-    datasets_f2 = []
-    objects_f2 = []
-
-    with h5py.File(test_file, 'r') as file1, h5py.File(reference_file, 'r') as file2:  # noqa
-
-        # Fill groups, datasets, and objects list
-        # to compare both h5files' attrs.
-        file1.visititems(
-            partial(
-                collect_attrs,
-                groups_f1,
-                datasets_f1,
-                objects_f1,
-                exclude_attrs,
-                ))
-        file2.visititems(
-            partial(
-                collect_attrs,
-                groups_f2,
-                datasets_f2,
-                objects_f2,
-                exclude_attrs,
-                ))
 
-        start_time = time.perf_counter()
-        # Compare groups, datasets, and objects to have the same content.
-        assert set(groups_f1) == set(groups_f2), f"{test_file} and {reference_file} consists of different groups."  # noqa
-        assert set(datasets_f1) == set(datasets_f2), f"{test_file} and {reference_file} consists of different datasets."  # noqa
-        assert set(objects_f1) == set(objects_f2), f"{test_file} and {reference_file} consists of different datasets."  # noqa
-        duration = time.perf_counter() - start_time
-        LOGGER.debug("Elapsed time comparing groups, "
-                    f"datasets, and objects: {duration} seconds")
-        LOGGER.debug("Groups, datasets, and objects have the same content.")
-
-        # Compare datasets and objects.
-        start_time = time.perf_counter()
-        h5_diff_datasets = compare_datasets(file1, file2, datasets_f1)
-        duration = time.perf_counter() - start_time
-        LOGGER.debug(f"Elapsed time comparing datasets: {duration} seconds")
-        start_time = time.perf_counter()
-        h5_diff_objects = compare_objects(file1, file2, objects_f1)
-        LOGGER.debug(f"Elapsed time comparing objects: {duration} seconds")
-
-        assert not h5_diff_datasets, f"{[d for d in h5_diff_datasets]} datasets contain different values for {test_file} and {reference_file}"  # noqa
-        LOGGER.debug("Datasets are validated.")
-        assert not h5_diff_objects, f"{[d for d in h5_diff_objects]} objects contain different values for {test_file} and {reference_file}"  # noqa
-        LOGGER.debug("Objects are validated.")
-
-
-def validate_files(
+def gather_dsets(f: h5py.File):
+    res = set()
+
+    def visitor(name, obj):
+        if isinstance(obj, h5py.Dataset):
+            res.add(name)
+
+    f.visititems(visitor)
+    return res
+
+
+def iter_sized_chunks(ds: h5py.Dataset, chunk_size: int):
+    """Make slices of the dataset along the first axis
+
+    Aims for block_size bytes per block"""
+    if ds.ndim == 0:  # Scalar
+        yield ()
+        return
+
+    chunk_l = min(chunk_size // (ds.dtype.itemsize * np.prod(ds.shape[1:])), 1)
+    for start in range(0, ds.shape[0], chunk_l):
+        yield slice(start, start + chunk_l)
+
+
+def validate_file(
     ref_folder: pathlib.PosixPath,
     out_folder: pathlib.PosixPath,
-    exclude_attrs: list,
-    test_file: pathlib.PosixPath,
-) -> Tuple[bool, pathlib.PosixPath]:
-    """Validate file similarities. Create temporary files to exclude
-    h5 attributes known to be different. e.g `report` for constants.
-    If both files are not identical, the function is able to loop over
-    both files and find and fail on the difference.
-
-    Args:
-        ref_folder: The reference folder for validating the files
-        out_folder: The output folder for the test constant files.
-        test_file: The output file to be validated.
-        exclude_attrs: A list of datasets, groups to exclude
-          from validated files.
-    Returns:
-        result: validation result for metadata.
-        test_file: The validated file.
-    """
-    import h5py
-    start_validating = time.perf_counter()
-
-    def exclude_sources(source_file, dest, excluded_sources):
-        # Open the source file in read-only mode
-        with h5py.File(source_file, 'r') as source:
-
-            # Recursively visit all objects in the source file
-            def visit_func(name, obj):
-                # Check if the object should be excluded
-                if name in excluded_sources:
-                    return
-
-                # Check if the object is a dataset
-                if isinstance(obj, h5py.Dataset):
-                    # Create a new dataset in the destination
-                    # file and copy the data
-                    dest.create_dataset(name, data=obj[()])
-
-            # Visit all objects in the source file and
-            # copy them to the destination file
-            source.visititems(visit_func)
-
-    with tempfile.NamedTemporaryFile(
-        dir=out_folder,
-        suffix=".tmp",
-        prefix="cal_",
-        delete=True,
-        ) as out_tf, tempfile.NamedTemporaryFile(
-            dir=out_folder,
-            suffix=".tmp",
-            prefix="cal_",
-            delete=True,
-            ) as ref_tf:
-
-        # Create in-memory HDF5 files for validation
-        with h5py.File(out_tf.name, 'a') as hp1, h5py.File(ref_tf.name, 'a') as hp2:  # noqa
-
-            start_time = time.perf_counter()
-            # Copy h5 files for validation and exclude selected attrs.
-            exclude_sources(test_file, hp1, exclude_attrs)
-
-            duration = time.perf_counter() - start_time
-            LOGGER.debug(f"Elapsed time copying {test_file}: "
-                        f"{duration} seconds")
-
-            start_time = time.perf_counter()
-            exclude_sources(ref_folder / test_file.name, hp2, exclude_attrs)
-
-            duration = time.perf_counter() - start_time
-            LOGGER.debug(f"Elapsed time copying {ref_folder / test_file.name}: "
-                        f"{duration} seconds")
-
-            start_time = time.perf_counter()
-            result = file_md5(out_tf.name) == file_md5(ref_tf.name)
-            LOGGER.debug(f"MD5 validation for {test_file}: {duration} seconds")
-    duration = time.perf_counter() - start_validating
-    return result, test_file
+    exclude_dsets: set,
+    test_file: str,
+) -> ComparisonResult:
+    ref_file = ref_folder / test_file
+    out_file = out_folder / test_file
+    with h5py.File(ref_file) as fref, h5py.File(out_file) as fout:
+        ref_dsets = gather_dsets(fref)
+        out_dsets = gather_dsets(fout)
+        changed = []
+        for dsname in sorted((ref_dsets & out_dsets) - exclude_dsets):
+            ref_ds = fref[dsname]
+            out_ds = fout[dsname]
+            if out_ds.shape != ref_ds.shape:
+                changed.append((
+                    dsname, f"Shape: {ref_ds.shape} -> {out_ds.shape}"
+                ))
+            elif out_ds.dtype != ref_ds.dtype:
+                changed.append((
+                    dsname, f"Dtype: {ref_ds.dtype} -> {out_ds.dtype}"
+                ))
+            else:
+                floaty = np.issubdtype(ref_ds.dtype, np.floating) \
+                        or np.issubdtype(ref_ds.dtype, np.complexfloating)
+
+                # Compare data incrementally rather than loading it all at once;
+                # read in blocks of ~64 MB (arbitrary limit) along first axis.
+                for chunk_slice in iter_sized_chunks(ref_ds, 64 * 1024 * 1024):
+                    ref_chunk = ref_ds[chunk_slice]
+                    out_chunk = out_ds[chunk_slice]
+                    if floaty:
+                        eq = np.allclose(ref_chunk, out_chunk, equal_nan=True)
+                    else:
+                        eq = np.array_equal(ref_chunk, out_chunk)
+                    if not eq:
+                        # If just 1 entry, show the values
+                        if ref_ds.size == 1:
+                            r, o = np.squeeze(ref_chunk), np.squeeze(out_chunk)
+                            changed.append((dsname, f"Value: {r} -> {o}"))
+                        else:
+                            changed.append((dsname, "Data changed"))
+                        break
+
+    return ComparisonResult(
+        test_file,
+        new_dsets=sorted(out_dsets - ref_dsets),
+        missing_dsets=sorted(ref_dsets - out_dsets),
+        changed_dsets=changed,
+    )
 
 
 def parse_config(
-    cmd: List[str],
-    config: Dict[str, Any],
-    out_folder: str
+        cmd: List[str], config: Dict[str, Any], out_folder: str
 ) -> List[str]:
     """Convert a dictionary to a list of arguments.
 
-       Values that are not strings will be cast.
-       Lists will be converted to several strings following their `--key`
-       flag.
-       Booleans will be converted to a `--key` flag, where `key` is the
-       dictionary key.
+    Values that are not strings will be cast.
+    Lists will be converted to several strings following their `--key`
+    flag.
+    Booleans will be converted to a `--key` flag, where `key` is the
+    dictionary key.
     """
 
     for key, value in config.items():
-        if ' ' in key or (isinstance(value, str) and ' ' in value):
-            raise ValueError('Spaces are not allowed', key, value)
+        if " " in key or (isinstance(value, str) and " " in value):
+            raise ValueError("Spaces are not allowed", key, value)
 
         if isinstance(value, list):
             cmd.append(f"--{key}")
@@ -286,11 +154,9 @@ def parse_config(
 
 
 def validate_hdf5_files(
-    test_key: str,
     out_folder: pathlib.Path,
     reference_folder: pathlib.Path,
     cal_type: str,
-    find_difference: bool
 ):
     """Apply HDF5 data validation.
 
@@ -301,59 +167,48 @@ def validate_hdf5_files(
           the reference data to validate against
         cal_type (str): The type of calibration processing.
           e.g. dark or correct.
-        find_difference (bool): A flag indicating a need to find the
-          difference between two files if tested data was
-          not identical to the reference data.
     """
-    # 3rd Check number of produced h5 files.
-    h5files = list(out_folder.glob("*.h5"))
-    expected_h5files = list(reference_folder.glob("*.h5"))
-    assert len(h5files) == len(expected_h5files), f"{test_key} failure, number of files are not as expected."  # noqa
-    LOGGER.info(f"{test_key}'s calibration h5files numbers are as expected.")
+    print("\n--- Compare HDF5 files  ----")
+    print("REF:", reference_folder)
+    print("NEW:", out_folder)
+    ok = True
+
+    result_h5files = {p.name for p in out_folder.glob("*.h5")}
+    ref_h5files = {p.name for p in reference_folder.glob("*.h5")}
+    missing_files = ref_h5files - result_h5files
+    if missing_files:
+        print("Files missing from result (*.h5):", ", ".join(missing_files))
+        ok = False
+    new_files = result_h5files - ref_h5files
+    if new_files:
+        print("New files in result (*.h5):", ", ".join(new_files))
+        ok = False
+
+    files_to_check = sorted(result_h5files & ref_h5files)
 
-    non_valid_files = []
     # Hard coded datasets to exclude from numerical validation.
     # These datasets are know to be updated everytime.
     if cal_type.lower() == "correct":
-        exclude_attrs = ["METADATA/creationDate", "METADATA/updateDate"]
+        exclude_attrs = {"METADATA/creationDate", "METADATA/updateDate"}
     else:
-        exclude_attrs = ["report"]
+        exclude_attrs = {"report"}
 
-    # 4th check that test and reference h5files are identical.
-    _validate_files = partial(
-        validate_files,
+    _validate_file = partial(
+        validate_file,
         reference_folder,
         out_folder,
         exclude_attrs,
     )
-    with multiprocessing.pool.ThreadPool(processes=8) as executor:
-        result = executor.map(_validate_files, h5files)
-
-    # Collect non-valid files, if any, to display them in the error message.
-    for valid, file in result:
-        if not valid:
-            non_valid_files.append(file)
-
-    if len(non_valid_files) > 0:
-        if find_difference:
-            LOGGER.error(f"Found non valid files: {non_valid_files}. "
-                         f"Checking differences for {non_valid_files[0]}")
-            find_differences(
-                non_valid_files[0],
-                reference_folder / non_valid_files[0].name,
-                exclude_attrs
-                )
-            LOGGER.info(f"No difference found for {non_valid_files[0]}")
-        else:
-            assert len(non_valid_files) == 0, f"{test_key} failure, while validating metadata for {non_valid_files}"  # noqa
-            LOGGER.info(f"{test_key}'s calibration h5files"
-                        " are validated successfully.")
+    with multiprocessing.Pool(processes=8) as pool:
+        for comparison in pool.imap(_validate_file, files_to_check):
+            comparison.show()
+            if comparison.found_differences():
+                ok = False
 
+    return ok
 
-def slurm_watcher(
-    test_key: str,
-    std_out: str
-):
+
+def slurm_watcher(test_key: str, std_out: str):
     """
     Watch for submitted slurm jobs and wait for them to finish.
     After they finish apply first test and check
@@ -380,19 +235,25 @@ def slurm_watcher(
         res = run(cmd, stdout=PIPE)
         states = res.stdout.decode().split("\n")[2:-1]
 
-        if not any(s.strip() in [
-            "COMPLETING",
-            "RUNNING",
-            "CONFIGURING",
-            "PENDING",
-        ] for s in states):
+        if not any(
+            s.strip()
+            in [
+                "COMPLETING",
+                "RUNNING",
+                "CONFIGURING",
+                "PENDING",
+            ]
+            for s in states
+        ):
             slurm_watcher = False
         else:
             time.sleep(2)
 
     # 1st check that all jobs were COMPLETED without errors.
     states = res.stdout.decode().split("\n")[2:-1]
-    assert all(s.strip() == "COMPLETED" for s in states), f"{test_key} failure, calibration jobs were not completed. {jobids}: {states}"  # noqa
+    assert all(
+        s.strip() == "COMPLETED" for s in states
+    ), f"{test_key} failure, calibration jobs were not completed. {jobids}: {states}"  # noqa
     LOGGER.info(f"{test_key}'s jobs were COMPLETED")
 
 
@@ -403,10 +264,9 @@ def slurm_watcher(
     ids=list(automated_test_config.keys()),
 )
 def test_xfel_calibrate(
-    test_key: str, val_dict: dict,
-    release_test_config: Tuple[bool, bool, bool, bool]
+        test_key: str, val_dict: dict, release_test_config: Tuple
 ):
-    """ Test xfel calibrate detectors and calibrations written
+    """Test xfel calibrate detectors and calibrations written
     in the given callab_test YAML file.
     Args:
         test_key : Key for the xfel-calibrate test.
@@ -416,9 +276,14 @@ def test_xfel_calibrate(
     """
 
     (
-        detectors, calibration, picked_test,
-        skip_numerical_validation, only_validate, find_difference,
-        use_slurm, reference_dir_base, out_dir_base,
+        detectors,
+        calibration,
+        picked_test,
+        skip_numerical_validation,
+        only_validate,
+        use_slurm,
+        reference_dir_base,
+        out_dir_base,
     ) = release_test_config
 
     cal_type = val_dict["cal_type"]
@@ -426,10 +291,9 @@ def test_xfel_calibrate(
 
     if not picked_test:
         # Skip non-selected detectors
-        if (
-            detectors != ["all"] and
-            det_type.lower() not in [d.lower() for d in detectors]
-        ):
+        if detectors != ["all"] and det_type.lower() not in [
+            d.lower() for d in detectors
+        ]:
             pytest.skip()
 
         # Skip non-selected calibration
@@ -444,32 +308,35 @@ def test_xfel_calibrate(
     cal_conf = val_dict["config"]
 
     out_folder = pathlib.Path(cal_conf["out-folder"].format(
-        out_dir_base, cal_conf["karabo-id"], test_key))
-    reference_folder = pathlib.Path(val_dict["reference-folder"].format(
-        reference_dir_base, cal_conf["karabo-id"], test_key))
+        out_dir_base, cal_conf["karabo-id"], test_key
+    ))
+    reference_folder = pathlib.Path(
+        val_dict["reference-folder"].format(
+            reference_dir_base, cal_conf["karabo-id"], test_key
+        )
+    )
 
-    report_name = (
-        out_folder /
-        f"{test_key}_{datetime.now().strftime('%y%m%d_%H%M%S')}")
+    report_name = out_folder / f"{test_key}_{datetime.now():%y%m%d_%H%M%S}"
 
     cal_conf["report-to"] = str(report_name)
 
     cmd = parse_config(cmd, cal_conf, out_folder)
 
     if only_validate:
-        validate_hdf5_files(
-            test_key,
-            out_folder,
-            reference_folder,
-            cal_type,
-            find_difference,
-            )
+        assert validate_hdf5_files(
+            out_folder,  reference_folder, cal_type
+        ), "HDF5 files changed - see details above"
         return
 
     if not use_slurm:  # e.g. for Gitlab CI.
         cmd += ["--no-cluster-job"]
 
-    cmd += ["--slurm-name", test_key, "--cal-db-interface", "tcp://max-exfl-cal001:8015#8045"]
+    cmd += [
+        "--slurm-name",
+        test_key,
+        "--cal-db-interface",
+        "tcp://max-exfl-cal001:8015#8045",
+    ]
     f = io.StringIO()
     LOGGER.info(f"Submitting CL: {cmd}")
     with redirect_stdout(f):
@@ -495,10 +362,6 @@ def test_xfel_calibrate(
 
     # Stop tests at this point, if desired.
     if not skip_numerical_validation:
-        validate_hdf5_files(
-            test_key,
-            out_folder,
-            reference_folder,
-            cal_type,
-            find_difference,
-        )
+        assert validate_hdf5_files(
+            out_folder,  reference_folder, cal_type
+        ), "HDF5 files changed - see details above"