Skip to content
Snippets Groups Projects

[Tests] clearer comparison of HDF5 files

Merged Thomas Kluyver requested to merge test/compare-h5-files into master
@@ -41,8 +41,8 @@ class ComparisonResult:
print(f" + NEW: {ds}")
for ds in self.missing_dsets:
print(f" - MISSING: {ds}")
for ds in self.changed_dsets:
print(f" ~ CHANGED: {ds}")
for ds, detail in self.changed_dsets:
print(f" ~ CHANGED: {ds} ({detail})")
def gather_dsets(f: h5py.File):
@@ -56,6 +56,19 @@ def gather_dsets(f: h5py.File):
return res
def iter_sized_chunks(ds: h5py.Dataset, chunk_size: int):
"""Make slices of the dataset along the first axis
Aims for block_size bytes per block"""
if ds.ndim == 0: # Scalar
yield ()
return
chunk_l = min(chunk_size // (ds.dtype.itemsize * np.prod(ds.shape[1:])), 1)
for start in range(0, ds.shape[0], chunk_l):
yield slice(start, start + chunk_l)
def validate_file(
ref_folder: pathlib.PosixPath,
out_folder: pathlib.PosixPath,
@@ -69,18 +82,33 @@ def validate_file(
out_dsets = gather_dsets(fout)
changed = []
for dsname in sorted((ref_dsets & out_dsets) - exclude_dsets):
ref_arr = fref[dsname][()]
out_arr = fout[dsname][()]
if isinstance(ref_arr, np.ndarray) ^ isinstance(out_arr, np.ndarray):
eq = False # One is an array, the other not
elif isinstance(ref_arr, np.ndarray):
# Both arrays
eq = np.array_equal(ref_arr, out_arr, equal_nan=True)
ref_ds = fref[dsname]
out_ds = fout[dsname]
if out_ds.shape != ref_ds.shape:
changed.append((dsname, f"Shape: {ref_ds.shape} -> {out_ds.shape}")) # noqa
elif out_ds.dtype != ref_ds.dtype:
changed.append((dsname, f"Dtype: {ref_ds.dtype} -> {out_ds.dtype}")) # noqa
else:
# Both single values
eq = ref_arr == out_arr
if not eq:
changed.append(dsname)
floaty = np.issubdtype(ref_ds.dtype, np.floating) \
or np.issubdtype(ref_ds.dtype, np.complexfloating)
# Compare data incrementally rather than loading it all at once;
# read in blocks of ~64 MB (arbitrary limit) along first axis.
for chunk_slice in iter_sized_chunks(ref_ds, 64 * 1024 * 1024):
ref_chunk = ref_ds[chunk_slice]
out_chunk = out_ds[chunk_slice]
if floaty:
eq = np.allclose(ref_chunk, out_chunk, equal_nan=True)
else:
eq = np.array_equal(ref_chunk, out_chunk)
if not eq:
# If just 1 entry, show the values
if ref_ds.size == 1:
r, o = np.squeeze(ref_chunk), np.squeeze(out_chunk)
changed.append((dsname, f"Value: {r} -> {o}"))
else:
changed.append((dsname, "Data changed"))
break
return ComparisonResult(
test_file,
@@ -90,7 +118,9 @@ def validate_file(
)
def parse_config(cmd: List[str], config: Dict[str, Any], out_folder: str) -> List[str]:
def parse_config(
cmd: List[str], config: Dict[str, Any], out_folder: str
) -> List[str]:
"""Convert a dictionary to a list of arguments.
Values that are not strings will be cast.
@@ -233,7 +263,7 @@ def slurm_watcher(test_key: str, std_out: str):
list(automated_test_config.items()),
ids=list(automated_test_config.keys()),
)
def test_xfel_calibrate(test_key: str, val_dict: dict, release_test_config: Tuple):
def test_xfel_calibrate(test_key: str, val_dict: dict, release_test_config: Tuple): # noqa
"""Test xfel calibrate detectors and calibrations written
in the given callab_test YAML file.
Args:
@@ -275,16 +305,16 @@ def test_xfel_calibrate(test_key: str, val_dict: dict, release_test_config: Tupl
cal_conf = val_dict["config"]
out_folder = pathlib.Path(
cal_conf["out-folder"].format(out_dir_base, cal_conf["karabo-id"], test_key)
)
out_folder = pathlib.Path(cal_conf["out-folder"].format(
out_dir_base, cal_conf["karabo-id"], test_key
))
reference_folder = pathlib.Path(
val_dict["reference-folder"].format(
reference_dir_base, cal_conf["karabo-id"], test_key
)
)
report_name = out_folder / f"{test_key}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
report_name = out_folder / f"{test_key}_{datetime.now().strftime('%y%m%d_%H%M%S')}" # noqa
cal_conf["report-to"] = str(report_name)
@@ -301,7+331,7 @@
if not use_slurm: # e.g. for Gitlab CI.
cmd += ["--no-cluster-job"]
cmd += [
"--slurm-name",
test_key,
Loading