Skip to content
Snippets Groups Projects

[Tests] clearer comparison of HDF5 files

Merged Thomas Kluyver requested to merge test/compare-h5-files into master
4 unresolved threads
1 file
+ 103
46
Compare changes
  • Side-by-side
  • Inline
@@ -6,6 +6,7 @@ import pathlib
import tempfile
import time
from contextlib import redirect_stdout
from dataclasses import dataclass
from datetime import datetime
from functools import partial
from subprocess import PIPE, run
@@ -252,23 +253,83 @@ def validate_files(
return result, test_file
def parse_config(
cmd: List[str],
config: Dict[str, Any],
out_folder: str
) -> List[str]:
@dataclass
class ComparisonResult:
filename: str
new_dsets: list
missing_dsets: list
changed_dsets: list
def found_differences(self):
return bool(self.new_dsets or self.missing_dsets or self.changed_dsets)
def show(self):
if not self.found_differences():
print(f"{self.filename} - ✓ no changes")
print(self.filename)
for ds in self.new_dsets:
print(f" + NEW {ds}")
for ds in self.missing_dsets:
print(f" - DEL {ds}")
for ds in self.changed_dsets:
print(f" ~ DIF {ds}")
def gather_dsets(f: h5py.File):
res = set()
def visitor(name, obj):
if isinstance(obj, h5py.Dataset):
res.add(name)
f.visititems(visitor)
return res
def validate_file(
ref_folder: pathlib.PosixPath,
out_folder: pathlib.PosixPath,
exclude_dsets: set,
test_file: str
) -> ComparisonResult:
res = ComparisonResult(test_file, [], [], [])
ref_file = ref_folder / test_file
out_file = out_folder / test_file
with h5py.File(ref_file) as fref, h5py.File(out_file) as fout:
ref_dsets = gather_dsets(fref)
out_dsets = gather_dsets(fout)
changed = []
for dsname in sorted((ref_dsets & out_dsets) - exclude_dsets):
ref_arr = fref[dsname][()]
out_arr = fout[dsname][()]
if isinstance(ref_arr, np.ndarray) ^ isinstance(out_arr, np.ndarray):
eq = False # One is an array, the other not
elif isinstance(ref_arr, np.ndarray):
# Both arrays
eq = np.array_equal(ref_arr, out_arr, equal_nan=True)
else:
# Both single values
eq = (ref_arr == out_arr)
if not eq:
changed.append(dsname)
return ComparisonResult(
test_file,
new_dsets=sorted(out_dsets - ref_dsets),
missing_dsets=sorted(ref_dsets - out_dsets),
changed_dsets=changed
)
def parse_config(cmd: List[str], config: Dict[str, Any], out_folder: str) -> List[str]:
"""Convert a dictionary to a list of arguments.
Values that are not strings will be cast.
Lists will be converted to several strings following their `--key`
flag.
Booleans will be converted to a `--key` flag, where `key` is the
dictionary key.
Values that are not strings will be cast.
Lists will be converted to several strings following their `--key`
flag.
Booleans will be converted to a `--key` flag, where `key` is the
dictionary key.
"""
for key, value in config.items():
if ' ' in key or (isinstance(value, str) and ' ' in value):
raise ValueError('Spaces are not allowed', key, value)
if " " in key or (isinstance(value, str) and " " in value):
raise ValueError("Spaces are not allowed", key, value)
if isinstance(value, list):
cmd.append(f"--{key}")
@@ -290,7 +351,7 @@ def validate_hdf5_files(
out_folder: pathlib.Path,
reference_folder: pathlib.Path,
cal_type: str,
find_difference: bool
find_difference: bool,
):
"""Apply HDF5 data validation.
@@ -305,49 +366,45 @@ def validate_hdf5_files(
difference between two files if tested data was
not identical to the reference data.
"""
# 3rd Check number of produced h5 files.
h5files = list(out_folder.glob("*.h5"))
expected_h5files = list(reference_folder.glob("*.h5"))
assert len(h5files) == len(expected_h5files), f"{test_key} failure, number of files are not as expected." # noqa
LOGGER.info(f"{test_key}'s calibration h5files numbers are as expected.")
print("\n--- Compare HDF5 files ----")
print("REF:", reference_folder)
print("NEW:", out_folder)
ok = True
result_h5files = {p.name for p in out_folder.glob("*.h5")}
ref_h5files = {p.name for p in reference_folder.glob("*.h5")}
missing_files = ref_h5files - result_h5files
if missing_files:
print("Files missing from result (*.h5):", ", ".join(missing_files))
ok = False
new_files = result_h5files - ref_h5files
if new_files:
print("New files in result (*.h5):", ", ".join(new_files))
ok = False
files_to_check = sorted(result_h5files & ref_h5files)
non_valid_files = []
# Hard coded datasets to exclude from numerical validation.
# These datasets are know to be updated everytime.
if cal_type.lower() == "correct":
exclude_attrs = ["METADATA/creationDate", "METADATA/updateDate"]
exclude_attrs = {"METADATA/creationDate", "METADATA/updateDate"}
else:
exclude_attrs = ["report"]
exclude_attrs = {"report"}
# 4th check that test and reference h5files are identical.
_validate_files = partial(
validate_files,
_validate_file = partial(
validate_file,
reference_folder,
out_folder,
exclude_attrs,
)
with multiprocessing.pool.ThreadPool(processes=8) as executor:
result = executor.map(_validate_files, h5files)
# Collect non-valid files, if any, to display them in the error message.
for valid, file in result:
if not valid:
non_valid_files.append(file)
if len(non_valid_files) > 0:
if find_difference:
LOGGER.error(f"Found non valid files: {non_valid_files}. "
f"Checking differences for {non_valid_files[0]}")
find_differences(
non_valid_files[0],
reference_folder / non_valid_files[0].name,
exclude_attrs
)
LOGGER.info(f"No difference found for {non_valid_files[0]}")
else:
assert len(non_valid_files) == 0, f"{test_key} failure, while validating metadata for {non_valid_files}" # noqa
LOGGER.info(f"{test_key}'s calibration h5files"
" are validated successfully.")
with multiprocessing.Pool(processes=8) as pool:
for comparison in pool.imap(_validate_file, files_to_check):
comparison.show()
if comparison.found_differences():
ok = False
assert ok, "HDF5 files changed - see details above"
def slurm_watcher(
Loading