From d8b2b7e7a2503c01443ea1dccb2ba36049c6c0bf Mon Sep 17 00:00:00 2001 From: Steffen Hauf <haufs@max-exfl027.desy.de> Date: Fri, 16 Nov 2018 15:06:45 +0100 Subject: [PATCH] Move checksum tests to h5 fletcher32 --- .gitignore | 1 + cal_tools/cal_tools/agipdlib.py | 55 +++++++------ cal_tools/cal_tools/lpdlib.py | 39 +++++++--- docs/source/index.rst | 1 + docs/source/testing.rst | 90 ++++++++++++++++++++++ notebooks/LPD/LPD_Correct_and_Verify.ipynb | 2 +- tests/correction_base.py | 80 +++++++++++-------- 7 files changed, 197 insertions(+), 71 deletions(-) create mode 100644 docs/source/testing.rst diff --git a/.gitignore b/.gitignore index d21e742f4..968dac8ff 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ docs/source/available_notebooks.rst docs/build docs/source/_notebooks/* docs/source/test_rsts/* +docs/source/test_results.rst diff --git a/cal_tools/cal_tools/agipdlib.py b/cal_tools/cal_tools/agipdlib.py index bb17e4393..121f91c84 100644 --- a/cal_tools/cal_tools/agipdlib.py +++ b/cal_tools/cal_tools/agipdlib.py @@ -1018,20 +1018,18 @@ class AgipdCorrections: # sanitize indices for do in ["image", ]: - existing = np.squeeze( - self.infile[idx_base + "{}/first".format(do)]) - updated = existing - np.min(existing[valid]) - self.outfile[idx_base + "{}/first".format(do)] = updated - - existing = np.squeeze( - self.infile[idx_base + "{}/count".format(do)]) - new_counts, _ = np.histogram(alltrains[firange], - bins=np.count_nonzero(valid) + 1, - range=(np.min(idxtrains[valid]), - np.max(idxtrains[valid] + 1))) - updated = existing - updated[valid] = new_counts[:-1] - self.outfile[idx_base + "{}/count".format(do)] = updated + uq, fidx, cnts = np.unique(alltrains[firange], return_index=True, + return_counts=True) + self.outfile.create_dataset(idx_base + "{}/first".format(do), + fidx.shape, + dtype=fidx.dtype, + data=fidx, + fletcher32=True) + self.outfile.create_dataset(idx_base + "{}/count".format(do), + cnts.shape, + dtype=cnts.dtype, + data=cnts, + fletcher32=True) def create_output_datasets(self): """ Initialize output data sets @@ -1043,26 +1041,35 @@ class AgipdCorrections: chunks = (chunksize, oshape[1], oshape[2]) self.ddset = self.outfile.create_dataset(agipd_base + "image/data", oshape, chunks=chunks, - dtype=np.float32) + dtype=np.float32, + fletcher32=True) self.gdset = self.outfile.create_dataset(agipd_base + "image/gain", oshape, chunks=chunks, dtype=np.uint8, compression="gzip", compression_opts=1, - shuffle=True) + shuffle=True, + fletcher32=True) self.mdset = self.outfile.create_dataset(agipd_base + "image/mask", oshape, chunks=chunks, dtype=np.uint32, compression="gzip", compression_opts=1, - shuffle=True) - - fsz = firange.size - self.outfile[agipd_base + "image/cellId"] = np.zeros(fsz, np.uint16) - self.outfile[agipd_base + "image/trainId"] = np.zeros(fsz, np.uint64) - self.outfile[agipd_base + "image/pulseId"] = np.zeros(fsz, np.uint64) - self.outfile[agipd_base + "image/status"] = np.zeros(fsz, np.uint16) - self.outfile[agipd_base + "image/length"] = np.zeros(fsz, np.uint32) + shuffle=True, + fletcher32=True) + + fsz = firange.shape + + self.outfile.create_dataset(agipd_base + "image/cellId", fsz, + dtype=np.uint16, fletcher32=True) + self.outfile.create_dataset(agipd_base + "image/trainId", fsz, + dtype=np.uint64, fletcher32=True) + self.outfile.create_dataset(agipd_base + "image/pulseId", fsz, + dtype=np.uint64, fletcher32=True) + self.outfile.create_dataset(agipd_base + "image/status", fsz, + dtype=np.uint16, fletcher32=True) + self.outfile.create_dataset(agipd_base + "image/length", fsz, + dtype=np.uint32, fletcher32=True) self.outfile.flush() def get_histograms(self): diff --git a/cal_tools/cal_tools/lpdlib.py b/cal_tools/cal_tools/lpdlib.py index 9b0a5c35a..2a7c18469 100644 --- a/cal_tools/cal_tools/lpdlib.py +++ b/cal_tools/cal_tools/lpdlib.py @@ -391,8 +391,16 @@ class LpdCorrections: for do in ["image", ]: uq, fidx, cnts = np.unique(alltrains[firange], return_index=True, return_counts=True) - self.outfile[idx_base + "{}/first".format(do)] = fidx # updated - self.outfile[idx_base + "{}/count".format(do)] = cnts # updated + self.outfile.create_dataset(idx_base + "{}/first".format(do), + fidx.shape, + dtype=fidx.dtype, + data=fidx, + fletcher32=True) + self.outfile.create_dataset(idx_base + "{}/count".format(do), + cnts.shape, + dtype=cnts.dtype, + data=cnts, + fletcher32=True) def create_output_datasets(self): """ Initialize output data sets @@ -404,26 +412,33 @@ class LpdCorrections: chunks = (chunksize, oshape[1], oshape[2]) self.ddset = self.outfile.create_dataset(lpdbase + "image/data", oshape, chunks=chunks, - dtype=np.float32) + dtype=np.float32, + fletcher32=True) self.gdset = self.outfile.create_dataset(lpdbase + "image/gain", oshape, chunks=chunks, dtype=np.uint8, compression="gzip", compression_opts=1, - shuffle=True) + shuffle=True, + fletcher32=True) self.mdset = self.outfile.create_dataset(lpdbase + "image/mask", oshape, chunks=chunks, dtype=np.uint32, compression="gzip", compression_opts=1, - shuffle=True) - frs = firange.size - self.outfile[lpdbase + "image/cellId"] = np.zeros(frs, np.uint16) - self.outfile[lpdbase + "image/trainId"] = np.zeros(frs, np.uint64) - self.outfile[lpdbase + "image/pulseId"] = np.zeros(frs, np.uint64) - self.outfile[lpdbase + "image/status"] = np.zeros(frs, np.uint16) - self.outfile[lpdbase + "image/length"] = np.zeros(frs, np.uint32) - self.outfile.flush() + shuffle=True, + fletcher32=True) + fsz = firange.shape + self.outfile.create_dataset(lpdbase + "image/cellId", fsz, + dtype=np.uint16, fletcher32=True) + self.outfile.create_dataset(lpdbase + "image/trainId", fsz, + dtype=np.uint64, fletcher32=True) + self.outfile.create_dataset(lpdbase + "image/pulseId", fsz, + dtype=np.uint64, fletcher32=True) + self.outfile.create_dataset(lpdbase + "image/status", fsz, + dtype=np.uint16, fletcher32=True) + self.outfile.create_dataset(lpdbase + "image/length", fsz, + dtype=np.uint32, fletcher32=True) def get_histograms(self): """ Return preview histograms computed from the first chunk diff --git a/docs/source/index.rst b/docs/source/index.rst index 953f6460d..1853a96de 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,6 +19,7 @@ Contents: advanced tutorial _notebooks/index + testing test_results diff --git a/docs/source/testing.rst b/docs/source/testing.rst new file mode 100644 index 000000000..3c6efff83 --- /dev/null +++ b/docs/source/testing.rst @@ -0,0 +1,90 @@ +Testing +******* + +The `test` directory contains tests which can ensure that updates to +notebooks and libraries do not result in unintended changes to +notebook output. This assures consistency of correction results +for subsequent versions. + +.. note:: + + Tests can be quite resource intensive, and thus should be run + on a dedicated cluster node, allocated using `salloc`. + +Running Tests ++++++++++++++ + +Before you run tests, commit your changes, so that the test +run can be assigned to that commit:: + + git add ... + git commit -m "Added test section to docs" + +To run all tests, navigate to the test directory and execute:: + + python -m unittest discover + +This will usually entail executing a notebook under test via SLURM +first, then checking its output against the last commited artifacts +of that test type. + +If individual tests are run, e.g. for debugging, additional options +exist to skip tests, or notebook execution:: + + python test_XXX.py --help + +where `test_XXX.py` is the test name, will give you a list of options +available for that test. + +If all tests pass, you can commit and push your updates. If you have +failures, either check your changes, or if changes are intended, +generate new artifacts. + +.. note:: + + Running tests will generate entries for test reports in the + artifacts directory under the most recent commit. + Reviewers should check that such updates are present in the + list of changed files. + + +Generating new Artifacts +++++++++++++++++++++++++ + +If an update intents to change output, the tests can be used to +generate new artifacts against which subsequent tests will then run. + +First, commit your changes which you want to produce new artifacts +for:: + + git add ... + git commit -m "AGIPD corrections handle baseline shift" + +Contrary to running tests alone, new artifacts need to be generated +for each affected test individually:: + + python test_XXX.py --generate + +replacing `test_XXX.py` with the test you'd like to run. This +will execute the notebook, create artifact entries in the artifact +dir, and the check for consistency by executing the test against +these artifacts. This last part is important: the test should not +fail on its own input. If it does, something is very likely wrong! + +After artifacts are created and tests using these have passed, +commit the new artifacts and create a merge request for your branch:: + + git add tests/artifacts/ + git commit -m "Added new artifacts for changes related to baseline shifts" + +Please also add comments in the MR description on why artifacts have +changed. + +.. note:: + + Reviewers should always question if a change in test artifacts + is appropriate and intended. + +Test Reports +++++++++++++ + diff --git a/notebooks/LPD/LPD_Correct_and_Verify.ipynb b/notebooks/LPD/LPD_Correct_and_Verify.ipynb index 9646183bc..1ba6621ce 100644 --- a/notebooks/LPD/LPD_Correct_and_Verify.ipynb +++ b/notebooks/LPD/LPD_Correct_and_Verify.ipynb @@ -25,7 +25,7 @@ "run = 44 # runs to process, required\n", "out_folder = \"/gpfs/exfel/data/scratch/haufs/test/\" # the folder to output to, required\n", "calfile = \"/gpfs/exfel/exp/FXE/201831/p900038/usr/calibration0818/cal_constants2.h5\" # path to constants extracted from the db into a file\n", - "sequences = [0] # sequences to correct, set to -1 for all, range allowed\n", + "sequences = [-1] # sequences to correct, set to -1 for all, range allowed\n", "mem_cells = 512 # memory cells in data\n", "overwrite = True # set to True if existing data should be overwritten\n", "no_relative_gain = False # do not do relative gain correction\n", diff --git a/tests/correction_base.py b/tests/correction_base.py index 6efb9b39d..810134fad 100644 --- a/tests/correction_base.py +++ b/tests/correction_base.py @@ -78,7 +78,9 @@ def get_artifact_dir(cls): print("Last git commit is: {}".format(last_commit)) test_name = cls.__name__ art_base = os.path.realpath(args.artefact_dir) - path = "{}/{}/{}".format(art_base, last_commit, test_name) + suffix = "G" if _do_generate else "T" + path = "{}/{}/{}_{}".format(art_base, last_commit, + test_name, suffix) print("Artifacts will be placed in: {}".format(path)) return path @@ -92,7 +94,7 @@ def get_artifact_comp_dir(cls): test_name = cls.__name__ art_base = os.path.realpath(args.artefact_dir) for commit in r.iter_commits(): - path = "{}/{}/{}".format(art_base, commit, test_name) + path = "{}/{}/{}_G".format(art_base, commit, test_name) if os.path.exists(path): return path msg = ("Could not find any previous artifacts for test {}" @@ -319,61 +321,71 @@ class CorrectionTestBase: break sleep(10) - def _create_md5(self, fname): - """ Create an MD5 checksum for the file at `fname`. - """ - print("Creating MD5 checksum of: {}".format(fname)) - hash_md5 = hashlib.md5() - with open(fname, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - @unittest.skipUnless(_do_generate() and not args.skip_checksum, "Artifact generation is not requested") def test_generate_checksums(self): - """ Generate MD5 checksums of output files from notebook + """ Generate Fletcher32 checksums of output files from notebook """ out_folder = self._output_to_path() files_to_check = glob.glob( - "{}/*{}".format(out_folder, self.rel_file_ext)) + "{}/*{}".format(out_folder, self.rel_file_ext)) + for fname in files_to_check: - m5 = self._create_md5(fname) - m5fname = "{}.md5".format(fname) - m5path = "{}/{}".format(self.artifact_dir, - os.path.basename(m5fname)) - with open(m5path, "w") as f: - f.write(m5) + + with h5py.File(fname, "r") as f: + d = {} + def visitor(k, item): + if isinstance(item, h5py.Dataset): + d[k] = item.fletcher32 + + f.visititems(visitor) + + chkfname = "{}.checksum".format(fname) + chkpath = "{}/{}".format(self.artifact_dir, + os.path.basename(chkfname)) + with open(chkpath, 'wb') as fc: + pickle.dump(d, fc, pickle.HIGHEST_PROTOCOL) @unittest.skipIf(args.skip_checksum, "User requested to skip checksum test") def test_checksums(self): - """ Compare MD5 checksums of notebook's output files with artifacts + """ Compare Fletcher32 checksums of notebook's output with artifacts - This test will verify if output files are identical. Even for - small changes in the correction logic this test is likely to fail. + This test will verify if datasets with checksums are identical. + Even for small changes in the correction logic this test is likely + to fail. If this is the case, it is recommended to verify correctness using the other tests, which inspect data, and the create new checksums using the --generate option. If no previous checksum exists for a given file the test for that - file will fail. + file will fail. It will also fail if a dataset previously did not + have a checksum assigned. """ out_folder = self._output_to_path() files_to_check = glob.glob( "{}/*{}".format(out_folder, self.rel_file_ext)) for fname in files_to_check: - m5fname = "{}.md5".format(fname) + chkfname = "{}.checksum".format(fname) test_art_dir = get_artifact_comp_dir(self.__class__) - m5path = "{}/{}".format(test_art_dir, os.path.basename(m5fname)) - - with self.subTest(msg="Verifying against: {}".format(m5path)): - self.assertTrue(os.path.exists(m5path), - "No comparison MD5 found") - m5 = self._create_md5(fname) - with open(m5path, "r") as f: - m5test = f.read().strip() - self.assertEqual(m5.strip(), m5test) + chkpath = "{}/{}".format(test_art_dir, os.path.basename(chkfname)) + with self.subTest(msg="Verifying against: {}".format(chkpath)): + self.assertTrue(os.path.exists(chkpath), + "No comparison checksums found") + with open(chkpath, 'rb') as fc: + d = pickle.load(fc) + + with h5py.File(fname, "r") as f: + + def visitor(k, item): + if isinstance(item, h5py.Dataset): + + msg = "Verify checksum of: {}".format(k) + with self.subTest(msg=msg): + self.assertIn(k, d) + self.assertEqual(d[k], item.fletcher32) + + f.visititems(visitor) @unittest.skipUnless(_do_generate() and not args.skip_histogram, "Artifact generation is not requested") -- GitLab