From d8b2b7e7a2503c01443ea1dccb2ba36049c6c0bf Mon Sep 17 00:00:00 2001
From: Steffen Hauf <haufs@max-exfl027.desy.de>
Date: Fri, 16 Nov 2018 15:06:45 +0100
Subject: [PATCH] Move checksum tests to h5 fletcher32

---
 .gitignore                                 |  1 +
 cal_tools/cal_tools/agipdlib.py            | 55 +++++++------
 cal_tools/cal_tools/lpdlib.py              | 39 +++++++---
 docs/source/index.rst                      |  1 +
 docs/source/testing.rst                    | 90 ++++++++++++++++++++++
 notebooks/LPD/LPD_Correct_and_Verify.ipynb |  2 +-
 tests/correction_base.py                   | 80 +++++++++++--------
 7 files changed, 197 insertions(+), 71 deletions(-)
 create mode 100644 docs/source/testing.rst

diff --git a/.gitignore b/.gitignore
index d21e742f4..968dac8ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ docs/source/available_notebooks.rst
 docs/build
 docs/source/_notebooks/*
 docs/source/test_rsts/*
+docs/source/test_results.rst
diff --git a/cal_tools/cal_tools/agipdlib.py b/cal_tools/cal_tools/agipdlib.py
index bb17e4393..121f91c84 100644
--- a/cal_tools/cal_tools/agipdlib.py
+++ b/cal_tools/cal_tools/agipdlib.py
@@ -1018,20 +1018,18 @@ class AgipdCorrections:
 
         # sanitize indices
         for do in ["image", ]:
-            existing = np.squeeze(
-                self.infile[idx_base + "{}/first".format(do)])
-            updated = existing - np.min(existing[valid])
-            self.outfile[idx_base + "{}/first".format(do)] = updated
-
-            existing = np.squeeze(
-                self.infile[idx_base + "{}/count".format(do)])
-            new_counts, _ = np.histogram(alltrains[firange],
-                                         bins=np.count_nonzero(valid) + 1,
-                                         range=(np.min(idxtrains[valid]),
-                                                np.max(idxtrains[valid] + 1)))
-            updated = existing
-            updated[valid] = new_counts[:-1]
-            self.outfile[idx_base + "{}/count".format(do)] = updated
+            uq, fidx, cnts = np.unique(alltrains[firange], return_index=True,
+                                       return_counts=True)
+            self.outfile.create_dataset(idx_base + "{}/first".format(do),
+                                        fidx.shape,
+                                        dtype=fidx.dtype,
+                                        data=fidx,
+                                        fletcher32=True)
+            self.outfile.create_dataset(idx_base + "{}/count".format(do),
+                                        cnts.shape,
+                                        dtype=cnts.dtype,
+                                        data=cnts,
+                                        fletcher32=True)
 
     def create_output_datasets(self):
         """ Initialize output data sets
@@ -1043,26 +1041,35 @@ class AgipdCorrections:
         chunks = (chunksize, oshape[1], oshape[2])
         self.ddset = self.outfile.create_dataset(agipd_base + "image/data",
                                                  oshape, chunks=chunks,
-                                                 dtype=np.float32)
+                                                 dtype=np.float32,
+                                                 fletcher32=True)
         self.gdset = self.outfile.create_dataset(agipd_base + "image/gain",
                                                  oshape, chunks=chunks,
                                                  dtype=np.uint8,
                                                  compression="gzip",
                                                  compression_opts=1,
-                                                 shuffle=True)
+                                                 shuffle=True,
+                                                 fletcher32=True)
         self.mdset = self.outfile.create_dataset(agipd_base + "image/mask",
                                                  oshape, chunks=chunks,
                                                  dtype=np.uint32,
                                                  compression="gzip",
                                                  compression_opts=1,
-                                                 shuffle=True)
-
-        fsz = firange.size
-        self.outfile[agipd_base + "image/cellId"] = np.zeros(fsz, np.uint16)
-        self.outfile[agipd_base + "image/trainId"] = np.zeros(fsz, np.uint64)
-        self.outfile[agipd_base + "image/pulseId"] = np.zeros(fsz, np.uint64)
-        self.outfile[agipd_base + "image/status"] = np.zeros(fsz, np.uint16)
-        self.outfile[agipd_base + "image/length"] = np.zeros(fsz, np.uint32)
+                                                 shuffle=True,
+                                                 fletcher32=True)
+
+        fsz = firange.shape
+        
+        self.outfile.create_dataset(agipd_base + "image/cellId", fsz,
+                                    dtype=np.uint16, fletcher32=True)
+        self.outfile.create_dataset(agipd_base + "image/trainId", fsz,
+                                    dtype=np.uint64, fletcher32=True)
+        self.outfile.create_dataset(agipd_base + "image/pulseId", fsz,
+                                    dtype=np.uint64, fletcher32=True)
+        self.outfile.create_dataset(agipd_base + "image/status", fsz,
+                                    dtype=np.uint16, fletcher32=True)
+        self.outfile.create_dataset(agipd_base + "image/length", fsz,
+                                    dtype=np.uint32, fletcher32=True)
         self.outfile.flush()
 
     def get_histograms(self):
diff --git a/cal_tools/cal_tools/lpdlib.py b/cal_tools/cal_tools/lpdlib.py
index 9b0a5c35a..2a7c18469 100644
--- a/cal_tools/cal_tools/lpdlib.py
+++ b/cal_tools/cal_tools/lpdlib.py
@@ -391,8 +391,16 @@ class LpdCorrections:
         for do in ["image", ]:
             uq, fidx, cnts = np.unique(alltrains[firange], return_index=True,
                                        return_counts=True)
-            self.outfile[idx_base + "{}/first".format(do)] = fidx  # updated
-            self.outfile[idx_base + "{}/count".format(do)] = cnts  # updated
+            self.outfile.create_dataset(idx_base + "{}/first".format(do),
+                                        fidx.shape,
+                                        dtype=fidx.dtype,
+                                        data=fidx,
+                                        fletcher32=True)
+            self.outfile.create_dataset(idx_base + "{}/count".format(do),
+                                        cnts.shape,
+                                        dtype=cnts.dtype,
+                                        data=cnts,
+                                        fletcher32=True)
 
     def create_output_datasets(self):
         """ Initialize output data sets
@@ -404,26 +412,33 @@ class LpdCorrections:
         chunks = (chunksize, oshape[1], oshape[2])
         self.ddset = self.outfile.create_dataset(lpdbase + "image/data",
                                                  oshape, chunks=chunks,
-                                                 dtype=np.float32)
+                                                 dtype=np.float32,
+                                                 fletcher32=True)
         self.gdset = self.outfile.create_dataset(lpdbase + "image/gain",
                                                  oshape, chunks=chunks,
                                                  dtype=np.uint8,
                                                  compression="gzip",
                                                  compression_opts=1,
-                                                 shuffle=True)
+                                                 shuffle=True,
+                                                 fletcher32=True)
         self.mdset = self.outfile.create_dataset(lpdbase + "image/mask",
                                                  oshape, chunks=chunks,
                                                  dtype=np.uint32,
                                                  compression="gzip",
                                                  compression_opts=1,
-                                                 shuffle=True)
-        frs = firange.size
-        self.outfile[lpdbase + "image/cellId"] = np.zeros(frs, np.uint16)
-        self.outfile[lpdbase + "image/trainId"] = np.zeros(frs, np.uint64)
-        self.outfile[lpdbase + "image/pulseId"] = np.zeros(frs, np.uint64)
-        self.outfile[lpdbase + "image/status"] = np.zeros(frs, np.uint16)
-        self.outfile[lpdbase + "image/length"] = np.zeros(frs, np.uint32)
-        self.outfile.flush()
+                                                 shuffle=True,
+                                                 fletcher32=True)
+        fsz = firange.shape
+        self.outfile.create_dataset(lpdbase + "image/cellId", fsz,
+                                    dtype=np.uint16, fletcher32=True)
+        self.outfile.create_dataset(lpdbase + "image/trainId", fsz,
+                                    dtype=np.uint64, fletcher32=True)
+        self.outfile.create_dataset(lpdbase + "image/pulseId", fsz,
+                                    dtype=np.uint64, fletcher32=True)
+        self.outfile.create_dataset(lpdbase + "image/status", fsz,
+                                    dtype=np.uint16, fletcher32=True)
+        self.outfile.create_dataset(lpdbase + "image/length", fsz,
+                                    dtype=np.uint32, fletcher32=True)
 
     def get_histograms(self):
         """ Return preview histograms computed from the first chunk
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 953f6460d..1853a96de 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -19,6 +19,7 @@ Contents:
    advanced
    tutorial
    _notebooks/index
+   testing
    test_results
 
 
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
new file mode 100644
index 000000000..3c6efff83
--- /dev/null
+++ b/docs/source/testing.rst
@@ -0,0 +1,90 @@
+Testing
+*******
+
+The `test` directory contains tests which can ensure that updates to
+notebooks and libraries do not result in unintended changes to
+notebook output. This assures consistency of correction results
+for subsequent versions.
+
+.. note::
+
+    Tests can be quite resource intensive, and thus should be run
+    on a dedicated cluster node, allocated using `salloc`.
+
+Running Tests
++++++++++++++
+
+Before you run tests, commit your changes, so that the test
+run can be assigned to that commit::
+
+    git add ...
+    git commit -m "Added test section to docs"
+
+To run all tests, navigate to the test directory and execute::
+
+    python -m unittest discover
+    
+This will usually entail executing a notebook under test via SLURM
+first, then checking its output against the last commited artifacts
+of that test type.
+
+If individual tests are run, e.g. for debugging, additional options
+exist to skip tests, or notebook execution::
+
+   python test_XXX.py --help
+   
+where `test_XXX.py` is the test name, will give you a list of options
+available for that test.
+
+If all tests pass, you can commit and push your updates. If you have
+failures, either check your changes, or if changes are intended,
+generate new artifacts.
+
+.. note::
+
+    Running tests will generate entries for test reports in the
+    artifacts directory under the most recent commit.
+    Reviewers should check that such updates are present in the 
+    list of changed files.
+
+
+Generating new Artifacts
+++++++++++++++++++++++++
+
+If an update intents to change output, the tests can be used to
+generate new artifacts against which subsequent tests will then run.
+
+First, commit your changes which you want to produce new artifacts
+for::
+
+    git add ...
+    git commit -m "AGIPD corrections handle baseline shift"
+
+Contrary to running tests alone, new artifacts need to be generated
+for each affected test individually::
+
+    python test_XXX.py --generate
+    
+replacing `test_XXX.py` with the test you'd like to run. This
+will execute the notebook, create artifact entries in the artifact
+dir, and the check for consistency by executing the test against
+these artifacts. This last part is important: the test should not
+fail on its own input. If it does, something is very likely wrong!
+
+After artifacts are created and tests using these have passed,
+commit the new artifacts and create a merge request for your branch::
+
+    git add tests/artifacts/
+    git commit -m "Added new artifacts for changes related to baseline shifts"
+    
+Please also add comments in the MR description on why artifacts have
+changed.
+
+.. note::
+
+    Reviewers should always question if a change in test artifacts
+    is appropriate and intended.
+    
+Test Reports
+++++++++++++
+
diff --git a/notebooks/LPD/LPD_Correct_and_Verify.ipynb b/notebooks/LPD/LPD_Correct_and_Verify.ipynb
index 9646183bc..1ba6621ce 100644
--- a/notebooks/LPD/LPD_Correct_and_Verify.ipynb
+++ b/notebooks/LPD/LPD_Correct_and_Verify.ipynb
@@ -25,7 +25,7 @@
     "run = 44 # runs to process, required\n",
     "out_folder =  \"/gpfs/exfel/data/scratch/haufs/test/\" # the folder to output to, required\n",
     "calfile =  \"/gpfs/exfel/exp/FXE/201831/p900038/usr/calibration0818/cal_constants2.h5\" # path to constants extracted from the db into a file\n",
-    "sequences = [0] # sequences to correct, set to -1 for all, range allowed\n",
+    "sequences = [-1] # sequences to correct, set to -1 for all, range allowed\n",
     "mem_cells = 512 # memory cells in data\n",
     "overwrite = True # set to True if existing data should be overwritten\n",
     "no_relative_gain = False # do not do relative gain correction\n",
diff --git a/tests/correction_base.py b/tests/correction_base.py
index 6efb9b39d..810134fad 100644
--- a/tests/correction_base.py
+++ b/tests/correction_base.py
@@ -78,7 +78,9 @@ def get_artifact_dir(cls):
     print("Last git commit is: {}".format(last_commit))
     test_name = cls.__name__
     art_base = os.path.realpath(args.artefact_dir)
-    path = "{}/{}/{}".format(art_base, last_commit, test_name)
+    suffix = "G" if _do_generate else "T"
+    path = "{}/{}/{}_{}".format(art_base, last_commit,
+                                test_name, suffix)
     print("Artifacts will be placed in: {}".format(path))
     return path
 
@@ -92,7 +94,7 @@ def get_artifact_comp_dir(cls):
     test_name = cls.__name__
     art_base = os.path.realpath(args.artefact_dir)
     for commit in r.iter_commits():
-        path = "{}/{}/{}".format(art_base, commit, test_name)
+        path = "{}/{}/{}_G".format(art_base, commit, test_name)
         if os.path.exists(path):
             return path
     msg = ("Could not find any previous artifacts for test {}"
@@ -319,61 +321,71 @@ class CorrectionTestBase:
                 break
             sleep(10)
 
-    def _create_md5(self, fname):
-        """ Create an MD5 checksum for the file at `fname`.
-        """
-        print("Creating MD5 checksum of: {}".format(fname))
-        hash_md5 = hashlib.md5()
-        with open(fname, "rb") as f:
-            for chunk in iter(lambda: f.read(4096), b""):
-                hash_md5.update(chunk)
-        return hash_md5.hexdigest()
-
     @unittest.skipUnless(_do_generate() and not args.skip_checksum,
                          "Artifact generation is not requested")
     def test_generate_checksums(self):
-        """ Generate MD5 checksums of output files from notebook
+        """ Generate Fletcher32 checksums of output files from notebook
         """
         out_folder = self._output_to_path()
         files_to_check = glob.glob(
-            "{}/*{}".format(out_folder, self.rel_file_ext))
+            "{}/*{}".format(out_folder, self.rel_file_ext))  
+        
         for fname in files_to_check:
-            m5 = self._create_md5(fname)
-            m5fname = "{}.md5".format(fname)
-            m5path = "{}/{}".format(self.artifact_dir,
-                                    os.path.basename(m5fname))
-            with open(m5path, "w") as f:
-                f.write(m5)
+            
+            with h5py.File(fname, "r") as f:
+                d = {}
+                def visitor(k, item):
+                    if isinstance(item, h5py.Dataset):
+                        d[k] = item.fletcher32
+
+                f.visititems(visitor)
+                
+                chkfname = "{}.checksum".format(fname)
+                chkpath = "{}/{}".format(self.artifact_dir,
+                                         os.path.basename(chkfname))
+                with open(chkpath, 'wb') as fc:
+                    pickle.dump(d, fc, pickle.HIGHEST_PROTOCOL) 
 
     @unittest.skipIf(args.skip_checksum,
                      "User requested to skip checksum test")
     def test_checksums(self):
-        """ Compare MD5 checksums of notebook's output files with artifacts
+        """ Compare Fletcher32 checksums of notebook's output with artifacts
 
-        This test will verify if output files are identical. Even for
-        small changes in the correction logic this test is likely to fail.
+        This test will verify if datasets with checksums are identical. 
+        Even for small changes in the correction logic this test is likely 
+        to fail.
         If this is the case, it is recommended to verify correctness using
         the other tests, which inspect data, and the create new checksums
         using the --generate option.
 
         If no previous checksum exists for a given file the test for that
-        file will fail.
+        file will fail. It will also fail if a dataset previously did not
+        have a checksum assigned.
         """
         out_folder = self._output_to_path()
         files_to_check = glob.glob(
             "{}/*{}".format(out_folder, self.rel_file_ext))
         for fname in files_to_check:
-            m5fname = "{}.md5".format(fname)
+            chkfname = "{}.checksum".format(fname)
             test_art_dir = get_artifact_comp_dir(self.__class__)
-            m5path = "{}/{}".format(test_art_dir, os.path.basename(m5fname))
-
-            with self.subTest(msg="Verifying against: {}".format(m5path)):
-                self.assertTrue(os.path.exists(m5path),
-                                "No comparison MD5 found")
-                m5 = self._create_md5(fname)
-                with open(m5path, "r") as f:
-                    m5test = f.read().strip()
-                    self.assertEqual(m5.strip(), m5test)
+            chkpath = "{}/{}".format(test_art_dir, os.path.basename(chkfname))
+            with self.subTest(msg="Verifying against: {}".format(chkpath)):
+                self.assertTrue(os.path.exists(chkpath),
+                                "No comparison checksums found")
+            with open(chkpath, 'rb') as fc:
+                d = pickle.load(fc)
+                
+                with h5py.File(fname, "r") as f:
+ 
+                    def visitor(k, item):
+                        if isinstance(item, h5py.Dataset):
+                            
+                            msg = "Verify checksum of: {}".format(k)
+                            with self.subTest(msg=msg):
+                                self.assertIn(k, d)
+                                self.assertEqual(d[k], item.fletcher32)
+
+                    f.visititems(visitor)
 
     @unittest.skipUnless(_do_generate() and not args.skip_histogram,
                          "Artifact generation is not requested")
-- 
GitLab