From 93198ab89a701b42db775901b8bb4095c7f1d4aa Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas.kluyver@xfel.eu>
Date: Wed, 17 May 2023 16:26:18 +0200
Subject: [PATCH] Use EXtra-data in LPD-1M darks notebook

---
 notebooks/LPD/LPDChar_Darks_NBC.ipynb | 114 +++++++++-----------------
 1 file changed, 39 insertions(+), 75 deletions(-)

diff --git a/notebooks/LPD/LPDChar_Darks_NBC.ipynb b/notebooks/LPD/LPDChar_Darks_NBC.ipynb
index 3bdc0114f..602cc793d 100644
--- a/notebooks/LPD/LPDChar_Darks_NBC.ipynb
+++ b/notebooks/LPD/LPDChar_Darks_NBC.ipynb
@@ -25,7 +25,6 @@
     "in_folder = \"/gpfs/exfel/exp/FXE/202304/p003338/raw\" # path to input data, required\n",
     "out_folder = \"/gpfs/exfel/data/scratch/kluyvert/lpd-darks-p3338-r133-134-135/\" # path to output to, required\n",
     "metadata_folder = \"\"  # Directory containing calibration_metadata.yml when run by xfel-calibrate\n",
-    "sequence = 0 # sequence files to evaluate\n",
     "modules = [-1] # list of modules to evaluate, RANGE ALLOWED\n",
     "run_high = 133 # run number in which high gain data was recorded, required\n",
     "run_med = 134 # run number in which medium gain data was recorded, required\n",
@@ -33,10 +32,7 @@
     "\n",
     "karabo_id = \"FXE_DET_LPD1M-1\" # karabo karabo_id\n",
     "karabo_da = ['-1']  # a list of data aggregators names, Default [-1] for selecting all data aggregators\n",
-    "receiver_id = \"{}CH0\" # inset for receiver devices\n",
-    "path_template = 'RAW-R{:04d}-{}-S{:05d}.h5' # the template to use to access data\n",
-    "h5path = '/INSTRUMENT/{}/DET/{}:xtdf/image' # path in the HDF5 file to images\n",
-    "h5path_idx = '/INDEX/{}/DET/{}:xtdf/image' # path in the HDF5 file to images\n",
+    "source_name = \"{}/DET/{}CH0:xtdf\"  # Source name for raw detector data - filled with karabo_id & module number\n",
     "\n",
     "use_dir_creation_date = True # use the creation date of the directory for database time derivation\n",
     "cal_db_interface = \"tcp://max-exfl016:8015#8025\" # the database interface to use\n",
@@ -51,13 +47,14 @@
     "thresholds_offset_hard = [400, 1500] # bad pixel hard threshold\n",
     "thresholds_noise_sigma = 7. # bad pixel relative threshold in terms of n sigma noise\n",
     "thresholds_noise_hard = [1, 35] # bad pixel hard threshold\n",
-    "skip_first_ntrains = 10 # Number of first trains to skip\n",
+    "\n",
+    "ntrains = 500  # maximum number of trains to use in each gain stage\n",
+    "skip_first_ntrains = 10  # Number of first trains to skip\n",
+    "min_trains = 370  # minimum number of trains needed for each gain stage\n",
     "\n",
     "# Parameters for plotting\n",
     "skip_plots = False  # exit after writing corrected files\n",
     "\n",
-    "instrument = \"FXE\" # instrument name\n",
-    "ntrains = 100 # number of trains to use\n",
     "high_res_badpix_3d = False # plot bad-pixel summary in high resolution\n",
     "test_for_normality = False # permorm normality test\n",
     "inject_cell_order = False  # Include memory cell order as part of the detector condition\n",
@@ -81,7 +78,6 @@
     "warnings.filterwarnings('ignore')\n",
     "\n",
     "import dateutil.parser\n",
-    "import h5py\n",
     "import matplotlib\n",
     "import pasha as psh\n",
     "import scipy.stats\n",
@@ -98,6 +94,7 @@
     "from iCalibrationDB import Conditions, Constants, Detectors, Versions\n",
     "from XFELDetAna.plotting.heatmap import heatmapPlot\n",
     "from XFELDetAna.plotting.simpleplot import simplePlot\n",
+    "from extra_data import RunDirectory\n",
     "\n",
     "from cal_tools.enums import BadPixels\n",
     "from cal_tools.plotting import (\n",
@@ -116,6 +113,7 @@
     "    map_gain_stages,\n",
     "    module_index_to_qm,\n",
     "    parse_runs,\n",
+    "    reorder_axes,\n",
     "    run_prop_seq_from_path,\n",
     "    save_const_to_h5,\n",
     "    send_to_db,\n",
@@ -140,17 +138,8 @@
     "else:\n",
     "    modules = [int(x[-2:]) for x in karabo_da]\n",
     "\n",
-    "gain_runs = {\n",
-    "    \"high\": run_high,\n",
-    "    \"med\": run_med,\n",
-    "    \"low\": run_low,\n",
-    "}\n",
-    "\n",
     "capacitor_setting_s = f'{capacitor_setting}pf'\n",
     "\n",
-    "h5path = h5path.format(karabo_id, receiver_id)\n",
-    "h5path_idx = h5path_idx.format(karabo_id, receiver_id)\n",
-    "\n",
     "creation_time = None\n",
     "if use_dir_creation_date:\n",
     "    creation_time = get_dir_creation_date(in_folder, run_high)\n",
@@ -165,7 +154,6 @@
     "print(\"Proposal: {}\".format(prop))\n",
     "print(\"Memory cells: {}/{}\".format(mem_cells, max_cells))\n",
     "print(\"Runs: {}, {}, {}\".format(run_high, run_med, run_low))\n",
-    "print(\"Sequence: {}\".format(sequence))\n",
     "print(\"Using DB: {}\".format(db_output))\n",
     "print(\"Input: {}\".format(in_folder))\n",
     "print(\"Output: {}\".format(out_folder))\n",
@@ -173,18 +161,6 @@
     "print(f\"Capacitor setting: {capacitor_setting}pf\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set everything up filewise\n",
-    "gmf = map_gain_stages(in_folder, gain_runs, path_template, karabo_da, [sequence])\n",
-    "gain_mapped_files, total_sequences, total_file_size = gmf\n",
-    "print(f\"Will process a total of {total_sequences} files.\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -202,41 +178,35 @@
     "parallel_num_threads = multiprocessing.cpu_count() // parallel_num_procs\n",
     "\n",
     "# the actual characterization\n",
-    "def characterize_module(filename, channel, gg):\n",
-    "\n",
-    "    def splitOffGainLPD(d):\n",
-    "        msk = np.zeros(d.shape, np.uint16)\n",
-    "        msk[...] = 0b0000111111111111\n",
-    "        data = np.bitwise_and(d, msk)\n",
-    "        msk[...] = 0b0011000000000000\n",
-    "        gain = np.bitwise_and(d, msk)//4096\n",
-    "        gain[gain > 2] = 2\n",
-    "        return data, gain\n",
-    "\n",
-    "    infile = h5py.File(filename, \"r\")\n",
+    "def characterize_module(run_path, channel, gg):\n",
+    "    run = RunDirectory(run_path)\n",
+    "    det_source = source_name.format(karabo_id, channel)\n",
+    "    data = run[det_source, 'image.data'].drop_empty_trains()\n",
+    "    data = data[skip_first_ntrains : skip_first_ntrains + ntrains]\n",
+    "    cell_ids = run[det_source, 'image.cellId'].drop_empty_trains()\n",
+    "    cell_ids = cell_ids[skip_first_ntrains : skip_first_ntrains + ntrains]\n",
     "    \n",
-    "    instrument_src = h5path.format(channel)\n",
-    "    index_src = h5path_idx.format(channel)\n",
-    "    count = infile[f\"{index_src}/count\"][()]\n",
-    "    first = infile[f\"{index_src}/first\"][()]\n",
-    "    valid = count != 0\n",
-    "    count, first = count[valid], first[valid]\n",
-    "    first_image = int(first[skip_first_ntrains] if first.shape[0] > skip_first_ntrains else 0)\n",
-    "    last_image = int(first_image + np.sum(count[skip_first_ntrains:skip_first_ntrains+ntrains]))\n",
-    "\n",
-    "    im = np.array(infile[\"{}/data\".format(instrument_src, channel)][first_image:last_image, ...])\n",
-    "    cellid = np.squeeze(np.array(infile[\"{}/cellId\".format(instrument_src, channel)][first_image:last_image, ...]))\n",
-    "    infile.close()\n",
-    "    if im.shape[0] == 0:  # No data\n",
-    "        return None, None, channel, gg, None, None, None, None\n",
-    "\n",
-    "    cellid_pattern = cellid[:count[0]]\n",
-    "\n",
-    "    im, g = splitOffGainLPD(im[:, 0, ...])\n",
-    "    im = im.astype(np.float32)\n",
+    "    if len(data.train_ids) < min_trains:\n",
+    "        raise Exception(f\"Run {run_path} only contains {len(data.train_ids)} trains, but {min_trains} required\")\n",
+    "\n",
+    "    im = data.ndarray()\n",
+    "    if im.ndim > 3:\n",
+    "        im = im[:, 0]  # Drop extra dimension\n",
+    "    \n",
+    "    cellid = cell_ids.ndarray()\n",
+    "    cellid_pattern = cell_ids[0].ndarray()\n",
+    "    if cellid.ndim > 1:\n",
+    "        cellid = cellid[:, 0]\n",
+    "        cellid_pattern = cellid_pattern[:, 0]\n",
+    "\n",
+    "    # Mask off gain bits, leaving only data\n",
+    "    im &= 0b0000111111111111\n",
     "\n",
-    "    im = np.rollaxis(im, 2)\n",
-    "    im = np.rollaxis(im, 2, 1)\n",
+    "    im = im.astype(np.float32)\n",
+    "    im = reorder_axes(im,\n",
+    "        from_order=('frames', 'slow_scan', 'fast_scan'),\n",
+    "        to_order=('fast_scan', 'slow_scan', 'frames'),\n",
+    "    )\n",
     "\n",
     "    context = psh.context.ThreadContext(num_workers=parallel_num_threads)\n",
     "    offset = context.alloc(shape=(im.shape[0], im.shape[1], max_cells), dtype=np.float64)\n",
@@ -295,19 +265,12 @@
     "# Should be the same cell order for all modules & all gain stages\n",
     "cellid_patterns_g = {}\n",
     "\n",
-    "gg = 0\n",
-    "inp = []\n",
     "\n",
-    "# This loop needs to get runs in the order high, medium, low\n",
-    "for gain, mapped_files in gain_mapped_files.items():\n",
+    "inp = []\n",
+    "for gg, run_num in enumerate([run_high, run_med, run_low]):\n",
+    "    run_path = os.path.join(in_folder, f\"r{run_num:04d}\")\n",
     "    for i in modules:\n",
-    "        qm = module_index_to_qm(i)\n",
-    "        if qm in mapped_files and not mapped_files[qm].empty():\n",
-    "            fname_in = mapped_files[qm].get()\n",
-    "            print(\"Process file: \", fname_in)\n",
-    "            inp.append((fname_in, i, gg))\n",
-    "\n",
-    "    gg+=1\n",
+    "        inp.append((run_path, i, gg))\n",
     "\n",
     "with multiprocessing.Pool(processes=parallel_num_procs) as pool:\n",
     "    results = pool.starmap(characterize_module, inp)\n",
@@ -321,6 +284,7 @@
     "    qm = module_index_to_qm(i)\n",
     "    if qm not in offset_g:\n",
     "        offset_g[qm] = np.zeros(offset.shape[:3] + (3,))\n",
+    "        print(\"Constant shape:\", offset_g[qm].shape)\n",
     "        noise_g[qm] = np.zeros_like(offset_g[qm])\n",
     "        badpix_g[qm] = np.zeros_like(offset_g[qm], dtype=np.uint32)\n",
     "        data_g[qm] = np.full((ntrains, 3), np.nan)\n",
-- 
GitLab