From ed59542e873c34790d25adaed23420eb2bd9e174 Mon Sep 17 00:00:00 2001
From: ahmedk <karim.ahmed@xfel.eu>
Date: Tue, 8 Mar 2022 18:56:05 +0100
Subject: [PATCH] remove source selection and only select instr sources for the
 n_trains check part

---
 .../Characterize_AGIPD_Gain_Darks_NBC.ipynb   | 34 ++++++-------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb b/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb
index be96463e0..32068cbac 100644
--- a/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb
+++ b/notebooks/AGIPD/Characterize_AGIPD_Gain_Darks_NBC.ipynb
@@ -209,43 +209,31 @@
     "# Create out_folder if it doesn't exist.\n",
     "Path(out_folder).mkdir(parents=True, exist_ok=True)\n",
     "\n",
-    "n_files = 0\n",
-    "total_file_sizes = 0\n",
     "max_trains_list = []\n",
-    "\n",
+    "file_sizes = []\n",
     "for run_dict in runs_dict.values():\n",
     "    missing_modules = []\n",
-    "    image_dc = run_dict[\"dc\"].select(f\"{karabo_id_control}*\", \"*\", require_all=True)\n",
     "    # This is important in case of no slurm parallelization over modules is done.\n",
     "    # (e.g. running notebook interactively)\n",
-    "    sources_l = [(f\"{karabo_id_control}*\", \"*\")]\n",
-    "    sources_l += [(instrument_src.format(m), \"*\") for m in modules]\n",
-    "    image_dc = run_dict[\"dc\"].select(sources_l, require_all=True)\n",
+    "    dc = run_dict[\"dc\"].select(\n",
+    "        [(instrument_src.format(m), \"*\") for m in modules],\n",
+    "        require_all=True\n",
+    "    )\n",
     "    # validate that there are trains and that data sources are\n",
     "    # present for any of the selected modules.\n",
-    "    if (\n",
-    "        len(image_dc.train_ids) == 0 or\n",
-    "        not np.any([\n",
-    "            karabo_id in s for s in run_dict[\"dc\"].select(sources_l, require_all=True).all_sources])  # noqa\n",
-    "    ):\n",
-    "        raise ValueError(f\"No images to process for run: {run_dict['number']}\")\n",
+    "    n_trains = len(dc.train_ids)\n",
     "\n",
-    "    max_trains_list.append(len(image_dc.train_ids))\n",
+    "    if n_trains == 0:\n",
+    "        raise ValueError(f\"No images to process for run: {run_dict['number']}\")\n",
     "\n",
-    "    # update run_dc with selected module sources\n",
-    "    run_dict[\"dc\"] = image_dc\n",
+    "    max_trains_list.append(n_trains)\n",
+    "    file_sizes += [os.path.getsize(f.filename) / 1e9 for f in dc.files]\n",
     "\n",
     "# Update modules and karabo_da lists based on available modules to processes.\n",
     "modules = [m for m in modules if m not in missing_modules]\n",
     "karabo_da = create_karabo_da_list(modules)\n",
     "\n",
-    "# Remodifing run data collections to display actual total files number and size. \n",
-    "for run_dict in runs_dict.values():\n",
-    "    file_sizes = [os.path.getsize(f.filename) / 1e9 for f in run_dict[\"dc\"].deselect(f\"{karabo_id_control}*\").files]\n",
-    "    total_file_sizes += sum(file_sizes)\n",
-    "    n_files += len(file_sizes)\n",
-    "\n",
-    "print(f\"Will process data in a total of {n_files} files ({total_file_sizes:.02f} GB).\")"
+    "print(f\"Will process data in a total of {len(file_sizes)} files ({sum(file_sizes):.02f} GB).\")"
    ]
   },
   {
-- 
GitLab