From cc8ebcd3e563da279aaa2a75d03bc1f490eb1d78 Mon Sep 17 00:00:00 2001
From: Danilo Ferreira de Lima <danilo.enoque.ferreira.de.lima@xfel.de>
Date: Mon, 28 Aug 2023 17:21:28 +0200
Subject: [PATCH] Added effect of number of channels.

---
 pes_to_spec/model.py                         |  10 +-
 pes_to_spec/test/channel_sensitivity.py      | 294 +++++++++++++++++++
 pes_to_spec/test/plot_channel_sensitivity.py |  89 ++++++
 run_tests_number_channels.sh                 |  53 ++++
 4 files changed, 442 insertions(+), 4 deletions(-)
 create mode 100755 pes_to_spec/test/channel_sensitivity.py
 create mode 100755 pes_to_spec/test/plot_channel_sensitivity.py
 create mode 100644 run_tests_number_channels.sh

diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py
index 44c1428..3243074 100644
--- a/pes_to_spec/model.py
+++ b/pes_to_spec/model.py
@@ -260,12 +260,14 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator):
             raise NotImplementedError("The low-resolution data cannot be transformed before the prompt has been identified. Call the fit function first.")
         if pulse_spacing is None:
             pulse_spacing = {ch: [0] for ch in X.keys()}
-        y = X
+        y = {channel: item for channel, item in X.items()
+             if channel in self.channels}
         if self.delta_tof is not None:
             first = max(0, self.tof_start - self.delta_tof)
             last = min(X[self.channels[0]].shape[1], self.tof_start + self.delta_tof)
             y = {channel: np.stack([item[:, (first + delta):(last + delta)] for delta in pulse_spacing[channel]], axis=1)
-                 for channel, item in X.items()}
+                 for channel, item in X.items()
+                   if channel in self.channels}
         if not keep_dictionary_structure:
             selected = list(y.values())
             if pulse_energy is not None:
@@ -319,9 +321,9 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator):
         self.tof_start = self.estimate_prompt_peak(X)
         X_tr = self.transform(X, keep_dictionary_structure=True)
         self.mean = {ch: np.mean(X_tr[ch], axis=0, keepdims=True)
-                     for ch in X.keys()}
+                     for ch in X_tr.keys()}
         self.std = {ch: np.std(X_tr[ch], axis=0, keepdims=True)
-                    for ch in X.keys()}
+                    for ch in X_tr.keys()}
         return self
 
     def debug_peak_finding(self, X: Dict[str, np.ndarray], filename: str):
diff --git a/pes_to_spec/test/channel_sensitivity.py b/pes_to_spec/test/channel_sensitivity.py
new file mode 100755
index 0000000..18cba12
--- /dev/null
+++ b/pes_to_spec/test/channel_sensitivity.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python
+
+import sys
+sys.path.append('.')
+sys.path.append('..')
+
+import os
+import argparse
+
+import numpy as np
+from extra_data import open_run, by_id, RunDirectory
+from pes_to_spec.model import Model, matching_ids
+
+# for helper plots
+from pes_to_spec.model import SelectRelevantLowResolution
+from sklearn.decomposition import PCA
+
+from itertools import product
+
+import pandas as pd
+from copy import deepcopy
+import scipy
+from scipy.signal import fftconvolve
+
+from typing import Dict, Optional
+
+from time import time_ns
+import pandas as pd
+
+def get_gas(run, tids):
+    gas_sources = [
+                "SA3_XTD10_PES/DCTRL/V30300S_NITROGEN",
+                "SA3_XTD10_PES/DCTRL/V30310S_NEON",
+                "SA3_XTD10_PES/DCTRL/V30320S_KRYPTON",
+                "SA3_XTD10_PES/DCTRL/V30330S_XENON",
+            ]
+    gas_active = list()
+    for gas in gas_sources:
+        # check if this gas source is interlocked
+        if gas in run.all_sources and run[gas, "interlock.AActionState.value"].ndarray().sum() == 0:
+            # it is not, so this gas was used
+            gas_active += [gas.split("/")[-1].split("_")[-1]]
+    gas = "_".join(gas_active)
+    return gas
+
+
+def save_result(filename: str,
+                spec_pred: Dict[str, np.ndarray],
+                spec_smooth: np.ndarray,
+                spec_raw_pe: np.ndarray,
+                intensity: float,
+                #spec_raw_int: Optional[np.ndarray]=None,
+                pes: Optional[np.ndarray]=None,
+                pes_to_show: Optional[str]="",
+                first: Optional[int]=None,
+                last: Optional[int]=None,
+                ):
+    """
+    Plot result with uncertainty band.
+
+    Args:
+      filename: Output file name.
+      spec_pred: Predicted result with uncertainty bands in a dictionary.
+      spec_smooth: Smoothened expected result with shape (features,).
+      spec_raw_pe: x axis with the photon energy in eV.
+      spec_raw_int: Original true expected result with shape (features,).
+      pes: PES spectrum for the inset.
+      pes_to_show: Name of the channel shown.
+      intensity: The XGM intensity in uJ.
+
+    """
+    unc_stat = spec_pred["unc"]
+    unc_pca = spec_pred["pca"]
+    unc = np.sqrt(unc_stat**2 + unc_pca**2)
+    df = pd.DataFrame(dict(energy=spec_raw_pe,
+                           spec=spec_smooth,
+                           prediction=spec_pred["expected"],
+                           unc=unc,
+                           unc_pca=unc_pca,
+                           unc_stat=unc_stat,
+                           beam_intensity=intensity*1e-3*np.ones_like(spec_raw_pe),
+                           deconvolved=spec_pred["deconvolved"]
+                           ))
+    df.to_csv(filename)
+    if pes is not None:
+        pes_data = deepcopy(pes)
+        pes_data['bin'] = np.arange(len(pes['channel_1_D']))
+        pes_data['first'] = first*np.ones_like(pes_data['bin'])
+        pes_data['last'] = last*np.ones_like(pes_data['bin'])
+        df = pd.DataFrame(pes_data)
+        df.to_csv(filename.replace('.pdf', '_pes.csv'))
+
+def save_pes_result(filename: str,
+                pes: Optional[np.ndarray]=None,
+                first: Optional[int]=None,
+                last: Optional[int]=None,
+                ):
+    """
+    Plot result with uncertainty band.
+
+    Args:
+      filename: Output file name.
+      spec_pred: Predicted result with uncertainty bands in a dictionary.
+      spec_smooth: Smoothened expected result with shape (features,).
+      spec_raw_pe: x axis with the photon energy in eV.
+      spec_raw_int: Original true expected result with shape (features,).
+      pes: PES spectrum for the inset.
+      pes_to_show: Name of the channel shown.
+      intensity: The XGM intensity in uJ.
+
+    """
+    pes_data = deepcopy(pes)
+    pes_data['bin'] = np.arange(len(pes['channel_1_D']))
+    pes_data['first'] = first*np.ones_like(pes_data['bin'])
+    pes_data['last'] = last*np.ones_like(pes_data['bin'])
+    df = pd.DataFrame(pes_data)
+    df.to_csv(filename)
+
+def main():
+    """
+    Main entry point. Reads some data, trains and predicts.
+    """
+    parser = argparse.ArgumentParser(prog="offline_analysis", description="Test pes2spec doing an offline analysis of the data.")
+    parser.add_argument('-p', '--proposal', type=int, metavar='INT', help='Proposal number', default=2828)
+    parser.add_argument('-r', '--run', type=str, metavar='INT,INT,...', help='Run numbers, comma-separated.', default=206)
+    parser.add_argument('-t', '--test-run', type=int, metavar='INT', help='Run to test', default=None)
+    parser.add_argument('-d', '--directory', type=str, metavar='DIRECTORY', default=".", help='Where to save the results.')
+    parser.add_argument('-S', '--spec', type=str, metavar='NAME', default="SA3_XTD10_SPECT/MDL/SPECTROMETER_SCS_NAVITAR:output", help='SPEC name')
+    parser.add_argument('-P', '--pes', type=str, metavar='NAME', default="SA3_XTD10_PES/ADC/1:network", help='PES name')
+    parser.add_argument('-X', '--xgm', type=str, metavar='NAME', default="SA3_XTD10_XGM/XGM/DOOCS:output", help='XGM name')
+    parser.add_argument('-o', '--offset', type=int, metavar='INT', default=0, help='Train ID offset')
+    parser.add_argument('-c', '--xgm_cut', type=float, metavar='INTENSITY', default=0, help='XGM intensity threshold in uJ.')
+    parser.add_argument('-T', '--model-type', type=str, metavar='TYPE', default="ard", choices=["bnn", "bnn_rvm", "ridge", "ard"], help='Which model type to use.')
+    parser.add_argument('-w', '--weight', action="store_true", default=False, help='Whether to reweight data as a function of the pulse energy to make it invariant to that.')
+
+    args = parser.parse_args()
+
+    print("Opening run ...")
+    runs = args.run.split(',')
+    runs = [int(r) for r in runs]
+    # get run
+    run_list = [open_run(proposal=args.proposal, run=r) for r in runs]
+    run = run_list[0]
+    if len(run_list) > 1:
+        run = run.union(*run_list[1:])
+
+    run_test = run
+    other_run_test = False
+    if "test_run" in args and args.test_run is not None:
+        other_run_test = True
+        run_test = open_run(proposal=args.proposal, run=args.test_run)
+
+    spec_offset = args.offset
+    spec_name = args.spec
+    pes_name = args.pes
+    xgm_name = args.xgm
+
+    pes_tid = run[pes_name, "digitizers.trainId"].ndarray()
+    xgm_tid = run[xgm_name, "data.trainId"].ndarray()
+
+    spec_tid = spec_offset + run[spec_name, "data.trainId"].ndarray()
+    # these are the train ID intersection
+    # this could have been done by a select call in the RunDirectory, but it would not correct for the spec_offset
+    tids = matching_ids(spec_tid, pes_tid, xgm_tid)
+
+    # read the spec photon energy and intensity
+    spec_raw_pe = run[spec_name, "data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray()
+    spec_raw_int = run[spec_name, "data.intensityDistribution"].select_trains(by_id[tids - spec_offset]).ndarray()
+
+
+    # reserve part of it for the test stage
+    train_tids = tids[:-10]
+    if other_run_test:
+        pes_tidt = run_test[pes_name, "digitizers.trainId"].ndarray()
+        xgm_tidt = run_test[xgm_name, "data.trainId"].ndarray()
+        spec_tidt = run_test[spec_name, "data.trainId"].ndarray()
+        test_tids = matching_ids(spec_tidt, pes_tidt, xgm_tidt)
+    else:
+        test_tids = tids
+    print(f"Number of train IDs: {len(train_tids)}")
+    print(f"Number of test IDs: {len(test_tids)}")
+
+    # read the PES data for each channel
+    channels = [f"channel_{i}_{l}"
+                for i, l in product([1,3,4], ["A", "B", "C", "D"])]
+    pes_raw = {ch: run[pes_name, f"digitizers.{ch}.raw.samples"].select_trains(by_id[tids]).ndarray()
+               for ch in channels}
+    pes_raw_t = {ch: run_test[pes_name, f"digitizers.{ch}.raw.samples"].select_trains(by_id[test_tids]).ndarray()
+                   for ch in channels}
+
+    # select test SPEC data
+    spec_raw_pe_t = run_test[spec_name, "data.photonEnergy"].select_trains(by_id[test_tids - spec_offset]).ndarray()
+    spec_raw_int_t = run_test[spec_name, "data.intensityDistribution"].select_trains(by_id[test_tids - spec_offset]).ndarray()
+
+    print("Data in memory.")
+
+    # read the XGM information
+    #xgm_pressure = run['SA3_XTD10_XGM/XGM/DOOCS', "pressure.pressureFiltered.value"].select_trains(by_id[tids]).ndarray()
+    #xgm_pe =  run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray()
+    #retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray()
+    #retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray()
+
+    xgm_flux =  run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray()[:, 0][:, np.newaxis]
+    xgm_flux_t =  run_test['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[test_tids]).ndarray()[:, 0][:, np.newaxis]
+
+    print(f"Intensity in training: {np.mean(xgm_flux):.2e} +/- {np.std(xgm_flux):.2e}")
+    print(f"Intensity in testing: {np.mean(xgm_flux_t):.2e} +/- {np.std(xgm_flux_t):.2e}")
+
+    pressure = run["SA3_XTD10_PES/GAUGE/G30310F", "value"].select_trains(by_id[tids]).ndarray()
+    pressure_t = run_test["SA3_XTD10_PES/GAUGE/G30310F", "value"].select_trains(by_id[test_tids]).ndarray()
+    print(f"Pressure in training: {np.mean(pressure):.2e} +/- {np.std(pressure):.2e}")
+    print(f"Pressure in testing: {np.mean(pressure_t):.2e} +/- {np.std(pressure_t):.2e}")
+
+    voltage = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray()
+    voltage_t = run_test["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[test_tids]).ndarray()
+    print(f"Voltage in training: {np.mean(voltage):.2f} +/- {np.std(voltage):.2f}")
+    print(f"Voltage in testing: {np.mean(voltage_t):.2f} +/- {np.std(voltage_t):.2f}")
+
+    gas = get_gas(run, tids)
+    gas_t = get_gas(run_test, test_tids)
+    print(f"Gas in training: {gas}")
+    print(f"Gas in testing: {gas_t}")
+
+    t = list()
+    t_names = list()
+    t_nch = list()
+
+    train_idx = np.isin(tids, train_tids) & (xgm_flux[:,0] > args.xgm_cut)
+
+    # we just need this for training and we need to avoid copying it, which blows up the memoray usage
+    for k in pes_raw.keys():
+        pes_raw[k] = pes_raw[k][train_idx]
+
+
+    nch_axis = np.arange(1, len(channels)+1)
+    resolution = list()
+    rmse = list()
+    chi2_prepca = list()
+    unc = list()
+    for nch in nch_axis:
+        model = Model(channels=channels[:nch], model_type=args.model_type)
+
+        print(f"Fitting using {nch} channels")
+        start = time_ns()
+        model.uniformize(xgm_flux[train_idx])
+        model.fit(pes_raw,
+                   spec_raw_int[train_idx],
+                   spec_raw_pe[train_idx],
+                   pulse_energy=xgm_flux[train_idx],
+                   )
+        t += [time_ns() - start]
+        t_names += ["Fit"]
+        t_nch += [nch]
+
+        resolution += [model.resolution]
+
+        # transfer function
+        print(f"Resolution: {model.resolution:.2f} eV")
+
+        # test
+        print("Predict")
+        start = time_ns()
+        spec_pred = model.predict(pes_raw_t, pulse_energy=xgm_flux_t)
+        spec_smooth = model.preprocess_high_res(spec_raw_int_t)
+        t += [time_ns() - start]
+        t_names += ["Predict"]
+        t_nch += [nch]
+
+        spec_smooth_pca = model.y_model['pca'].transform(spec_smooth)
+        unc2 = spec_pred["expected_pca_unc"]**2
+        pca_var = (spec_pred["expected_pca"].std(axis=0, keepdims=True)**2).reshape(1, 1, -1)
+        ndof_prepca = float(spec_smooth_pca.shape[-1])
+        print("Expected pca std:", pca_var)
+        chi2_prepca += [np.mean(np.sum((spec_smooth_pca[:, np.newaxis, :] - spec_pred["expected_pca"])**2/unc2, axis=(-1, -2)))/ndof_prepca]
+        rmse += [np.mean(np.sqrt(np.mean((spec_smooth[:, np.newaxis, :] - spec_pred["expected"])**2, axis=(-1, -2))))]
+        unc += [np.mean(spec_pred["total_unc"])]
+
+    df = pd.DataFrame(dict(number_channels=nch_axis,
+                           resolution=resolution,
+                           rmse=rmse,
+                           chi2_prepca=chi2_prepca,
+                           unc=unc,
+                           ))
+    df.to_csv(os.path.join(args.directory, "number_channel_effect.csv"))
+
+
+    print("Time taken in ms")
+    df_time = pd.DataFrame(data=dict(time=t, name=t_names, nch=t_nch))
+    df_time.time *= 1e-6
+    df_time.to_csv(os.path.join(args.directory, "number_channel_time.csv"))
+
+if __name__ == '__main__':
+    main()
+
diff --git a/pes_to_spec/test/plot_channel_sensitivity.py b/pes_to_spec/test/plot_channel_sensitivity.py
new file mode 100755
index 0000000..52c6cae
--- /dev/null
+++ b/pes_to_spec/test/plot_channel_sensitivity.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+import os
+import re
+
+from typing import Optional, Tuple, Dict
+
+import matplotlib
+matplotlib.use('Agg')
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
+import seaborn as sns
+
+SMALL_SIZE = 12
+MEDIUM_SIZE = 22
+BIGGER_SIZE = 26
+
+plt.rc('font', size=BIGGER_SIZE)         # controls default text sizes
+plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
+plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
+plt.rc('xtick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
+plt.rc('ytick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
+plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
+plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
+
+def plot_resolution(df: pd.DataFrame, filename: str):
+    fig = plt.figure(figsize=(12, 8))
+    gs = GridSpec(1, 1)
+    ax = fig.add_subplot(gs[0, 0])
+    ax.plot(df.number_channels, df.resolution, c='b', lw=3)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set(
+           xlabel="Number of channels",
+           ylabel="Average resolution [eV]",
+           )
+    plt.tight_layout()
+    fig.savefig(filename)
+    plt.close(fig)
+
+def plot_unc(df: pd.DataFrame, filename: str):
+    fig = plt.figure(figsize=(12, 8))
+    gs = GridSpec(1, 1)
+    ax = fig.add_subplot(gs[0, 0])
+    ax.plot(df.number_channels, df.unc, c='tab:blue', lw=3)
+    ax.spines['top'].set_visible(False)
+    ax.set(
+           xlabel="Number of channels",
+           ylabel="Uncertainty [a.u.]",
+           )
+    maxY = 1.1*df.unc.max()
+    ax.set_ylim((None, maxY))
+    rax = ax.twinx()
+    rax.plot(df.number_channels, df.rmse, c='tab:red', lw=3)
+    rax.spines['right'].set_color('tab:red')
+    rax.tick_params(axis='y', colors='tab:red')
+    rax.set_ylabel("Root-mean-squared error [a.u.]")
+    rax.yaxis.label.set_color("tab:red")
+    maxY = 1.1*df.rmse.max()
+    rax.set_ylim((None, maxY))
+    plt.tight_layout()
+    fig.savefig(filename)
+    plt.close(fig)
+
+def plot_rmse(df: pd.DataFrame, filename: str):
+    fig = plt.figure(figsize=(12, 8))
+    gs = GridSpec(1, 1)
+    ax = fig.add_subplot(gs[0, 0])
+    ax.plot(df.number_channels, df.rmse, c='b', lw=3)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set(
+           xlabel="Number of channels",
+           ylabel="Root-mean-squared error [a.u.]",
+           )
+    plt.tight_layout()
+    fig.savefig(filename)
+    plt.close(fig)
+
+if __name__ == '__main__':
+    indir = 'p900331r69t70'
+    df = pd.read_csv(f'{indir}/number_channel_effect.csv')
+
+    plot_rmse(df, "rmse.pdf")
+    plot_unc(df, "unc.pdf")
+    plot_resolution(df, "resolution.pdf")
+
diff --git a/run_tests_number_channels.sh b/run_tests_number_channels.sh
new file mode 100644
index 0000000..3609bfb
--- /dev/null
+++ b/run_tests_number_channels.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#SBATCH --partition=exfel
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=8:00:00
+#SBATCH --job-name=nch_pes2spec
+#SBATCH -o slurm.%x.err.txt
+#SBATCH -e slurm.%x.err.txt
+#SBATCH --reservation=exfel_ml
+
+optstring="d:"
+
+DIR=results
+while getopts ${optstring} arg; do
+    case ${arg} in
+        d)
+            DIR=${OPTARG}
+            S=$OPTIND
+            ;;
+        *)
+            S=$((OPTIND))
+            break
+            ;;
+    esac
+done
+OPTS="${@:$S}"
+echo "Options: $OPTS"
+
+source /usr/share/Modules/init/sh
+
+module load exfel exfel_anaconda3
+
+cd $HOME/scratch/karabo/devices/pes_to_spec
+source env/bin/activate
+pwd
+export PYTHONPATH=$PYTHONPATH:$PWD
+
+mkdir $DIR
+
+do_it() {
+    p=$1
+    r=$2
+    rt=$3
+    output=$DIR/p${p}r${r}t${rt}
+    mkdir -p $output
+    echo "Proposal $p, run $r, test at run $rt"
+    CMD=(./pes_to_spec/test/channel_sensitivity.py -p $p -r $r -t $rt -d $output ${@:4})
+    echo "${CMD[*]}"
+    ${CMD[*]} 2>&1 | tee $output/log.txt
+}
+
+do_it 900331  69 70 $OPTS
+
-- 
GitLab