From cc8ebcd3e563da279aaa2a75d03bc1f490eb1d78 Mon Sep 17 00:00:00 2001 From: Danilo Ferreira de Lima <danilo.enoque.ferreira.de.lima@xfel.de> Date: Mon, 28 Aug 2023 17:21:28 +0200 Subject: [PATCH] Added effect of number of channels. --- pes_to_spec/model.py | 10 +- pes_to_spec/test/channel_sensitivity.py | 294 +++++++++++++++++++ pes_to_spec/test/plot_channel_sensitivity.py | 89 ++++++ run_tests_number_channels.sh | 53 ++++ 4 files changed, 442 insertions(+), 4 deletions(-) create mode 100755 pes_to_spec/test/channel_sensitivity.py create mode 100755 pes_to_spec/test/plot_channel_sensitivity.py create mode 100644 run_tests_number_channels.sh diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py index 44c1428..3243074 100644 --- a/pes_to_spec/model.py +++ b/pes_to_spec/model.py @@ -260,12 +260,14 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): raise NotImplementedError("The low-resolution data cannot be transformed before the prompt has been identified. Call the fit function first.") if pulse_spacing is None: pulse_spacing = {ch: [0] for ch in X.keys()} - y = X + y = {channel: item for channel, item in X.items() + if channel in self.channels} if self.delta_tof is not None: first = max(0, self.tof_start - self.delta_tof) last = min(X[self.channels[0]].shape[1], self.tof_start + self.delta_tof) y = {channel: np.stack([item[:, (first + delta):(last + delta)] for delta in pulse_spacing[channel]], axis=1) - for channel, item in X.items()} + for channel, item in X.items() + if channel in self.channels} if not keep_dictionary_structure: selected = list(y.values()) if pulse_energy is not None: @@ -319,9 +321,9 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): self.tof_start = self.estimate_prompt_peak(X) X_tr = self.transform(X, keep_dictionary_structure=True) self.mean = {ch: np.mean(X_tr[ch], axis=0, keepdims=True) - for ch in X.keys()} + for ch in X_tr.keys()} self.std = {ch: np.std(X_tr[ch], axis=0, keepdims=True) - for ch in X.keys()} + for ch in X_tr.keys()} return self def debug_peak_finding(self, X: Dict[str, np.ndarray], filename: str): diff --git a/pes_to_spec/test/channel_sensitivity.py b/pes_to_spec/test/channel_sensitivity.py new file mode 100755 index 0000000..18cba12 --- /dev/null +++ b/pes_to_spec/test/channel_sensitivity.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python + +import sys +sys.path.append('.') +sys.path.append('..') + +import os +import argparse + +import numpy as np +from extra_data import open_run, by_id, RunDirectory +from pes_to_spec.model import Model, matching_ids + +# for helper plots +from pes_to_spec.model import SelectRelevantLowResolution +from sklearn.decomposition import PCA + +from itertools import product + +import pandas as pd +from copy import deepcopy +import scipy +from scipy.signal import fftconvolve + +from typing import Dict, Optional + +from time import time_ns +import pandas as pd + +def get_gas(run, tids): + gas_sources = [ + "SA3_XTD10_PES/DCTRL/V30300S_NITROGEN", + "SA3_XTD10_PES/DCTRL/V30310S_NEON", + "SA3_XTD10_PES/DCTRL/V30320S_KRYPTON", + "SA3_XTD10_PES/DCTRL/V30330S_XENON", + ] + gas_active = list() + for gas in gas_sources: + # check if this gas source is interlocked + if gas in run.all_sources and run[gas, "interlock.AActionState.value"].ndarray().sum() == 0: + # it is not, so this gas was used + gas_active += [gas.split("/")[-1].split("_")[-1]] + gas = "_".join(gas_active) + return gas + + +def save_result(filename: str, + spec_pred: Dict[str, np.ndarray], + spec_smooth: np.ndarray, + spec_raw_pe: np.ndarray, + intensity: float, + #spec_raw_int: Optional[np.ndarray]=None, + pes: Optional[np.ndarray]=None, + pes_to_show: Optional[str]="", + first: Optional[int]=None, + last: Optional[int]=None, + ): + """ + Plot result with uncertainty band. + + Args: + filename: Output file name. + spec_pred: Predicted result with uncertainty bands in a dictionary. + spec_smooth: Smoothened expected result with shape (features,). + spec_raw_pe: x axis with the photon energy in eV. + spec_raw_int: Original true expected result with shape (features,). + pes: PES spectrum for the inset. + pes_to_show: Name of the channel shown. + intensity: The XGM intensity in uJ. + + """ + unc_stat = spec_pred["unc"] + unc_pca = spec_pred["pca"] + unc = np.sqrt(unc_stat**2 + unc_pca**2) + df = pd.DataFrame(dict(energy=spec_raw_pe, + spec=spec_smooth, + prediction=spec_pred["expected"], + unc=unc, + unc_pca=unc_pca, + unc_stat=unc_stat, + beam_intensity=intensity*1e-3*np.ones_like(spec_raw_pe), + deconvolved=spec_pred["deconvolved"] + )) + df.to_csv(filename) + if pes is not None: + pes_data = deepcopy(pes) + pes_data['bin'] = np.arange(len(pes['channel_1_D'])) + pes_data['first'] = first*np.ones_like(pes_data['bin']) + pes_data['last'] = last*np.ones_like(pes_data['bin']) + df = pd.DataFrame(pes_data) + df.to_csv(filename.replace('.pdf', '_pes.csv')) + +def save_pes_result(filename: str, + pes: Optional[np.ndarray]=None, + first: Optional[int]=None, + last: Optional[int]=None, + ): + """ + Plot result with uncertainty band. + + Args: + filename: Output file name. + spec_pred: Predicted result with uncertainty bands in a dictionary. + spec_smooth: Smoothened expected result with shape (features,). + spec_raw_pe: x axis with the photon energy in eV. + spec_raw_int: Original true expected result with shape (features,). + pes: PES spectrum for the inset. + pes_to_show: Name of the channel shown. + intensity: The XGM intensity in uJ. + + """ + pes_data = deepcopy(pes) + pes_data['bin'] = np.arange(len(pes['channel_1_D'])) + pes_data['first'] = first*np.ones_like(pes_data['bin']) + pes_data['last'] = last*np.ones_like(pes_data['bin']) + df = pd.DataFrame(pes_data) + df.to_csv(filename) + +def main(): + """ + Main entry point. Reads some data, trains and predicts. + """ + parser = argparse.ArgumentParser(prog="offline_analysis", description="Test pes2spec doing an offline analysis of the data.") + parser.add_argument('-p', '--proposal', type=int, metavar='INT', help='Proposal number', default=2828) + parser.add_argument('-r', '--run', type=str, metavar='INT,INT,...', help='Run numbers, comma-separated.', default=206) + parser.add_argument('-t', '--test-run', type=int, metavar='INT', help='Run to test', default=None) + parser.add_argument('-d', '--directory', type=str, metavar='DIRECTORY', default=".", help='Where to save the results.') + parser.add_argument('-S', '--spec', type=str, metavar='NAME', default="SA3_XTD10_SPECT/MDL/SPECTROMETER_SCS_NAVITAR:output", help='SPEC name') + parser.add_argument('-P', '--pes', type=str, metavar='NAME', default="SA3_XTD10_PES/ADC/1:network", help='PES name') + parser.add_argument('-X', '--xgm', type=str, metavar='NAME', default="SA3_XTD10_XGM/XGM/DOOCS:output", help='XGM name') + parser.add_argument('-o', '--offset', type=int, metavar='INT', default=0, help='Train ID offset') + parser.add_argument('-c', '--xgm_cut', type=float, metavar='INTENSITY', default=0, help='XGM intensity threshold in uJ.') + parser.add_argument('-T', '--model-type', type=str, metavar='TYPE', default="ard", choices=["bnn", "bnn_rvm", "ridge", "ard"], help='Which model type to use.') + parser.add_argument('-w', '--weight', action="store_true", default=False, help='Whether to reweight data as a function of the pulse energy to make it invariant to that.') + + args = parser.parse_args() + + print("Opening run ...") + runs = args.run.split(',') + runs = [int(r) for r in runs] + # get run + run_list = [open_run(proposal=args.proposal, run=r) for r in runs] + run = run_list[0] + if len(run_list) > 1: + run = run.union(*run_list[1:]) + + run_test = run + other_run_test = False + if "test_run" in args and args.test_run is not None: + other_run_test = True + run_test = open_run(proposal=args.proposal, run=args.test_run) + + spec_offset = args.offset + spec_name = args.spec + pes_name = args.pes + xgm_name = args.xgm + + pes_tid = run[pes_name, "digitizers.trainId"].ndarray() + xgm_tid = run[xgm_name, "data.trainId"].ndarray() + + spec_tid = spec_offset + run[spec_name, "data.trainId"].ndarray() + # these are the train ID intersection + # this could have been done by a select call in the RunDirectory, but it would not correct for the spec_offset + tids = matching_ids(spec_tid, pes_tid, xgm_tid) + + # read the spec photon energy and intensity + spec_raw_pe = run[spec_name, "data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray() + spec_raw_int = run[spec_name, "data.intensityDistribution"].select_trains(by_id[tids - spec_offset]).ndarray() + + + # reserve part of it for the test stage + train_tids = tids[:-10] + if other_run_test: + pes_tidt = run_test[pes_name, "digitizers.trainId"].ndarray() + xgm_tidt = run_test[xgm_name, "data.trainId"].ndarray() + spec_tidt = run_test[spec_name, "data.trainId"].ndarray() + test_tids = matching_ids(spec_tidt, pes_tidt, xgm_tidt) + else: + test_tids = tids + print(f"Number of train IDs: {len(train_tids)}") + print(f"Number of test IDs: {len(test_tids)}") + + # read the PES data for each channel + channels = [f"channel_{i}_{l}" + for i, l in product([1,3,4], ["A", "B", "C", "D"])] + pes_raw = {ch: run[pes_name, f"digitizers.{ch}.raw.samples"].select_trains(by_id[tids]).ndarray() + for ch in channels} + pes_raw_t = {ch: run_test[pes_name, f"digitizers.{ch}.raw.samples"].select_trains(by_id[test_tids]).ndarray() + for ch in channels} + + # select test SPEC data + spec_raw_pe_t = run_test[spec_name, "data.photonEnergy"].select_trains(by_id[test_tids - spec_offset]).ndarray() + spec_raw_int_t = run_test[spec_name, "data.intensityDistribution"].select_trains(by_id[test_tids - spec_offset]).ndarray() + + print("Data in memory.") + + # read the XGM information + #xgm_pressure = run['SA3_XTD10_XGM/XGM/DOOCS', "pressure.pressureFiltered.value"].select_trains(by_id[tids]).ndarray() + #xgm_pe = run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray() + #retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray() + #retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray() + + xgm_flux = run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray()[:, 0][:, np.newaxis] + xgm_flux_t = run_test['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[test_tids]).ndarray()[:, 0][:, np.newaxis] + + print(f"Intensity in training: {np.mean(xgm_flux):.2e} +/- {np.std(xgm_flux):.2e}") + print(f"Intensity in testing: {np.mean(xgm_flux_t):.2e} +/- {np.std(xgm_flux_t):.2e}") + + pressure = run["SA3_XTD10_PES/GAUGE/G30310F", "value"].select_trains(by_id[tids]).ndarray() + pressure_t = run_test["SA3_XTD10_PES/GAUGE/G30310F", "value"].select_trains(by_id[test_tids]).ndarray() + print(f"Pressure in training: {np.mean(pressure):.2e} +/- {np.std(pressure):.2e}") + print(f"Pressure in testing: {np.mean(pressure_t):.2e} +/- {np.std(pressure_t):.2e}") + + voltage = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray() + voltage_t = run_test["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[test_tids]).ndarray() + print(f"Voltage in training: {np.mean(voltage):.2f} +/- {np.std(voltage):.2f}") + print(f"Voltage in testing: {np.mean(voltage_t):.2f} +/- {np.std(voltage_t):.2f}") + + gas = get_gas(run, tids) + gas_t = get_gas(run_test, test_tids) + print(f"Gas in training: {gas}") + print(f"Gas in testing: {gas_t}") + + t = list() + t_names = list() + t_nch = list() + + train_idx = np.isin(tids, train_tids) & (xgm_flux[:,0] > args.xgm_cut) + + # we just need this for training and we need to avoid copying it, which blows up the memoray usage + for k in pes_raw.keys(): + pes_raw[k] = pes_raw[k][train_idx] + + + nch_axis = np.arange(1, len(channels)+1) + resolution = list() + rmse = list() + chi2_prepca = list() + unc = list() + for nch in nch_axis: + model = Model(channels=channels[:nch], model_type=args.model_type) + + print(f"Fitting using {nch} channels") + start = time_ns() + model.uniformize(xgm_flux[train_idx]) + model.fit(pes_raw, + spec_raw_int[train_idx], + spec_raw_pe[train_idx], + pulse_energy=xgm_flux[train_idx], + ) + t += [time_ns() - start] + t_names += ["Fit"] + t_nch += [nch] + + resolution += [model.resolution] + + # transfer function + print(f"Resolution: {model.resolution:.2f} eV") + + # test + print("Predict") + start = time_ns() + spec_pred = model.predict(pes_raw_t, pulse_energy=xgm_flux_t) + spec_smooth = model.preprocess_high_res(spec_raw_int_t) + t += [time_ns() - start] + t_names += ["Predict"] + t_nch += [nch] + + spec_smooth_pca = model.y_model['pca'].transform(spec_smooth) + unc2 = spec_pred["expected_pca_unc"]**2 + pca_var = (spec_pred["expected_pca"].std(axis=0, keepdims=True)**2).reshape(1, 1, -1) + ndof_prepca = float(spec_smooth_pca.shape[-1]) + print("Expected pca std:", pca_var) + chi2_prepca += [np.mean(np.sum((spec_smooth_pca[:, np.newaxis, :] - spec_pred["expected_pca"])**2/unc2, axis=(-1, -2)))/ndof_prepca] + rmse += [np.mean(np.sqrt(np.mean((spec_smooth[:, np.newaxis, :] - spec_pred["expected"])**2, axis=(-1, -2))))] + unc += [np.mean(spec_pred["total_unc"])] + + df = pd.DataFrame(dict(number_channels=nch_axis, + resolution=resolution, + rmse=rmse, + chi2_prepca=chi2_prepca, + unc=unc, + )) + df.to_csv(os.path.join(args.directory, "number_channel_effect.csv")) + + + print("Time taken in ms") + df_time = pd.DataFrame(data=dict(time=t, name=t_names, nch=t_nch)) + df_time.time *= 1e-6 + df_time.to_csv(os.path.join(args.directory, "number_channel_time.csv")) + +if __name__ == '__main__': + main() + diff --git a/pes_to_spec/test/plot_channel_sensitivity.py b/pes_to_spec/test/plot_channel_sensitivity.py new file mode 100755 index 0000000..52c6cae --- /dev/null +++ b/pes_to_spec/test/plot_channel_sensitivity.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +import os +import re + +from typing import Optional, Tuple, Dict + +import matplotlib +matplotlib.use('Agg') +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.gridspec import GridSpec +import seaborn as sns + +SMALL_SIZE = 12 +MEDIUM_SIZE = 22 +BIGGER_SIZE = 26 + +plt.rc('font', size=BIGGER_SIZE) # controls default text sizes +plt.rc('axes', titlesize=BIGGER_SIZE) # fontsize of the axes title +plt.rc('axes', labelsize=BIGGER_SIZE) # fontsize of the x and y labels +plt.rc('xtick', labelsize=BIGGER_SIZE) # fontsize of the tick labels +plt.rc('ytick', labelsize=BIGGER_SIZE) # fontsize of the tick labels +plt.rc('legend', fontsize=MEDIUM_SIZE) # legend fontsize +plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title + +def plot_resolution(df: pd.DataFrame, filename: str): + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + ax.plot(df.number_channels, df.resolution, c='b', lw=3) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set( + xlabel="Number of channels", + ylabel="Average resolution [eV]", + ) + plt.tight_layout() + fig.savefig(filename) + plt.close(fig) + +def plot_unc(df: pd.DataFrame, filename: str): + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + ax.plot(df.number_channels, df.unc, c='tab:blue', lw=3) + ax.spines['top'].set_visible(False) + ax.set( + xlabel="Number of channels", + ylabel="Uncertainty [a.u.]", + ) + maxY = 1.1*df.unc.max() + ax.set_ylim((None, maxY)) + rax = ax.twinx() + rax.plot(df.number_channels, df.rmse, c='tab:red', lw=3) + rax.spines['right'].set_color('tab:red') + rax.tick_params(axis='y', colors='tab:red') + rax.set_ylabel("Root-mean-squared error [a.u.]") + rax.yaxis.label.set_color("tab:red") + maxY = 1.1*df.rmse.max() + rax.set_ylim((None, maxY)) + plt.tight_layout() + fig.savefig(filename) + plt.close(fig) + +def plot_rmse(df: pd.DataFrame, filename: str): + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + ax.plot(df.number_channels, df.rmse, c='b', lw=3) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set( + xlabel="Number of channels", + ylabel="Root-mean-squared error [a.u.]", + ) + plt.tight_layout() + fig.savefig(filename) + plt.close(fig) + +if __name__ == '__main__': + indir = 'p900331r69t70' + df = pd.read_csv(f'{indir}/number_channel_effect.csv') + + plot_rmse(df, "rmse.pdf") + plot_unc(df, "unc.pdf") + plot_resolution(df, "resolution.pdf") + diff --git a/run_tests_number_channels.sh b/run_tests_number_channels.sh new file mode 100644 index 0000000..3609bfb --- /dev/null +++ b/run_tests_number_channels.sh @@ -0,0 +1,53 @@ +#!/bin/bash +#SBATCH --partition=exfel +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=8:00:00 +#SBATCH --job-name=nch_pes2spec +#SBATCH -o slurm.%x.err.txt +#SBATCH -e slurm.%x.err.txt +#SBATCH --reservation=exfel_ml + +optstring="d:" + +DIR=results +while getopts ${optstring} arg; do + case ${arg} in + d) + DIR=${OPTARG} + S=$OPTIND + ;; + *) + S=$((OPTIND)) + break + ;; + esac +done +OPTS="${@:$S}" +echo "Options: $OPTS" + +source /usr/share/Modules/init/sh + +module load exfel exfel_anaconda3 + +cd $HOME/scratch/karabo/devices/pes_to_spec +source env/bin/activate +pwd +export PYTHONPATH=$PYTHONPATH:$PWD + +mkdir $DIR + +do_it() { + p=$1 + r=$2 + rt=$3 + output=$DIR/p${p}r${r}t${rt} + mkdir -p $output + echo "Proposal $p, run $r, test at run $rt" + CMD=(./pes_to_spec/test/channel_sensitivity.py -p $p -r $r -t $rt -d $output ${@:4}) + echo "${CMD[*]}" + ${CMD[*]} 2>&1 | tee $output/log.txt +} + +do_it 900331 69 70 $OPTS + -- GitLab