diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py index 891637ef43121b40213b97a3ba48a449cbf76afc..d0b22592a6e5dda8f32fc0d2091800e3fe2de170 100644 --- a/pes_to_spec/model.py +++ b/pes_to_spec/model.py @@ -3,6 +3,7 @@ from __future__ import annotations import joblib import numpy as np +import scipy.stats from scipy.signal import fftconvolve from scipy.signal import find_peaks_cwt from scipy.optimize import fmin_l_bfgs_b @@ -156,7 +157,8 @@ from typing import Any, Dict, List, Optional, Union, Tuple # x0, # grad_loss, # disp=True, -# factr=1e7, +# #factr=1e7, +# factr=10, # maxiter=100, # iprint=0) # @@ -185,7 +187,7 @@ from typing import Any, Dict, List, Optional, Union, Tuple # # input-dependent uncertainty # y_eps = np.exp(X @ self.A_eps + y_unc) # return y, y_eps - +# def matching_ids(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> np.ndarray: @@ -401,6 +403,8 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): self.channels = channels self.tof_start = tof_start self.delta_tof = delta_tof + self.mean = dict() + self.std = dict() def transform(self, X: Dict[str, np.ndarray], keep_dictionary_structure: bool=False) -> np.ndarray: """ @@ -436,8 +440,14 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): """ # reduce on channel and on train ID sum_low_res = - np.mean(sum(list(X.values())), axis=0) - widths = np.arange(10, 50, step=1) - peak_idx = find_peaks_cwt(sum_low_res, widths) + axis = np.arange(0.0, sum_low_res.shape[1], 1.0) + #widths = np.arange(10, 50, step=5) + #peak_idx = find_peaks_cwt(sum_low_res, widths) + gaussian = np.exp(-0.5*(axis - sum_low_res.shape[1]//2)**2/20**2) + gaussian /= np.sum(gaussian, axis=1, keepdims=True) + # apply it to the data + smoothened = fftconvolve(sum_low_res, gaussian, mode="same", axes=1) + peak_idx = [np.argmax(smoothened)] if len(peak_idx) < 1: raise PromptNotFoundError() peak_idx = sorted(peak_idx, key=lambda k: np.fabs(sum_low_res[k]), reverse=True) @@ -460,7 +470,14 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): Returns: The object itself. """ + print("Finding peaks") self.tof_start = self.estimate_prompt_peak(X) + X_tr = self.transform(X, keep_dictionary_structure=True) + self.mean = {ch: np.mean(X_tr[ch], axis=0, keepdims=True) + for ch in X.keys()} + self.std = {ch: np.std(X_tr[ch], axis=0, keepdims=True) + for ch in X.keys()} + print("Found peaks") return self def debug_peak_finding(self, X: Dict[str, np.ndarray], filename: str): @@ -576,7 +593,7 @@ class Model(TransformerMixin, BaseEstimator): channels:List[str]=[f"channel_{j}_{k}" for j, k in product(range(1, 5), ["A", "B", "C", "D"])], n_pca_lr: int=600, - n_pca_hr: int=20, + n_pca_hr: int=40, high_res_sigma: float=0.2, tof_start: Optional[int]=None, delta_tof: Optional[int]=300, @@ -591,6 +608,7 @@ class Model(TransformerMixin, BaseEstimator): ]))] x_model_steps += [ ('pca', PCA(n_pca_lr, whiten=True)), + #('pca', TruncatedSVD(n_pca_lr)), ('unc', UncertaintyHolder()), ] self.x_model = Pipeline(x_model_steps) @@ -600,10 +618,11 @@ class Model(TransformerMixin, BaseEstimator): ('unc', UncertaintyHolder()), ]) #self.fit_model = FitModel() - self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=30, tol=1e-4, verbose=True)) + self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=30, tol=1e-5, verbose=True)) self.channel_mean = {ch: np.nan for ch in channels} - self.channel_relevance = {ch: np.nan for ch in channels} + + #self.channel_relevance = {ch: np.nan for ch in channels} # size of the test subset self.validation_size = validation_size @@ -618,7 +637,9 @@ class Model(TransformerMixin, BaseEstimator): def get_low_resolution_range(self) -> Tuple[int, int]: """Get bin number with first and last relevant bins in the low-resolution spectrum.""" - return self.x_model.named_steps['select'].tof_start, (self.x_model.named_steps['select'].tof_start + self.x_model.named_steps['select'].delta_tof) + first = (self.x_model.named_steps['select'].tof_start - self.x_model.named_steps['select'].delta_tof) + last = (self.x_model.named_steps['select'].tof_start + self.x_model.named_steps['select'].delta_tof) + return first, last def debug_peak_finding(self, low_res_data: Dict[str, np.ndarray], filename: str): """ @@ -683,49 +704,52 @@ class Model(TransformerMixin, BaseEstimator): low_pca_unc = np.mean(np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)), axis=0, keepdims=True) self.x_model['unc'].set_uncertainty(low_pca_unc) + # for consistency check + self.x_pca_mean = np.mean(low_pca, axis=0, keepdims=True) + self.x_pca_std = np.std(low_pca, axis=0, keepdims=True) + # for consistency check per channel - selection_model = self.x_model['select'] - for channel in self.get_channels(): - self.channel_mean[channel] = np.mean(low_res_data[channel], axis=0, keepdims=True) - print(f"Calculate PCA relevance on {channel}") - # freeze input data in one channel only - low_res_data_frozen = {ch: low_res_data[ch] if ch != channel - else np.repeat(self.channel_mean[channel], low_res_data[ch].shape[0], axis=0) - for ch in self.get_channels()} - low_res = selection_model.transform(low_res_data_frozen) - low_pca = pca_model.fit_transform(low_res) - low_pca_rec = pca_model.inverse_transform(low_pca) - low_pca_unc = np.mean(np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)), axis=0, keepdims=True) - self.channel_relevance[channel] = low_pca_unc + #selection_model = self.x_model['select'] + #for channel in self.get_channels(): + # self.channel_mean[channel] = np.mean(low_res_data[channel], axis=0, keepdims=True) + # print(f"Calculate PCA relevance on {channel}") + # # freeze input data in one channel only + # low_res_data_frozen = {ch: low_res_data[ch] if ch != channel + # else np.repeat(self.channel_mean[channel], low_res_data[ch].shape[0], axis=0) + # for ch in self.get_channels()} + # low_res = selection_model.transform(low_res_data_frozen) + # low_pca = pca_model.fit_transform(low_res) + # low_pca_rec = pca_model.inverse_transform(low_pca) + # low_pca_unc = np.mean(np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)), axis=0, keepdims=True) + # self.channel_relevance[channel] = low_pca_unc print("End of fit.") return high_res - def get_channel_quality(self, channel: str, low_res_data: Dict[str, np.ndarray], - pca_model: PCA, - channel_relevance: Dict[str, float], - selection_model: SelectRelevantLowResolution, - channel_mean: Dict[str, np.ndarray]) -> float: - """ - Get the compatibility for a single channel. - - Args: - channel: The channel. - low_res: The data in a dictionary. - pca_model: The PCA model. - - Returns: the compatibility factor. - """ - # freeze input data in one channel only - low_res_data_frozen = {ch: low_res_data[ch] if ch != channel - else np.repeat(channel_mean[channel], low_res_data[ch].shape[0], axis=0) - for ch in low_res_data.keys()} - low_res_selected = selection_model.transform(low_res_data_frozen) - low_pca = pca_model.transform(low_res_selected) - low_pca_rec = pca_model.inverse_transform(low_pca) - low_pca_unc = channel_relevance[channel] - low_pca_dev = np.sqrt(np.mean((low_res_selected - low_pca_rec)**2, axis=1, keepdims=True)) - return low_pca_dev/low_pca_unc + #def get_channel_quality(self, channel: str, low_res_data: Dict[str, np.ndarray], + # pca_model: PCA, + # channel_relevance: Dict[str, float], + # selection_model: SelectRelevantLowResolution, + # channel_mean: Dict[str, np.ndarray]) -> float: + # """ + # Get the compatibility for a single channel. + # Args: + # channel: The channel. + # low_res: The data in a dictionary. + # pca_model: The PCA model. + # Returns: the compatibility factor. + # """ + # # freeze input data in one channel only + # low_res_data_frozen = {ch: low_res_data[ch] if ch != channel + # else np.repeat(channel_mean[channel], low_res_data[ch].shape[0], axis=0) + # for ch in low_res_data.keys()} + # low_res_selected = selection_model.transform(low_res_data_frozen) + # low_pca = pca_model.transform(low_res_selected) + # low_pca_rec = pca_model.inverse_transform(low_pca) + # low_pca_unc = channel_relevance[channel] + # low_pca_dev = np.sqrt(np.mean((low_res_selected - low_pca_rec)**2, axis=1, keepdims=True)) + # quality = low_pca_dev/low_pca_unc + # return quality def check_compatibility_per_channel(self, low_res_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: """ @@ -740,19 +764,34 @@ class Model(TransformerMixin, BaseEstimator): selection_model = self.x_model['select'] quality = {channel: 0.0 for channel in low_res_data.keys()} channels = list(low_res_data.keys()) - pca_model = self.x_model['pca'] - if 'fex' in self.x_model.named_steps: - pca_model = self.x_model['fex'].named_steps['prepca'] - #with mp.Pool(len(low_res_data.keys())) as p: - values = map(partial(self.get_channel_quality, - low_res_data=low_res_data, - pca_model=pca_model, - channel_relevance=self.channel_relevance, - selection_model=selection_model, - channel_mean=self.channel_mean - ), channels) - quality = dict(zip(channels, values)) - return quality + # check if each channel is close to the mean + low_res_selected = selection_model.transform(low_res_data, keep_dictionary_structure=True) + deviation = {ch: (low_res_selected[ch] - selection_model.mean[ch])/selection_model.std[ch] + for ch in channels} + ndof = {ch: float(deviation[ch].shape[1] - 1) + for ch in channels} + chi2 = {ch: np.sum(deviation[ch]**2, axis=1, keepdims=True) + for ch in channels} + cutoff = {ch: scipy.stats.chi2.ppf(0.05, df=deviation[ch].shape[1] - 1) + for ch in channels} + ts = {ch: (chi2[ch] - cutoff[ch])/ndof[ch] + for ch in channels} + return ts + + # checking channel relevance + #pca_model = self.x_model['pca'] + #if 'fex' in self.x_model.named_steps: + # pca_model = self.x_model['fex'].named_steps['prepca'] + ##with mp.Pool(len(low_res_data.keys())) as p: + #values = map(partial(self.get_channel_quality, + # low_res_data=low_res_data, + # pca_model=pca_model, + # channel_relevance=self.channel_relevance, + # selection_model=selection_model, + # channel_mean=self.channel_mean + # ), channels) + #quality = dict(zip(channels, values)) + #return quality def check_compatibility(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -769,8 +808,16 @@ class Model(TransformerMixin, BaseEstimator): if 'fex' in self.x_model.named_steps: pca_model = self.x_model['fex'].named_steps['prepca'] low_pca = pca_model.transform(low_res) - low_pca_rec = pca_model.inverse_transform(low_pca) - low_pca_unc = self.x_model['unc'].uncertainty() + + deviation = (low_pca - self.x_pca_mean)/self.x_pca_std + ndof = float(deviation.shape[1] - 1) + chi2 = np.sum(deviation**2, axis=1, keepdims=True) + cutoff = scipy.stats.chi2.ppf(0.05, df=deviation.shape[1] - 1) + ts = (chi2 - cutoff)/ndof + return ts + + #low_pca_rec = pca_model.inverse_transform(low_pca) + #low_pca_unc = self.x_model['unc'].uncertainty() #fig = plt.figure(figsize=(8, 16)) #ax = plt.gca() @@ -787,8 +834,8 @@ class Model(TransformerMixin, BaseEstimator): #plt.savefig("check.png") #plt.close(fig) - low_pca_dev = np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)) - return low_pca_dev/low_pca_unc + #low_pca_dev = np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)) + #return low_pca_dev/low_pca_unc def predict(self, low_res_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: @@ -829,7 +876,9 @@ class Model(TransformerMixin, BaseEstimator): self.y_model, self.fit_model, DataHolder(self.channel_mean), - DataHolder(self.channel_relevance) + DataHolder(self.x_pca_mean), + DataHolder(self.x_pca_std), + #DataHolder(self.channel_relevance) ], filename, compress='zlib') @staticmethod @@ -842,12 +891,14 @@ class Model(TransformerMixin, BaseEstimator): Returns: A new model object. """ - x_model, y_model, fit_model, channel_mean, channel_relevance = joblib.load(filename) + x_model, y_model, fit_model, channel_mean, x_pca_mean, x_pca_std = joblib.load(filename) obj = Model() obj.x_model = x_model obj.y_model = y_model obj.fit_model = fit_model obj.channel_mean = channel_mean.get_data() - obj.channel_relevance = channel_relevance.get_data() + obj.x_pca_mean = x_pca_mean.get_data() + obj.x_pca_std = x_pca_std.get_data() + #obj.channel_relevance = channel_relevance.get_data() return obj diff --git a/pes_to_spec/test/offline_analysis.py b/pes_to_spec/test/offline_analysis.py index 5b6716cc493336b122ff231dbe2c776473fd21e4..d1fcf77cf7621515cde1303a22fdf4a23ce6b68c 100755 --- a/pes_to_spec/test/offline_analysis.py +++ b/pes_to_spec/test/offline_analysis.py @@ -8,7 +8,7 @@ import os import argparse import numpy as np -from extra_data import open_run, by_id +from extra_data import open_run, by_id, RunDirectory from pes_to_spec.model import Model, matching_two_ids from itertools import product @@ -137,8 +137,10 @@ def main(): args = parser.parse_args() + print("Opening run ...") # get run - run = open_run(proposal=args.proposal, run=args.run) + #run = open_run(proposal=args.proposal, run=args.run) + run = RunDirectory("/gpfs/exfel/data/scratch/tmichela/data/r0206") # ----------------Used in the first tests------------------------- # get train IDs and match them, so we are sure to have information from all needed sources @@ -184,6 +186,8 @@ def main(): for ch in channels} pes_raw_t = {ch: run[pes_name, f"digitizers.{ch}.raw.samples"].select_trains(by_id[test_tids]).ndarray() for ch in channels} + + print("Data in memory.") # read the XGM information #xgm_pressure = run['SA3_XTD10_XGM/XGM/DOOCS', "pressure.pressureFiltered.value"].select_trains(by_id[tids]).ndarray() @@ -198,7 +202,7 @@ def main(): train_idx = np.isin(tids, train_tids) - model.debug_peak_finding(pes_raw, os.path.join(args.directory, "test_peak_finding.png")) + #model.debug_peak_finding(pes_raw, os.path.join(args.directory, "test_peak_finding.png")) if len(args.model) == 0: print("Fitting") start = time_ns() @@ -230,7 +234,7 @@ def main(): rmse = model.check_compatibility(pes_raw_t) print("Consistency check RMSE ratios:", rmse) rmse = model.check_compatibility_per_channel(pes_raw_t) - print("Consistency per channel check RMSE ratios:", rmse) + print("Consistency per channel check (chi2 - icdf(p=0.05))/ndof:", rmse) t += [time_ns() - start] t_names += ["Consistency"] @@ -253,7 +257,7 @@ def main(): spec_smooth = model.preprocess_high_res(spec_raw_int) first, last = model.get_low_resolution_range() first += 10 - last -= 100 + last -= 10 pes_to_show = 'channel_1_D' # plot for tid in test_tids: diff --git a/pyproject.toml b/pyproject.toml index 729a6382b1856f652257812a449e7a7442ff3307..66bce5b1adeb005dabc04139fa60602af6705392 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,9 @@ classifiers = [ ] dynamic = ["version", "readme"] dependencies = [ - "numpy>=1.21", - "scipy>=1.6", + #"numpy>=1.21", + #"scipy>=1.6", + "intel-scipy>=1.7.3", "scikit-learn>=1.0.2", #"autograd", #"h5py"