From 81ad444afb0ffe0cf57f5abb595378190602305a Mon Sep 17 00:00:00 2001 From: Danilo Ferreira de Lima <danilo.enoque.ferreira.de.lima@xfel.de> Date: Fri, 24 Feb 2023 10:26:46 +0100 Subject: [PATCH] Use BayesianRidge and KDE weights to uniformize as a function of XGM intensity. Evaluate inliers. --- pes_to_spec/model.py | 484 +++++++++++++++------------ pes_to_spec/test/offline_analysis.py | 183 +++++++++- pyproject.toml | 2 +- 3 files changed, 447 insertions(+), 222 deletions(-) diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py index 9bd444a..8017ed5 100644 --- a/pes_to_spec/model.py +++ b/pes_to_spec/model.py @@ -5,25 +5,26 @@ import joblib import numpy as np import scipy.stats from scipy.signal import fftconvolve -from scipy.signal import find_peaks_cwt +#from scipy.signal import find_peaks_cwt from scipy.optimize import fmin_l_bfgs_b -from sklearn.decomposition import PCA, IncrementalPCA +from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.preprocessing import FunctionTransformer from sklearn.base import TransformerMixin, BaseEstimator from sklearn.base import RegressorMixin from sklearn.kernel_approximation import Nystroem -from sklearn.linear_model import ARDRegression from sklearn.preprocessing import PolynomialFeatures -#from sklearn.svm import LinearSVR -#from sklearn.gaussian_process import GaussianProcessRegressor -#from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel +from sklearn.linear_model import ARDRegression +from sklearn.linear_model import BayesianRidge +from sklearn.neighbors import KernelDensity +from sklearn.ensemble import IsolationForest +#from sklearn.neighbors import LocalOutlierFactor +#from sklearn.covariance import EllipticEnvelope +from functools import reduce from itertools import product -from sklearn.model_selection import train_test_split from sklearn.base import clone, MetaEstimatorMixin from joblib import Parallel, delayed -from functools import partial -import multiprocessing as mp from copy import deepcopy from typing import Any, Dict, List, Optional, Union, Tuple @@ -42,25 +43,22 @@ class FitModel(RegressorMixin, BaseEstimator): Linear regression model with uncertainties. Args: - l: Regularization coefficient. """ - def __init__(self, l: float=1e-6): - self.l = l - + def __init__(self): # model parameter sizes self.Nx: int = 0 self.Ny: int = 0 # fit result - self.A_inf: Optional[np.ndarray] = None - self.b_inf: Optional[np.ndarray] = None - self.u_inf: Optional[np.ndarray] = None - + self.pars: Optional[Dict[str, np.ndarray]] = None + self.structure: Optional[Dict[str, Tuple[int, int]]] = None + # fit monitoring self.loss_train: List[float] = list() self.loss_test: List[float] = list() - - self.input_data = None + + self.nll_train: Optional[np.ndarray] = None + self.nll_train_expected: Optional[np.ndarray] = None def fit(self, X: np.ndarray, y: np.ndarray, **fit_params) -> RegressorMixin: """ @@ -85,11 +83,28 @@ class FitModel(RegressorMixin, BaseEstimator): self.Ny: int = int(y.shape[1]) # initial parameter values - A0: np.ndarray = np.eye(self.Nx, self.Ny).reshape(self.Nx*self.Ny) - b0: np.ndarray = np.zeros(self.Ny) - Aeps: np.ndarray = np.zeros(self.Nx) - u0: np.ndarray = np.zeros(self.Ny) - x0: np.ndarray = np.concatenate((A0, b0, u0, Aeps), axis=0) + self.structure = dict(A_inf=(self.Nx, self.Ny), + b_inf=(1, self.Ny), + Ap_inf=(self.Nx, self.Ny), + log_inv_alpha=(1, self.Ny), + log_inv_alpha_prime=(self.Nx, 1), + #log_inv_alpha_prime2=(self.Nx, 1), + #log_inv_tau1=(1, 1), + #log_inv_tau2=(1, 1), + ) + # initialize close to the solution + # pinv(X) @ y solves the problem Ax = y + # assume b is zero, since both X and y are mean subtracted after the PCA + # assume a small noise uncertainty in alpha and in tau + init_method = dict(A_inf=lambda shape: np.linalg.pinv(X) @ (y - np.mean(y, axis=0, keepdims=True)), + b_inf=lambda shape: np.mean(y, axis=0, keepdims=True), + Ap_inf=lambda shape: np.zeros(shape), + log_inv_alpha=lambda shape: 1.0*np.ones(shape), + log_inv_alpha_prime=lambda shape: 1.0*np.ones(shape), + #log_inv_tau1=lambda shape: 1.0*np.ones(shape), + #log_inv_tau2=lambda shape: 1.0*np.ones(shape), + ) + x0: np.ndarray = FitModel.get_pars_init(self.structure, init_method) # reset loss monitoring self.loss_train: List[float] = list() @@ -106,20 +121,8 @@ class FitModel(RegressorMixin, BaseEstimator): Returns: The loss. """ - # diag( (in @ x - out) @ (in @ x - out)^T ) - A = x[:self.Nx*self.Ny].reshape((self.Nx, self.Ny)) - b = x[self.Nx*self.Ny:(self.Nx*self.Ny+self.Ny)].reshape((1, self.Ny)) - - b_eps = x[(self.Nx*self.Ny+self.Ny):(self.Nx*self.Ny+self.Ny+self.Ny)].reshape((1, self.Ny)) - A_eps = x[(self.Nx*self.Ny+self.Ny+self.Ny):].reshape((self.Nx, 1)) - log_unc = anp.matmul(X, A_eps) + b_eps - - #log_unc = anp.log(anp.exp(log_unc) + anp.exp(log_eps)) - iunc2 = anp.exp(-2*log_unc) - - L = anp.mean( (0.5*((anp.matmul(X, A) + b - Y)**2)*iunc2 + log_unc).sum(axis=1), axis=0 ) - #weights2 = anp.sum(anp.square(A.ravel())) - return L #+ self.l/2 * weights2 + p = FitModel.get_pars(x, self.structure) + return anp.mean(self.nll(X, Y, pars=p), axis=0) def loss_history(x: np.ndarray) -> float: """ @@ -154,16 +157,15 @@ class FitModel(RegressorMixin, BaseEstimator): x0, grad_loss, disp=True, - #factr=1e7, - factr=10, - maxiter=100, + factr=1e7, + #factr=10, + maxiter=10000, iprint=0) # Inference - self.A_inf = sc_op[0][:self.Nx*self.Ny].reshape(self.Nx, self.Ny) - self.b_inf = sc_op[0][self.Nx*self.Ny:(self.Nx*self.Ny+self.Ny)].reshape(1, self.Ny) - self.u_inf = sc_op[0][(self.Nx*self.Ny+self.Ny):(self.Nx*self.Ny+self.Ny+self.Ny)].reshape(1, self.Ny) # removed np.exp - self.A_eps = sc_op[0][(self.Nx*self.Ny+self.Ny+self.Ny):].reshape(self.Nx, 1) + self.pars = FitModel.get_pars(sc_op[0], self.structure) + self.nll_train = sc_op[1] + self.nll_train_expected = np.mean(self.nll(X, pars=self.pars), axis=0, keepdims=True) def predict(self, X: np.ndarray, return_std: bool=False) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: """ @@ -174,18 +176,92 @@ class FitModel(RegressorMixin, BaseEstimator): Returns: Predicted Y and, if return_std is True, also its uncertainty. """ - # result - y = (X @ self.A_inf + self.b_inf) + A = self.pars["A_inf"] + b = self.pars["b_inf"] + X2 = anp.square(X) + Ap = self.pars["Ap_inf"] + y = anp.matmul(X2, Ap) + anp.matmul(X, A) + b if not return_std: return y - # flat uncertainty - y_unc = self.u_inf[0,:] # input-dependent uncertainty - y_eps = np.exp(X @ self.A_eps + y_unc) - return y, y_eps + log_inv_alpha = anp.matmul(X, self.pars["log_inv_alpha_prime"]) + self.pars["log_inv_alpha"] + sqrt_inv_alpha = anp.exp(0.5*log_inv_alpha) + return y, sqrt_inv_alpha + @staticmethod + def get_pars(x: np.ndarray, structure: Dict[str, Tuple[int, int]]) -> Dict[str, np.ndarray]: + pars = dict() + s = 0 + for key, value in structure.items(): + n = value[0]*value[1] + e = s + n + pars[key] = x[s:e].reshape(value) + s += n + return pars + @staticmethod + def get_pars_size(structure: Dict[str, Tuple[int, int]]) -> int: + size = [value[0]*value[1] for _, value in structure.items()] + return reduce(lambda x, y: x*y, size) + + @staticmethod + def get_pars_init(structure: Dict[str, Tuple[int, int]], init_method: Optional[Dict[str, Any]]=dict()) -> int: + init = {key: np.zeros((value[0], value[1])).reshape(-1) if not key in init_method + else init_method[key](value).reshape(-1) + for key, value in structure.items()} + return np.concatenate(list(init.values()), axis=0) + + def nll(self, X: np.ndarray, Y: Optional[np.ndarray]=None, pars: Optional[Dict[str, np.ndarray]]=None) -> np.ndarray: + """ + To estimate p(M|X) = int_Y p(M|X,Y)p(Y) dY, we assume p(Y) is Normal(mean(X), std(X)). + p(M|X,Y=Normal(mu(X), var(X))) = 1/2b exp(-(mu(X)-mu(X))/b) = 1/2b. + -log p = log(2) + log(b) + We return -log [p(M|X,Y=mu(X))/p(M|X_train,Y_train)] = -log p(M|X,Y=mu(X)) + log p(M|X_train, Y_traun) + Negative log likelihood L allows one + + Args: + X: The input data. + Y: The true result. If None, set it to the expectation. + + Returns: negative log likelihood. + """ + if pars is None: + pars = self.pars + + A = pars["A_inf"] + b = pars["b_inf"] + X2 = anp.square(X) + Ap = pars["Ap_inf"] + Y_pred = anp.matmul(X2, Ap) + anp.matmul(X, A) + b + + log_inv_alpha = anp.matmul(X, pars["log_inv_alpha_prime"]) + pars["log_inv_alpha"] + sqrt_alpha = anp.exp(-0.5*log_inv_alpha) + #log_inv_tau1 = pars["log_inv_tau1"] + #tau1 = anp.exp(-log_inv_tau1) + #log_inv_tau2 = pars["log_inv_tau2"] + #tau2 = anp.exp(-log_inv_tau2) + if Y is None: + Y = self.predict(X) + # likelihood = p(D|x, y, A, b) = 1/sqrt(2 pi sigma^2) exp(-0.5*(Ax + b - y)/sigma^2) + # sigma is modelled as exp(A_e x + b_e) to make the aleatoric uncertainty data-dependent + L = anp.sum((anp.fabs(Y_pred - Y))*sqrt_alpha + 0.5*log_inv_alpha, axis=1) + # prior p(A_inf) p(b_inf) = p(A_inf) cte. (assume all b equally likely) + # A has normal prior with var = 1/tau + # 1/sqrt(2pi)1/sqrt(sigma**2) exp(-0.5 A**2/sigma**2) + # log p = -0.5 A **2/sigma**2 - 0.5 log sigma**2 - 0.5 log 2pi + # - log p = 0.5 A**2/sigma**2 + 0.5 log sigma**2 + 0.5 log 2pi + # 0.5 log sigma**2 = 0.5 log 1/tau + #L_prior = anp.sum(0.5*anp.square(pars["A_inf"].ravel())*tau.ravel() + 0.5*log_inv_tau.ravel()) + #L_prior = (anp.sum(0.5*anp.square(A.ravel())*tau1 + 0.5*log_inv_tau1) + # + anp.sum(0.5*anp.square(Ap.ravel())*tau2 + 0.5*log_inv_tau2) + # ) + #alpha = 2.0 + #beta = 2.0 + #L_hyper = anp.sum((alpha - 1)*log_inv_tau1 + beta*tau1 + # + (alpha - 1)*log_inv_tau2 + beta*tau2 + # ) + return L def matching_ids(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> np.ndarray: """Returns list of train IDs common to sets a, b and c.""" @@ -378,8 +454,6 @@ class DataHolder(TransformerMixin, BaseEstimator): """ return X - - class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): """ Select only relevant entries in the low-resolution data. @@ -390,16 +464,19 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): Set to None to perform no selection delta_tof: Number of components to take from the low-resolution spectrometer. Set to None to perform no selection. + poly: Whether to output a polynomial expantion of the low-resolution data. """ def __init__(self, channels:List[str]=[f"channel_{j}_{k}" for j, k in product(range(1, 5), ["A", "B", "C", "D"])], tof_start: Optional[int]=None, delta_tof: Optional[int]=300, + poly: bool=False ): self.channels = channels self.tof_start = tof_start self.delta_tof = delta_tof + self.poly = poly self.mean = dict() self.std = dict() @@ -423,7 +500,10 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): last = min(X[self.channels[0]].shape[1], self.tof_start + self.delta_tof) y = {channel: item[:, first:last] for channel, item in X.items()} if not keep_dictionary_structure: - return np.concatenate(list(y.values()), axis=1) + selected = list(y.values()) + if self.poly: + selected += [np.sqrt(np.fabs(v)) for v in y.values()] + return np.concatenate(selected, axis=1) return y def estimate_prompt_peak(self, X: Dict[str, np.ndarray]) -> int: @@ -507,9 +587,9 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): plt.savefig(filename) plt.close(fig) -def _fit_estimator(estimator, X: np.ndarray, y: np.ndarray): +def _fit_estimator(estimator, X: np.ndarray, y: np.ndarray, w: np.ndarray): estimator = clone(estimator) - estimator.fit(X, y) + estimator.fit(X, y, w) return estimator class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator): @@ -518,7 +598,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator): self.estimator = estimator self.n_jobs = n_jobs - def fit(self, X: np.ndarray, y: np.ndarray): + def fit(self, X: np.ndarray, y: np.ndarray, weights: Optional[np.ndarray]=None): """Fit the model to data, separately for each output variable. Args: @@ -535,10 +615,12 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator): "y must have at least two dimensions for " "multi-output regression but has only one." ) + if weights is None: + weights = np.ones(y.shape[0]) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)( - self.estimator, X, y[:, i] + self.estimator, X, y[:, i], weights ) for i in range(y.shape[1]) ) @@ -565,7 +647,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator): return np.asarray(y).T, np.asarray(unc).T return np.asarray(y).T - + class Model(TransformerMixin, BaseEstimator): """ Object representing a previous fit of the model to be used to predict high-resolution @@ -584,28 +666,33 @@ class Model(TransformerMixin, BaseEstimator): validation and systematic uncertainty estimate. n_nonlinear_kernel: Number of nonlinear kernel components added at the preprocessing stage to obtain nonlinearities as an input and improve the prediction. + poly: Whether to use a polynomial expantion of the low-resolution data. """ def __init__(self, channels:List[str]=[f"channel_{j}_{k}" for j, k in product(range(1, 5), ["A", "B", "C", "D"])], - n_pca_lr: int=600, - n_pca_hr: int=40, + n_pca_lr: int=400, + n_pca_hr: int=20, high_res_sigma: float=0.2, tof_start: Optional[int]=None, delta_tof: Optional[int]=300, validation_size: float=0.05, - n_nonlinear_kernel: int=0): + n_nonlinear_kernel: int=0, + poly: bool=False, + ): # models + self.x_select = SelectRelevantLowResolution(channels, tof_start, delta_tof, poly=poly) x_model_steps = list() - x_model_steps += [('select', SelectRelevantLowResolution(channels, tof_start, delta_tof))] + self.n_nonlinear_kernel = n_nonlinear_kernel if n_nonlinear_kernel > 0: + # Kernel PCA using Nystroem x_model_steps += [('fex', Pipeline([('prepca', PCA(n_pca_lr, whiten=True)), ('nystroem', Nystroem(n_components=n_nonlinear_kernel, kernel='rbf', gamma=None, n_jobs=8)), - ]))] + ])), + ] x_model_steps += [ ('pca', PCA(n_pca_lr, whiten=True)), - #('pca', TruncatedSVD(n_pca_lr)), ('unc', UncertaintyHolder()), ] self.x_model = Pipeline(x_model_steps) @@ -614,23 +701,27 @@ class Model(TransformerMixin, BaseEstimator): ('pca', PCA(n_pca_hr, whiten=True)), ('unc', UncertaintyHolder()), ]) - self.fit_model = Pipeline([ - #('fit', MultiOutputWithStd(ARDRegression(n_iter=30, tol=1e-6, verbose=True), n_jobs=10)), - ('fit', FitModel()), - ]) - - self.channel_mean = {ch: np.nan for ch in channels} - self.channel_pca = {ch: PCA(n_pca_lr, whiten=True) + self.ood = {ch: IsolationForest() + for ch in channels+['full']} + #self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8) + self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8) + #self.fit_model = FitModel() + + self.mu_intensity = np.nan + self.std_intensity = np.nan + + # we are reducing per channel from 2*delta_tof to delta_tof to check correlations + n_pca_lr_per_channel = delta_tof + self.channel_pca = {ch: PCA(n_pca_lr_per_channel, whiten=True) for ch in channels} - - #self.channel_relevance = {ch: np.nan for ch in channels} + #self.channel_fit_model = {ch: FitModel() for ch in channels} # size of the test subset self.validation_size = validation_size def get_channels(self) -> List[str]: """Get channels used in training.""" - return self.x_model.named_steps["select"].channels + return self.x_select.channels def get_energy_values(self) -> np.ndarray: """Get x-axis of high-resolution data.""" @@ -638,8 +729,8 @@ class Model(TransformerMixin, BaseEstimator): def get_low_resolution_range(self) -> Tuple[int, int]: """Get bin number with first and last relevant bins in the low-resolution spectrum.""" - first = (self.x_model.named_steps['select'].tof_start - self.x_model.named_steps['select'].delta_tof) - last = (self.x_model.named_steps['select'].tof_start + self.x_model.named_steps['select'].delta_tof) + first = (self.x_select.tof_start - self.x_select.delta_tof) + last = (self.x_select.tof_start + self.x_select.delta_tof) return first, last def debug_peak_finding(self, low_res_data: Dict[str, np.ndarray], filename: str): @@ -651,7 +742,7 @@ class Model(TransformerMixin, BaseEstimator): filename: The file name where to save the plot. """ - self.x_model['select'].debug_peak_finding(low_res_data, filename) + self.x_select.debug_peak_finding(low_res_data, filename) def preprocess_high_res(self, high_res_data: np.ndarray) -> np.ndarray: """ @@ -663,8 +754,30 @@ class Model(TransformerMixin, BaseEstimator): Returns: Smoothened spectrum. """ return self.y_model['smoothen'].transform(high_res_data) - - def fit(self, low_res_data: Dict[str, np.ndarray], high_res_data: np.ndarray, high_res_photon_energy: np.ndarray) -> np.ndarray: + + def uniformize(self, intensity: np.ndarray) -> np.ndarray: + """ + Calculate weights to uniformize data in variable intensity. + + Args: + intensity: The variable to uniformize the weights on. + + Return: weights. + """ + kde = KernelDensity() + kde.fit(intensity) + q = np.quantile(intensity, [0.10, 0.90]) + l, h = q[0], q[1] + x = intensity*((intensity > l) & (intensity < h)) + l*(intensity <= l) + h*(intensity >= h) + log_prob = kde.score_samples(x) + w = np.exp(-log_prob) + w = w/np.median(w) + return w + + def fit(self, low_res_data: Dict[str, np.ndarray], + high_res_data: np.ndarray, high_res_photon_energy: np.ndarray, + weights: Optional[np.ndarray]=None + ) -> np.ndarray: """ Train the model. @@ -679,13 +792,24 @@ class Model(TransformerMixin, BaseEstimator): Returns: Smoothened high resolution spectrum. """ + if weights is None: + weights = np.ones(high_Res_data.shape[0]) print("Fitting PCA on low-resolution data.") - x_t = self.x_model.fit_transform(low_res_data) + low_res_select = self.x_select.fit_transform(low_res_data) + n_components = min(self.x_model["pca"].n_components, low_res_select.shape[0]) + self.x_model.set_params(pca__n_components=n_components) + x_t = self.x_model.fit_transform(low_res_select) print("Fitting PCA on high-resolution data.") y_t = self.y_model.fit_transform(high_res_data, smoothen__energy=high_res_photon_energy) + intensity = np.sum(y_t, axis=1) + self.mu_intensity = np.mean(intensity, axis=0) + self.sigma_intensity = np.mean(intensity, axis=0) #self.fit_model.set_params(fex__gamma=1.0/float(x_t.shape[0])) + print("Fitting outlier detection") + self.ood['full'].fit(x_t) + inliers = self.ood['full'].predict(x_t) > 0.0 print("Fitting model.") - self.fit_model.fit(x_t, y_t) + self.fit_model.fit(x_t[inliers], y_t[inliers], weights[inliers]) # calculate the effect of the PCA print("Calculate PCA unc. on high-resolution data.") @@ -695,55 +819,20 @@ class Model(TransformerMixin, BaseEstimator): high_pca_unc = np.sqrt(np.mean((high_res - high_pca_rec)**2, axis=0, keepdims=True)) self.y_model['unc'].set_uncertainty(high_pca_unc) - print("Calculate PCA unc. on low-resolution data.") - low_res = self.x_model['select'].transform(low_res_data) - pca_model = self.x_model['pca'] - if 'fex' in self.x_model.named_steps: - pca_model = self.x_model['fex'].named_steps['prepca'] - low_pca = pca_model.transform(low_res) - low_pca_rec = pca_model.inverse_transform(low_pca) - low_pca_unc = np.mean(np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)), axis=0, keepdims=True) - self.x_model['unc'].set_uncertainty(low_pca_unc) - - # for consistency check - self.x_pca_mean = np.mean(low_res - low_pca_rec, axis=0, keepdims=True) - self.x_pca_std = np.std(low_res - low_pca_rec, axis=0, keepdims=True) - # for consistency check per channel - selection_model = self.x_model['select'] + selection_model = self.x_select low_res_selected = selection_model.transform(low_res_data, keep_dictionary_structure=True) for channel in self.get_channels(): print(f"Calculate PCA on {channel}") low_pca = self.channel_pca[channel].fit_transform(low_res_selected[channel]) + self.ood[channel].fit(low_pca) + #low_pca_rec = self.channel_pca[channel].inverse_transform(low_pca) + #self.channel_fit_model[channel].fit(low_pca, y_t) + print("End of fit.") return high_res - #def get_channel_quality(self, channel: str, low_res_data: Dict[str, np.ndarray], - # pca_model: PCA, - # channel_relevance: Dict[str, float], - # selection_model: SelectRelevantLowResolution, - # channel_mean: Dict[str, np.ndarray]) -> float: - # """ - # Get the compatibility for a single channel. - # Args: - # channel: The channel. - # low_res: The data in a dictionary. - # pca_model: The PCA model. - # Returns: the compatibility factor. - # """ - # # freeze input data in one channel only - # low_res_data_frozen = {ch: low_res_data[ch] if ch != channel - # else np.repeat(channel_mean[channel], low_res_data[ch].shape[0], axis=0) - # for ch in low_res_data.keys()} - # low_res_selected = selection_model.transform(low_res_data_frozen) - # low_pca = pca_model.transform(low_res_selected) - # low_pca_rec = pca_model.inverse_transform(low_pca) - # low_pca_unc = channel_relevance[channel] - # low_pca_dev = np.sqrt(np.mean((low_res_selected - low_pca_rec)**2, axis=1, keepdims=True)) - # quality = low_pca_dev/low_pca_unc - # return quality - def check_compatibility_per_channel(self, low_res_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: """ Check if a new low-resolution data source is compatible with the one used in training, by @@ -752,89 +841,54 @@ class Model(TransformerMixin, BaseEstimator): Args: low_res_data: Low resolution data as in the fit step with shape (train_id, channel, ToF channel). - Returns: Ratio of root-mean-squared-error of the data reconstruction using the existing PCA model and the one from the original model per channel. + Returns: Outlier score. If it is bigger than 0, this is an outlier. """ - selection_model = self.x_model['select'] + selection_model = self.x_select quality = {channel: 0.0 for channel in low_res_data.keys()} channels = list(low_res_data.keys()) # check if each channel is close to the mean low_res_selected = selection_model.transform(low_res_data, keep_dictionary_structure=True) - # decorrelate it - deviation = {ch: self.channel_pca[ch].transform(low_res_selected[ch]) - for ch in channels} - # just whiten it - #deviation = {ch: (low_res_selected[ch] - selection_model.mean[ch])/selection_model.std[ch] - # for ch in channels} - ndof = {ch: float(deviation[ch].shape[1] - 1) - for ch in channels} - chi2 = {ch: np.sum(deviation[ch]**2, axis=1, keepdims=True) - for ch in channels} - chi2_mu = {ch: scipy.stats.chi2.mean(ndof[ch]) + low_pca = {ch: self.channel_pca[ch].transform(low_res_selected[ch]) for ch in channels} - chi2_sigma = {ch: scipy.stats.chi2.std(ndof[ch]) - for ch in channels} - return {ch: (chi2[ch] - chi2_mu[ch])/chi2_sigma[ch] - for ch in channels} - - # checking channel relevance - #pca_model = self.x_model['pca'] - #if 'fex' in self.x_model.named_steps: - # pca_model = self.x_model['fex'].named_steps['prepca'] - ##with mp.Pool(len(low_res_data.keys())) as p: - #values = map(partial(self.get_channel_quality, - # low_res_data=low_res_data, - # pca_model=pca_model, - # channel_relevance=self.channel_relevance, - # selection_model=selection_model, - # channel_mean=self.channel_mean - # ), channels) - #quality = dict(zip(channels, values)) - #return quality + return {ch: self.ood[ch].predict(low_pca[ch]) for ch in channels} + + ## for chi2 + #deviation = {ch: low_pca[ch] + # for ch in channels} + #ndof = {ch: float(deviation[ch].shape[1] - 1) + # for ch in channels} + #chi2 = {ch: np.sum(deviation[ch]**2, axis=1, keepdims=True) + # for ch in channels} + #chi2_mu = {ch: scipy.stats.chi2.mean(ndof[ch]) + # for ch in channels} + #chi2_sigma = {ch: scipy.stats.chi2.std(ndof[ch]) + # for ch in channels} + #return {ch: (chi2[ch] - chi2_mu[ch])/chi2_sigma[ch] + # for ch in channels} def check_compatibility(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray: """ Check if a new low-resolution data source is compatible with the one used in training, by - comparing the effect of the trained PCA model on it. + using a robust covariance matrix estimate of the data Args: low_res_data: Low resolution data as in the fit step with shape (train_id, channel, ToF channel). - Returns: Ratio of root-mean-squared-error of the data reconstruction using the existing PCA model and the one from the original model. + Returns: An outlier score: if it is greater than 0, this is an outlier. """ - low_res = self.x_model['select'].transform(low_res_data) - pca_model = self.x_model['pca'] - if 'fex' in self.x_model.named_steps: - pca_model = self.x_model['fex'].named_steps['prepca'] + low_res = self.x_select.transform(low_res_data) + pca_model = self.x_model + #pca_model = self.x_model['pca'] + #if 'fex' in self.x_model.named_steps: + # pca_model = self.x_model['fex'].named_steps['prepca'] low_pca = pca_model.transform(low_res) - low_pca_rec = pca_model.inverse_transform(low_pca) - - deviation = (low_pca_rec - low_res - self.x_pca_mean)/self.x_pca_std - ndof = float(deviation.shape[1] - 1) - chi2 = np.sum(deviation**2, axis=1, keepdims=True) - chi2_mu = scipy.stats.chi2.mean(ndof) - chi2_sigma = scipy.stats.chi2.std(ndof) - return (chi2 - chi2_mu)/chi2_sigma - - #low_pca_unc = self.x_model['unc'].uncertainty() - - #fig = plt.figure(figsize=(8, 16)) - #ax = plt.gca() - #ax.plot(low_res[0,...], - # c="b", - # label="LR") - #ax.plot(low_pca_rec[0,...], - # c="r", - # label="LR rec.") - #ax.set(title="", - # xlabel="Photon Spectrometer channel", - # ylabel="Low resolution spectrometer intensity") - #ax.legend() - #plt.savefig("check.png") - #plt.close(fig) - - #low_pca_dev = np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)) - #return low_pca_dev/low_pca_unc - + return self.ood['full'].predict(low_pca) + #deviation = low_pca + #ndof = float(deviation.shape[1] - 1) + #chi2 = np.sum(deviation**2, axis=1, keepdims=True) + #chi2_mu = scipy.stats.chi2.mean(ndof) + #chi2_sigma = scipy.stats.chi2.std(ndof) + #return (chi2 - chi2_mu)/chi2_sigma def predict(self, low_res_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: """ @@ -848,20 +902,33 @@ class Model(TransformerMixin, BaseEstimator): the expected prediction in key "expected", the stat. uncertainty in key "unc" and a (1, energy channel) array for the PCA syst. uncertainty in key "pca". """ - low_pca = self.x_model.transform(low_res_data) + low_res_pre = self.x_select.transform(low_res_data) + low_pca = self.x_model.transform(low_res_pre) high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True) + intensity = np.sum(high_pca, axis=1, keepdims=True) + Z_intensity = (intensity - self.mu_intensity)/self.sigma_intensity #high_pca = self.fit_model.predict(low_pca) #high_pca_unc = 0 n_trains = high_pca.shape[0] + # Note: The whiten=True setting in the PCA model leads to an affine transformation pca_y = np.concatenate((high_pca, - high_pca + high_pca_unc), + high_pca + high_pca_unc, + ), axis=0) high_res_predicted = self.y_model["pca"].inverse_transform(pca_y) expected = high_res_predicted[:n_trains, :] - unc = high_res_predicted[n_trains:, :] - expected + e_plus = high_res_predicted[n_trains:(2*n_trains), :] + unc = np.fabs(e_plus - expected) + pca_unc = self.y_model['unc'].uncertainty() + total_unc = np.sqrt(pca_unc**2 + unc**2) + return dict(expected=expected, unc=unc, - pca=self.y_model['unc'].uncertainty()) + pca=pca_unc, + total_unc=total_unc, + inlier=self.ood['full'].predict(low_pca), + Z_intensity=Z_intensity + ) def save(self, filename: str): """ @@ -870,14 +937,15 @@ class Model(TransformerMixin, BaseEstimator): Args: filename: File name where to save this. """ - joblib.dump([self.x_model, + joblib.dump([self.x_select, + self.x_model, self.y_model, self.fit_model, - DataHolder(self.channel_mean), - DataHolder(self.x_pca_mean), - DataHolder(self.x_pca_std), self.channel_pca, - #DataHolder(self.channel_relevance) + #self.channel_fit_model + DataHolder(self.mu_intensity), + DataHolder(self.sigma_intensity), + self.ood ], filename, compress='zlib') @staticmethod @@ -890,17 +958,23 @@ class Model(TransformerMixin, BaseEstimator): Returns: A new model object. """ - (x_model, y_model, fit_model, - channel_mean, x_pca_mean, x_pca_std, - channel_pca) = joblib.load(filename) + (x_select, + x_model, y_model, fit_model, + channel_pca, + #channel_fit_model + mu_intensity, + sigma_intensity, + ood + ) = joblib.load(filename) obj = Model() + obj.x_select = x_select obj.x_model = x_model obj.y_model = y_model obj.fit_model = fit_model - obj.channel_mean = channel_mean.get_data() - obj.x_pca_mean = x_pca_mean.get_data() - obj.x_pca_std = x_pca_std.get_data() obj.channel_pca = channel_pca - #obj.channel_relevance = channel_relevance.get_data() + #obj.channel_fit_model = channel_fit_model + obj.ood = ood + obj.mu_intensity = mu_intensity.get_data() + obj.sigma_intensity = sigma_intensity.get_data() return obj diff --git a/pes_to_spec/test/offline_analysis.py b/pes_to_spec/test/offline_analysis.py index ac12ba5..790a5b3 100755 --- a/pes_to_spec/test/offline_analysis.py +++ b/pes_to_spec/test/offline_analysis.py @@ -9,7 +9,7 @@ import argparse import numpy as np from extra_data import open_run, by_id, RunDirectory -from pes_to_spec.model import Model, matching_two_ids +from pes_to_spec.model import Model, matching_ids from itertools import product @@ -19,6 +19,7 @@ matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec from mpl_toolkits.axes_grid.inset_locator import InsetPosition +import seaborn as sns from typing import Dict, Optional @@ -82,8 +83,8 @@ def plot_result(filename: str, fig = plt.figure(figsize=(12, 8)) gs = GridSpec(1, 1) ax = fig.add_subplot(gs[0, 0]) - unc_stat = np.mean(spec_pred["unc"]) - unc_pca = np.mean(spec_pred["pca"]) + unc_stat = spec_pred["unc"] + unc_pca = spec_pred["pca"] unc = np.sqrt(unc_stat**2 + unc_pca**2) ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High-res. measurement (smoothened)") ax.plot(spec_raw_pe, spec_pred["expected"], c='r', ls='--', lw=3, label="High-res. prediction") @@ -134,6 +135,8 @@ def main(): parser.add_argument('-P', '--pes', type=str, metavar='NAME', default="SA3_XTD10_PES/ADC/1:network", help='PES name') parser.add_argument('-X', '--xgm', type=str, metavar='NAME', default="SA3_XTD10_XGM/XGM/DOOCS:output", help='XGM name') parser.add_argument('-o', '--offset', type=int, metavar='INT', default=0, help='Train ID offset') + parser.add_argument('-c', '--xgm_cut', type=float, metavar='INTENSITY', default=0, help='XGM intensity threshold in uJ.') + parser.add_argument('-e', '--poly', action="store_true", default=False, help='Wheteher to expand PES data in higher order polynomials.') args = parser.parse_args() @@ -157,16 +160,16 @@ def main(): spec_offset = args.offset spec_name = args.spec pes_name = args.pes - #xgm_name = args.xgm + xgm_name = args.xgm pes_tid = run[pes_name, "digitizers.trainId"].ndarray() - #xgm_tid = run[xgm_name, "data.trainId"].ndarray() + xgm_tid = run[xgm_name, "data.trainId"].ndarray() if len(args.model) == 0: spec_tid = spec_offset + run[spec_name, "data.trainId"].ndarray() # these are the train ID intersection # this could have been done by a select call in the RunDirectory, but it would not correct for the spec_offset - tids = matching_two_ids(spec_tid, pes_tid) + tids = matching_ids(spec_tid, pes_tid, xgm_tid) # read the spec photon energy and intensity spec_raw_pe = run[spec_name, "data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray() @@ -194,22 +197,29 @@ def main(): #xgm_pe = run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray() #retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray() #retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray() + + xgm_flux = run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray()[:, 0][:, np.newaxis] + xgm_flux_t = run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[test_tids]).ndarray()[:, 0][:, np.newaxis] t = list() t_names = list() - model = Model() + model = Model(poly=args.poly) - train_idx = np.isin(tids, train_tids) + train_idx = np.isin(tids, train_tids) & (xgm_flux[:,0] > args.xgm_cut) model.debug_peak_finding(pes_raw, os.path.join(args.directory, "test_peak_finding.png")) if len(args.model) == 0: print("Fitting") start = time_ns() + w = model.uniformize(xgm_flux[train_idx]) + print("w", np.amin(w), np.amax(w), np.median(w)) model.fit({k: v[train_idx, :] for k, v in pes_raw.items()}, spec_raw_int[train_idx, :], - spec_raw_pe[train_idx, :]) + spec_raw_pe[train_idx, :], + w + ) t += [time_ns() - start] t_names += ["Fit"] @@ -231,10 +241,10 @@ def main(): print("Check consistency") start = time_ns() - rmse = model.check_compatibility(pes_raw_t) - print("Consistency check RMSE ratios:", rmse) - rmse = model.check_compatibility_per_channel(pes_raw_t) - print("Consistency per channel check (chi2 - icdf(p=0.05))/ndof:", rmse) + Z = model.check_compatibility(pes_raw_t) + print("Consistency check:", Z) + Z = model.check_compatibility_per_channel(pes_raw_t) + print("Consistency per channel:", Z) t += [time_ns() - start] t_names += ["Consistency"] @@ -255,6 +265,147 @@ def main(): if len(args.model) == 0: showSpec = True spec_smooth = model.preprocess_high_res(spec_raw_int) + + # chi2 w.r.t XGM intensity + erange = spec_raw_pe[0,-1] - spec_raw_pe[0,0] + de = (spec_raw_pe[0,1] - spec_raw_pe[0,0]) + chi2 = np.sum((spec_smooth - spec_pred["expected"])**2/(spec_pred["total_unc"]**2), axis=1) + ndof = float(spec_smooth.shape[1]) - 1.0 + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + ax.scatter(chi2/ndof, xgm_flux[:,0], c='r', s=20) + ax.set(title=f"", #avg(stat unc) = {unc_stat}, avg(pca unc) = {unc_pca}", + xlabel=r"$\chi^2/$ndof", + ylabel="XGM intensity [uJ]", + xlim=(0, 5), + ) + ax2 = plt.axes([0,0,1,1]) + # Manually set the position and relative size of the inset axes within ax1 + ip = InsetPosition(ax, [0.65,0.6,0.35,0.4]) + ax2.set_axes_locator(ip) + ax2.scatter(chi2/ndof, xgm_flux[:,0], c='r', s=30) + #ax2.scatter(chi2/ndof, np.sum(spec_pred["expected"], axis=1)*de, c='b', s=30) + #ax2.scatter(chi2/ndof, np.sum(spec_raw_int, axis=1)*de, c='g', s=30) + ax2.set(title="", + xlabel=r"$\chi^2/$ndof", + ylabel=f"XGM intensity [uJ]", + ) + ax2.title.set_size(SMALL_SIZE) + ax2.xaxis.label.set_size(SMALL_SIZE) + ax2.yaxis.label.set_size(SMALL_SIZE) + ax2.tick_params(axis='both', which='major', labelsize=SMALL_SIZE) + fig.savefig(os.path.join(args.directory, "intensity_vs_chi2.png")) + plt.close(fig) + + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + sns.kdeplot(x=chi2/ndof, ax=ax) + ax.set(title=f"", + xlabel=r"$\chi^2/$ndof", + ylabel="Density [a.u.]", + xlim=(0, 5), + ) + ax.text(0.90, 0.95, fr"$\mu = ${np.mean(chi2/ndof):.2f}", + verticalalignment='top', horizontalalignment='right', + transform=ax.transAxes, + color='black', fontsize=15) + ax.text(0.90, 0.90, fr"$\sigma = ${np.std(chi2/ndof):.2f}", + verticalalignment='top', horizontalalignment='right', + transform=ax.transAxes, + color='black', fontsize=15) + fig.savefig(os.path.join(args.directory, "chi2.png")) + plt.close(fig) + + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + sns.kdeplot(x=xgm_flux[:,0], ax=ax) + ax.set(title=f"", + xlabel="XGM intensity [uJ]", + ylabel="Density [a.u.]", + ) + ax.text(0.90, 0.95, fr"$\mu = ${np.mean(xgm_flux[:,0]):.2f}", + verticalalignment='top', horizontalalignment='right', + transform=ax.transAxes, + color='black', fontsize=15) + ax.text(0.90, 0.90, fr"$\sigma = ${np.std(xgm_flux[:,0]):.2f}", + verticalalignment='top', horizontalalignment='right', + transform=ax.transAxes, + color='black', fontsize=15) + fig.savefig(os.path.join(args.directory, "intensity.png")) + plt.close(fig) + + # rmse + rmse = np.sqrt(np.mean((spec_smooth - spec_pred["expected"])**2, axis=1)) + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + ax.scatter(rmse, xgm_flux[:,0], c='r', s=30) + ax = plt.gca() + ax.set(title=f"", + xlabel=r"Root-mean-squared error", + ylabel="XGM intensity [uJ]", + ) + fig.savefig(os.path.join(args.directory, "intensity_vs_rmse.png")) + plt.close(fig) + + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + sns.kdeplot(x=rmse, ax=ax) + ax.set(title=f"", + xlabel="Root-mean-squared error", + ylabel="Density [a.u.]", + xlim=(0, 20), + ) + ax.text(0.90, 0.95, fr"$\mu = ${np.mean(rmse):.2f}", + verticalalignment='top', horizontalalignment='right', + transform=ax.transAxes, + color='black', fontsize=15) + ax.text(0.90, 0.90, fr"$\sigma = ${np.std(rmse):.2f}", + verticalalignment='top', horizontalalignment='right', + transform=ax.transAxes, + color='black', fontsize=15) + fig.savefig(os.path.join(args.directory, "rmse.png")) + plt.close(fig) + + # SPEC integral w.r.t XGM intensity + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + sns.regplot(x=np.sum(spec_raw_int, axis=1)*de, y=xgm_flux[:,0], color='r', robust=True, ax=ax) + ax.set(title=f"", + xlabel="SPEC (raw) integral", + ylabel="XGM Intensity [uJ]", + ) + fig.savefig(os.path.join(args.directory, "xgm_vs_intensity.png")) + plt.close(fig) + + # SPEC integral w.r.t XGM intensity + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + sns.regplot(x=np.sum(spec_raw_int, axis=1)*de, y=np.sum(spec_pred["expected"], axis=1)*de, color='r', robust=True, ax=ax) + ax.set(title=f"", + xlabel="SPEC (raw) integral", + ylabel="Predicted integral", + ) + fig.savefig(os.path.join(args.directory, "expected_vs_intensity.png")) + plt.close(fig) + + fig = plt.figure(figsize=(12, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + sns.regplot(x=np.sum(spec_pred["expected"], axis=1)*de, y=xgm_flux[:,0], color='r', robust=True, ax=ax) + ax.set(title=f"", + xlabel="Predicted integral", + ylabel="XGM intensity [uJ]", + ) + fig.savefig(os.path.join(args.directory, "xgm_vs_expected.png")) + plt.close(fig) + first, last = model.get_low_resolution_range() first += 10 last -= 10 @@ -269,9 +420,9 @@ def main(): spec_smooth[idx, :] if showSpec else None, spec_raw_pe[idx, :] if showSpec else None, spec_raw_int[idx, :] if showSpec else None, - pes=-pes_raw[pes_to_show][idx, first:last], - pes_to_show=pes_to_show.replace('_', ' '), - pes_bin=np.arange(first, last) + #pes=-pes_raw[pes_to_show][idx, first:last], + #pes_to_show=pes_to_show.replace('_', ' '), + #pes_bin=np.arange(first, last) ) for ch in channels: plot_pes(os.path.join(args.directory, f"test_pes_{tid}_{ch}.png"), diff --git a/pyproject.toml b/pyproject.toml index fce19d2..32d5924 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ ] [project.optional-dependencies] -offline = ["matplotlib", "extra_data"] +offline = ["seaborn", "statsmodels", "matplotlib", "extra_data"] [project.scripts] offline_analysis = "pes_to_spec.test.offline_analysis:main" -- GitLab