diff --git a/pes_to_spec/__init__.py b/pes_to_spec/__init__.py index 081a9d45da3389e4ac5e944c9736a8ba8a8fee76..28b89f0692f8818e9d8f27f564cf3488097134ee 100644 --- a/pes_to_spec/__init__.py +++ b/pes_to_spec/__init__.py @@ -2,4 +2,4 @@ Estimate high-resolution photon spectrometer data from low-resolution non-invasive measurements. """ -VERSION = "0.3.4" +VERSION = "0.3.6" diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py index 02512dce0c401171d7d9cafd4e3d53d31b6930e2..389c0827bc759812fbdc28049f6301b37fc2dcd1 100644 --- a/pes_to_spec/model.py +++ b/pes_to_spec/model.py @@ -7,6 +7,7 @@ import joblib import numpy as np import scipy from scipy.signal import fftconvolve +from sklearn.covariance import EllipticEnvelope from sklearn.decomposition import IncrementalPCA, PCA from sklearn.base import TransformerMixin, BaseEstimator from sklearn.base import OutlierMixin @@ -308,15 +309,20 @@ class SelectRelevantLowResolution(TransformerMixin, BaseEstimator): if self.delta_tof is not None: first = max(0, self.tof_start - self.delta_tof) last = min(X[self.channels[0]].shape[1], self.tof_start + self.delta_tof) - y = {channel: np.stack([item[:, (first + delta):(last + delta)] for delta in pulse_spacing[channel]], axis=1) + y = {channel: [item[:, (first + delta):(last + delta)] for delta in pulse_spacing[channel]] for channel, item in X.items() if channel in self.channels} + # pad it with zeros, if we reach the edge of the array + for channel in y.keys(): + y[channel] = [np.pad(y[channel][j], ((0, 0), (0, 2*self.delta_tof - y[channel][j].shape[1]))) + for j in range(len(y[channel]))] + y[channel] = np.stack(y[channel], axis=1) if not keep_dictionary_structure: selected = list(y.values()) if pulse_energy is not None: - selected += [pulse_energy[:, np.newaxis, :]] + selected += [pulse_energy[:, :, np.newaxis]] if self.poly: - selected += [pulse_energy[:, np.newaxis, :]*v for v in y.values()] + selected += [pulse_energy*v for v in y.values()] return np.concatenate(selected, axis=-1) return y @@ -853,39 +859,55 @@ class Model(TransformerMixin, BaseEstimator): print("Checking data quality in high-resolution data.") peaks = self.count_peaks(high_res_data, high_res_photon_energy) filter_hr = (peaks >= self.n_peaks) - print(f"Selected {np.sum(filter_hr)} of {high_res_data.shape[0]} samples") - print("Fitting PCA on low-resolution data.") + print("Finding region-of-interest") self.x_select.fit(low_res_data) low_res_select = self.x_select.transform(low_res_data, pulse_energy=pulse_energy) # keep the number of pulses B, P, _ = low_res_select.shape low_res_select = low_res_select.reshape((B*P, -1)) + print("Excluding outliers") + lr_sums = np.stack([low_res_select[:, :-1].sum(1), low_res_select[:, -1]], axis=1) + # exclude outliers + cov = EllipticEnvelope(random_state=0).fit(lr_sums) + good_low_res = cov.predict(lr_sums) + filter_lr = (good_low_res > 0) + low_res_filter = low_res_select[filter_hr & filter_lr, :] + high_res_filter = high_res_data[filter_hr & filter_lr, :] + weights_filter = weights + if weights is not None: + weights_filter = weights[filter_hr & filter_lr] + + print(f"Selected {np.sum(filter_hr & filter_lr)} of {high_res_data.shape[0]} samples.") + + print("Fitting PCA on low-resolution data.") # estimate number of PCA components pca_test = PCA(None, whiten=True) - pca_test.fit(low_res_select) + pca_test.fit(low_res_filter) n_components = np.where(np.cumsum(pca_test.explained_variance_ratio_) > self.pca_threshold)[0] if len(n_components) > 0: n_components = n_components[0] + n_components = max(600, n_components) print(f"Using {n_components} comp. for PES PCA.") self.x_model.set_params(pca__n_components=n_components) - x_t = self.x_model.fit_transform(low_res_select) + x_t = self.x_model.fit_transform(low_res_filter) #print("PCA fraction of variance (LR): ", np.cumsum(self.x_model["pca"].explained_variance_ratio_)) print("Fitting PCA on high-resolution data.") # estimate number of PCA components pca_test = PCA(None, whiten=True) - pca_test.fit(high_res_data) + pca_test.fit(high_res_filter) n_components_hr = np.where(np.cumsum(pca_test.explained_variance_ratio_) > self.pca_threshold)[0] if len(n_components_hr) > 0: n_components_hr = n_components_hr[0] + n_components_hr = max(20, n_components_hr) print(f"Using {n_components_hr} comp. for grating spec. PCA.") self.y_model.set_params(pca__n_components=n_components_hr) - y_t = self.y_model.fit_transform(high_res_data, smoothen__energy=high_res_photon_energy) + y_t = self.y_model.fit_transform(high_res_filter, smoothen__energy=high_res_photon_energy) #print("PCA fraction of variance (HR): ", np.cumsum(self.y_model["pca"].explained_variance_ratio_)) @@ -894,13 +916,13 @@ class Model(TransformerMixin, BaseEstimator): inliers = self.ood['full'].predict(x_t) > 0.0 print("Fitting model.") if weights is not None: - weights = weights[inliers & filter_hr] - self.fit_model.fit(x_t[inliers & filter_hr], y_t[inliers & filter_hr], weights) - self.n_obs = len(x_t[inliers & filter_hr]) + weights = weights[inliers] + self.fit_model.fit(x_t[inliers], y_t[inliers], weights) + self.n_obs = len(x_t[inliers]) # calculate the effect of the PCA print("Calculate PCA unc. on high-resolution data.") - high_res = self.y_model['smoothen'].transform(high_res_data) + high_res = self.y_model['smoothen'].transform(high_res_filter) high_pca = self.y_model['pca'].transform(high_res) high_pca_rec = self.y_model['pca'].inverse_transform(high_pca) high_pca_unc = np.sqrt(np.mean((high_res - high_pca_rec)**2, axis=0, keepdims=True)) @@ -915,9 +937,9 @@ class Model(TransformerMixin, BaseEstimator): # n: noise (uncertainty) # e: energy # true signal (as far as we can get -- it is smoothened, but this is the model target) - y = high_res_data[inliers & filter_hr] + y = high_res_filter[inliers] - y_pred, n = self.fit_model.predict(x_t[inliers & filter_hr], return_std=True) + y_pred, n = self.fit_model.predict(x_t[inliers], return_std=True) y_hat = self.y_model['pca'].inverse_transform(y_pred) n = np.sqrt((self.y_model['pca'].inverse_transform(y_pred + n) - y_hat)**2 + high_pca_unc**2)