diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py index 8f19fbf743ab39672f174034f56f9fa7c71deda2..fc087c4dc072435e9502accae00569565b44b91f 100644 --- a/pes_to_spec/model.py +++ b/pes_to_spec/model.py @@ -4,9 +4,8 @@ from autograd import grad import joblib import h5py from scipy.signal import fftconvolve -from scipy.signal.windows import gaussian as gaussian_window from scipy.optimize import fmin_l_bfgs_b -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, IncrementalPCA from sklearn.model_selection import train_test_split from typing import Any, Dict, List, Optional @@ -25,9 +24,9 @@ class Model(object): channels: Selected channels to use as an input for the low resolution data. n_pca_lr: Number of low-resolution data PCA components. n_pca_hr: Number of high-resolution data PCA components. - high_res_sigma: Resolution of the high-resolution spectrometer. - tof_start: Start looking at this index from the low-resolution spectrometer data. - delta_tof: Number of components to take from the low-resolution spectrometer. + high_res_sigma: Resolution of the high-resolution spectrometer in electron-Volts. + tof_start: Start looking at this index from the low-resolution spectrometer data. Set to None to perform no selection + delta_tof: Number of components to take from the low-resolution spectrometer. Set to None to perform no selection. validation_size: Fraction (number between 0 and 1) of the data to take for validation and systematic uncertainty estimate. """ @@ -41,15 +40,15 @@ class Model(object): n_pca_lr: int=400, n_pca_hr: int=20, high_res_sigma: float=0.2, - tof_start: int=31445, - delta_tof: int=200, + tof_start: Optional[int]=31445, + delta_tof: Optional[int]=200, validation_size: float=0.05): self.channels = channels self.n_pca_lr = n_pca_lr self.n_pca_hr = n_pca_hr # PCA models - self.lr_pca = PCA(n_pca_lr, whiten=True) + self.lr_pca = IncrementalPCA(n_pca_lr, whiten=True, batch_size=n_pca_lr) self.hr_pca = PCA(n_pca_hr, whiten=True) # PCA unc. in high resolution @@ -64,7 +63,6 @@ class Model(object): # where to cut on the ToF PES data self.tof_start = tof_start self.delta_tof = delta_tof - self.tof_end = self.tof_start + self.delta_tof # high-resolution photon energy axis self.high_res_photon_energy: Optional[np.ndarray] = None @@ -82,7 +80,10 @@ class Model(object): Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features). """ - cat = np.concatenate([low_res_data[k][:, self.tof_start:self.tof_end] for k in self.channels], axis=1) + items = [low_res_data[k] for k in self.channels] + if self.tof_start is not None and self.delta_tof is not None: + items = [item[:, self.tof_start:(self.tof_start + self.delta_tof)] for item in items] + cat = np.concatenate(items, axis=1) return cat def preprocess_high_res(self, high_res_data: np.ndarray, high_res_photon_energy: np.ndarray) -> np.ndarray: @@ -117,19 +118,28 @@ class Model(object): self.high_res_photon_energy = high_res_photon_energy + print("Pre-processing low") low_res = self.preprocess_low_res(low_res_data) + print("Pre-processing high") high_res = self.preprocess_high_res(high_res_data, high_res_photon_energy) # fit PCA + print("PCA low") low_pca = self.lr_pca.fit_transform(low_res) + print("PCA high") high_pca = self.hr_pca.fit_transform(high_res) + print("Split") # split in train and test for PCA uncertainty evaluation low_pca_train, low_pca_test, high_pca_train, high_pca_test = train_test_split(low_pca, high_pca, test_size=self.validation_size, random_state=42) # fit the linear model + print("Fit") self.fit_model.fit(low_pca_train, high_pca_train, low_pca_test, high_pca_test) + print("PCA unc") high_pca_rec = self.hr_pca.inverse_transform(high_pca) self.high_pca_unc = np.sqrt(np.mean((high_res - high_pca_rec)**2, axis=0, keepdims=True)) + print("Done") + return high_res def predict(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray: diff --git a/scripts/test_analysis.py b/scripts/test_analysis.py index 438dedc01462cd88bb7f85886a6280e0146195b8..93351c36067affab19ab5b82190c98a95e8b4281 100755 --- a/scripts/test_analysis.py +++ b/scripts/test_analysis.py @@ -17,6 +17,26 @@ from matplotlib.gridspec import GridSpec from typing import Optional +def plot_pes(filename: str, pes_raw_int: np.ndarray): + """ + Plot low-resolution spectrum. + + Args: + filename: Output file name. + pes_raw_int: Low-resolution spectrum. + + """ + fig = plt.figure(figsize=(16, 8)) + gs = GridSpec(1, 1) + ax = fig.add_subplot(gs[0, 0]) + ax.plot(pes_raw_int, c='b', lw=3, label="Low-resolution measurement") + ax.legend() + ax.set(title=f"", + xlabel="ToF index", + ylabel="Intensity") + fig.savefig(filename) + plt.close(fig) + def plot_result(filename: str, spec_pred: np.ndarray, spec_smooth: np.ndarray, spec_raw_pe: np.ndarray, spec_raw_int: Optional[np.ndarray]=None): """ Plot result with uncertainty band. @@ -33,12 +53,12 @@ def plot_result(filename: str, spec_pred: np.ndarray, spec_smooth: np.ndarray, s gs = GridSpec(1, 1) ax = fig.add_subplot(gs[0, 0]) eps = np.mean(spec_pred[:, 1]) - ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High resolution measurement (smoothened)") - ax.plot(spec_raw_pe, spec_pred[:, 0], c='r', lw=3, label="High resolution prediction") + ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High-resolution measurement (smoothened)") + ax.plot(spec_raw_pe, spec_pred[:, 0], c='r', lw=3, label="High-resolution prediction") ax.fill_between(spec_raw_pe, spec_pred[:, 0] - spec_pred[:, 1], spec_pred[:, 0] + spec_pred[:, 1], facecolor='red', alpha=0.6, label="68% unc. (stat.)") ax.fill_between(spec_raw_pe, spec_pred[:, 0] - spec_pred[:, 2], spec_pred[:, 0] + spec_pred[:, 2], facecolor='magenta', alpha=0.6, label="68% unc. (syst., PCA)") if spec_raw_int is not None: - ax.plot(spec_raw_pe, spec_raw_int, c='b', lw=1, ls='--', label="High resolution measurement") + ax.plot(spec_raw_pe, spec_raw_int, c='b', lw=1, ls='--', label="High-resolution measurement") ax.legend() ax.set(title=f"avg(unc) = {eps}", xlabel="Photon energy [eV]", @@ -81,18 +101,34 @@ def main(): #retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray() #retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray() - model = Model() + model = Model(channels=["channel_1_D", + "channel_2_B", + "channel_3_A", + "channel_3_B", + "channel_4_C", + "channel_4_D"], + n_pca_lr=400, + n_pca_hr=20, + high_res_sigma=0.2, + tof_start=None, + delta_tof=None, + validation_size=0.05) + train_idx = np.isin(tids, train_tids) + print("Fitting") model.fit({k: v[train_idx, :] for k, v in pes_raw.items()}, spec_raw_int[train_idx, :], spec_raw_pe[train_idx, :]) spec_smooth = model.preprocess_high_res(spec_raw_int, spec_raw_pe) # test + print("Predict") spec_pred = model.predict(pes_raw) # plot for tid in test_tids: idx = np.where(tid==tids)[0][0] plot_result(f"test_{tid}.png", spec_pred[idx, :, :], spec_smooth[idx, :], spec_raw_pe[idx, :], spec_raw_int[idx, :]) + for ch in channels: + plot_pes(f"test_pes_{tid}_{ch}.png", pes_raw[ch][idx, :]) if __name__ == '__main__': main()