Switched to IncrementalPCA.

ff1a99d6 · Danilo Ferreira de Lima · 201a1568 · ff1a99d6 · ff1a99d6
Commit ff1a99d6 authored 2 years ago by Danilo Ferreira de Lima
--- a/pes_to_spec/model.py
+++ b/pes_to_spec/model.py
@@ -4,9 +4,8 @@ from autograd import grad
 import joblib
 import h5py
 from scipy.signal import fftconvolve
-from scipy.signal.windows import gaussian as gaussian_window
 from scipy.optimize import fmin_l_bfgs_b
-from sklearn.decomposition import PCA
+from sklearn.decomposition import PCA, IncrementalPCA
 from sklearn.model_selection import train_test_split

 from typing import Any, Dict, List, Optional
@@ -25,9 +24,9 @@ class Model(object):
      channels: Selected channels to use as an input for the low resolution data.
      n_pca_lr: Number of low-resolution data PCA components.
      n_pca_hr: Number of high-resolution data PCA components.
-      high_res_sigma: Resolution of the high-resolution spectrometer.
-      tof_start: Start looking at this index from the low-resolution spectrometer data.
-      delta_tof: Number of components to take from the low-resolution spectrometer.
+      high_res_sigma: Resolution of the high-resolution spectrometer in electron-Volts.
+      tof_start: Start looking at this index from the low-resolution spectrometer data. Set to None to perform no selection
+      delta_tof: Number of components to take from the low-resolution spectrometer. Set to None to perform no selection.
      validation_size: Fraction (number between 0 and 1) of the data to take for validation and systematic uncertainty estimate.

    """
@@ -41,15 +40,15 @@ class Model(object):
                 n_pca_lr: int=400,
                 n_pca_hr: int=20,
                 high_res_sigma: float=0.2,
-                 tof_start: int=31445,
-                 delta_tof: int=200,
+                 tof_start: Optional[int]=31445,
+                 delta_tof: Optional[int]=200,
                 validation_size: float=0.05):
        self.channels = channels
        self.n_pca_lr = n_pca_lr
        self.n_pca_hr = n_pca_hr

        # PCA models
-        self.lr_pca = PCA(n_pca_lr, whiten=True)
+        self.lr_pca = IncrementalPCA(n_pca_lr, whiten=True, batch_size=n_pca_lr)
        self.hr_pca = PCA(n_pca_hr, whiten=True)

        # PCA unc. in high resolution
@@ -64,7 +63,6 @@ class Model(object):
        # where to cut on the ToF PES data
        self.tof_start = tof_start
        self.delta_tof = delta_tof
-        self.tof_end = self.tof_start + self.delta_tof

        # high-resolution photon energy axis
        self.high_res_photon_energy: Optional[np.ndarray] = None
@@ -82,7 +80,10 @@ class Model(object):

        Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features).
        """
-        cat = np.concatenate([low_res_data[k][:, self.tof_start:self.tof_end] for k in self.channels], axis=1)
+        items = [low_res_data[k] for k in self.channels]
+        if self.tof_start is not None and self.delta_tof is not None:
+            items = [item[:, self.tof_start:(self.tof_start + self.delta_tof)] for item in items]
+        cat = np.concatenate(items, axis=1)
        return cat

    def preprocess_high_res(self, high_res_data: np.ndarray, high_res_photon_energy: np.ndarray) -> np.ndarray:
@@ -117,19 +118,28 @@ class Model(object):

        self.high_res_photon_energy = high_res_photon_energy

+        print("Pre-processing low")
        low_res = self.preprocess_low_res(low_res_data)
+        print("Pre-processing high")
        high_res = self.preprocess_high_res(high_res_data, high_res_photon_energy)
        # fit PCA
+        print("PCA low")
        low_pca = self.lr_pca.fit_transform(low_res)
+        print("PCA high")
        high_pca = self.hr_pca.fit_transform(high_res)
+        print("Split")
        # split in train and test for PCA uncertainty evaluation
        low_pca_train, low_pca_test, high_pca_train, high_pca_test = train_test_split(low_pca, high_pca, test_size=self.validation_size, random_state=42)
        # fit the linear model
+        print("Fit")
        self.fit_model.fit(low_pca_train, high_pca_train, low_pca_test, high_pca_test)

+        print("PCA unc")
        high_pca_rec = self.hr_pca.inverse_transform(high_pca)
        self.high_pca_unc =  np.sqrt(np.mean((high_res - high_pca_rec)**2, axis=0, keepdims=True))

+        print("Done")
+
        return high_res

    def predict(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray:

--- a/scripts/test_analysis.py
+++ b/scripts/test_analysis.py
@@ -17,6 +17,26 @@ from matplotlib.gridspec import GridSpec

 from typing import Optional

+def plot_pes(filename: str, pes_raw_int: np.ndarray):
+    """
+    Plot low-resolution spectrum.
+
+    Args:
+      filename: Output file name.
+      pes_raw_int: Low-resolution spectrum.
+
+    """
+    fig = plt.figure(figsize=(16, 8))
+    gs = GridSpec(1, 1)
+    ax = fig.add_subplot(gs[0, 0])
+    ax.plot(pes_raw_int, c='b', lw=3, label="Low-resolution measurement")
+    ax.legend()
+    ax.set(title=f"",
+           xlabel="ToF index",
+           ylabel="Intensity")
+    fig.savefig(filename)
+    plt.close(fig)
+
 def plot_result(filename: str, spec_pred: np.ndarray, spec_smooth: np.ndarray, spec_raw_pe: np.ndarray, spec_raw_int: Optional[np.ndarray]=None):
    """
    Plot result with uncertainty band.
@@ -33,12 +53,12 @@ def plot_result(filename: str, spec_pred: np.ndarray, spec_smooth: np.ndarray, s
    gs = GridSpec(1, 1)
    ax = fig.add_subplot(gs[0, 0])
    eps = np.mean(spec_pred[:, 1])
-    ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High resolution measurement (smoothened)")
-    ax.plot(spec_raw_pe, spec_pred[:, 0], c='r', lw=3, label="High resolution prediction")
+    ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High-resolution measurement (smoothened)")
+    ax.plot(spec_raw_pe, spec_pred[:, 0], c='r', lw=3, label="High-resolution prediction")
    ax.fill_between(spec_raw_pe, spec_pred[:, 0] - spec_pred[:, 1], spec_pred[:, 0] + spec_pred[:, 1], facecolor='red', alpha=0.6, label="68% unc. (stat.)")
    ax.fill_between(spec_raw_pe, spec_pred[:, 0] - spec_pred[:, 2], spec_pred[:, 0] + spec_pred[:, 2], facecolor='magenta', alpha=0.6, label="68% unc. (syst., PCA)")
    if spec_raw_int is not None:
-        ax.plot(spec_raw_pe, spec_raw_int, c='b', lw=1, ls='--', label="High resolution measurement")
+        ax.plot(spec_raw_pe, spec_raw_int, c='b', lw=1, ls='--', label="High-resolution measurement")
    ax.legend()
    ax.set(title=f"avg(unc) = {eps}",
           xlabel="Photon energy [eV]",
@@ -81,18 +101,34 @@ def main():
    #retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray()
    #retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray()

-    model = Model()
+    model = Model(channels=["channel_1_D",
+                            "channel_2_B",
+                            "channel_3_A",
+                            "channel_3_B",
+                            "channel_4_C",
+                            "channel_4_D"],
+                 n_pca_lr=400,
+                 n_pca_hr=20,
+                 high_res_sigma=0.2,
+                 tof_start=None,
+                 delta_tof=None,
+                 validation_size=0.05)
+
    train_idx = np.isin(tids, train_tids)
+    print("Fitting")
    model.fit({k: v[train_idx, :] for k, v in pes_raw.items()}, spec_raw_int[train_idx, :], spec_raw_pe[train_idx, :])
    spec_smooth = model.preprocess_high_res(spec_raw_int, spec_raw_pe)

    # test
+    print("Predict")
    spec_pred = model.predict(pes_raw)

    # plot
    for tid in test_tids:
        idx = np.where(tid==tids)[0][0]
        plot_result(f"test_{tid}.png", spec_pred[idx, :, :], spec_smooth[idx, :], spec_raw_pe[idx, :], spec_raw_int[idx, :])
+        for ch in channels:
+            plot_pes(f"test_pes_{tid}_{ch}.png", pes_raw[ch][idx, :])

 if __name__ == '__main__':
    main()