From 3b1b5aa359614d4810f162c3e9de0b502362ac99 Mon Sep 17 00:00:00 2001
From: Danilo Ferreira de Lima <danilo.enoque.ferreira.de.lima@xfel.de>
Date: Fri, 13 Jan 2023 16:39:47 +0100
Subject: [PATCH] Added pre-PCA step before Nystroem to reduce the
 dimensionality in advance.

---
 pes_to_spec/model.py | 53 ++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py
index 761e086..5200c68 100644
--- a/pes_to_spec/model.py
+++ b/pes_to_spec/model.py
@@ -502,31 +502,32 @@ class Model(TransformerMixin, BaseEstimator):
     def __init__(self,
                  channels:List[str]=[f"channel_{j}_{k}"
                                      for j, k in product(range(1, 5), ["A", "B", "C", "D"])],
-                 n_pca_lr: int=1000,
-                 n_pca_hr: int=40,
+                 n_pca_lr: int=600,
+                 n_pca_hr: int=20,
                  high_res_sigma: float=0.2,
                  tof_start: Optional[int]=None,
                  delta_tof: Optional[int]=300,
                  validation_size: float=0.05,
-                 n_nonlinear_kernel: int=10000):
+                 n_nonlinear_kernel: int=5000):
         # models
-        self.x_model = Pipeline([
-                                ('select', SelectRelevantLowResolution(channels, tof_start, delta_tof)),
-                                ('pca', PCA(n_pca_lr, whiten=True)),
-                                ('unc', UncertaintyHolder()),
-                               ])
+        x_model_steps = list()
+        x_model_steps += [('select', SelectRelevantLowResolution(channels, tof_start, delta_tof))]
+        if n_nonlinear_kernel > 0:
+            x_model_steps += [('fex', Pipeline([('prepca', PCA(n_pca_lr, whiten=True)),
+                                                ('nystroem', Nystroem(n_components=n_nonlinear_kernel, kernel='rbf', gamma=None, n_jobs=-1)),
+                                                ]))]
+        x_model_steps += [
+                          ('pca', PCA(n_pca_lr, whiten=True)),
+                          ('unc', UncertaintyHolder()),
+                          ]
+        self.x_model = Pipeline(x_model_steps)
         self.y_model = Pipeline([
                                 ('smoothen', HighResolutionSmoother(high_res_sigma)),
                                 ('pca', PCA(n_pca_hr, whiten=True)),
                                 ('unc', UncertaintyHolder()),
                                 ])
-        fit_steps = list()
-        if n_nonlinear_kernel > 0:
-            fit_steps += [('fex', Nystroem(n_components=n_nonlinear_kernel, kernel='rbf', gamma=None, n_jobs=-1))]
-        #fit_steps += [('regression', FitModel())]
-        fit_steps += [('regression', MultiOutputWithStd(ARDRegression(n_iter=30, verbose=True)))]
-        #fit_steps += [('regression', MultiOutputWithStd(LinearSVR(verbose=10, max_iter=2000, tol=1e-5)))]
-        self.fit_model = Pipeline(fit_steps)
+        #self.fit_model = FitModel()
+        self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=30, verbose=True))
 
         # size of the test subset
         self.validation_size = validation_size
@@ -581,12 +582,11 @@ class Model(TransformerMixin, BaseEstimator):
         self.y_model['unc'].set_uncertainty(high_pca_unc)
 
         low_res = self.x_model['select'].transform(low_res_data)
-        low_pca = self.x_model['pca'].transform(low_res)
-        if isinstance(self.x_model['pca'], FeatureUnion):
-            n = self.x_model['pca'].transformer_list[0][1].n_components
-            low_pca_rec = self.x_model['pca'].transformer_list[0][1].inverse_transform(low_pca[:, :n])
-        else:
-            low_pca_rec = self.x_model['pca'].inverse_transform(low_pca)
+        pca_model = self.x_model['pca']
+        if 'fex' in self.x_model.named_steps:
+            pca_model = self.x_model['fex'].named_steps['prepca']
+        low_pca = pca_model.transform(low_res)
+        low_pca_rec = pca_model.inverse_transform(low_pca)
         low_pca_unc =  np.mean(np.sqrt(np.mean((low_res - low_pca_rec)**2, axis=1, keepdims=True)), axis=0, keepdims=True)
         self.x_model['unc'].set_uncertainty(low_pca_unc)
 
@@ -603,12 +603,11 @@ class Model(TransformerMixin, BaseEstimator):
         Returns: Ratio of root-mean-squared-error of the data reconstruction using the existing PCA model and the one from the original model.
         """
         low_res = self.x_model['select'].transform(low_res_data)
-        low_pca = self.x_model['pca'].transform(low_res)
-        if isinstance(self.x_model['pca'], FeatureUnion):
-            n = self.x_model['pca'].transformer_list[0][1].n_components
-            low_pca_rec = self.x_model['pca'].transformer_list[0][1].inverse_transform(low_pca[:, :n])
-        else:
-            low_pca_rec = self.x_model['pca'].inverse_transform(low_pca)
+        pca_model = self.x_model['pca']
+        if 'fex' in self.x_model.named_steps:
+            pca_model = self.x_model['fex'].named_steps['prepca']
+        low_pca = pca_model.transform(low_res)
+        low_pca_rec = pca_model.inverse_transform(low_pca)
         low_pca_unc = self.x_model['unc'].uncertainty()
 
         #fig = plt.figure(figsize=(8, 16))
-- 
GitLab