Started simplified setup and clean up.

c198865e · Danilo Ferreira de Lima · 04d7d131 · c198865e · c198865e · c198865e
Commit c198865e authored 2 years ago by Danilo Ferreira de Lima
--- a/pes_to_spec/model.py
+++ b/pes_to_spec/model.py
+import numpy as np
+from scipy.signal import fftconvolve
+from sklearn.decomposition import PCA
+from typing import Dict, List
+
+def matching_ids(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> np.ndarray:
+    """Returns list of train IDs common to sets a, b and c."""
+    unique_ids = list(set(a).intersection(b).intersection(c))
+    return np.array(unique_ids)
+
+class Model(object):
+    """
+    Object representing a previous fit of the model to be used to predict high-resolution
+    spectrum from a low-resolution one.
+
+    Args:
+      channels: Selected channels to use as an input for the low resolution data.
+      n_pca_lr: Number of low-resolution data PCA components.
+      n_pca_hr: Number of high-resolution data PCA components.
+
+    """
+    def __init__(self,
+                 channels:List[str]=["channel_1_D",
+                                     "channel_2_B",
+                                     "channel_3_A",
+                                     "channel_3_B",
+                                     "channel_4_C",
+                                     "channel_4_D"],
+                 n_pca_lr: int=400,
+                 n_pca_hr: int=20):
+        self.channels = channels
+        self.n_pca_lr = n_pca_lr
+        self.n_pca_hr = n_pca_hr
+
+        # PCA models
+        self.lr_pca = PCA(n_pca_lr, whiten=True)
+        self.hr_pca = PCA(n_pca_hr, whiten=True)
+
+        # where to cut on the ToF PES data
+        self.tof_start = 31445
+        self.delta_tof = 200
+        self.tof_end = self.tof_start + self.delta_tof
+
+        # smoothing of the SPEC data in eV
+        self.high_res_sigma = 0.2
+
+    def preprocess_low_res(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Get a dictionary with the channel names for the inut low resolution data and output
+        only the relevant input data in an array.
+
+        Args:
+          low_res_data: Dictionary with keys named channel_{i}_{k}, where i is a number between 1 and 4 and k is a letter between A and D.
+
+        Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features).
+        """
+        cat = np.concatenate([low_res_data[k][:, self.tof_start:self.tof_end] for k in self.channels], axis=1)
+        return cat
+
+    def preprocess_high_res(self, high_res_data: np.ndarray) -> np.ndarray:
+        """
+        Get the high resolution data and preprocess it.
+
+        Args:
+          high_res_data: High resolution data with shape (train_id, features).
+
+        Returns: Pre-processed high-resolution data of shape (train_id, features) before.
+        """
+        # Apply smoothing
+        # TODO: Why?!
+        mu = high_res_data[0,high_res_data.shape[1]//2]
+        gaussian = np.exp(-((high_res_data - mu)/self.high_res_sigma)**2/2)/np.sqrt(2*np.pi*self.high_res_sigma**2)
+        # TODO: why 80?!
+        high_res_gc = fftconvolve(high_res_data, gaussian, mode="same", axes=1)/80
+        return high_res_gc
+
+    def fit(self, low_res_data: Dict[str, np.ndarray], high_res_data: np.ndarray):
+        """
+        Train the model.
+
+        Args:
+          low_res_data: Low resolution data as a dictionary with the key set to `channel_{i}_{k}`, where i is a number between 1 and 4 and k is a letter between A and D. For each dictionary entry, a numpy array is expected with shape (train_id, ToF channel).
+          high_res_data: Reference high resolution data with a one-to-one match to the low resolution data in the train_id dimension. Shape (train_id, ToF channel).
+        """
+        low_res = self.preprocess_low_res(low_res_data)
+        high_res = self.preprocess_high_res(high_res_data)
+        # fit PCA
+        low_pca = self.lr_pca.fit_transform(low_res)
+        high_pca = self.hr_pca.fit_transform(high_res)
+        pass
+
+    def predict(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Predict a high-resolution spectrum from a low resolution given one.
+        The output includes the uncertainty in its second and third entries of the first dimension.
+
+        Args:
+          low_res_data: Low resolution data as in the fit step with shape (train_id, channel, ToF channel).
+
+        Returns: High resolution data with shape (3, train_id, ToF channel). The component 0 of the first dimension is the predicted spectrum. Components 1 and 2 correspond to two sources of uncertainty.
+        """
+        low_res = self.preprocess_low_res(low_res_data)
+        low_pca = self.lr_pca.transform(low_res)
+        # TODO: Get high res.
+        # high_pca = linear_model.predict(low_pca)
+        high_res_predicted = self.hr_pca.inverse_transform(high_pca)
+        # TODO: Add uncertainties
+        return high_res_predicted
+
--- a/requirements.txt
+++ b/requirements.txt
+numpy
+scipy
+scikit-learn
+extra_data
+matplotlib
--- a/src/models/fit_methods/model.py
+++ b/src/models/fit_methods/model.py
@@ -161,4 +161,4 @@ if __name__ == '__main__':
    #model_instance.split(0.2)
    #model_instance.fit()    
    #print(model_instance.predict([.9, 1000]))
-    #print("Accuracy: ", model_instance.model.score(model_instance.X_test, model_instance.y_test))
\ No newline at end of file
+    #print("Accuracy: ", model_instance.model.score(model_instance.X_test, model_instance.y_test))