diff --git a/pes_to_spec/model.py b/pes_to_spec/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8ac73a57c7065f9856b7399e25585b1a208bbac6 --- /dev/null +++ b/pes_to_spec/model.py @@ -0,0 +1,109 @@ +import numpy as np +from scipy.signal import fftconvolve +from sklearn.decomposition import PCA +from typing import Dict, List + +def matching_ids(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> np.ndarray: + """Returns list of train IDs common to sets a, b and c.""" + unique_ids = list(set(a).intersection(b).intersection(c)) + return np.array(unique_ids) + +class Model(object): + """ + Object representing a previous fit of the model to be used to predict high-resolution + spectrum from a low-resolution one. + + Args: + channels: Selected channels to use as an input for the low resolution data. + n_pca_lr: Number of low-resolution data PCA components. + n_pca_hr: Number of high-resolution data PCA components. + + """ + def __init__(self, + channels:List[str]=["channel_1_D", + "channel_2_B", + "channel_3_A", + "channel_3_B", + "channel_4_C", + "channel_4_D"], + n_pca_lr: int=400, + n_pca_hr: int=20): + self.channels = channels + self.n_pca_lr = n_pca_lr + self.n_pca_hr = n_pca_hr + + # PCA models + self.lr_pca = PCA(n_pca_lr, whiten=True) + self.hr_pca = PCA(n_pca_hr, whiten=True) + + # where to cut on the ToF PES data + self.tof_start = 31445 + self.delta_tof = 200 + self.tof_end = self.tof_start + self.delta_tof + + # smoothing of the SPEC data in eV + self.high_res_sigma = 0.2 + + def preprocess_low_res(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray: + """ + Get a dictionary with the channel names for the inut low resolution data and output + only the relevant input data in an array. + + Args: + low_res_data: Dictionary with keys named channel_{i}_{k}, where i is a number between 1 and 4 and k is a letter between A and D. + + Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features). + """ + cat = np.concatenate([low_res_data[k][:, self.tof_start:self.tof_end] for k in self.channels], axis=1) + return cat + + def preprocess_high_res(self, high_res_data: np.ndarray) -> np.ndarray: + """ + Get the high resolution data and preprocess it. + + Args: + high_res_data: High resolution data with shape (train_id, features). + + Returns: Pre-processed high-resolution data of shape (train_id, features) before. + """ + # Apply smoothing + # TODO: Why?! + mu = high_res_data[0,high_res_data.shape[1]//2] + gaussian = np.exp(-((high_res_data - mu)/self.high_res_sigma)**2/2)/np.sqrt(2*np.pi*self.high_res_sigma**2) + # TODO: why 80?! + high_res_gc = fftconvolve(high_res_data, gaussian, mode="same", axes=1)/80 + return high_res_gc + + def fit(self, low_res_data: Dict[str, np.ndarray], high_res_data: np.ndarray): + """ + Train the model. + + Args: + low_res_data: Low resolution data as a dictionary with the key set to `channel_{i}_{k}`, where i is a number between 1 and 4 and k is a letter between A and D. For each dictionary entry, a numpy array is expected with shape (train_id, ToF channel). + high_res_data: Reference high resolution data with a one-to-one match to the low resolution data in the train_id dimension. Shape (train_id, ToF channel). + """ + low_res = self.preprocess_low_res(low_res_data) + high_res = self.preprocess_high_res(high_res_data) + # fit PCA + low_pca = self.lr_pca.fit_transform(low_res) + high_pca = self.hr_pca.fit_transform(high_res) + pass + + def predict(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray: + """ + Predict a high-resolution spectrum from a low resolution given one. + The output includes the uncertainty in its second and third entries of the first dimension. + + Args: + low_res_data: Low resolution data as in the fit step with shape (train_id, channel, ToF channel). + + Returns: High resolution data with shape (3, train_id, ToF channel). The component 0 of the first dimension is the predicted spectrum. Components 1 and 2 correspond to two sources of uncertainty. + """ + low_res = self.preprocess_low_res(low_res_data) + low_pca = self.lr_pca.transform(low_res) + # TODO: Get high res. + # high_pca = linear_model.predict(low_pca) + high_res_predicted = self.hr_pca.inverse_transform(high_pca) + # TODO: Add uncertainties + return high_res_predicted + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f09e9ec1eedbc2e1e4c230efba6e9335adfa9ebc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy +scipy +scikit-learn +extra_data +matplotlib diff --git a/src/models/fit_methods/model.py b/src/models/fit_methods/model.py index db101abb33db5950b6d36d73695422ffd1968d23..ef7e8d4d66bbd9e56c0b3ae607d4bee986fa0f17 100644 --- a/src/models/fit_methods/model.py +++ b/src/models/fit_methods/model.py @@ -161,4 +161,4 @@ if __name__ == '__main__': #model_instance.split(0.2) #model_instance.fit() #print(model_instance.predict([.9, 1000])) - #print("Accuracy: ", model_instance.model.score(model_instance.X_test, model_instance.y_test)) \ No newline at end of file + #print("Accuracy: ", model_instance.model.score(model_instance.X_test, model_instance.y_test))