Skip to content
Snippets Groups Projects
Commit ff1a99d6 authored by Danilo Ferreira de Lima's avatar Danilo Ferreira de Lima
Browse files

Switched to IncrementalPCA.

parent 201a1568
No related branches found
No related tags found
No related merge requests found
......@@ -4,9 +4,8 @@ from autograd import grad
import joblib
import h5py
from scipy.signal import fftconvolve
from scipy.signal.windows import gaussian as gaussian_window
from scipy.optimize import fmin_l_bfgs_b
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.model_selection import train_test_split
from typing import Any, Dict, List, Optional
......@@ -25,9 +24,9 @@ class Model(object):
channels: Selected channels to use as an input for the low resolution data.
n_pca_lr: Number of low-resolution data PCA components.
n_pca_hr: Number of high-resolution data PCA components.
high_res_sigma: Resolution of the high-resolution spectrometer.
tof_start: Start looking at this index from the low-resolution spectrometer data.
delta_tof: Number of components to take from the low-resolution spectrometer.
high_res_sigma: Resolution of the high-resolution spectrometer in electron-Volts.
tof_start: Start looking at this index from the low-resolution spectrometer data. Set to None to perform no selection
delta_tof: Number of components to take from the low-resolution spectrometer. Set to None to perform no selection.
validation_size: Fraction (number between 0 and 1) of the data to take for validation and systematic uncertainty estimate.
"""
......@@ -41,15 +40,15 @@ class Model(object):
n_pca_lr: int=400,
n_pca_hr: int=20,
high_res_sigma: float=0.2,
tof_start: int=31445,
delta_tof: int=200,
tof_start: Optional[int]=31445,
delta_tof: Optional[int]=200,
validation_size: float=0.05):
self.channels = channels
self.n_pca_lr = n_pca_lr
self.n_pca_hr = n_pca_hr
# PCA models
self.lr_pca = PCA(n_pca_lr, whiten=True)
self.lr_pca = IncrementalPCA(n_pca_lr, whiten=True, batch_size=n_pca_lr)
self.hr_pca = PCA(n_pca_hr, whiten=True)
# PCA unc. in high resolution
......@@ -64,7 +63,6 @@ class Model(object):
# where to cut on the ToF PES data
self.tof_start = tof_start
self.delta_tof = delta_tof
self.tof_end = self.tof_start + self.delta_tof
# high-resolution photon energy axis
self.high_res_photon_energy: Optional[np.ndarray] = None
......@@ -82,7 +80,10 @@ class Model(object):
Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features).
"""
cat = np.concatenate([low_res_data[k][:, self.tof_start:self.tof_end] for k in self.channels], axis=1)
items = [low_res_data[k] for k in self.channels]
if self.tof_start is not None and self.delta_tof is not None:
items = [item[:, self.tof_start:(self.tof_start + self.delta_tof)] for item in items]
cat = np.concatenate(items, axis=1)
return cat
def preprocess_high_res(self, high_res_data: np.ndarray, high_res_photon_energy: np.ndarray) -> np.ndarray:
......@@ -117,19 +118,28 @@ class Model(object):
self.high_res_photon_energy = high_res_photon_energy
print("Pre-processing low")
low_res = self.preprocess_low_res(low_res_data)
print("Pre-processing high")
high_res = self.preprocess_high_res(high_res_data, high_res_photon_energy)
# fit PCA
print("PCA low")
low_pca = self.lr_pca.fit_transform(low_res)
print("PCA high")
high_pca = self.hr_pca.fit_transform(high_res)
print("Split")
# split in train and test for PCA uncertainty evaluation
low_pca_train, low_pca_test, high_pca_train, high_pca_test = train_test_split(low_pca, high_pca, test_size=self.validation_size, random_state=42)
# fit the linear model
print("Fit")
self.fit_model.fit(low_pca_train, high_pca_train, low_pca_test, high_pca_test)
print("PCA unc")
high_pca_rec = self.hr_pca.inverse_transform(high_pca)
self.high_pca_unc = np.sqrt(np.mean((high_res - high_pca_rec)**2, axis=0, keepdims=True))
print("Done")
return high_res
def predict(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray:
......
......@@ -17,6 +17,26 @@ from matplotlib.gridspec import GridSpec
from typing import Optional
def plot_pes(filename: str, pes_raw_int: np.ndarray):
"""
Plot low-resolution spectrum.
Args:
filename: Output file name.
pes_raw_int: Low-resolution spectrum.
"""
fig = plt.figure(figsize=(16, 8))
gs = GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
ax.plot(pes_raw_int, c='b', lw=3, label="Low-resolution measurement")
ax.legend()
ax.set(title=f"",
xlabel="ToF index",
ylabel="Intensity")
fig.savefig(filename)
plt.close(fig)
def plot_result(filename: str, spec_pred: np.ndarray, spec_smooth: np.ndarray, spec_raw_pe: np.ndarray, spec_raw_int: Optional[np.ndarray]=None):
"""
Plot result with uncertainty band.
......@@ -33,12 +53,12 @@ def plot_result(filename: str, spec_pred: np.ndarray, spec_smooth: np.ndarray, s
gs = GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
eps = np.mean(spec_pred[:, 1])
ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High resolution measurement (smoothened)")
ax.plot(spec_raw_pe, spec_pred[:, 0], c='r', lw=3, label="High resolution prediction")
ax.plot(spec_raw_pe, spec_smooth, c='b', lw=3, label="High-resolution measurement (smoothened)")
ax.plot(spec_raw_pe, spec_pred[:, 0], c='r', lw=3, label="High-resolution prediction")
ax.fill_between(spec_raw_pe, spec_pred[:, 0] - spec_pred[:, 1], spec_pred[:, 0] + spec_pred[:, 1], facecolor='red', alpha=0.6, label="68% unc. (stat.)")
ax.fill_between(spec_raw_pe, spec_pred[:, 0] - spec_pred[:, 2], spec_pred[:, 0] + spec_pred[:, 2], facecolor='magenta', alpha=0.6, label="68% unc. (syst., PCA)")
if spec_raw_int is not None:
ax.plot(spec_raw_pe, spec_raw_int, c='b', lw=1, ls='--', label="High resolution measurement")
ax.plot(spec_raw_pe, spec_raw_int, c='b', lw=1, ls='--', label="High-resolution measurement")
ax.legend()
ax.set(title=f"avg(unc) = {eps}",
xlabel="Photon energy [eV]",
......@@ -81,18 +101,34 @@ def main():
#retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray()
#retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray()
model = Model()
model = Model(channels=["channel_1_D",
"channel_2_B",
"channel_3_A",
"channel_3_B",
"channel_4_C",
"channel_4_D"],
n_pca_lr=400,
n_pca_hr=20,
high_res_sigma=0.2,
tof_start=None,
delta_tof=None,
validation_size=0.05)
train_idx = np.isin(tids, train_tids)
print("Fitting")
model.fit({k: v[train_idx, :] for k, v in pes_raw.items()}, spec_raw_int[train_idx, :], spec_raw_pe[train_idx, :])
spec_smooth = model.preprocess_high_res(spec_raw_int, spec_raw_pe)
# test
print("Predict")
spec_pred = model.predict(pes_raw)
# plot
for tid in test_tids:
idx = np.where(tid==tids)[0][0]
plot_result(f"test_{tid}.png", spec_pred[idx, :, :], spec_smooth[idx, :], spec_raw_pe[idx, :], spec_raw_int[idx, :])
for ch in channels:
plot_pes(f"test_pes_{tid}_{ch}.png", pes_raw[ch][idx, :])
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment