Skip to content
Snippets Groups Projects
Commit fe96d70e authored by Danilo Ferreira de Lima's avatar Danilo Ferreira de Lima
Browse files

Fixed saving into file.

parent 25829f59
No related branches found
No related tags found
No related merge requests found
import numpy as np import numpy as np
from autograd import numpy as anp from autograd import numpy as anp
from autograd import grad from autograd import grad
#import joblib
import h5py import h5py
from scipy.signal import fftconvolve from scipy.signal import fftconvolve
from scipy.signal import find_peaks_cwt from scipy.signal import find_peaks_cwt
from scipy.optimize import fmin_l_bfgs_b from scipy.optimize import fmin_l_bfgs_b
from sklearn.decomposition import PCA, IncrementalPCA from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin, BaseEstimator
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -18,13 +18,41 @@ def matching_ids(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> np.ndarray: ...@@ -18,13 +18,41 @@ def matching_ids(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> np.ndarray:
unique_ids = list(set(a).intersection(b).intersection(c)) unique_ids = list(set(a).intersection(b).intersection(c))
return np.array(unique_ids) return np.array(unique_ids)
def save_pca(pca_obj: Union[IncrementalPCA, PCA], pca_group: h5py.Group): def dump_in_group(content: Dict[str, Any], hf: h5py.Group):
""" """
Save a PCA object to an H5 file. Write dictionary as an HDF5 group.
Args:
content: The content to be written.
hf: The HDF5 group.
"""
for key, value in content.items():
if isinstance(value, int) or isinstance(value, float):
hf.attrs[key] = value
elif isinstance(value, list):
hf.attrs[key] = value
else:
hf.create_dataset(key, data=value, compression="gzip")
def read_from_group(hf: h5py.Group) -> Dict[str, Any]:
"""
Read dictionary from an HDF5 group.
Args:
hf: The HDF5 group.
Returns: the content to be read.
"""
d = {k: hf[k][()] for k in hf.keys() if not isinstance(hf[k], h5py.Group)}
d.update({k: hf.attrs[k] for k in hf.attrs})
return d
def get_pca_props(pca_obj: Union[IncrementalPCA, PCA]):
"""
Get PCA properties and attributes.
Args: Args:
pca_obj: An instance where to load data to. pca_obj: An instance where to load data to.
pca_group: An H5 file group.
""" """
...@@ -32,25 +60,23 @@ def save_pca(pca_obj: Union[IncrementalPCA, PCA], pca_group: h5py.Group): ...@@ -32,25 +60,23 @@ def save_pca(pca_obj: Union[IncrementalPCA, PCA], pca_group: h5py.Group):
"explained_variance_", "explained_variance_",
"explained_variance_ratio_", "explained_variance_ratio_",
"singular_values_", "singular_values_",
"mean_"] "mean_",
attrs = ["n_components_", "n_components_",
#"n_features_", "n_features_",
#"n_samples_", "n_samples_",
"noise_variance_", "noise_variance_",
#"n_features_in_" "n_features_in_"
] ]
for p in props: return {p: getattr(pca_obj, p)
pca_group.create_dataset(p, data=getattr(pca_obj, p)) for p in props}
for a in attrs:
pca_group.attrs[a] = getattr(pca_obj, a)
def load_pca(pca_obj: Union[IncrementalPCA, PCA], pca_group: h5py.Group) -> Union[IncrementalPCA, PCA]: def set_pca_props(pca_obj: Union[IncrementalPCA, PCA], pca_props: Dict[str, Any]) -> Union[IncrementalPCA, PCA]:
""" """
Load a PCA object from an H5 file. Load a PCA object from a dict.
Args: Args:
pca_obj: An instance where to load data to. pca_obj: An instance where to load data to.
pca_group: An H5 file group. pca_props: A dictionary of properties read.
Returns: Updated PCA instance. Returns: Updated PCA instance.
""" """
...@@ -59,17 +85,15 @@ def load_pca(pca_obj: Union[IncrementalPCA, PCA], pca_group: h5py.Group) -> Unio ...@@ -59,17 +85,15 @@ def load_pca(pca_obj: Union[IncrementalPCA, PCA], pca_group: h5py.Group) -> Unio
"explained_variance_", "explained_variance_",
"explained_variance_ratio_", "explained_variance_ratio_",
"singular_values_", "singular_values_",
"mean_"] "mean_",
attrs = ["n_components_", "n_components_",
#"n_features_", "n_features_",
#"n_samples_", "n_samples_",
"noise_variance_", "noise_variance_",
#"n_features_in_" "n_features_in_"
] ]
for p in props: for p in props:
setattr(pca_obj, p, pca_group[p][()]) setattr(pca_obj, p, pca_props[p])
for a in attrs:
setattr(pca_obj, a, pca_group.attrs[a])
return pca_obj return pca_obj
class PromptNotFoundError(Exception): class PromptNotFoundError(Exception):
...@@ -81,7 +105,7 @@ class PromptNotFoundError(Exception): ...@@ -81,7 +105,7 @@ class PromptNotFoundError(Exception):
def __str__(self) -> str: def __str__(self) -> str:
return "No prompt peak has been detected." return "No prompt peak has been detected."
class Model(object): class Model(TransformerMixin, BaseEstimator):
""" """
Object representing a previous fit of the model to be used to predict high-resolution Object representing a previous fit of the model to be used to predict high-resolution
spectrum from a low-resolution one. spectrum from a low-resolution one.
...@@ -91,9 +115,12 @@ class Model(object): ...@@ -91,9 +115,12 @@ class Model(object):
n_pca_lr: Number of low-resolution data PCA components. n_pca_lr: Number of low-resolution data PCA components.
n_pca_hr: Number of high-resolution data PCA components. n_pca_hr: Number of high-resolution data PCA components.
high_res_sigma: Resolution of the high-resolution spectrometer in electron-Volts. high_res_sigma: Resolution of the high-resolution spectrometer in electron-Volts.
tof_start: Start looking at this index from the low-resolution spectrometer data. Set to None to perform no selection tof_start: Start looking at this index from the low-resolution spectrometer data.
delta_tof: Number of components to take from the low-resolution spectrometer. Set to None to perform no selection. Set to None to perform no selection
validation_size: Fraction (number between 0 and 1) of the data to take for validation and systematic uncertainty estimate. delta_tof: Number of components to take from the low-resolution spectrometer.
Set to None to perform no selection.
validation_size: Fraction (number between 0 and 1) of the data to take for
validation and systematic uncertainty estimate.
""" """
def __init__(self, def __init__(self,
...@@ -114,7 +141,7 @@ class Model(object): ...@@ -114,7 +141,7 @@ class Model(object):
self.n_pca_hr = n_pca_hr self.n_pca_hr = n_pca_hr
# PCA models # PCA models
self.lr_pca = IncrementalPCA(n_pca_lr, whiten=True) self.lr_pca = PCA(n_pca_lr, whiten=True)
self.hr_pca = PCA(n_pca_hr, whiten=True) self.hr_pca = PCA(n_pca_hr, whiten=True)
# PCA unc. in high resolution # PCA unc. in high resolution
...@@ -146,7 +173,10 @@ class Model(object): ...@@ -146,7 +173,10 @@ class Model(object):
high_res_sigma=self.high_res_sigma, high_res_sigma=self.high_res_sigma,
tof_start=self.tof_start, tof_start=self.tof_start,
delta_tof=self.delta_tof, delta_tof=self.delta_tof,
validation_size=self.validation_size) validation_size=self.validation_size,
high_pca_unc=self.high_pca_unc,
high_res_photon_energy=self.high_res_photon_energy,
)
def preprocess_low_res(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray: def preprocess_low_res(self, low_res_data: Dict[str, np.ndarray]) -> np.ndarray:
""" """
...@@ -154,7 +184,8 @@ class Model(object): ...@@ -154,7 +184,8 @@ class Model(object):
only the relevant input data in an array. only the relevant input data in an array.
Args: Args:
low_res_data: Dictionary with keys named channel_{i}_{k}, where i is a number between 1 and 4 and k is a letter between A and D. low_res_data: Dictionary with keys named channel_{i}_{k},
where i is a number between 1 and 4 and k is a letter between A and D.
Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features). Returns: Concatenated and pre-processed low-resolution data of shape (train_id, features).
""" """
...@@ -172,7 +203,9 @@ class Model(object): ...@@ -172,7 +203,9 @@ class Model(object):
Args: Args:
high_res_data: High resolution data with shape (train_id, features). high_res_data: High resolution data with shape (train_id, features).
high_res_photon_energy: High resolution photon energy values (the "x"-axis of the high resolution data) with shape (train_id, features). high_res_photon_energy: High resolution photon energy values
(the "x"-axis of the high resolution data) with
shape (train_id, features).
Returns: Pre-processed high-resolution data of shape (train_id, features) before. Returns: Pre-processed high-resolution data of shape (train_id, features) before.
""" """
...@@ -200,7 +233,14 @@ class Model(object): ...@@ -200,7 +233,14 @@ class Model(object):
if len(peak_idx) < 1: if len(peak_idx) < 1:
raise PromptNotFoundError() raise PromptNotFoundError()
peak_idx = sorted(peak_idx, key=lambda k: np.fabs(sum_low_res[k]), reverse=True) peak_idx = sorted(peak_idx, key=lambda k: np.fabs(sum_low_res[k]), reverse=True)
return peak_idx[0] best_guess = int(peak_idx[0])
# look around this estimate for the maximum
# this is probably not necessary
min_search = max(best_guess - 10, 0)
max_search = min(best_guess + 10, len(sum_low_res))
restricted_arr = sum_low_res[min_search:max_search]
improved_guess = min_search + int(np.argmax(restricted_arr))
return improved_guess
def debug_peak_finding(self, low_res_data: Dict[str, np.ndarray], filename: str): def debug_peak_finding(self, low_res_data: Dict[str, np.ndarray], filename: str):
""" """
...@@ -236,8 +276,12 @@ class Model(object): ...@@ -236,8 +276,12 @@ class Model(object):
Train the model. Train the model.
Args: Args:
low_res_data: Low resolution data as a dictionary with the key set to `channel_{i}_{k}`, where i is a number between 1 and 4 and k is a letter between A and D. For each dictionary entry, a numpy array is expected with shape (train_id, ToF channel). low_res_data: Low resolution data as a dictionary with the key set to `channel_{i}_{k}`,
high_res_data: Reference high resolution data with a one-to-one match to the low resolution data in the train_id dimension. Shape (train_id, ToF channel). where i is a number between 1 and 4 and k is a letter between A and D.
For each dictionary entry, a numpy array is expected with shape
(train_id, ToF channel).
high_res_data: Reference high resolution data with a one-to-one match to the
low resolution data in the train_id dimension. Shape (train_id, ToF channel).
high_res_photon_energy: Photon energy axis for the high-resolution data. high_res_photon_energy: Photon energy axis for the high-resolution data.
Returns: Smoothened high resolution spectrum. Returns: Smoothened high resolution spectrum.
...@@ -272,7 +316,9 @@ class Model(object): ...@@ -272,7 +316,9 @@ class Model(object):
Args: Args:
low_res_data: Low resolution data as in the fit step with shape (train_id, channel, ToF channel). low_res_data: Low resolution data as in the fit step with shape (train_id, channel, ToF channel).
Returns: High resolution data with shape (train_id, ToF channel, 3). The component 0 of the last dimension is the predicted spectrum. Components 1 and 2 correspond to two sources of uncertainty. Returns: High resolution data with shape (train_id, ToF channel, 3).
The component 0 of the last dimension is the predicted spectrum.
Components 1 and 2 correspond to two sources of uncertainty.
""" """
low_res = self.preprocess_low_res(low_res_data) low_res = self.preprocess_low_res(low_res_data)
low_pca = self.lr_pca.transform(low_res) low_pca = self.lr_pca.transform(low_res)
...@@ -297,23 +343,27 @@ class Model(object): ...@@ -297,23 +343,27 @@ class Model(object):
Args: Args:
filename: H5 file name where to save this. filename: H5 file name where to save this.
""" """
#joblib.dump(self, filename)
with h5py.File(filename, 'w') as hf: with h5py.File(filename, 'w') as hf:
# transform parameters into a dict
d = self.fit_model.as_dict() d = self.fit_model.as_dict()
d.update(self.parameters()) d.update(self.parameters())
for key, value in d.items(): # dump them in the file
if isinstance(value, int): dump_in_group(d, hf)
hf.attrs[key] = value
else:
hf.create_dataset(key, data=value)
# this is not ideal, because it depends on the knowledge of the PCA # this is not ideal, because it depends on the knowledge of the PCA
# object structure, but saving to a joblib file would mean creating several # object structure, but saving to a joblib file would mean creating several
# files # files
# create a group
lr_pca = hf.create_group("lr_pca") lr_pca = hf.create_group("lr_pca")
# get PCA properties
lr_pca_props = get_pca_props(self.lr_pca)
# create the HR group
hr_pca = hf.create_group("hr_pca") hr_pca = hf.create_group("hr_pca")
save_pca(self.lr_pca, lr_pca) # get PCA properties
save_pca(self.hr_pca, hr_pca) hr_pca_props = get_pca_props(self.hr_pca)
#joblib.dump(self.lr_pca, lr_pca_filename) # dump them
#joblib.dump(self.hr_pca, hr_pca_filename) dump_in_group(lr_pca_props, lr_pca)
dump_in_group(hr_pca_props, hr_pca)
def load(self, filename: str): def load(self, filename: str):
...@@ -325,22 +375,31 @@ class Model(object): ...@@ -325,22 +375,31 @@ class Model(object):
""" """
with h5py.File(filename, 'r') as hf: with h5py.File(filename, 'r') as hf:
d = {k: hf[k][()] for k in hf.keys() if not isinstance(hf[k], h5py.Group)} # read from file
d.update({k: hf.attrs[k] for k in hf.attrs}) d = read_from_group(hf)
# load fit_model parameters
self.fit_model.from_dict(d) self.fit_model.from_dict(d)
# load parameters of this class
for key in self.parameters().keys(): for key in self.parameters().keys():
setattr(self, key, d[key]) value = d[key]
if key == 'channels':
value = [item.decode() if isinstance(item, bytes)
else item
for item in value]
setattr(self, key, value)
# this is not ideal, because it depends on the knowledge of the PCA # this is not ideal, because it depends on the knowledge of the PCA
# object structure, but saving to a joblib file would mean creating several # object structure, but saving to a joblib file would mean creating several
# files # files
lr_pca = hf["/lr_pca/"] lr_pca = hf["/lr_pca/"]
hr_pca = hf["/hr_pca/"] hr_pca = hf["/hr_pca/"]
self.lr_pca = IncrementalPCA(self.n_pca_lr, whiten=True) self.lr_pca = PCA(self.n_pca_lr, whiten=True)
self.hr_pca = PCA(self.n_pca_hr, whiten=True) self.hr_pca = PCA(self.n_pca_hr, whiten=True)
self.lr_pca = load_pca(self.lr_pca, lr_pca) # read properties in dictionaries
self.hr_pca = load_pca(self.hr_pca, hr_pca) lr_pca_props = read_from_group(lr_pca)
#self.lr_pca = joblib.load(lr_pca_filename) hr_pca_props = read_from_group(hr_pca)
#self.hr_pca = joblib.load(hr_pca_filename) # set them
self.lr_pca = set_pca_props(self.lr_pca, lr_pca_props)
self.hr_pca = set_pca_props(self.hr_pca, hr_pca_props)
class FitModel(object): class FitModel(object):
""" """
...@@ -388,8 +447,8 @@ class FitModel(object): ...@@ -388,8 +447,8 @@ class FitModel(object):
self.Y_test: np.ndarray = Y_test self.Y_test: np.ndarray = Y_test
# model parameter sizes # model parameter sizes
self.Nx: int = self.X_train.shape[1] self.Nx: int = int(self.X_train.shape[1])
self.Ny: int = self.Y_train.shape[1] self.Ny: int = int(self.Y_train.shape[1])
# initial parameter values # initial parameter values
A0: np.ndarray = np.eye(self.Nx, self.Ny).reshape(self.Nx*self.Ny) A0: np.ndarray = np.eye(self.Nx, self.Ny).reshape(self.Nx*self.Ny)
...@@ -424,9 +483,6 @@ class FitModel(object): ...@@ -424,9 +483,6 @@ class FitModel(object):
#log_unc = anp.log(anp.exp(log_unc) + anp.exp(log_eps)) #log_unc = anp.log(anp.exp(log_unc) + anp.exp(log_eps))
iunc2 = anp.exp(-2*log_unc) iunc2 = anp.exp(-2*log_unc)
#print("iunc2", iunc2)
#print("log_unc", log_unc)
return anp.mean( (0.5*((X@ A + b - Y)**2)*iunc2 + log_unc).sum(axis=1), axis=0 ) # Put RELU on (XX@x) and introduce new matrix W return anp.mean( (0.5*((X@ A + b - Y)**2)*iunc2 + log_unc).sum(axis=1), axis=0 ) # Put RELU on (XX@x) and introduce new matrix W
def loss_history(x: np.ndarray) -> float: def loss_history(x: np.ndarray) -> float:
......
...@@ -78,9 +78,12 @@ def main(): ...@@ -78,9 +78,12 @@ def main():
# get train IDs and match them, so we are sure to have information from all needed sources # get train IDs and match them, so we are sure to have information from all needed sources
# in this example, there is an offset of -2 in the SPEC train ID, so correct for it # in this example, there is an offset of -2 in the SPEC train ID, so correct for it
spec_offset = -2 spec_offset = -2
spec_tid = spec_offset + run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output', f"data.trainId"].ndarray() spec_tid = spec_offset + run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output',
pes_tid = run['SA3_XTD10_PES/ADC/1:network', f"digitizers.trainId"].ndarray() "data.trainId"].ndarray()
xgm_tid = run['SA3_XTD10_XGM/XGM/DOOCS:output', f"data.trainId"].ndarray() pes_tid = run['SA3_XTD10_PES/ADC/1:network',
"digitizers.trainId"].ndarray()
xgm_tid = run['SA3_XTD10_XGM/XGM/DOOCS:output',
"data.trainId"].ndarray()
# these are the train ID intersection # these are the train ID intersection
# this could have been done by a select call in the RunDirectory, but it would not correct for the spec_offset # this could have been done by a select call in the RunDirectory, but it would not correct for the spec_offset
tids = matching_ids(spec_tid, pes_tid, xgm_tid) tids = matching_ids(spec_tid, pes_tid, xgm_tid)
...@@ -88,16 +91,21 @@ def main(): ...@@ -88,16 +91,21 @@ def main():
test_tids = tids[-10:] test_tids = tids[-10:]
# read the spec photon energy and intensity # read the spec photon energy and intensity
spec_raw_pe = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output', f"data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray() spec_raw_pe = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output',
spec_raw_int = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output', f"data.intensityDistribution"].select_trains(by_id[tids - spec_offset]).ndarray() "data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray()
spec_raw_int = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output',
"data.intensityDistribution"].select_trains(by_id[tids - spec_offset]).ndarray()
# read the PES data for each channel # read the PES data for each channel
channels = [f"channel_{i}_{l}" for i, l in product(range(1, 5), ["A", "B", "C", "D"])] channels = [f"channel_{i}_{l}"
pes_raw = {ch: run['SA3_XTD10_PES/ADC/1:network', f"digitizers.{ch}.raw.samples"].select_trains(by_id[tids]).ndarray() for ch in channels} for i, l in product(range(1, 5), ["A", "B", "C", "D"])]
pes_raw = {ch: run['SA3_XTD10_PES/ADC/1:network',
f"digitizers.{ch}.raw.samples"].select_trains(by_id[tids]).ndarray()
for ch in channels}
# read the XGM information # read the XGM information
#xgm_pressure = run['SA3_XTD10_XGM/XGM/DOOCS', f"pressure.pressureFiltered.value"].select_trains(by_id[tids]).ndarray() #xgm_pressure = run['SA3_XTD10_XGM/XGM/DOOCS', "pressure.pressureFiltered.value"].select_trains(by_id[tids]).ndarray()
#xgm_pe = run['SA3_XTD10_XGM/XGM/DOOCS:output', f"data.intensitySa3TD"].select_trains(by_id[tids]).ndarray() #xgm_pe = run['SA3_XTD10_XGM/XGM/DOOCS:output', "data.intensitySa3TD"].select_trains(by_id[tids]).ndarray()
#retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray() #retvol_raw = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.value"].select_trains(by_id[tids]).ndarray()
#retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray() #retvol_raw_timestamp = run["SA3_XTD10_PES/MDL/DAQ_MPOD", "u212.timestamp"].select_trains(by_id[tids]).ndarray()
...@@ -118,7 +126,10 @@ def main(): ...@@ -118,7 +126,10 @@ def main():
model.debug_peak_finding(pes_raw, "test_peak_finding.png") model.debug_peak_finding(pes_raw, "test_peak_finding.png")
print("Fitting") print("Fitting")
model.fit({k: v[train_idx, :] for k, v in pes_raw.items()}, spec_raw_int[train_idx, :], spec_raw_pe[train_idx, :]) model.fit({k: v[train_idx, :]
for k, v in pes_raw.items()},
spec_raw_int[train_idx, :],
spec_raw_pe[train_idx, :])
spec_smooth = model.preprocess_high_res(spec_raw_int, spec_raw_pe) spec_smooth = model.preprocess_high_res(spec_raw_int, spec_raw_pe)
print("Saving the model") print("Saving the model")
...@@ -136,7 +147,11 @@ def main(): ...@@ -136,7 +147,11 @@ def main():
# plot # plot
for tid in test_tids: for tid in test_tids:
idx = np.where(tid==tids)[0][0] idx = np.where(tid==tids)[0][0]
plot_result(f"test_{tid}.png", spec_pred[idx, :, :], spec_smooth[idx, :], spec_raw_pe[idx, :], spec_raw_int[idx, :]) plot_result(f"test_{tid}.png",
spec_pred[idx, :, :],
spec_smooth[idx, :],
spec_raw_pe[idx, :],
spec_raw_int[idx, :])
for ch in channels: for ch in channels:
plot_pes(f"test_pes_{tid}_{ch}.png", pes_raw[ch][idx, :]) plot_pes(f"test_pes_{tid}_{ch}.png", pes_raw[ch][idx, :])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment