Machine Learning projects.
pes_to_spec

Repository

git clone https://git.xfel.eu/machineLearning/pes_to_spec.git
cd pes_to_spec
# copy it
git clone https://git.xfel.eu/machineLearning/pes_to_spec.git
cd pes_to_spec

# create separate environment
module load exfel exfel-python
python -m venv env
source env/bin/activate

# install dependencies
pip install --force-reinstall --index-url https://pypi.anaconda.org/intel/simple --no-dependencies numpy scipy==1.7.3
pip install numpy scipy==1.7.3
pip install torch --index-url https://download.pytorch.org/whl/cpu
pip install joblib scikit-learn

# install dependencies for the analysis in the notebook
pip install matplotlib lmfit seaborn extra_data

# add it to the Jupyter Lab in Maxwell
pip install ipykernel
python -m ipykernel install --user --name pes_to_spec_env

from pes_to_spec.model import Model
from itertools import product

# this is the main object holding all
# information needed for training and prediction
# the default parameters should be sufficient in most times
# if some channels are dead, you may want to specify the active channels
channels = [f"channel_{i}_{l}" for i, l in product(range(1, 5), ["A", "B", "C", "D"])]
model = Model(channels=channels)

# this trains the model
# low_resolution_raw_data is a dictionary with keys "channel_[1-4]_[A-D]" and values set to 2D-shaped
# numpy arrays with shape (number_of_train_IDs, features),
# indicating the low resolution spectra for each input channel
# high_resolution_intensity and high_resolution_photon_energy are estimates from the high-resolution invasive spectrometer
model.fit(low_resolution_raw_data,
          high_resolution_intensity,
          high_resolution_photon_energy,
          pulse_energy=beam_intensity)

# save it for later usage:
model.save("model.joblib")

# when performing inference:
# load a model:
model = Model.load("model.joblib")

# and use it to map a low-resolution spectrum to a high-resolution one
# as before, the low_resolution_raw_data refers to a dictionary mapping the channel name
# in the format "channel_[1-4]_[A-D]" to the 2D numpy array with shape (number_of_train_IDs, features)
# all names and shapes must match the format in training, except for the number_of_train_IDs, which may vary
interpolated_high_resolution_data = model.predict(low_resolution_raw_data, pulse_energy=beam_intensity)

# extra useful functions for debugging:

# this may be used to debug the finding of the prompt peak in the low-resolution data
# ideally this finds the peak correctly in the sample low-resolution data used for training
model.debug_peak_finding(low_resolution_raw_data, "test_peak_finding.png")

# this provides a smoothened version of the high-resolution spectrum, filtering sources of noise
# caused by fluctuations below the spectrometer's resolution
# it may be useful for plotting and debugging
high_resolution_filtered = model.preprocess_high_res(high_resolution_intensity,
                                                     high_resolution_photon_energy)

from extra_data import RunDirectory
from itertools import product

run = RunDirectory(f"/gpfs/exfel/exp/SA3/202121/p002935/raw/r0015")

# get train IDs and match them, so we are sure to have information from all needed sources
# in this example, there is an offset of -2 in the SPEC train ID, so correct for it
# this should not be necessary usually
spec_offset = -2
spec_tid = spec_offset + run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output',
                             "data.trainId"].ndarray()
pes_tid = run['SA3_XTD10_PES/ADC/1:network',
              "digitizers.trainId"].ndarray()
xgm_tid = run['SA3_XTD10_XGM/XGM/DOOCS:output',
              "data.trainId"].ndarray()
# these are the train ID intersection
# this could have been done by a select call in the RunDirectory,
# but it would not correct for the spec_offset
tids = matching_ids(spec_tid, pes_tid, xgm_tid)
train_tids = tids[:-10]
test_tids = tids[-10:]

# read the spec photon energy and intensity
high_resolution_photon_energy = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output',
                                    "data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray()
high_resolution_intensity = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output',
                                "data.intensityDistribution"].select_trains(by_id[tids - spec_offset]).ndarray()

# read the PES data for each channel
channels = [f"channel_{i}_{l}" for i, l in product(range(1, 5), ["A", "B", "C", "D"])]
low_resolution_raw_data = {ch: run['SA3_XTD10_PES/ADC/1:network',
                                   f"digitizers.{ch}.raw.samples"].select_trains(by_id[tids]).ndarray()
                           for ch in channels}