From 5e1c5100ae7c81c3a34d23bdaf82c10f255eeabe Mon Sep 17 00:00:00 2001
From: Danilo Enoque Ferreira de Lima <danilo.enoque.ferreira.de.lima@xfel.eu>
Date: Tue, 20 Dec 2022 11:33:55 +0100
Subject: [PATCH] Update README.md.

---
 README.md | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f76d3e1..3deaa9c 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,94 @@
 # pes_to_spec
 
-Aim of the project to transform the data from PES to SPEC.
+The aim of this project is to read low-resolution photon spectrometer measurements and predict, with an uncertainty band, a high-resolution invasive
+photon spectrometer result, in such a way that one may continue to collect low-resolution data without stopping the beam, but obtaining nevertheless, high-resolution
+results.
 
-# Usage
+The concept is to collect both results simultaneously during a training phase and use it to learn a model that may be used later, under the same conditions, but without the high-resolution, invasive spectrometer. The idea is that minimum tuning of the parameters of this methods are needed, so that if the data for training is available, no fine-tuning is required.
+
+## Installation
+
+One may install it simply with `pip install pes_to_spec`.
+
+## Usage
+
+The API may be used as follows:
+
+```
+from pes_to_spec.model import Model
+
+# this is the main object holding all
+# information needed for training and prediction
+# the default parameters should be sufficient in most times
+model = Model()
+
+# this trains the model
+# low_resolution_raw_data is a dictionary with keys "channel_[1-4]_[A-D]" and values set to 2D-shaped
+# numpy arrays with shape (number_of_train_IDs, features),
+# indicating the low resolution spectra for each input channel
+# high_resolution_intensity and high_resolution_photon_energy are estimates from the high-resolution invasive spectrometer
+model.fit(low_resolution_raw_data, high_resolution_intensity, high_resolution_photon_energy)
+
+# save it for later usage:
+model.save("model.h5")
+
+# when performing inference:
+# load a model:
+model = Model()
+model.load("model.h5")
+
+# and use it to map a low-resolution spectrum to a high-resolution one
+# as before, the low_resolution_raw_data refers to a dictionary mapping the channel name
+# in the format "channel_[1-4]_[A-D]" to the 2D numpy array with shape (number_of_train_IDs, features)
+# all names and shapes must match the format in training, except for the number_of_train_IDs, which may vary
+interpolated_high_resolution_data = model.predict(low_resolution_raw_data)
+
+# extra useful functions for debugging:
+
+# this may be used to debug the finding of the prompt peak in the low-resolution data
+# ideally this finds the peak correctly in the sample low-resolution data used for training
+model.debug_peak_finding(low_resolution_raw_data, "test_peak_finding.png")
+
+# this provides a smoothened version of the high-resolution spectrum, filtering sources of noise
+# caused by fluctuations below the spectrometer's resolution
+# it may be useful for plotting and debugging
+high_resolution_filtered = model.preprocess_high_res(high_resolution_intensity, high_resolution_photon_energy)
+
+```
+
+The input data shown here may be retrieved using `extra_data` as in the following example:
+
+```
+from extra_data import RunDirectory
+from itertools import product
+
+run = RunDirectory(f"/gpfs/exfel/exp/SA3/202121/p002935/raw/r0015")
+
+# get train IDs and match them, so we are sure to have information from all needed sources
+# in this example, there is an offset of -2 in the SPEC train ID, so correct for it
+# this should not be necessary usually
+spec_offset = -2
+spec_tid = spec_offset + run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output', f"data.trainId"].ndarray()
+pes_tid = run['SA3_XTD10_PES/ADC/1:network', f"digitizers.trainId"].ndarray()
+xgm_tid = run['SA3_XTD10_XGM/XGM/DOOCS:output', f"data.trainId"].ndarray()
+# these are the train ID intersection
+# this could have been done by a select call in the RunDirectory, but it would not correct for the spec_offset
+tids = matching_ids(spec_tid, pes_tid, xgm_tid)
+train_tids = tids[:-10]
+test_tids = tids[-10:]
+
+# read the spec photon energy and intensity
+high_resolution_photon_energy = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output', f"data.photonEnergy"].select_trains(by_id[tids - spec_offset]).ndarray()
+high_resolution_intensity = run['SA3_XTD10_SPECT/MDL/FEL_BEAM_SPECTROMETER_SQS1:output', f"data.intensityDistribution"].select_trains(by_id[tids - spec_offset]).ndarray()
+
+# read the PES data for each channel
+channels = [f"channel_{i}_{l}" for i, l in product(range(1, 5), ["A", "B", "C", "D"])]
+low_resolution_raw_data = {ch: run['SA3_XTD10_PES/ADC/1:network', f"digitizers.{ch}.raw.samples"].select_trains(by_id[tids]).ndarray() for ch in channels}
+```
+
+## Exploration initial tests
+
+A first draft and explorative code can be seen in the `exploration` directory.
 
 1. inv_train.py -> Train a model on the specific RUN.
 Thish will save the pca model and fit model in experiments/YOUR_DIR/checkpoints and the data in 
@@ -12,4 +98,4 @@ experiments/YOUR_DIR/data.
 
 3. inv_inference -> Use trained model to do inference on new data point
 
-4. data_drift_check.py  -> Check data drift between two datasets
\ No newline at end of file
+4. data_drift_check.py  -> Check data drift between two datasets
-- 
GitLab