From 5393914ea8656eacb5dceaf122cda0a68a7cda58 Mon Sep 17 00:00:00 2001
From: Rafael Gort <rafael.gort@xfel.eu>
Date: Thu, 7 May 2020 16:14:28 +0200
Subject: [PATCH] Extended skeleton for dssc class, restructured processing and
 routines modules for easy switch between joblib and multiprocessing

---
 src/toolbox_scs/detectors/__init__.py         |   9 +-
 .../detectors/azimuthal_integrator.py         |   6 +-
 src/toolbox_scs/detectors/dssc.py             |   9 +-
 src/toolbox_scs/detectors/dssc_misc.py        | 114 ++++++++++++
 .../{dssc_routines.py => dssc_processing.py}  | 171 ++++++------------
 5 files changed, 190 insertions(+), 119 deletions(-)
 create mode 100644 src/toolbox_scs/detectors/dssc_misc.py
 rename src/toolbox_scs/detectors/{dssc_routines.py => dssc_processing.py} (77%)

diff --git a/src/toolbox_scs/detectors/__init__.py b/src/toolbox_scs/detectors/__init__.py
index abb5916..d08e171 100644
--- a/src/toolbox_scs/detectors/__init__.py
+++ b/src/toolbox_scs/detectors/__init__.py
@@ -3,8 +3,9 @@ from .xgm import (
 from .tim import (
     load_TIM,)
 from .dssc_routines import (
-    load_dssc_info, calc_xgm_frame_indices, process_dssc_module, split_frames, 
-    process_intra_train)
+    load_dssc_info, load_geom, calc_xgm_frame_indices)
+from .dssc_processing import (
+    bin_data_multipr, process_intra_train, bin_data)
 from .dssc import DSSC
 
 __all__ = (
@@ -14,7 +15,8 @@ __all__ = (
     "matchXgmTimPulseId",
     "load_dssc_info",
     "calc_xgm_frame_indices",
-    "process_dssc_module",
+    "bin_data_multipr",
+    "bin_data",
     "process_intra_train",
     "split_frames",
     # Classes
@@ -36,6 +38,7 @@ clean_ns = [
     'DSSC1module',
     'dssc',
     'dssc_routines',
+    'dssc_processing',
     'dssc_data',
     'dssc_plot',
     'azimuthal_integrator',
diff --git a/src/toolbox_scs/detectors/azimuthal_integrator.py b/src/toolbox_scs/detectors/azimuthal_integrator.py
index 56759ac..e6b4ce3 100644
--- a/src/toolbox_scs/detectors/azimuthal_integrator.py
+++ b/src/toolbox_scs/detectors/azimuthal_integrator.py
@@ -1,5 +1,9 @@
+import logging
+
 import numpy as np
 
+log = logging.getLogger(__name__)
+
 class AzimutalIntegrator(object):
     def __init__(self, imageshape, center, polar_range, dr=2, aspect=204/236):
         '''
@@ -37,7 +41,7 @@ class AzimutalIntegrator(object):
         '''
         self.shape = imageshape
         cx, cy = center
-        print(f'azimuthal center: {center}')
+        log.info(f'azimuthal center: {center}')
         sx, sy = imageshape
         xcoord, ycoord = np.ogrid[:sx, :sy]
         xcoord -= cx
diff --git a/src/toolbox_scs/detectors/dssc.py b/src/toolbox_scs/detectors/dssc.py
index 09a608e..7ba4dba 100644
--- a/src/toolbox_scs/detectors/dssc.py
+++ b/src/toolbox_scs/detectors/dssc.py
@@ -62,7 +62,14 @@ class DSSC:
     # Data processing
     # -------------------------------------------------------------------------
     def process_bin(self):
-        pass
+        
+        #option 1
+        #with multiprocessing.Pool(16) as pool:
+        #    module_data = pool.map(tbdet.bin_data_multipr, jobs)
+        
+        #option 2
+        #module_data = joblib.Parallel(n_jobs=16) \
+        #    (joblib.delayed(tbdet.bin_data)(**jobs[i]) for i in range(16))
 
     def azimuthal_integration(self):
         pass
diff --git a/src/toolbox_scs/detectors/dssc_misc.py b/src/toolbox_scs/detectors/dssc_misc.py
new file mode 100644
index 0000000..013ada2
--- /dev/null
+++ b/src/toolbox_scs/detectors/dssc_misc.py
@@ -0,0 +1,114 @@
+""" 
+    DSSC-related sub-routines.
+
+    original-author: Michael Schneider 
+    authors: SCS-team members
+    
+    license: BSD 3-Clause License (see LICENSE_BSD for more info)
+
+    comment: contributions should comply with pep8 code structure guidelines.
+"""
+import logging
+from joblib import Parallel, delayed
+from tqdm import tqdm
+
+import numpy as np
+import xarray as xr
+import pandas as pd
+
+import extra_data as ed
+
+
+log = logging.getLogger(__name__)
+
+
+def load_dssc_info(proposal, run_nr):
+    """
+    Loads the first data file for DSSC module 0 (this is hardcoded)
+    and returns the detector_info dictionary
+
+    Parameters
+    ----------
+    proposal: str, int
+        number of proposal
+    run_nr: str, int
+        number of run
+
+    Returns
+    -------
+    info : dictionary
+        {'dims': tuple, 'frames_per_train': int, 'total_frames': int}
+    """
+
+    module = ed.open_run(proposal, run_nr, include='*DSSC00*')
+    info = module.detector_info('SCS_DET_DSSC1M-1/DET/0CH0:xtdf')
+    log.debug("Fetched information for DSSC module nr. 0.")
+    return info
+
+
+def calc_xgm_frame_indices(nbunches, framepattern):
+    """
+    Returns a coordinate array for XGM data. The coordinates correspond to
+    DSSC frame numbers and depend on the number of FEL pulses per train
+    ("nbunches") and the framepattern. In framepattern, dark DSSC frame
+    names (i.e., without FEL pulse) _must_ include "dark" as a substring.
+    
+    Copyright (c) 2019, Michael Schneider
+    Copyright (c) 2020, SCS-team
+
+    Parameters
+    ----------
+    nbunches: int
+        number of bunches per train
+    framepattern: list
+        experimental pattern
+
+    Returns
+    -------
+    frame_indices: numpy.ndarray
+        coordinate array corresponding to DSSC frame numbers
+
+    """
+
+    n_frames = len(framepattern)
+    n_data_frames = np.sum(['dark' not in p for p in framepattern])
+    frame_max = nbunches * n_frames // n_data_frames
+
+    frame_indices = []
+    for i, p in enumerate(framepattern):
+        if 'dark' not in p:
+            frame_indices.append(np.arange(i, frame_max, n_frames))
+
+    log.debug("Constructed coordinate array for XGM data.")
+    return np.sort(np.concatenate(frame_indices))
+
+
+def load_geom(geopath=None, quad_pos=None):
+    """ 
+    Loads and return the DSSC geometry.
+
+    Parameters
+    ----------
+    geopath: str 
+        path to the h5 geometry file. If None uses a default file.
+        quad_pos: list of quadrants tuple position. If None uses a default
+        position.
+
+    Returns
+    -------
+    geom: extra_geom.DSSC_1MGeometry
+        loaded geometry object
+        """
+    if quad_pos is None:
+        quad_pos = [(-124.100,    3.112),  # TR
+                    (-133.068, -110.604),  # BR
+                    (   0.988, -125.236),  # BL
+                    (   4.528,   -4.912)   # TL
+                    ]
+
+    if geopath is None:
+        geopath = '/gpfs/exfel/sw/software/git/EXtra-geom/' \
+                  'docs/dssc_geo_june19.h5'
+
+    geom = DSSC_1MGeometry.from_h5_file_and_quad_positions(geopath, quad_pos)
+    return geom
diff --git a/src/toolbox_scs/detectors/dssc_routines.py b/src/toolbox_scs/detectors/dssc_processing.py
similarity index 77%
rename from src/toolbox_scs/detectors/dssc_routines.py
rename to src/toolbox_scs/detectors/dssc_processing.py
index ded1405..0930849 100644
--- a/src/toolbox_scs/detectors/dssc_routines.py
+++ b/src/toolbox_scs/detectors/dssc_processing.py
@@ -9,6 +9,7 @@
     comment: contributions should comply with pep8 code structure guidelines.
 """
 import logging
+from joblib import Parallel, delayed
 from tqdm import tqdm
 
 import numpy as np
@@ -21,98 +22,6 @@ import extra_data as ed
 log = logging.getLogger(__name__)
 
 
-def load_dssc_info(proposal, run_nr):
-    """
-    Loads the first data file for DSSC module 0 (this is hardcoded)
-    and returns the detector_info dictionary
-
-    Parameters
-    ----------
-    proposal: str, int
-        number of proposal
-    run_nr: str, int
-        number of run
-
-    Returns
-    -------
-    info : dictionary
-        {'dims': tuple, 'frames_per_train': int, 'total_frames': int}
-    """
-
-    module = ed.open_run(proposal, run_nr, include='*DSSC00*')
-    info = module.detector_info('SCS_DET_DSSC1M-1/DET/0CH0:xtdf')
-    log.debug("Fetched information for DSSC module nr. 0.")
-    return info
-
-
-def load_geom(geopath=None, quad_pos=None):
-    """ 
-    Loads and return the DSSC geometry.
-
-    Parameters
-    ----------
-    geopath: str 
-        path to the h5 geometry file. If None uses a default file.
-        quad_pos: list of quadrants tuple position. If None uses a default
-        position.
-
-    Returns
-    -------
-    geom: extra_geom.DSSC_1MGeometry
-        loaded geometry object
-        """
-    if quad_pos is None:
-        quad_pos = [(-124.100,    3.112),  # TR
-                    (-133.068, -110.604),  # BR
-                    (   0.988, -125.236),  # BL
-                    (   4.528,   -4.912)   # TL
-                    ]
-
-    if geopath is None:
-        geopath = '/gpfs/exfel/sw/software/git/EXtra-geom/' \
-                  'docs/dssc_geo_june19.h5'
-
-    geom = DSSC_1MGeometry.from_h5_file_and_quad_positions(geopath, quad_pos)
-    return geom
-
-
-def calc_xgm_frame_indices(nbunches, framepattern):
-    """
-    Returns a coordinate array for XGM data. The coordinates correspond to
-    DSSC frame numbers and depend on the number of FEL pulses per train
-    ("nbunches") and the framepattern. In framepattern, dark DSSC frame
-    names (i.e., without FEL pulse) _must_ include "dark" as a substring.
-    
-    Copyright (c) 2019, Michael Schneider
-    Copyright (c) 2020, SCS-team
-
-    Parameters
-    ----------
-    nbunches: int
-        number of bunches per train
-    framepattern: list
-        experimental pattern
-
-    Returns
-    -------
-    frame_indices: numpy.ndarray
-        coordinate array corresponding to DSSC frame numbers
-
-    """
-
-    n_frames = len(framepattern)
-    n_data_frames = np.sum(['dark' not in p for p in framepattern])
-    frame_max = nbunches * n_frames // n_data_frames
-
-    frame_indices = []
-    for i, p in enumerate(framepattern):
-        if 'dark' not in p:
-            frame_indices.append(np.arange(i, frame_max, n_frames))
-
-    log.debug("Constructed coordinate array for XGM data.")
-    return np.sort(np.concatenate(frame_indices))
-
-
 def prepare_module_empty(scan_variable, framepattern):
     """
     Create empty (zero-valued) DataArray for a single DSSC module
@@ -345,30 +254,24 @@ def process_intra_train(job):
     return module_data
 
 
-def process_dssc_module(job):
+def bin_data_multipr(job):
     """
-    Aggregate DSSC data (chunked, to fit into memory) for a single module.
-    Groups by "scan_variable" in given scanfile - use dummy scan_variable to
-    average over all trains. This implies, that only trains found in the
-    scanfile are considered.
-    
-    Copyright (c) 2019, Michael Schneider
-    Copyright (c) 2020, SCS-team
+    Entry point for binning routines using the multrprocessing module
 
     Parameters
     ----------
-    job:Â dictionary
+    job: dictionary
       Designed for the multiprocessing module - expects a job dictionary with
       the following keys:
         proposal : int
                    proposal number
-        run : int
+        run_nr : int
                 run number
         module : int
                 DSSC module to process
         chunksize : int
                 number of trains to process simultaneously
-        scanfile : str
+        binfile : str
                 name of hdf5 file with xarray.DataArray containing the
                 scan variable and trainIds
         framepattern : list of str
@@ -383,14 +286,54 @@ def process_dssc_module(job):
     module_data: xarray.Dataset
 
     """
+    params = {}
+    params['proposal'] = job['proposal']
+    params['run_nr'] = job['run_nr']
+    params['module'] = job['module']
+    params['chunksize'] = job['chunksize']
+    params['binfile'] = job['binfile']
+    params['framepattern'] = job.get('framepattern', ['image'])
+    params['maskfile'] = job.get('maskfile', None)
 
-    proposal = job['proposal']
-    run_nr = job['run_nr']
-    module = job['module']
-    chunksize = job['chunksize']
-    scanfile = job['scanfile']
-    framepattern = job.get('framepattern', ['image'])
-    maskfile = job.get('maskfile', None)
+    return bin_data(**params)
+
+
+def bin_data(proposal, run_nr, module, chunksize, binfile,
+                        framepattern=['image'], maskfile=None):
+    """
+    Aggregate DSSC data (chunked, to fit into memory) for a single module.
+    Groups by "scan_variable" in given bins - use dummy scan_variable to
+    average over all trains. This implies, that only trains found in bins are 
+    considered.
+    
+    Copyright (c) 2019, Michael Schneider
+    Copyright (c) 2020, SCS-team
+
+    Parameters
+    ----------
+    proposal : int
+        proposal number
+    run_nr : int
+        run number
+    module : int
+        DSSC module to process
+    chunksize : int
+        number of trains to process simultaneously
+    binfile : str
+        name of hdf5 file with xarray.DataArray containing the
+        bin variable and trainIds
+    framepattern : list of str
+        names for the (possibly repeating) intra-train pulses. See
+        split_dssc_data
+    pulsemask : str
+        name of hdf5 file with boolean xarray.DataArray to
+        select/reject trains and pulses
+
+    Returns
+    -------
+    module_data: xarray.Dataset
+        xarray datastructure containing data binned according to bins.
+    """
 
     sourcename = f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf'
     collection = ed.open_run(proposal, run_nr,
@@ -399,8 +342,8 @@ def process_dssc_module(job):
 
     log.info(f"Processing dssc module {module}: start")
 
-    # read preprocessed scan variable from file
-    scan = xr.open_dataarray(scanfile, 'data', engine='h5netcdf')
+    # read preprocessed bins from file
+    bins = xr.open_dataarray(binfile, 'data', engine='h5netcdf')
 
     # read binary pulse/train mask - e.g. from XGM thresholding
     if maskfile is not None:
@@ -408,7 +351,7 @@ def process_dssc_module(job):
     else:
         pulsemask = None
 
-    module_data = prepare_module_empty(scan, framepattern)
+    module_data = prepare_module_empty(bins, framepattern)
     chunks = np.arange(ntrains, step=chunksize)
 
     # progress bar
@@ -436,7 +379,7 @@ def process_dssc_module(job):
             data = xr.merge([data, sum_count])
 
             # aligns on trainId, drops non-matching trains
-            data['scan_variable'] = scan
+            data['scan_variable'] = bins
             data = data.groupby('scan_variable').sum('trainId')
 
             log.debug(f"Module {module}: "
-- 
GitLab