From 489661ee9a75c4093e6b4ff8e5415826ec4b3a18 Mon Sep 17 00:00:00 2001 From: Egor Sobolev <egor.sobolev@xfel.eu> Date: Sat, 8 Mar 2025 00:23:52 +0100 Subject: [PATCH 1/8] Add reduction of frames by candidate list --- setup.py | 1 + src/exdf/data_reduction/builtins.py | 152 +++++++++++++++++++++++++++- 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2dae071..6f173de 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ setup( 'AgipdGain = exdf.data_reduction.builtins:AgipdGain', 'LpdMini = exdf.data_reduction.builtins:LpdMini', 'LitFrames = exdf.data_reduction.builtins:LitFrames', + 'FrameList = exdf.data_reduction.builtins:FrameList', ] }, diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index 37bc292..942a91e 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -3,7 +3,7 @@ from logging import getLogger import numpy as np from exdf.data_reduction import ReductionMethod -from extra_data import SourceNameError, by_id +from extra_data import SourceNameError, by_id, RunDirectory def parse_slice(value): @@ -399,3 +399,153 @@ class LitFrames(ReductionMethod): ) if nrec > max_lines: self.log.info(f"... {nrec - max_lines + 1} more lines skipped") + + +class FrameList(ReductionMethod): + log = getLogger('exdf.data_reduction.builtins.FrameList') + + @staticmethod + def arguments(ap): + group = ap.add_argument_group( + 'Candidate frame selection', + 'Allows to filter frames by candidate list') + + group.add_argument( + '--framelist-det-sources', + action='store', type=str, + help='Detector sources to filter', + ) + group.add_argument( + '--framelist-file', + action='store', type=str, + help='Candidate list file, if none use data collection', + ) + group.add_argument( + '--framelist-filetype', + action='store', type=str, choices=['exdf', 'h5', 'csv'], default='exdf', + help='Candidate list file, if none use data collection', + ) + group.add_argument( + '--framelist-entry', + action='store', type=str, default='', + help='Source or dataset prefix', + ) + group.add_argument( + '--framelist-class-key', + action='store', type=str, + help='Key of frame class dataset', + ) + group.add_argument( + '--framelist-train-id-key', + action='store', type=str, + help='Key of frame class dataset', + ) + group.add_argument( + '--framelist-pulse-id-key', + action='store', type=str, + help='Key of frame class dataset', + ) + group.add_argument( + '--framelist-keep-missed', + action='store_true', + help='Key of frame class dataset', + ) + + def __init__(self, data, args): + if args.framelist_filetype == 'exdf': + filename = args.framelist_file + try: + clist = data if filename is None else RunDirectory(filename) + except FileNotFoundError: + self.log.error(f"Directory '{filename}' not found.") + return + + source = args.framelist_entry + class_key = args.framelist_class_key + if class_key is None: + self.log.error( + "The --framelist-class-key option is not specified.") + return + train_id_key = args.framelist_train_id_key + if train_id_key is None: + train_id_key = class_key.rpartition('.')[0] + '.trainId' + pulse_id_key = args.framelist_pulse_id_key + if pulse_id_key is None: + pulse_id_key = class_key.rpartition('.')[0] + '.pulseId' + + try: + group = clist[source] + cls = group[class_key].ndarray() + tid = group[train_id_key].ndarray() + pid = group[pulse_id_key].ndarray() + except (SourceNameError, KeyError): + self.log.error(f"No control source {source} or " + "required keys found in data.") + return + elif args.framelist_filetype == 'h5': + import h5py + try: + with h5py.File(filename) as f: + path = args.framelist_entry + tid = f[path + '/' + args.framelist_train_id_key][:] + pid = f[path + '/' + args.framelist_pulse_id_key][:] + cls = f[path + '/' + args.framelist_class_key][:] + except FileNotFoundError: + self.log.error(f"Directory '{filename}' not found.") + return + except KeyError: + self.log.error("Required keys are not found in data.") + return + elif args.framelist_filetype == 'csv': + try: + pass + except FileNotFoundError: + self.log.error(f"Directory '{filename}' not found.") + return + + # group by trains + trains, first, count = np.unique(tid, return_index=True, return_counts=True) + + # loop over sources + det = data.select(args.framelist_det_sources, "image.*") + for source_req in det.instrument_sources: + # resolve legacy source + src = data.legacy_sources.get(source_req, source_req) + self.log.info("select frames in " + src) + + num_drop = 0 + num_keep = 0 + num_missed_frames = 0 + num_missed_trains = 0 + + key = data[src, "image.pulseId"].drop_empty_trains() + for train_id, pulse_ids in key.trains(): + num_frames = len(pulse_ids) + if args.framelist_keep_missed: + mask = np.ones(num_frames, bool) + else: + mask = np.zeros(num_frames, bool) + i = np.argmax(trains == train_id) + if trains[i] == train_id: + f0 = first[i] + fN = f0 + count[i] + pulses = np.isin(pulse_ids, pid[f0:fN]) + mask[pulses] = cls[f0:fN] != 0 + num_missed_frames += abs(num_frames - count[i]) + else: + num_missed_trains += 1 + + reduced_num_frames = np.sum(mask) + num_keep += reduced_num_frames + num_drop += num_frames - reduced_num_frames + + if src.endswith(':xtdf'): + self.select_xtdf(src, by_id[[train_id]], mask) + else: + self.select_entries(src, 'image', by_id[[train_id]], mask) + + ratio = num_keep / (num_keep + num_drop) + self.log.info(f" - {num_keep} frames keep, {num_drop} frames drop, " + f"ratio: {ratio:.2%}") + self.log.info(f" - decision is missed for {num_missed_trains} trains, " + f"{num_missed_frames} frames") -- GitLab From 515d1d739d645dabaebe70e676e26a3db491db6a Mon Sep 17 00:00:00 2001 From: Egor Sobolev <egor.sobolev@xfel.eu> Date: Mon, 10 Mar 2025 15:52:23 +0100 Subject: [PATCH 2/8] Support csv format, isolate file reading --- src/exdf/data_reduction/builtins.py | 123 ++++++++++++++++------------ 1 file changed, 69 insertions(+), 54 deletions(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index 942a91e..ce92362 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -404,6 +404,9 @@ class LitFrames(ReductionMethod): class FrameList(ReductionMethod): log = getLogger('exdf.data_reduction.builtins.FrameList') + event_type = np.dtype([ + ('tid', np.uint64), ('pid', np.uint64), ('cls', np.uint16)]) + @staticmethod def arguments(ap): group = ap.add_argument_group( @@ -428,7 +431,7 @@ class FrameList(ReductionMethod): group.add_argument( '--framelist-entry', action='store', type=str, default='', - help='Source or dataset prefix', + help='Source name in EXDF format or dataset prefix in other formats', ) group.add_argument( '--framelist-class-key', @@ -451,60 +454,72 @@ class FrameList(ReductionMethod): help='Key of frame class dataset', ) - def __init__(self, data, args): - if args.framelist_filetype == 'exdf': - filename = args.framelist_file - try: - clist = data if filename is None else RunDirectory(filename) - except FileNotFoundError: - self.log.error(f"Directory '{filename}' not found.") - return - - source = args.framelist_entry - class_key = args.framelist_class_key - if class_key is None: - self.log.error( - "The --framelist-class-key option is not specified.") - return - train_id_key = args.framelist_train_id_key - if train_id_key is None: - train_id_key = class_key.rpartition('.')[0] + '.trainId' - pulse_id_key = args.framelist_pulse_id_key - if pulse_id_key is None: - pulse_id_key = class_key.rpartition('.')[0] + '.pulseId' + def from_exdf(self, data, args): + clist = ( + data if args.framelist_file is None + else RunDirectory(args.framelist_file) + ) + source = args.framelist_entry + class_key = args.framelist_class_key + if class_key is None: + raise ValueError( + "The --framelist-class-key option is not specified.") + train_id_key = args.framelist_train_id_key + if train_id_key is None: + train_id_key = class_key.rpartition('.')[0] + '.trainId' + pulse_id_key = args.framelist_pulse_id_key + if pulse_id_key is None: + pulse_id_key = class_key.rpartition('.')[0] + '.pulseId' + + group = clist[source] + cls = group[class_key].ndarray() + tid = group[train_id_key].ndarray() + pid = group[pulse_id_key].ndarray() + return np.rec.fromarrays([tid, pid, cls], dtype=self.event_type) + + def from_h5(self, args): + import h5py + with h5py.File(args.framelist_file) as f: + entry = f[args.framelist_entry] + tid = entry[args.framelist_train_id_key][:] + pid = entry[args.framelist_pulse_id_key][:] + cls = entry[args.framelist_class_key][:] + return np.rec.fromarrays([tid, pid, cls], dtype=self.event_type) + + def from_csv(self, args): + import pandas as pd + ds = pd.read_csv(args.framelist_file) + return np.rec.fromarrays( + [ + ds[args.framelist_entry + args.framelist_train_id_key], + ds[args.framelist_entry + args.framelist_pulse_id_key], + ds[args.framelist_entry + args.framelist_class_key] + ], + dtype=self.event_type + ) - try: - group = clist[source] - cls = group[class_key].ndarray() - tid = group[train_id_key].ndarray() - pid = group[pulse_id_key].ndarray() - except (SourceNameError, KeyError): - self.log.error(f"No control source {source} or " - "required keys found in data.") - return - elif args.framelist_filetype == 'h5': - import h5py - try: - with h5py.File(filename) as f: - path = args.framelist_entry - tid = f[path + '/' + args.framelist_train_id_key][:] - pid = f[path + '/' + args.framelist_pulse_id_key][:] - cls = f[path + '/' + args.framelist_class_key][:] - except FileNotFoundError: - self.log.error(f"Directory '{filename}' not found.") - return - except KeyError: - self.log.error("Required keys are not found in data.") - return - elif args.framelist_filetype == 'csv': - try: - pass - except FileNotFoundError: - self.log.error(f"Directory '{filename}' not found.") - return + def __init__(self, data, args): + try: + if args.framelist_filetype == 'exdf': + evt = self.from_exdf(data, args) + elif args.framelist_filetype == 'h5': + evt = self.from_h5(args) + elif args.framelist_filetype == 'csv': + evt = self.from_csv(args) + else: + raise ValueError("Unknown filetype.") + except FileNotFoundError: + self.log.error(f"File '{args.framelist_file}' not found.") + return + except (SourceNameError, KeyError): + self.log.error("Required keys are not found in data.") + return + except Exception as e: + self.log.error(f"Exception when reading the candidate list: {e}.") + return # group by trains - trains, first, count = np.unique(tid, return_index=True, return_counts=True) + trains, first, count = np.unique(evt.tid, return_index=True, return_counts=True) # loop over sources det = data.select(args.framelist_det_sources, "image.*") @@ -529,8 +544,8 @@ class FrameList(ReductionMethod): if trains[i] == train_id: f0 = first[i] fN = f0 + count[i] - pulses = np.isin(pulse_ids, pid[f0:fN]) - mask[pulses] = cls[f0:fN] != 0 + pulses = np.isin(pulse_ids, evt.pid[f0:fN]) + mask[pulses] = evt.cls[f0:fN] != 0 num_missed_frames += abs(num_frames - count[i]) else: num_missed_trains += 1 -- GitLab From 5eea8ab0c6c105f7013d64df24354c12e70637ba Mon Sep 17 00:00:00 2001 From: Egor Sobolev <egor.sobolev@xfel.eu> Date: Mon, 10 Mar 2025 16:03:51 +0100 Subject: [PATCH 3/8] Fix pulse alignment --- src/exdf/data_reduction/builtins.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index ce92362..537f179 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -544,9 +544,10 @@ class FrameList(ReductionMethod): if trains[i] == train_id: f0 = first[i] fN = f0 + count[i] - pulses = np.isin(pulse_ids, evt.pid[f0:fN]) - mask[pulses] = evt.cls[f0:fN] != 0 - num_missed_frames += abs(num_frames - count[i]) + classified = np.isin(pulse_ids, evt.pid[f0:fN]) + available = np.isin(evt.pid[f0:fN], pulse_ids[classified]) + mask[classified] = evt.cls[f0:fN][available] != 0 + num_missed_frames += abs(num_frames - np.sum(classified)) else: num_missed_trains += 1 -- GitLab From 5b268025f14bd2f09646f4723e055911969930dd Mon Sep 17 00:00:00 2001 From: Egor Sobolev <egor.sobolev@xfel.eu> Date: Tue, 11 Mar 2025 09:36:35 +0100 Subject: [PATCH 4/8] Apply suggestions Co-authored-by: Oleksii Turkot <oleksii.turkot@xfel.eu> --- src/exdf/data_reduction/builtins.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index 537f179..541563b 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -499,6 +499,9 @@ class FrameList(ReductionMethod): ) def __init__(self, data, args): + if not args.framelist_det_sources: + return + try: if args.framelist_filetype == 'exdf': evt = self.from_exdf(data, args) @@ -507,6 +510,9 @@ class FrameList(ReductionMethod): elif args.framelist_filetype == 'csv': evt = self.from_csv(args) else: + # This branch is for completeness here + # The value of `--framelist-filetype` options + # should be already checked by the `argparse` raise ValueError("Unknown filetype.") except FileNotFoundError: self.log.error(f"File '{args.framelist_file}' not found.") @@ -547,7 +553,7 @@ class FrameList(ReductionMethod): classified = np.isin(pulse_ids, evt.pid[f0:fN]) available = np.isin(evt.pid[f0:fN], pulse_ids[classified]) mask[classified] = evt.cls[f0:fN][available] != 0 - num_missed_frames += abs(num_frames - np.sum(classified)) + num_missed_frames += num_frames - np.sum(classified) else: num_missed_trains += 1 -- GitLab From 1fb1e9d8ea30df5c72918ee3c2ac0c1a098fdddd Mon Sep 17 00:00:00 2001 From: Egor Sobolev <egor.sobolev@xfel.eu> Date: Tue, 11 Mar 2025 14:21:47 +0100 Subject: [PATCH 5/8] Add option to choose which frame classes to drop --- src/exdf/data_reduction/builtins.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index 541563b..b98afff 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -453,6 +453,11 @@ class FrameList(ReductionMethod): action='store_true', help='Key of frame class dataset', ) + group.add_argument( + '--framelist-drop-classes', + nargs='*', type=int, default=[0], + help='The list of class numbers to drop, default [0]', + ) def from_exdf(self, data, args): clist = ( @@ -526,6 +531,7 @@ class FrameList(ReductionMethod): # group by trains trains, first, count = np.unique(evt.tid, return_index=True, return_counts=True) + drop_classes = np.array(sorted(set(args.framelist_drop_classes))) # loop over sources det = data.select(args.framelist_det_sources, "image.*") @@ -552,7 +558,12 @@ class FrameList(ReductionMethod): fN = f0 + count[i] classified = np.isin(pulse_ids, evt.pid[f0:fN]) available = np.isin(evt.pid[f0:fN], pulse_ids[classified]) - mask[classified] = evt.cls[f0:fN][available] != 0 + mask[classified] = np.isin( + evt.cls[f0:fN][available], + drop_classes, + assume_unique=True, + invert=True, + ) num_missed_frames += num_frames - np.sum(classified) else: num_missed_trains += 1 -- GitLab From d2e0a17ffa9a004902d41f824aa40b55962d366a Mon Sep 17 00:00:00 2001 From: turkot <oleksii.turkot@xfel.eu> Date: Tue, 11 Mar 2025 15:37:41 +0100 Subject: [PATCH 6/8] Allow --framelist-train-id-key and --framelist-pulse-id-key to be optional also for h5 and csv file inputs. --- src/exdf/data_reduction/builtins.py | 47 +++++++++++++++++------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index b98afff..a5972f7 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -459,36 +459,42 @@ class FrameList(ReductionMethod): help='The list of class numbers to drop, default [0]', ) + def check_keys(self, args): + self.class_key = args.framelist_class_key + if self.class_key is None: + raise ValueError( + "The --framelist-class-key option is not specified.") + ckey_rpart = self.class_key.rpartition('.') + ckey_prefix = ckey_rpart[0] + ckey_rpart[1] + + self.train_id_key = args.framelist_train_id_key + if self.train_id_key is None: + self.train_id_key = ckey_prefix + 'trainId' + + self.pulse_id_key = args.framelist_pulse_id_key + if self.pulse_id_key is None: + self.pulse_id_key = ckey_prefix + 'pulseId' + + def from_exdf(self, data, args): clist = ( data if args.framelist_file is None else RunDirectory(args.framelist_file) ) source = args.framelist_entry - class_key = args.framelist_class_key - if class_key is None: - raise ValueError( - "The --framelist-class-key option is not specified.") - train_id_key = args.framelist_train_id_key - if train_id_key is None: - train_id_key = class_key.rpartition('.')[0] + '.trainId' - pulse_id_key = args.framelist_pulse_id_key - if pulse_id_key is None: - pulse_id_key = class_key.rpartition('.')[0] + '.pulseId' - group = clist[source] - cls = group[class_key].ndarray() - tid = group[train_id_key].ndarray() - pid = group[pulse_id_key].ndarray() + tid = group[self.train_id_key].ndarray() + pid = group[self.pulse_id_key].ndarray() + cls = group[self.class_key].ndarray() return np.rec.fromarrays([tid, pid, cls], dtype=self.event_type) def from_h5(self, args): import h5py with h5py.File(args.framelist_file) as f: entry = f[args.framelist_entry] - tid = entry[args.framelist_train_id_key][:] - pid = entry[args.framelist_pulse_id_key][:] - cls = entry[args.framelist_class_key][:] + tid = entry[self.train_id_key][:] + pid = entry[self.pulse_id_key][:] + cls = entry[self.class_key][:] return np.rec.fromarrays([tid, pid, cls], dtype=self.event_type) def from_csv(self, args): @@ -496,9 +502,9 @@ class FrameList(ReductionMethod): ds = pd.read_csv(args.framelist_file) return np.rec.fromarrays( [ - ds[args.framelist_entry + args.framelist_train_id_key], - ds[args.framelist_entry + args.framelist_pulse_id_key], - ds[args.framelist_entry + args.framelist_class_key] + ds[args.framelist_entry + self.train_id_key], + ds[args.framelist_entry + self.pulse_id_key], + ds[args.framelist_entry + self.class_key] ], dtype=self.event_type ) @@ -507,6 +513,7 @@ class FrameList(ReductionMethod): if not args.framelist_det_sources: return + self.check_keys(args) try: if args.framelist_filetype == 'exdf': evt = self.from_exdf(data, args) -- GitLab From f522c1c628313c7c76707f1ca4ab014dc8284581 Mon Sep 17 00:00:00 2001 From: Oleksii Turkot <oleksii.turkot@xfel.eu> Date: Tue, 11 Mar 2025 15:58:59 +0100 Subject: [PATCH 7/8] Disallow empty list in the --framelist-drop-classes option Co-authored-by: Egor Sobolev <egor.sobolev@xfel.eu> --- src/exdf/data_reduction/builtins.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index a5972f7..20bfb51 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -455,7 +455,7 @@ class FrameList(ReductionMethod): ) group.add_argument( '--framelist-drop-classes', - nargs='*', type=int, default=[0], + nargs='+', type=int, default=[0], help='The list of class numbers to drop, default [0]', ) @@ -475,7 +475,6 @@ class FrameList(ReductionMethod): if self.pulse_id_key is None: self.pulse_id_key = ckey_prefix + 'pulseId' - def from_exdf(self, data, args): clist = ( data if args.framelist_file is None -- GitLab From ed8c063fbaa8a9ecff0cbf74fcf936e4a468e902 Mon Sep 17 00:00:00 2001 From: Egor Sobolev <egor.sobolev@xfel.eu> Date: Thu, 13 Mar 2025 13:03:40 +0100 Subject: [PATCH 8/8] Fix help strings of argument parser Co-authored-by: Oleksii Turkot <oleksii.turkot@xfel.eu> --- src/exdf/data_reduction/builtins.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/exdf/data_reduction/builtins.py b/src/exdf/data_reduction/builtins.py index 20bfb51..10d73f2 100644 --- a/src/exdf/data_reduction/builtins.py +++ b/src/exdf/data_reduction/builtins.py @@ -426,7 +426,7 @@ class FrameList(ReductionMethod): group.add_argument( '--framelist-filetype', action='store', type=str, choices=['exdf', 'h5', 'csv'], default='exdf', - help='Candidate list file, if none use data collection', + help='Type of the candidate list file', ) group.add_argument( '--framelist-entry', @@ -441,17 +441,17 @@ class FrameList(ReductionMethod): group.add_argument( '--framelist-train-id-key', action='store', type=str, - help='Key of frame class dataset', + help='Key of the train id dataset', ) group.add_argument( '--framelist-pulse-id-key', action='store', type=str, - help='Key of frame class dataset', + help='Key of the pulse id dataset', ) group.add_argument( '--framelist-keep-missed', action='store_true', - help='Key of frame class dataset', + help='Keep frames that are not covered by the candidate list file, ) group.add_argument( '--framelist-drop-classes', -- GitLab