Skip to content
Snippets Groups Projects

Sanitize cal_tools.tools.get_creation_date

Merged Cyril Danilevski requested to merge clean/get_dir_creation_date into master
1 unresolved thread
Files
2
+ 34
22
@@ -2,8 +2,9 @@ from collections import OrderedDict
@@ -2,8 +2,9 @@ from collections import OrderedDict
import datetime
import datetime
from glob import glob
from glob import glob
import json
import json
from os import environ, listdir, path, stat
from os import environ, listdir, path
from os.path import isfile, splitext
from os.path import isfile
 
from pathlib import Path
from queue import Queue
from queue import Queue
import re
import re
from time import sleep
from time import sleep
@@ -11,6 +12,7 @@ from typing import Optional
@@ -11,6 +12,7 @@ from typing import Optional
from urllib.parse import urljoin
from urllib.parse import urljoin
import dateutil.parser
import dateutil.parser
 
import h5py
import ipykernel
import ipykernel
from metadata_client.metadata_client import MetadataClient
from metadata_client.metadata_client import MetadataClient
from notebook.notebookapp import list_running_servers
from notebook.notebookapp import list_running_servers
@@ -229,47 +231,57 @@ def get_run_info(proposal, run):
@@ -229,47 +231,57 @@ def get_run_info(proposal, run):
def get_dir_creation_date(directory: str, run: int,
def get_dir_creation_date(directory: str, run: int,
tsdir: Optional[bool] = False,
verbosity: Optional[int] = 0) -> datetime.datetime:
verbosity: Optional[int] = 0):
"""
"""
Return run starting time from the MDC.
Return run start time from MyDC.
If not succeeded, return modification time of oldest file.h5
If not available from MyMDC, retrieve the data from the dataset's metadata
in [directory]/[run]04.
in [directory]/[run] or, if the dataset is older than 2020, from the
 
directory's creation time.
 
 
If the data is not available from either source, this function will raise a
 
ValueError.
:param directory: path to directory which contains runs
:param directory: path to directory which contains runs
:param run: run number
:param run: run number
:param tsdir: to get modification time of [directory]/[run]04.
:param verbosity: Level of verbosity (0 - silent)
:param verbosity: Level of verbosity (0 - silent)
:return: (datetime) modification time
:return: (datetime) modification time
 
"""
"""
 
directory = Path(directory)
 
proposal = int(directory.parent.name[1:])
try:
try:
path_list = list(filter(None, directory.strip('/').split('/')))
proposal = int(path_list[-2][1:])
run_info = get_run_info(proposal, run)
run_info = get_run_info(proposal, run)
    • this use MetadataClient. Do we also have begin_at information somewhere in metadata of the run? We could then simply use extra_data

      • we have begin_at in the metadata saved at myMDC, yes.

        But this part only works in production as communication with MetadataClient needs the secrets written in the webservice yaml configurations.

        We could then simply use extra_data

        How?

        Edited by Karim Ahmed
      •  from extra_data import open_run
         r = open_run(proposal, run)
         r.info() # has some info about number of trains, start, stop time etc, sources, etc..```
      • the new HDF5 files (since ~February) have a few more metadata, they maybe contain all required information?

        example:

        [tmichela@max-exfl001]/gpfs/exfel/exp/SPB/202001/p002450/raw/r0122% h5glance RAW-R0122-DA04-S00000.h5 METADATA
        RAW-R0122-DA04-S00000.h5/METADATA (3 attributes)
        ├creationDate	[ASCII str: 1]
        ├daqLibrary	[ASCII str: 1]
        ├dataFormatVersion	[ASCII str: 1]
        ├dataSources
        │ ├dataSourceId	[ASCII str: 3]
        │ ├deviceId	[ASCII str: 3]
        │ └root	[ASCII str: 3]
        ├karaboFramework	[ASCII str: 1]
        ├proposalNumber	[uint32: 1]
        ├runNumber	[uint32: 1]
        ├sequenceNumber	[uint32: 1]
        └updateDate	[ASCII str: 1]
      • so this would have the same creationDate as MetadataClient or it just shows last time the file was modified?

        Edited by Karim Ahmed
      • the new HDF5 files (since ~February) have a few more metadata, they maybe contain all required information?

        Oh, I didn't know that. That is very useful.

        Edited by Karim Ahmed
      • I'd expect the creationDate field to match the information from the metadata, but I'd check before :D

      • Is there an extra_data way to retrieve it, or is using h5py directly okay?

      • EXtra-data doesn't read these yet (it's on the todo list :) so h5py would be the way to go at the moment

      • Ok, I'm cool to use the creationDate from the files, but there will still be a loop, as not all files are created at once:

        In [1]: import pathlib, h5py
        
        In [2]: p = pathlib.Path('/gpfs/exfel/d/raw/DETLAB/202031/p900172/r0072')
        
        In [3]: for f in p.iterdir():
           ...:     print(f)
           ...: 
        RAW-R0072-AGIPD07-S00000.h5
        RAW-R0072-AGIPD04-S00001.h5
        RAW-R0072-AGIPD05-S00001.h5
        RAW-R0072-AGIPD01-S00000.h5
        RAW-R0072-AGIPD05-S00000.h5
        RAW-R0072-AGIPD02-S00000.h5
        RAW-R0072-AGIPD06-S00000.h5
        RAW-R0072-AGIPD00-S00000.h5
        RAW-R0072-AGIPD04-S00000.h5
        RAW-R0072-AGIPD03-S00000.h5
        RAW-R0072-DA01-S00000.h5
        RAW-R0072-AGIPD00-S00001.h5
        RAW-R0072-AGIPD03-S00001.h5
        RAW-R0072-AGIPD07-S00001.h5
        RAW-R0072-AGIPD01-S00001.h5
        RAW-R0072-AGIPD06-S00001.h5
        RAW-R0072-AGIPD02-S00001.h5
        
        In [4]: dates = []
           ...: for f in p.iterdir():
           ...:     with h5py.File(f) as fin:
           ...:         dates.append(fin['METADATA/creationDate'][0])
        
        In [5]: dates
        Out[6]: 
        [b'20200904T101712Z',
         b'20200904T101737Z',
         b'20200904T101737Z',
         b'20200904T101712Z',
         b'20200904T101712Z',
         b'20200904T101712Z',
         b'20200904T101712Z',
         b'20200904T101712Z',
         b'20200904T101712Z',
         b'20200904T101712Z',
         b'20200904T101715Z',
         b'20200904T101737Z',
         b'20200904T101738Z',
         b'20200904T101737Z',
         b'20200904T101737Z',
         b'20200904T101737Z',
         b'20200904T101738Z']
      • Just to link things up, here's the issue for this in EXtra-data: https://github.com/European-XFEL/EXtra-data/issues/27

        It's also partly stuck on a similar question: how do we efficiently present metadata in 200 separate files as metadata for a run?

      • Please register or sign in to reply
Please register or sign in to reply
return dateutil.parser.parse(run_info['begin_at'])
return dateutil.parser.parse(run_info['begin_at'])
except Exception as e:
except Exception as e:
if verbosity > 0:
if verbosity > 0:
print(e)
print(e)
 
directory = directory / 'r{:04d}'.format(run)
 
 
# Loop a number of times to catch stale file handle errors, due to
 
# migration or gpfs sync.
ntries = 100
ntries = 100
while ntries > 0:
while ntries > 0:
try:
try:
if tsdir:
dates = []
creation_time = stat("{}/r{:04d}".format(directory, run)).st_mtime
for f in directory.glob('*.h5'):
else:
with h5py.File(f, 'r') as fin:
rfiles = glob("{}/r{:04d}/*.h5".format(directory, run))
cdate = fin['METADATA/creationDate'][0].decode()
rfiles.sort(key=path.getmtime)
cdate = datetime.datetime.strptime(cdate, "%Y%m%dT%H%M%SZ")
creation_time = stat(rfiles[0]).st_mtime
dates.append(cdate)
return min(dates)
creation_time = datetime.datetime.fromtimestamp(creation_time)
except (IOError, ValueError):
return creation_time
except: # catch stale file handle errors etc and try again
ntries -= 1
ntries -= 1
 
except KeyError: # The files are here, but it's an older dataset
 
return datetime.datetime.fromtimestamp(directory.stat().st_ctime)
 
 
msg = 'Could not get the creation time from the directory'
 
raise ValueError(msg, directory)
def save_const_to_h5(device, constant, condition, data,
def save_const_to_h5(device, constant, condition, data,
file_loc, creation_time, out_folder):
file_loc, creation_time, out_folder):
"""
"""
Save constant in h5 file with its metadata
Save constant in h5 file with its metadata
(e.g. db_module, condition, creation_time)
(e.g. db_module, condition, creation_time)
@@ -280,7 +292,7 @@ def save_const_to_h5(device, constant, condition, data,
@@ -280,7 +292,7 @@ def save_const_to_h5(device, constant, condition, data,
:type constant: iCalibrationDB.know_constants object
:type constant: iCalibrationDB.know_constants object
:param condition: Calibration condition
:param condition: Calibration condition
:type condition: iCalibrationDB.know_detector_conditions object
:type condition: iCalibrationDB.know_detector_conditions object
:param data: Constant data to save
:param data: Constant data to save
:type data: ndarray
:type data: ndarray
:param file_loc: Location of raw data "proposal:{} runs:{} {} {}"
:param file_loc: Location of raw data "proposal:{} runs:{} {} {}"
:type file_loc: str
:type file_loc: str
Loading