diff --git a/src/cal_tools/tools.py b/src/cal_tools/tools.py index 9a20cfe1dc5009ff96439b6d66fb7b1f8cc29d12..f8e316ccf55a90e5916811587b4eaa5305495cab 100644 --- a/src/cal_tools/tools.py +++ b/src/cal_tools/tools.py @@ -13,13 +13,13 @@ from time import sleep from typing import List, Optional, Tuple, Union from urllib.parse import urljoin -import dateutil.parser import h5py import ipykernel import numpy as np import requests import yaml import zmq +from extra_data import RunDirectory from iCalibrationDB import ConstantMetaData, Versions from metadata_client.metadata_client import MetadataClient from notebook.notebookapp import list_running_servers @@ -246,8 +246,7 @@ def get_notebook_name(): def get_run_info(proposal, run): - """ - Return information about run from the MDC + """Return information about run from the MDC :param proposal: proposal number :param run: run number @@ -265,66 +264,132 @@ def get_run_info(proposal, run): base_api_url=mdc_config['base-api-url'], ) - runs = mdc.get_proposal_runs(proposal_number=proposal, - run_number=run) - run_id = runs['data']['runs'][0]['id'] + mdc_response = mdc.get_proposal_runs( + proposal_number=proposal, run_number=run) + + if mdc_response["success"]: + return mdc_response + else: # empty dictionary for wrong proposal or run. + raise KeyError(mdc_response['app_info']) + + +def creation_date_metadata_client( + proposal: int, run: int +) -> datetime.datetime: + """Get run directory creation date from myMDC using metadata client. + using method `get_proposal_runs`. + + :param proposal: proposal number e.g. 2656 or 900113. + :param run: run number. + :return Optional[datetime.datetime]: Run creation date. + """ + + run_info = get_run_info(proposal, run) + return datetime.datetime.strptime( + run_info['data']['runs'][0]['begin_at'], + "%Y-%m-%dT%H:%M:%S.%f%z", + ).astimezone(datetime.timezone.utc) + + +def creation_date_file_metadata( + dc: RunDirectory +) -> Optional[datetime.datetime]: + """Get run directory creation date from + METADATA/CreationDate of the oldest file using EXtra-data. + # TODO: update after DAQ store the same date as myMDC. + + :param dc: EXtra-data DataCollection for the run directory. + :return Optional[datetime.datetime]: Run creation date. + """ + + md_dict = dc.run_metadata() + if md_dict["dataFormatVersion"] != "0.5": + oldest_file = sorted( + dc.files, key=lambda x: x.metadata()["creationDate"])[0] + return datetime.datetime.strptime( + oldest_file.metadata()["creationDate"], + "%Y%m%dT%H%M%S%z", + ) + else: + print("WARNING: input files contains old datasets. " + "No `METADATA/creationDate` to read.") + + +def creation_date_train_timestamp( + dc: RunDirectory +) -> Optional[datetime.datetime]: + """Get creation date from the timestamp of the first train. + + :param dc: EXtra-data DataCollection for the run directory. + :return Optional[datetime.datetime]: Run creation date. + """ - resp = mdc.get_run_by_id_api(run_id) - return resp.json() + creation_date = np.datetime64( + dc.select_trains(np.s_[0]).train_timestamps()[0], 'us').item() + if creation_date is None: + print("WARNING: input files contains old datasets without" + " trains timestamps.") + return None + return creation_date.replace(tzinfo=datetime.timezone.utc) def get_dir_creation_date(directory: Union[str, Path], run: int, verbosity: int = 0) -> datetime.datetime: - """ - Return run start time from MyDC. - If not available from MyMDC, retrieve the data from the dataset's metadata - in [directory]/[run] or, if the dataset is older than 2020, from the oldest - file's modified time. + """Get the directory creation data based on 3 different methods. + + 1) Return run start time from myMDC. (get_runtime_metadata_client) + 2) If myMDC connection is not set, + get the date from the files metadata. (get_runtime_metadata_file) + 3) If data files are older than 2020 (dataformatversion == "0.5"), + get the data from the oldest file's modified time. If the data is not available from either source, - this function will raise a ValueError. + this function will raise a FileNotFoundError. - :param directory: path to directory which contains runs - :param run: run number + :param directory: path to a directory which contains runs + (e.g. /gpfs/exfel/data/exp/callab/202031/p900113/raw/). + :param run: run number. :param verbosity: Level of verbosity (0 - silent) - :return: (datetime) modification time + :return: creation datetime for the directory. """ directory = Path(directory) + proposal = int(directory.parent.name[1:]) + directory = directory / 'r{:04d}'.format(run) + + # Validate the availability of the input folder. + # And show a clear error message, if it was not found. + try: + dc = RunDirectory(directory) + except FileNotFoundError as e: + raise FileNotFoundError( + "- Failed to read creation time, wrong input folder", + directory) from e try: - run_info = get_run_info(proposal, run) - return dateutil.parser.parse(run_info['begin_at']) + return creation_date_metadata_client(proposal, run) except Exception as e: if verbosity > 0: print(e) - directory = directory / 'r{:04d}'.format(run) - - # Loop a number of times to catch stale file handle errors, due to - # migration or gpfs sync. - ntries = 100 - while ntries > 0: - try: - rfiles = list(directory.glob('*.h5')) - # get creation time for oldest file, - # as creation time between run files - # should differ by a few seconds only. - rfile = sorted(rfiles, key=path.getmtime)[0] - with h5py.File(rfile, 'r') as fin: - cdate = fin['METADATA/creationDate'][0].decode() - cdate = datetime.datetime.strptime( - cdate, - "%Y%m%dT%H%M%SZ").replace(tzinfo=datetime.timezone.utc) - return cdate - except (IndexError, IOError, ValueError): - ntries -= 1 - except KeyError: # The files are here, but it's an older dataset - return datetime.datetime.fromtimestamp(rfile.stat().st_mtime) - - msg = 'Could not get the creation time from the directory' - raise ValueError(msg, directory) + cdate = creation_date_train_timestamp(dc) + + if cdate is not None: + # Exposing the method used for reading the creation_date. + print("Reading creation_date from input files metadata" + " `METADATA/creationDate`") + else: # It's an older dataset. + print("Reading creation_date from last modification data " + "for the oldest input file.") + cdate = datetime.datetime.fromtimestamp( + sorted( + [Path(f.filename) for f in dc.files], + key=path.getmtime + )[0].stat().st_mtime, + tz=datetime.timezone.utc, + ) + return cdate def _init_metadata(constant: 'iCalibrationDB.calibration_constant', diff --git a/tests/conftest.py b/tests/conftest.py index a7cfeb0f1e81716dc083aeed46d6a3e45bb6179b..fd71bd602e4b7921a05c95639f2ddce0c797f558 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,13 @@ def pytest_addoption(parser): help="Skips tests marked as requiring GPFS access", ) + parser.addoption( + "--no-mdc", + action="store_true", + default=True, + help="Skips tests marked as requiring myMDC access", + ) + def pytest_configure(config): config.addinivalue_line( @@ -25,6 +32,11 @@ def pytest_configure(config): "requires_caldb(): marks skips for tests that require calDB access", ) + config.addinivalue_line( + "markers", + "requires_mdc(): marks skips for tests that require calDB access", + ) + @lru_cache() def server_reachable(server: str = "max-exfl017"): @@ -46,3 +58,9 @@ def pytest_runtest_setup(item): if list(item.iter_markers(name="requires_caldb")) and not server_reachable(): pytest.skip("caldb not available") + + if ( + list(item.iter_markers(name="requires_mdc")) and + item.config.getoption("--no-mdc") + ): + pytest.skip("myMDC not available") diff --git a/tests/test_cal_tools.py b/tests/test_cal_tools.py index 102f613b7b9f3b9bdb032f5ebe5ab218a8d1e098..6e7f44b279dbf59788acf1e1f73fd0e21e1cd7e9 100644 --- a/tests/test_cal_tools.py +++ b/tests/test_cal_tools.py @@ -5,11 +5,15 @@ from unittest.mock import patch import numpy as np import pytest import zmq +from extra_data import open_run from iCalibrationDB import Conditions, ConstantMetaData, Constants from cal_tools.agipdlib import AgipdCorrections, CellRange from cal_tools.plotting import show_processed_modules from cal_tools.tools import ( + creation_date_file_metadata, + creation_date_metadata_client, + creation_date_train_timestamp, get_dir_creation_date, get_from_db, get_pdu_from_db, @@ -31,6 +35,7 @@ WRONG_AGIPD_MODULE = "AGIPD_**" CAL_DB_INTERFACE = "tcp://max-exfl017:8020" WRONG_CAL_DB_INTERFACE = "tcp://max-exfl017:0000" +PROPOSAL = 900113 @pytest.fixture def _agipd_const_cond(): @@ -57,20 +62,63 @@ def test_show_processed_modules(): @pytest.mark.requires_gpfs def test_dir_creation_date(): + """This test is based on not connecting to MDC and failing to use + `creation_date_metadata_client()` + """ folder = '/gpfs/exfel/exp/CALLAB/202031/p900113/raw' date = get_dir_creation_date(folder, 9983) assert isinstance(date, datetime) - assert str(date) == '2020-09-23 13:30:50+00:00' + assert str(date) == '2020-09-23 13:30:45.821262+00:00' - with pytest.raises(ValueError) as e: + # The following data predates the addition of creation_time in metadata + date = get_dir_creation_date(folder, 9999) + assert isinstance(date, datetime) + assert str(date) == '2019-12-16 07:52:25.196603+00:00' + + +@pytest.mark.requires_gpfs +def test_raise_dir_creation_date(): + folder = '/gpfs/exfel/exp/CALLAB/202031/p900113/raw' + + with pytest.raises(FileNotFoundError) as e: get_dir_creation_date(folder, 4) assert e.value.args[1] == Path(folder) / 'r0004' - # The following data predates the addition of creation_time in metadata - date = get_dir_creation_date(folder, 9999) + +@pytest.mark.requires_mdc +def test_creation_date_metadata_client(): + + date = creation_date_metadata_client(PROPOSAL, 9983) + assert isinstance(date, datetime) + assert str(date) == '2020-09-23 13:30:00+00:00' + + +@pytest.mark.requires_gpfs +def test_creation_date_file_metadata(): + + date = creation_date_file_metadata(open_run(PROPOSAL, 9983)) assert isinstance(date, datetime) - assert str(date) == '2019-12-16 08:52:25.196603' + assert str(date) == '2020-09-23 13:30:50+00:00' + + # Old run without METADATA/CreationDate + date = creation_date_file_metadata(open_run(PROPOSAL, 9999)) + + assert date is None + + +@pytest.mark.requires_gpfs +def test_creation_date_train_timestamp(): + + date = creation_date_train_timestamp(open_run(PROPOSAL, 9983)) + + assert isinstance(date, datetime) + assert str(date) == '2020-09-23 13:30:45.821262+00:00' + + # Old run without trainId timestamps + date = creation_date_train_timestamp(open_run(PROPOSAL, 9999)) + + assert date is None def _call_get_from_db( @@ -132,7 +180,7 @@ def _call_send_to_db( return metadata -# TODO add a marker for accessing zmq end_point +@pytest.mark.requires_caldb @pytest.mark.requires_gpfs def test_get_from_db_load_data(_agipd_const_cond): """ Test retrieving calibration constants with get_from_db @@ -169,7 +217,7 @@ def test_get_from_db_load_data(_agipd_const_cond): assert isinstance(md, ConstantMetaData) -# TODO add a marker for accessing zmq end_point +@pytest.mark.requires_caldb @pytest.mark.requires_gpfs def test_raise_get_from_db(_agipd_const_cond): """ Test error raised scenarios for get_from_db:"""