diff --git a/webservice/job_monitor.py b/webservice/job_monitor.py index f86a6c7711eb7f82f9a2ea8b24611e00affb2ac3..207e4442d37a44d2c0be3d1ff85e02fac68b1199 100644 --- a/webservice/job_monitor.py +++ b/webservice/job_monitor.py @@ -23,6 +23,11 @@ except ImportError: log = logging.getLogger(__name__) +STATES_FINISHED = { # https://slurm.schedmd.com/squeue.html#lbAG + 'BOOT_FAIL', 'CANCELLED', 'COMPLETED', 'DEADLINE', 'FAILED', + 'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED', 'SPECIAL_EXIT', 'TIMEOUT', +} + class NoOpProducer: """Fills in for Kafka producer object when setting that up fails""" @@ -53,7 +58,7 @@ def slurm_status(filter_user=True): cmd = ["squeue"] if filter_user: cmd += ["--me"] - res = run(cmd, stdout=PIPE) + res = run(cmd, stdout=PIPE, stderr=PIPE) if res.returncode == 0: rlines = res.stdout.decode().split("\n") statii = {} @@ -65,6 +70,10 @@ def slurm_status(filter_user=True): except ValueError: # not enough values to unpack in split pass return statii + else: + log.error("Running squeue failed. stdout: %r, stderr: %r", + res.stdout.decode(), res.stderr.decode()) + return None def slurm_job_status(jobid): @@ -148,15 +157,19 @@ class JobsMonitor: Newly completed executions are present with an empty list. """ + jobs_to_check = self.job_db.execute( + "SELECT job_id, exec_id FROM slurm_jobs WHERE finished = 0" + ).fetchall() + if not jobs_to_check: + log.debug("No unfinished jobs to check") + return {} + statii = slurm_status() # Check that slurm is giving proper feedback if statii is None: return {} log.debug(f"SLURM info {statii}") - jobs_to_check = self.job_db.execute( - "SELECT job_id, exec_id FROM slurm_jobs WHERE finished = 0" - ).fetchall() ongoing_jobs_by_exn = {} updates = [] for r in jobs_to_check: @@ -166,13 +179,13 @@ class JobsMonitor: if str(r['job_id']) in statii: # statii contains jobs which are still going (from squeue) slstatus, runtime = statii[str(r['job_id'])] - finished = False execn_ongoing_jobs.append(f"{slstatus}-{runtime}") else: # These jobs have finished (successfully or otherwise) _, runtime, slstatus = slurm_job_status(r['job_id']) - finished = True + + finished = slstatus in STATES_FINISHED updates.append((finished, runtime, slstatus, r['job_id']))