From 9c4e3a9ff2b3f04e15b82e1c3fb20a7f502c1789 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver <thomas@kluyver.me.uk> Date: Fri, 7 Jul 2023 15:58:34 +0100 Subject: [PATCH] Fix checking Slurm job states --- webservice/job_monitor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/webservice/job_monitor.py b/webservice/job_monitor.py index 8f911f7c5..cde702c69 100644 --- a/webservice/job_monitor.py +++ b/webservice/job_monitor.py @@ -27,6 +27,10 @@ STATES_FINISHED = { # https://slurm.schedmd.com/squeue.html#lbAG 'BOOT_FAIL', 'CANCELLED', 'COMPLETED', 'DEADLINE', 'FAILED', 'OUT_OF_MEMORY', 'SPECIAL_EXIT', 'TIMEOUT', } +STATE_ABBREVS = { + 'PENDING': 'PD', + 'RUNNING': 'R', +} class NoOpProducer: @@ -55,7 +59,7 @@ def slurm_status(filter_user=True): :return: a dictionary indexed by slurm jobid and containing a tuple of (status, run time) as values. """ - cmd = ["squeue", "--states=all"] + cmd = ["squeue", "--states=all", "--format=%i %T %M"] if filter_user: cmd += ["--me"] res = run(cmd, stdout=PIPE, stderr=PIPE) @@ -64,7 +68,7 @@ def slurm_status(filter_user=True): statii = {} for r in rlines[1:]: try: - jobid, _, _, _, status, runtime, _, _ = r.split() + jobid, status, runtime = r.split() jobid = jobid.strip() statii[jobid] = status, runtime except ValueError: # not enough values to unpack in split @@ -179,13 +183,14 @@ class JobsMonitor: if str(r['job_id']) in statii: # statii contains jobs which are still going (from squeue) slstatus, runtime = statii[str(r['job_id'])] - execn_ongoing_jobs.append(f"{slstatus}-{runtime}") - else: # These jobs have finished (successfully or otherwise) _, runtime, slstatus = slurm_job_status(r['job_id']) finished = slstatus in STATES_FINISHED + if not finished: + short_state = STATE_ABBREVS.get(slstatus, slstatus) + execn_ongoing_jobs.append(f"{short_state}-{runtime}") updates.append((finished, runtime, slstatus, r['job_id'])) -- GitLab