From 9c4e3a9ff2b3f04e15b82e1c3fb20a7f502c1789 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Fri, 7 Jul 2023 15:58:34 +0100
Subject: [PATCH] Fix checking Slurm job states

---
 webservice/job_monitor.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/webservice/job_monitor.py b/webservice/job_monitor.py
index 8f911f7c5..cde702c69 100644
--- a/webservice/job_monitor.py
+++ b/webservice/job_monitor.py
@@ -27,6 +27,10 @@ STATES_FINISHED = {  # https://slurm.schedmd.com/squeue.html#lbAG
     'BOOT_FAIL',  'CANCELLED', 'COMPLETED',  'DEADLINE', 'FAILED',
     'OUT_OF_MEMORY', 'SPECIAL_EXIT', 'TIMEOUT',
 }
+STATE_ABBREVS = {
+    'PENDING': 'PD',
+    'RUNNING': 'R',
+}
 
 
 class NoOpProducer:
@@ -55,7 +59,7 @@ def slurm_status(filter_user=True):
     :return: a dictionary indexed by slurm jobid and containing a tuple
              of (status, run time) as values.
     """
-    cmd = ["squeue", "--states=all"]
+    cmd = ["squeue", "--states=all", "--format=%i %T %M"]
     if filter_user:
         cmd += ["--me"]
     res = run(cmd, stdout=PIPE, stderr=PIPE)
@@ -64,7 +68,7 @@ def slurm_status(filter_user=True):
         statii = {}
         for r in rlines[1:]:
             try:
-                jobid, _, _, _, status, runtime, _, _ = r.split()
+                jobid, status, runtime = r.split()
                 jobid = jobid.strip()
                 statii[jobid] = status, runtime
             except ValueError:  # not enough values to unpack in split
@@ -179,13 +183,14 @@ class JobsMonitor:
             if str(r['job_id']) in statii:
                 # statii contains jobs which are still going (from squeue)
                 slstatus, runtime = statii[str(r['job_id'])]
-                execn_ongoing_jobs.append(f"{slstatus}-{runtime}")
-
             else:
                 # These jobs have finished (successfully or otherwise)
                 _, runtime, slstatus = slurm_job_status(r['job_id'])
 
             finished = slstatus in STATES_FINISHED
+            if not finished:
+                short_state = STATE_ABBREVS.get(slstatus, slstatus)
+                execn_ongoing_jobs.append(f"{short_state}-{runtime}")
 
             updates.append((finished, runtime, slstatus, r['job_id']))
 
-- 
GitLab