From 4f3ef7f813f4155200450f311604625056d042f0 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Mon, 17 Apr 2023 17:24:41 +0100
Subject: [PATCH] Don't mark jobs as finished just because they disappear from
 squeue output

---
 webservice/job_monitor.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/webservice/job_monitor.py b/webservice/job_monitor.py
index f86a6c771..207e4442d 100644
--- a/webservice/job_monitor.py
+++ b/webservice/job_monitor.py
@@ -23,6 +23,11 @@ except ImportError:
 
 log = logging.getLogger(__name__)
 
+STATES_FINISHED = {  # https://slurm.schedmd.com/squeue.html#lbAG
+    'BOOT_FAIL',  'CANCELLED', 'COMPLETED',  'DEADLINE', 'FAILED',
+    'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED', 'SPECIAL_EXIT', 'TIMEOUT',
+}
+
 
 class NoOpProducer:
     """Fills in for Kafka producer object when setting that up fails"""
@@ -53,7 +58,7 @@ def slurm_status(filter_user=True):
     cmd = ["squeue"]
     if filter_user:
         cmd += ["--me"]
-    res = run(cmd, stdout=PIPE)
+    res = run(cmd, stdout=PIPE, stderr=PIPE)
     if res.returncode == 0:
         rlines = res.stdout.decode().split("\n")
         statii = {}
@@ -65,6 +70,10 @@ def slurm_status(filter_user=True):
             except ValueError:  # not enough values to unpack in split
                 pass
         return statii
+    else:
+        log.error("Running squeue failed. stdout: %r, stderr: %r",
+                  res.stdout.decode(), res.stderr.decode())
+        return None
 
 
 def slurm_job_status(jobid):
@@ -148,15 +157,19 @@ class JobsMonitor:
 
         Newly completed executions are present with an empty list.
         """
+        jobs_to_check = self.job_db.execute(
+            "SELECT job_id, exec_id FROM slurm_jobs WHERE finished = 0"
+        ).fetchall()
+        if not jobs_to_check:
+            log.debug("No unfinished jobs to check")
+            return {}
+
         statii = slurm_status()
         # Check that slurm is giving proper feedback
         if statii is None:
             return {}
         log.debug(f"SLURM info {statii}")
 
-        jobs_to_check = self.job_db.execute(
-            "SELECT job_id, exec_id FROM slurm_jobs WHERE finished = 0"
-        ).fetchall()
         ongoing_jobs_by_exn = {}
         updates = []
         for r in jobs_to_check:
@@ -166,13 +179,13 @@ class JobsMonitor:
             if str(r['job_id']) in statii:
                 # statii contains jobs which are still going (from squeue)
                 slstatus, runtime = statii[str(r['job_id'])]
-                finished = False
                 execn_ongoing_jobs.append(f"{slstatus}-{runtime}")
 
             else:
                 # These jobs have finished (successfully or otherwise)
                 _, runtime, slstatus = slurm_job_status(r['job_id'])
-                finished = True
+
+            finished = slstatus in STATES_FINISHED
 
             updates.append((finished, runtime, slstatus, r['job_id']))
 
-- 
GitLab