Skip to content
Snippets Groups Projects
Commit 9c4e3a9f authored by Thomas Kluyver's avatar Thomas Kluyver
Browse files

Fix checking Slurm job states

parent 45acad28
No related branches found
No related tags found
1 merge request!875Use status AW in myMdC if correction failed for some detectors in a run
......@@ -27,6 +27,10 @@ STATES_FINISHED = { # https://slurm.schedmd.com/squeue.html#lbAG
'BOOT_FAIL', 'CANCELLED', 'COMPLETED', 'DEADLINE', 'FAILED',
'OUT_OF_MEMORY', 'SPECIAL_EXIT', 'TIMEOUT',
}
STATE_ABBREVS = {
'PENDING': 'PD',
'RUNNING': 'R',
}
class NoOpProducer:
......@@ -55,7 +59,7 @@ def slurm_status(filter_user=True):
:return: a dictionary indexed by slurm jobid and containing a tuple
of (status, run time) as values.
"""
cmd = ["squeue", "--states=all"]
cmd = ["squeue", "--states=all", "--format=%i %T %M"]
if filter_user:
cmd += ["--me"]
res = run(cmd, stdout=PIPE, stderr=PIPE)
......@@ -64,7 +68,7 @@ def slurm_status(filter_user=True):
statii = {}
for r in rlines[1:]:
try:
jobid, _, _, _, status, runtime, _, _ = r.split()
jobid, status, runtime = r.split()
jobid = jobid.strip()
statii[jobid] = status, runtime
except ValueError: # not enough values to unpack in split
......@@ -179,13 +183,14 @@ class JobsMonitor:
if str(r['job_id']) in statii:
# statii contains jobs which are still going (from squeue)
slstatus, runtime = statii[str(r['job_id'])]
execn_ongoing_jobs.append(f"{slstatus}-{runtime}")
else:
# These jobs have finished (successfully or otherwise)
_, runtime, slstatus = slurm_job_status(r['job_id'])
finished = slstatus in STATES_FINISHED
if not finished:
short_state = STATE_ABBREVS.get(slstatus, slstatus)
execn_ongoing_jobs.append(f"{short_state}-{runtime}")
updates.append((finished, runtime, slstatus, r['job_id']))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment