Skip to content
Snippets Groups Projects
Commit 465194ca authored by Philipp Schmidt's avatar Philipp Schmidt
Browse files

Record and warn when jobs fails (almost) instantly

parent 153d7cb7
No related branches found
No related tags found
1 merge request!1134[job_monitor] Watch and warn of jobs failing (almost) instantly by host
......@@ -34,6 +34,7 @@ STATES_FINISHED = { # https://slurm.schedmd.com/squeue.html#lbAG
'OUT_OF_MEMORY', 'SPECIAL_EXIT', 'TIMEOUT',
'NA', # Unknown (used internally if job ID missing)
}
STATES_FAILED = {'FAILED'}
STATE_ABBREVS = {
'PENDING': 'PD',
'RUNNING': 'R',
......@@ -144,6 +145,7 @@ class JobsMonitor:
self.kafka_prod = init_kafka_producer(config)
self.kafka_topic = config['kafka']['topic']
self.time_interval = int(config['web-service']['job-update-interval'])
self.instant_fails_by_host = defauldict(ExpiringEvents)
def __enter__(self):
return self
......@@ -234,6 +236,12 @@ class JobsMonitor:
if not finished:
short_state = STATE_ABBREVS.get(slstatus, slstatus)
execn_ongoing_jobs.append(f"{short_state}-{runtime}")
elif slstatus in STATES_FAILED and elapsed_to_seconds(runtime) < 2:
# Specific branch to catch potentially broken nodes.
num_fails = self.instant_fails_by_host(hostname).add()
log.warning(f"Job {r['job_id']} failed instantly on "
f"{hostname}, {num_fails} on this host within "
f"the last 10 minutes")
updates.append((finished, runtime, slstatus, hostname, r['job_id']))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment