Unverified Commit 6993dbb6 authored by Martin Cech's avatar Martin Cech Committed by GitHub
Browse files

Merge pull request #6705 from natefoo/slurm-cgroup-enospc

[18.05] Catch additional permutations of Slurm cgroup creation warnings
parents f513594c 3d508a01
......@@ -29,6 +29,13 @@ SLURM_MEMORY_LIMIT_EXCEEDED_PARTIAL_WARNINGS = [': Exceeded job memory limit at
': Exceeded step memory limit at some point.']
SLURM_MEMORY_LIMIT_SCAN_SIZE = 16 * 1024 * 1024 # 16MB
SLURM_UNABLE_TO_ADD_TASK_TO_MEMORY_CG_MSG_RE = re.compile(r"""slurmstepd: error: task/cgroup: unable to add task\[pid=\d+\] to memory cg '\(null\)'$""")
SLURM_UNABLE_TO_CREATE_CGROUP_MSG_RE = re.compile(r"""slurmstepd: error: xcgroup_instantiate: unable to create cgroup '[^']+' : No space left on device$""")
SLURM_UNABLE_TO_INSTANCIATE_JOB_MSG_RE = re.compile(r"""slurmstepd: error: jobacct_gather/cgroup: unable to instanciate job \d+ memory cgroup$""")
SLURM_TOP_WARNING_RES = (
SLURM_UNABLE_TO_ADD_TASK_TO_MEMORY_CG_MSG_RE,
SLURM_UNABLE_TO_CREATE_CGROUP_MSG_RE,
SLURM_UNABLE_TO_INSTANCIATE_JOB_MSG_RE
)
# These messages are returned to the user
OUT_OF_MEMORY_MSG = 'This job was terminated because it used more memory than it was allocated.'
......@@ -154,13 +161,7 @@ class SlurmJobRunner(DRMAAJobRunner):
return
if drmaa_state == self.drmaa_job_states.DONE:
with open(ajs.error_file, 'r') as rfh:
first_line = rfh.readline()
if SLURM_UNABLE_TO_ADD_TASK_TO_MEMORY_CG_MSG_RE.match(first_line):
with tempfile.NamedTemporaryFile('w', delete=False) as wfh:
shutil.copyfileobj(rfh, wfh)
wf_name = wfh.name
shutil.move(wf_name, ajs.error_file)
log.debug('(%s/%s) Job completed, removing SLURM spurious warning: "%s"', ajs.job_wrapper.get_id_tag(), ajs.job_id, first_line)
_remove_spurious_top_lines(rfh, ajs)
with open(ajs.error_file, 'r+') as f:
if os.path.getsize(ajs.error_file) > SLURM_MEMORY_LIMIT_SCAN_SIZE:
f.seek(-SLURM_MEMORY_LIMIT_SCAN_SIZE, os.SEEK_END)
......@@ -201,3 +202,32 @@ class SlurmJobRunner(DRMAAJobRunner):
log.exception('Error reading end of %s:', efile_path)
return False
def _remove_spurious_top_lines(rfh, ajs, maxlines=2):
bad = []
putback = None
for i in range(maxlines):
line = rfh.readline()
log.trace('checking line: %s', line)
for pattern in SLURM_TOP_WARNING_RES:
if pattern.match(line):
bad.append(line)
# found a match, stop checking REs and check next line
break
else:
if bad:
# no match found on this line so line is now a good line, but previous bad lines are found, so it needs to be put back
putback = line
# no match on this line, stop looking
break
# check next line
if bad:
with tempfile.NamedTemporaryFile('w', delete=False) as wfh:
if putback is not None:
wfh.write(putback)
shutil.copyfileobj(rfh, wfh)
wf_name = wfh.name
shutil.move(wf_name, ajs.error_file)
for line in bad:
log.debug('(%s/%s) Job completed, removing SLURM spurious warning: "%s"', ajs.job_wrapper.get_id_tag(), ajs.job_id, line)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment