We then use the get_jobscript to actually query the job scripts.
-Paul Edmon- On 04/27/2016 03:41 AM, Lennart Karlsson wrote:
On 04/27/2016 08:46 AM, Miguel Gila wrote:Another option is to run (as SlurmUser, or root) when the job is still in the system (R or PD):# scontrol show jobid=XXXX -dd and dump its output somewhere. MiguelHi, We are using this one, run as slurmctld.prolog, saving the output for 30 days. Cheers, -- Lennart Karlsson, UPPMAX, Uppsala University, Sweden http://www.uppmax.uu.se/
#!/usr/bin/python -tt
import logging
import os
import sys
import syslog
import traceback
from glob import glob
def get_env_vars(job_id):
#if not os.path.exists('/slurm/spool/job.%s/environment' % job_id):
# return []
paths = glob('/slurm/spool/hash.*/job.%s/environment' % job_id)
if len(paths) == 0:
return []
# Not sure how this would happen, but it would be weird
if len(paths) > 1:
return []
# Skip the first four bytes, which are a uint32 indicating the
# length of the following data.
env_raw = open(paths[0]).read()[4:]
env_vars = []
for env in env_raw.split('\0'):
if env in ['', ';']:
continue
name, value = env.split('=', 1)
env_vars.append("%s='%s'" % (name, value.replace("'", r"'\''")))
return env_vars
def get_job_script(job_id):
#if not os.path.exists('/slurm/spool/job.%s/script' % job_id):
# return ''
paths = glob('/slurm/spool/hash.*/job.%s/script' % job_id)
if len(paths) == 0:
return ''
# Not sure how this would happen, but it would be weird
if len(paths) > 1:
return ''
# The job script has a trailing NULL. o_O
script_raw = open(paths[0]).read().rstrip('\0')
shebang = rest = ''
try:
shebang, rest = script_raw.split('\n', 1)
except ValueError, e:
raise Exception('No lines in script file
/slurm/spool/hash.*/job.%s/script' % job_id)
sbatch_lines = []
rest_lines = []
in_sbatch = True
for line in rest.split('\n'):
if in_sbatch:
if line.startswith('#SBATCH') or line.strip() == '':
sbatch_lines.append(line)
else:
in_sbatch = False
rest_lines.append(line)
else:
rest_lines.append(line)
return '%s\n%s\n\n%s\n#### BEGIN RUNTIME ENV ####\n%s' % (
shebang, '\n'.join(sbatch_lines).strip(),
'\n'.join(rest_lines), '\n'.join(get_env_vars(job_id)),
)
def save_job_script(job_id):
# The number of subdirectories the scripts will be distributed into
# job_id modulo SUBDIRCOUNT determines parent directory for the job id
SLURM_JOBSCRIPT_SUBDIRCOUNT = 1000;
SLURM_JOBSCRIPT_SUBDIRCOUNT =
int(os.environ.get("SLURM_JOBSCRIPT_SUBDIRCOUNT","1000"))
# The root path of the jobscripts
SLURM_JOBSCRIPT_HOME = os.environ.get("SLURM_JOBSCRIPT_HOME","/jobscripts")
subdir = str(int(job_id) % SLURM_JOBSCRIPT_SUBDIRCOUNT)
scriptdir = os.path.join(SLURM_JOBSCRIPT_HOME,subdir)
if not os.path.exists(scriptdir):
os.mkdir(scriptdir)
scriptfile = os.path.join(scriptdir,job_id)
with open(scriptfile,'w') as f:
f.write(get_job_script(job_id))
if __name__ == '__main__':
try:
save_job_script(os.environ['SLURM_JOB_ID'])
except Exception as e:
syslog.openlog(os.path.basename(sys.argv[0]), syslog.LOG_PID)
for line in traceback.format_exc().split('\n'):
syslog.syslog(syslog.LOG_ERR, line)
get_jobscript.sh
Description: Bourne shell script
