We use this python script as a slurmctld prolog to save ours. Basically it pulls all the info from the slurm hash files and copies to a separate filesystem. We used to do it via mysql but the database got too large.

We then use the get_jobscript to actually query the job scripts.

-Paul Edmon-

On 04/27/2016 03:41 AM, Lennart Karlsson wrote:

On 04/27/2016 08:46 AM, Miguel Gila wrote:
Another option is to run (as SlurmUser, or root) when the job is still in the system (R or PD):

# scontrol show jobid=XXXX -dd

and dump its output somewhere.

Miguel

Hi,

We are using this one, run as slurmctld.prolog,
saving the output for 30 days.

Cheers,
-- Lennart Karlsson, UPPMAX, Uppsala University, Sweden
   http://www.uppmax.uu.se/

#!/usr/bin/python -tt

import logging
import os
import sys
import syslog
import traceback
from glob import glob


def get_env_vars(job_id):
    #if not os.path.exists('/slurm/spool/job.%s/environment' % job_id):
    #    return []
    paths = glob('/slurm/spool/hash.*/job.%s/environment' % job_id)
    if len(paths) == 0:
        return []

    # Not sure how this would happen, but it would be weird
    if len(paths) > 1:
        return []

    # Skip the first four bytes, which are a uint32 indicating the
    # length of the following data.
    env_raw = open(paths[0]).read()[4:]

    env_vars = []
    for env in env_raw.split('\0'):
        if env in ['', ';']:
            continue

        name, value = env.split('=', 1)
        env_vars.append("%s='%s'" % (name, value.replace("'", r"'\''")))

    return env_vars

def get_job_script(job_id):
    #if not os.path.exists('/slurm/spool/job.%s/script' % job_id):
    #    return ''
    paths = glob('/slurm/spool/hash.*/job.%s/script' % job_id)
    if len(paths) == 0:
        return ''

    # Not sure how this would happen, but it would be weird
    if len(paths) > 1:
        return ''

    # The job script has a trailing NULL. o_O
    script_raw = open(paths[0]).read().rstrip('\0')
    shebang = rest = ''
    try:
        shebang, rest = script_raw.split('\n', 1)
    except ValueError, e:
        raise Exception('No lines in script file 
/slurm/spool/hash.*/job.%s/script' % job_id)
        

    sbatch_lines = []
    rest_lines = []
    in_sbatch = True
    for line in rest.split('\n'):
        if in_sbatch:
            if line.startswith('#SBATCH') or line.strip() == '':
                sbatch_lines.append(line)
            else:
                in_sbatch = False
                rest_lines.append(line)
        else:
            rest_lines.append(line)

    return '%s\n%s\n\n%s\n#### BEGIN RUNTIME ENV ####\n%s' % (
        shebang, '\n'.join(sbatch_lines).strip(),
        '\n'.join(rest_lines), '\n'.join(get_env_vars(job_id)), 
    )

def save_job_script(job_id):
    
    # The number of subdirectories the scripts will be distributed into
    # job_id modulo SUBDIRCOUNT determines parent directory for the job id
    SLURM_JOBSCRIPT_SUBDIRCOUNT = 1000;
    SLURM_JOBSCRIPT_SUBDIRCOUNT = 
int(os.environ.get("SLURM_JOBSCRIPT_SUBDIRCOUNT","1000"))
        
    # The root path of the jobscripts
    SLURM_JOBSCRIPT_HOME = os.environ.get("SLURM_JOBSCRIPT_HOME","/jobscripts")

    
    subdir = str(int(job_id) % SLURM_JOBSCRIPT_SUBDIRCOUNT)
    scriptdir = os.path.join(SLURM_JOBSCRIPT_HOME,subdir)
    if not os.path.exists(scriptdir):
        os.mkdir(scriptdir)
    
    scriptfile = os.path.join(scriptdir,job_id)
    with open(scriptfile,'w') as f:
        f.write(get_job_script(job_id))
        

if __name__ == '__main__':
    try:
        save_job_script(os.environ['SLURM_JOB_ID'])
    except Exception as e:
        syslog.openlog(os.path.basename(sys.argv[0]), syslog.LOG_PID)
        for line in traceback.format_exc().split('\n'):
            syslog.syslog(syslog.LOG_ERR, line)

Attachment: get_jobscript.sh
Description: Bourne shell script

Reply via email to