I'm having difficulty using the Slurm API. Specifically the batch submit
call. When submitted through sbatch a bare bones script runs fine. When
submitted through the API the same script fails with a NODE_FAIL status.

slurm_submit_batch_job returns 0, and the response message contains a valid
job_id. A subsequent call to slurm_load_job() reveals that the job's state
is PENDING. However, that job ID does not appear in the output of squeue,
because by the time I look, the job has already failed. Using sacct shows
the job with a state of NODE_FAIL.

Requested MAIL_JOB_BEGIN & MAIL_JOB_FAIL messages indicate that the job was
successfully queued, and then almost immediately fails with a NONE_FAIL,
ExitCode 0. Below is my test code & its output


int main(int argc, char* argv[])
{
  string wkdir("/home/stephenb/testing/slurm");
  char* cwkdir = new char[wkdir.size() + 1];
  strcpy(cwkdir, wkdir.c_str());

  string mail("stephen.barr...@redcedartech.com");
  char* cmail = new char[mail.size() + 1];
  strcpy(cmail, mail.c_str());

  string name("bbones");
  char* cname = new char[name.size() + 1];
  strcpy(cname, name.c_str());

  string script(wkdir + "/task.sh");
  char* cscript = new char[script.size() + 1];
  strcpy(cscript, script.c_str());


  job_desc_msg_t desc;
  slurm_init_job_desc_msg(&desc);

  desc.mail_user = cmail;
  desc.mail_type = MAIL_JOB_END | MAIL_JOB_FAIL;
  desc.name = cname;
  desc.script = cscript;
  desc.user_id = getuid();
  desc.work_dir = cwkdir;


  cout << "--- job description ---" << endl;
  cout << "   name=" << desc.name << endl;
  cout << "   script=" << desc.script << endl;
  cout << "   user_id=" << desc.user_id << endl;
  cout << "   work_dir=" << desc.work_dir << endl;


  submit_response_msg_t* srsp;
  int rc = slurm_submit_batch_job(&desc, &srsp);


  if (rc == 0)
  {
    cout << "--- job response ---" << endl;
    cout << "   job_id=" << srsp->job_id << endl;
    cout << "   step_id=" << srsp->step_id << endl;
    cout << "   error_code=" << srsp->error_code << endl;


    job_info_msg_t* info;
    rc = slurm_load_job(&info, srsp->job_id, 0);

    if (rc == 0)
    {
      job_info_t* detail = info->job_array;

      cout << "--- job detail ---" << endl << "   " << flush;
      FILE* flog = fopen("/dev/stdout", "w");
      slurm_print_job_info(flog, detail, 0);
    }

      slurm_free_submit_response_response_msg(srsp);
    }
    else
      int enum = slurm_get_errno();
      char* emsg = slurm_strerror(enum);
      cout << "SUB ERROR: " << emsg << " (#" << enum << ")\n";
    }
  }

  delete cname;
  delete cmail;
  delete cscript;
  delete cwkdir;

  return EXIT_SUCCESS;
}



--- job description ---
   name=bbones
   script=/home/stephenb/testing/slurm/task.sh
   user_id=517
   work_dir=/home/stephenb/testing/slurm

--- job response ---
   job_id=298
   step_id=4294967294
   error_code=0

--- job detail ---
   JobId=298 JobName=bbones
   UserId=stephenb(517) GroupId=root(0) MCS_label=N/A
   Priority=1804 Nice=0 Account=totalcae QOS=normal
   JobState=PENDING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:00 TimeLimit=UNLIMITED TimeMin=N/A
   SubmitTime=2016-08-29T09:23:31 EligibleTime=2016-08-29T09:23:31
   StartTime=Unknown EndTime=Unknown Deadline=N/A
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   Partition=gen2-128-24 AllocNode:Sid=I96:8405
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=(null)
   NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=1,node=1
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
   Features=(null) Gres=(null) Reservation=(null)
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=(null)
   WorkDir=/home/stephenb/testing/slurm
   StdErr=/home/stephenb/testing/slurm/slurm-298.out
   StdIn=StdIn=/dev/null
   StdOut=/home/stephenb/testing/slurm/slurm-298.out
   Power=


The code appears to be doing it's job, with the script failing *after*
being successfully queued. Anyone have an idea as to what may be going on.
Many thanks!

-- 
*Steve Barrett*, Sr. Solver Development Engineer
www.redcedartech.com

Reply via email to