Hello there!
There is possible situation where variable TMPDIR on srun/sbatch side
points to valid directory but that directory does not exist on computing
node. Attached patch fixes the issue by rewriting TMPDIR=/tmp in those
cases. User still get error message from each node involved though, is it
better to do info() call instead of error() in that code?
Andriy.diff -udpr slurm-2.2.4/src/slurmd/slurmstepd/task.c slurm-2.2.4.tmp/src/slurmd/slurmstepd/task.c
--- slurm-2.2.4/src/slurmd/slurmstepd/task.c 2010-11-24 00:05:15.000000000 +0200
+++ slurm-2.2.4.tmp/src/slurmd/slurmstepd/task.c 2011-05-23 15:54:14.000000000 +0300
@@ -515,13 +515,24 @@ exec_task(slurmd_job_t *job, int i, int
static void
_make_tmpdir(slurmd_job_t *job)
{
char *tmpdir;
if (!(tmpdir = getenvp(job->env, "TMPDIR")))
- return;
+ setenvf(&job->env, "TMPDIR", "/tmp"); /* task may want it set */
- if ((mkdir(tmpdir, 0700) < 0) && (errno != EEXIST))
+ else if (mkdir(tmpdir, 0700) < 0) {
+ if (errno == EEXIST) {
+ struct stat st;
+
+ if (stat(tmpdir, &st) == 0 && /* does user have access? */
+ S_ISDIR(st.st_mode) && /* is it a directory? */
+ ((st.st_mode & S_IWOTH) || /* can user write there? */
+ (st.st_uid == job->uid && (st.st_mode & S_IWUSR))))
+ return;
+ }
error ("Unable to create TMPDIR [%s]: %m", tmpdir);
+ setenvf(&job->env, "TMPDIR", "/tmp");
+ }
return;
}