Hello,
I've run into minor problem with cgroups when a task epilog script is specified in the config file. The issue appears to be that with the ProctrackType=proctrack/cgroup, and with the appropriate release files in place so that cgroups are cleaned up properly (CgroupReleaseAgentDir=/srv/slurm/sbin), the task epilog script occasionally gives an error in a job's output file: slurm_container_add: No such process.
It looks like the _run_script_as_user function forks and then execs the epilog script, while the parent tries to add the child process to the container, but sometimes (the timing has to be right), the script finishes (and the process exits), before the slurm_container_add function gets to writing the pid to the cgroup tasks file. This results in the ESRCH error.
I've attached a patch that I believe fixes this, by making the child process wait (just like in _fork_all_tasks) on a pipe for the parent to write a character. This allows the parent to call slurm_container_add before the child process exits, ensuring that the pid will exist.
This doesn't quite fix my problems though. In cgroups-fixes-with-sleep.patch, I've added a sleep(10); just after the fork in the parent process. With my patch and this sleep in place, the slurm_container_add now fails with ENOENT, because the entire cgroup freezer directory is missing. I think this is because the release files are removing the cgroup when the task completes. In the patch, I remove the check to create the container only if the cont_id == 0, but that still doesn't seem to resolve the issue, as the directory might exist between when slurm_container_create is called, and when the release script is called to remove the cgroup.
I'm not sure what the right fix is at this point, and any help is greatly appreciated. In summary, the three patches are basically the same, one without the sleep and debugging messages, one with the sleep, and one with the sleep and debugging messages.
Thanks, -sam
diff --git a/src/common/xcgroup.c b/src/common/xcgroup.c index e4f0eca..cf4159e 100644 --- a/src/common/xcgroup.c +++ b/src/common/xcgroup.c @@ -191,7 +191,7 @@ int xcgroup_ns_mount(xcgroup_ns_t* cgns) return XCGROUP_ERROR; } else - debug3("cgroup mount cmd line is '%s'", mount_cmd); + debug("cgroup mount cmd line is '%s'", mount_cmd); if (system(mount_cmd)) return XCGROUP_ERROR; diff --git a/src/slurmd/common/proctrack.c b/src/slurmd/common/proctrack.c index 5e007f3..22f8a7b 100644 --- a/src/slurmd/common/proctrack.c +++ b/src/slurmd/common/proctrack.c @@ -259,6 +259,7 @@ extern int slurm_proctrack_fini(void) */ extern int slurm_container_create(slurmd_job_t * job) { + debug3("slurm_container_create: job ptr: %p", job); if (slurm_proctrack_init() < 0) return 0; @@ -276,6 +277,8 @@ extern int slurm_container_create(slurmd_job_t * job) */ extern int slurm_container_add(slurmd_job_t * job, pid_t pid) { + debug3("slurm_container_add: job ptr: %p, pid: %d", job, pid); + if (slurm_proctrack_init() < 0) return SLURM_ERROR; diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 9bb8271..8e48aa9 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -2124,6 +2124,9 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, { int status, rc, opt; pid_t cpid; + int fdpair[2]; + int writefd, readfd; + char c; xassert(env); if (path == NULL || path[0] == '\0') @@ -2136,10 +2139,20 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, return -1; } - if ((job->cont_id == 0) && - (slurm_container_create(job) != SLURM_SUCCESS)) + if (slurm_container_create(job) != SLURM_SUCCESS) error("slurm_container_create: %m"); + if(pipe (fdpair) < 0) { + error("__run_script_as_user: pipe: %m"); + return SLURM_ERROR; + } + debug("New fdpair[0] = %d, fdpair[1] = %d", + fdpair[0], fdpair[1]); + fd_set_close_on_exec(fdpair[0]); + fd_set_close_on_exec(fdpair[1]); + readfd = fdpair[0]; + writefd = fdpair[1]; + if ((cpid = fork()) < 0) { error ("executing %s: fork: %m", name); return -1; @@ -2151,6 +2164,17 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, argv[0] = (char *)xstrdup(path); argv[1] = NULL; + /* close write fd not needed by the child */ + close(writefd); + + /* wait till parent notifies us */ + if ((rc = read (readfd, &c, sizeof(c))) != 1) { + error("run_script_as_user failed, fd = %d, rc=%d: %m", readfd, rc); + /* child process, should not return */ + exit(127); + } + close(readfd); + if (_drop_privileges(job, true, &sprivs) < 0) { error("run_script_as_user _drop_privileges: %m"); /* child process, should not return */ @@ -2183,6 +2207,19 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, else opt = WNOHANG; + + /* + * Now it's ok to unblock the child process and call exec. + */ + debug("Unblocking process %d, writefd = %d", + cpid, writefd); + if(write (writefd, &c, sizeof(c)) != 1) { + error("write to unblock script exec %d failed, writefd = %d", + cpid, writefd); + } + + close(writefd); + while (1) { rc = waitpid(cpid, &status, opt); if (rc < 0) {
diff --git a/src/common/xcgroup.c b/src/common/xcgroup.c index e4f0eca..90e045e 100644 --- a/src/common/xcgroup.c +++ b/src/common/xcgroup.c @@ -191,7 +191,7 @@ int xcgroup_ns_mount(xcgroup_ns_t* cgns) return XCGROUP_ERROR; } else - debug3("cgroup mount cmd line is '%s'", mount_cmd); + debug("cgroup mount cmd line is '%s'", mount_cmd); if (system(mount_cmd)) return XCGROUP_ERROR; @@ -449,6 +449,7 @@ int xcgroup_instanciate(xcgroup_t* cg) return fstatus; } } + debug("created cgroup dir: %s", file_path); umask(omask); /* change cgroup ownership as requested */ @@ -457,6 +458,7 @@ int xcgroup_instanciate(xcgroup_t* cg) uid, gid, file_path); return fstatus; } + debug("changed cgroup dir ownership: %s to %d:%d", file_path, uid, gid); /* following operations failure might not result in a general * failure so set output status to success */ @@ -503,6 +505,7 @@ int xcgroup_load(xcgroup_ns_t* cgns, xcgroup_t* cg, char* uri) int xcgroup_delete(xcgroup_t* cg) { + debug("deleting cgroup: %s", cg->path); if (rmdir(cg->path)) return XCGROUP_ERROR; else @@ -521,6 +524,7 @@ int xcgroup_add_pids(xcgroup_t* cg, pid_t* pids, int npids) return fstatus; } + debug("adding pids to cgroup %s", file_path); fstatus = _file_write_uint32s(file_path, (uint32_t*)pids, npids); if (fstatus != XCGROUP_SUCCESS) debug2("unable to add pids to '%s'", cpath); diff --git a/src/slurmd/common/proctrack.c b/src/slurmd/common/proctrack.c index 5e007f3..22f8a7b 100644 --- a/src/slurmd/common/proctrack.c +++ b/src/slurmd/common/proctrack.c @@ -259,6 +259,7 @@ extern int slurm_proctrack_fini(void) */ extern int slurm_container_create(slurmd_job_t * job) { + debug3("slurm_container_create: job ptr: %p", job); if (slurm_proctrack_init() < 0) return 0; @@ -276,6 +277,8 @@ extern int slurm_container_create(slurmd_job_t * job) */ extern int slurm_container_add(slurmd_job_t * job, pid_t pid) { + debug3("slurm_container_add: job ptr: %p, pid: %d", job, pid); + if (slurm_proctrack_init() < 0) return SLURM_ERROR; diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 9bb8271..5388bed 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -2124,6 +2124,9 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, { int status, rc, opt; pid_t cpid; + int fdpair[2]; + int writefd, readfd; + char c; xassert(env); if (path == NULL || path[0] == '\0') @@ -2136,10 +2139,21 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, return -1; } - if ((job->cont_id == 0) && - (slurm_container_create(job) != SLURM_SUCCESS)) + debug("[job %u] container id: %d", job->jobid, job->cont_id); + if (slurm_container_create(job) != SLURM_SUCCESS) error("slurm_container_create: %m"); + if(pipe (fdpair) < 0) { + error("__run_script_as_user: pipe: %m"); + return SLURM_ERROR; + } + debug("New fdpair[0] = %d, fdpair[1] = %d", + fdpair[0], fdpair[1]); + fd_set_close_on_exec(fdpair[0]); + fd_set_close_on_exec(fdpair[1]); + readfd = fdpair[0]; + writefd = fdpair[1]; + if ((cpid = fork()) < 0) { error ("executing %s: fork: %m", name); return -1; @@ -2151,6 +2165,17 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, argv[0] = (char *)xstrdup(path); argv[1] = NULL; + /* close write fd not needed by the child */ + close(writefd); + + /* wait till parent notifies us */ + if ((rc = read (readfd, &c, sizeof(c))) != 1) { + error("run_script_as_user failed, fd = %d, rc=%d: %m", readfd, rc); + /* child process, should not return */ + exit(127); + } + close(readfd); + if (_drop_privileges(job, true, &sprivs) < 0) { error("run_script_as_user _drop_privileges: %m"); /* child process, should not return */ @@ -2176,6 +2201,7 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, exit(127); } + sleep(10); if (slurm_container_add(job, cpid) != SLURM_SUCCESS) error("slurm_container_add: %m"); if (max_wait < 0) @@ -2183,6 +2209,19 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, else opt = WNOHANG; + + /* + * Now it's ok to unblock the child process and call exec. + */ + debug("Unblocking process %d, writefd = %d", + cpid, writefd); + if(write (writefd, &c, sizeof(c)) != 1) { + error("write to unblock script exec %d failed, writefd = %d", + cpid, writefd); + } + + close(writefd); + while (1) { rc = waitpid(cpid, &status, opt); if (rc < 0) {
diff --git a/src/common/xcgroup.c b/src/common/xcgroup.c index e4f0eca..cf4159e 100644 --- a/src/common/xcgroup.c +++ b/src/common/xcgroup.c @@ -191,7 +191,7 @@ int xcgroup_ns_mount(xcgroup_ns_t* cgns) return XCGROUP_ERROR; } else - debug3("cgroup mount cmd line is '%s'", mount_cmd); + debug("cgroup mount cmd line is '%s'", mount_cmd); if (system(mount_cmd)) return XCGROUP_ERROR; diff --git a/src/slurmd/common/proctrack.c b/src/slurmd/common/proctrack.c index 5e007f3..22f8a7b 100644 --- a/src/slurmd/common/proctrack.c +++ b/src/slurmd/common/proctrack.c @@ -259,6 +259,7 @@ extern int slurm_proctrack_fini(void) */ extern int slurm_container_create(slurmd_job_t * job) { + debug3("slurm_container_create: job ptr: %p", job); if (slurm_proctrack_init() < 0) return 0; @@ -276,6 +277,8 @@ extern int slurm_container_create(slurmd_job_t * job) */ extern int slurm_container_add(slurmd_job_t * job, pid_t pid) { + debug3("slurm_container_add: job ptr: %p, pid: %d", job, pid); + if (slurm_proctrack_init() < 0) return SLURM_ERROR; diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 9bb8271..0d01dde 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -2124,6 +2124,9 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, { int status, rc, opt; pid_t cpid; + int fdpair[2]; + int writefd, readfd; + char c; xassert(env); if (path == NULL || path[0] == '\0') @@ -2136,10 +2139,20 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, return -1; } - if ((job->cont_id == 0) && - (slurm_container_create(job) != SLURM_SUCCESS)) + if (slurm_container_create(job) != SLURM_SUCCESS) error("slurm_container_create: %m"); + if(pipe (fdpair) < 0) { + error("__run_script_as_user: pipe: %m"); + return SLURM_ERROR; + } + debug("New fdpair[0] = %d, fdpair[1] = %d", + fdpair[0], fdpair[1]); + fd_set_close_on_exec(fdpair[0]); + fd_set_close_on_exec(fdpair[1]); + readfd = fdpair[0]; + writefd = fdpair[1]; + if ((cpid = fork()) < 0) { error ("executing %s: fork: %m", name); return -1; @@ -2151,6 +2164,17 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, argv[0] = (char *)xstrdup(path); argv[1] = NULL; + /* close write fd not needed by the child */ + close(writefd); + + /* wait till parent notifies us */ + if ((rc = read (readfd, &c, sizeof(c))) != 1) { + error("run_script_as_user failed, fd = %d, rc=%d: %m", readfd, rc); + /* child process, should not return */ + exit(127); + } + close(readfd); + if (_drop_privileges(job, true, &sprivs) < 0) { error("run_script_as_user _drop_privileges: %m"); /* child process, should not return */ @@ -2176,6 +2200,7 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, exit(127); } + sleep(10); if (slurm_container_add(job, cpid) != SLURM_SUCCESS) error("slurm_container_add: %m"); if (max_wait < 0) @@ -2183,6 +2208,19 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, else opt = WNOHANG; + + /* + * Now it's ok to unblock the child process and call exec. + */ + debug("Unblocking process %d, writefd = %d", + cpid, writefd); + if(write (writefd, &c, sizeof(c)) != 1) { + error("write to unblock script exec %d failed, writefd = %d", + cpid, writefd); + } + + close(writefd); + while (1) { rc = waitpid(cpid, &status, opt); if (rc < 0) {