This patch modifies s6-supervise to use the Linux specific clone() system call to enable the child process to become the pid 1 of a new pid namespace. To enable it, compile with -DWANT_CLONE_NEWPID and make the ./clone-newpid file readable to s6-supervise in the desired service directories.
I ask that this be included in s6-supervise.c because doing unshare(CLONE_NEWPID) in the child process doesn't change the process's pid to 1. Rather, it runs the next spawned child as pid 1. After spawning that first process, the parent is prevented from spawning any future children, subsequent attempts will fail with ENOMEM. Changing s6-supervise to use clone() avoids these limitations as well as avoiding extending the supervision chain, which would make exit/signal proxying necessary. To see correct ps output, /proc needs to be remounted. To avoid conflicts with the parent pid namespace's /proc, this is done in a new mount namespace. For example: #!/bin/execlineb -P unshare -m -- foreground { umount /proc } if -- { mount -t proc proc /proc } exec ... The functions added in this patch could be migrated into skalibs or libs6, but I wanted to start with this as a PoC without making API changes. Jesse --- src/supervision/s6-supervise.c | 87 +++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 22 deletions(-) diff --git a/src/supervision/s6-supervise.c b/src/supervision/s6-supervise.c index 2e8fa38..7605a82 100644 --- a/src/supervision/s6-supervise.c +++ b/src/supervision/s6-supervise.c @@ -9,6 +9,9 @@ #include <errno.h> #include <fcntl.h> #include <signal.h> +#ifdef WANT_CLONE_NEWPID +# include <sched.h> +#endif #include <skalibs/allreadwrite.h> #include <skalibs/bytestr.h> #include <skalibs/uint.h> @@ -203,6 +206,67 @@ static int maybesetsid (void) return 1 ; } +static void exec_run(int p[2], int notifyp[2], int fd) gccattr_noreturn ; +static void exec_run(int p[2], int notifyp[2], int fd) +{ + char const *cargv[2] = { "run", 0 } ; + PROG = "s6-supervise (child)" ; + selfpipe_finish() ; + if (notifyp[0] >= 0) close(notifyp[0]) ; + close(p[0]) ; + if (notifyp[1] >= 0 && fd_move(fd, notifyp[1]) < 0) + { + failcoe(p[1]) ; + strerr_diefu1sys(127, "move notification descriptor") ; + } + if (!maybesetsid()) + { + failcoe(p[1]) ; + strerr_diefu1sys(127, "access ./nosetsid") ; + } + execve("./run", (char *const *)cargv, (char *const *)environ) ; + failcoe(p[1]) ; + strerr_dieexec(127, "run") ; +} + +static pid_t spawn_run_fork(int p[2], int notifyp[2], int fd) +{ + pid_t pid = fork() ; + if (!pid) exec_run(p, notifyp, fd) ; + return pid ; +} + +#ifdef WANT_CLONE_NEWPID +typedef struct +{ + int p[2] ; + int notifyp[2] ; + int fd ; +} exec_run_t ; + +static int exec_run_shim(void *ctx) gccattr_noreturn ; +static int exec_run_shim(void *ctx) +{ + exec_run_t *er = (exec_run_t *) ctx ; + exec_run(er->p, er->notifyp, er->fd) ; +} + +static pid_t spawn_run(int p[2], int notifyp[2], int fd) +{ + exec_run_t arg = { { p[0], p[1] }, { notifyp[0], notifyp[1] }, fd } ; + char child_stack[SIGSTKSZ] ; + if (access("clone-newpid", F_OK) < 0 && errno == ENOENT) + return spawn_run_fork(p, notifyp, fd) ; + return (pid_t) clone(&exec_run_shim, child_stack + sizeof(child_stack), + CLONE_NEWPID | SIGCHLD, &arg) ; +} +#else /* if !defined(WANT_CLONE_NEWPID) */ +static pid_t spawn_run(int p[2], int notifyp[2], int fd) +{ + return spawn_run_fork(p, notifyp, fd) ; +} +#endif /* defined(WANT_CLONE_NEWPID) */ + static void trystart (void) { int p[2] ; @@ -222,7 +286,7 @@ static void trystart (void) fd_close(p[1]) ; fd_close(p[0]) ; return ; } - pid = fork() ; + pid = spawn_run(p, notifyp, (int)fd) ; if (pid < 0) { settimeout(60) ; @@ -232,27 +296,6 @@ static void trystart (void) fd_close(p[1]) ; fd_close(p[0]) ; return ; } - else if (!pid) - { - char const *cargv[2] = { "run", 0 } ; - PROG = "s6-supervise (child)" ; - selfpipe_finish() ; - if (notifyp[0] >= 0) close(notifyp[0]) ; - close(p[0]) ; - if (notifyp[1] >= 0 && fd_move((int)fd, notifyp[1]) < 0) - { - failcoe(p[1]) ; - strerr_diefu1sys(127, "move notification descriptor") ; - } - if (!maybesetsid()) - { - failcoe(p[1]) ; - strerr_diefu1sys(127, "access ./nosetsid") ; - } - execve("./run", (char *const *)cargv, (char *const *)environ) ; - failcoe(p[1]) ; - strerr_dieexec(127, "run") ; - } if (notifyp[1] >= 0) fd_close(notifyp[1]) ; fd_close(p[1]) ; { -- 2.13.1