[PATCH] gccattributes.h: define gccattr_returns_twice
Signed-off-by: Jesse Young --- For some out-of-tree code that wraps clone(), using setjmp()/longjmp() making it behave more like fork(). src/include/skalibs/gccattributes.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/include/skalibs/gccattributes.h b/src/include/skalibs/gccattributes.h index 0584480..7892687 100644 --- a/src/include/skalibs/gccattributes.h +++ b/src/include/skalibs/gccattributes.h @@ -6,6 +6,7 @@ #ifdef __GNUC__ #define gccattr_noreturn __attribute__((__noreturn__)) +#define gccattr_returns_twice __attribute__((__returns_twice__)) #define gccattr_noinline __attribute__((__noinline__)) #define gccattr_inline __attribute__((__always_inline__)) #define gccattr_const __attribute__((__const__)) @@ -31,6 +32,7 @@ #else #define gccattr_noreturn +#define gccattr_returns_twice #define gccattr_noinline #define gccattr_inline #define gccattr_const @@ -46,6 +48,7 @@ #ifdef GCCATTR_COMPAT_0_22 #define _a_noreturn gccattr_noreturn +#define _a_returns_twice gccattr_returns_twice #define _a_noinline gccattr_noinline #define _a_inline gccattr_inline #define _a_const gccattr_const -- 2.32.0
[PATCH 2/2] doc: allreadwrite: document scatter/gatter functions
--- Document scatter/gatter functions as well as unsanitize_read() in the allreadwrite.h header. doc/libstddjb/allreadwrite.html| 53 ++ src/include/skalibs/allreadwrite.h | 5 ++- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/doc/libstddjb/allreadwrite.html b/doc/libstddjb/allreadwrite.html index 2619153..1c31022 100644 --- a/doc/libstddjb/allreadwrite.html +++ b/doc/libstddjb/allreadwrite.html @@ -51,6 +51,18 @@ and https://www.opengroup.org/onlinepubs/9699919799/functions/write.html;>write(). + + typedef ssize_t iovfunc_t (int fd, struct iovec const *v, unsigned int vlen) +This is the simplified type of IO functions such as +https://www.opengroup.org/onlinepubs/9699919799/functions/readv.html;>readv() +and +https://www.opengroup.org/onlinepubs/9699919799/functions/writev.html;>writev(), +where the content to perform IO on is specified as a +scatter/gather array of vlen +elements instead of a single string. + + + typedef size_t alliofunc_t (int fd, char *buf, size_t len) This is the type of an IO operation that expects all of its @@ -82,6 +94,11 @@ semantics are appropriate, so EPIPE is a good candidate to signal EOF on reading.) + + ssize_t unsanitize_read (ssize_t r) +Returns the inverse of sanitize_read. + + size_t allreadwrite (iofunc_t *f, int fd, char *s, size_t len) *f must be a basic reading or writing function such as @@ -94,6 +111,14 @@ blocking mode; if fd is in non-blocking mode, it might set errno to EWOULDBLOCK or EAGAIN. + + size_t allreadwritev (iovfunc_t *f, int fd, struct iovec const *v, unsigned int vlen) +Like allreadwrite +but the content to perform IO on is specified as a +scatter/gather array of vlen +elements instead of a single string. + + ssize_t fd_read (int fd, char *s, size_t len) Safe wrapper around the @@ -107,6 +132,20 @@ function. https://www.opengroup.org/onlinepubs/9699919799/functions/write.html;>write() function. + + + ssize_t fd_readv (int fd, struct iovec const *v, unsigned int vlen) +Safe wrapper around the +https://www.opengroup.org/onlinepubs/9699919799/functions/readv.html;>readv() +function. + + + + ssize_t fd_writev (int fd, struct iovec const *v, unsigned int vlen) +Safe wrapper around the +https://www.opengroup.org/onlinepubs/9699919799/functions/writev.html;>writev() +function. + ssize_t fd_recv (int fd, char *s, size_t len, unsigned int flags) @@ -138,5 +177,19 @@ around fd_write() if necessary, until either len bytes are written or an error occurs. + + size_t allreadv (int fd, struct iovec *v, unsigned int vlen) +Like allread, but the bytes from fd are read into a +scatter/gather array of vlen +elements instead of a single string. + + + + size_t allwritev (int fd, struct iovec const *v, unsigned int vlen) +Like allwrite, but the content to write is taken from a +scatter/gather array of vlen +elements instead of a single string. + + diff --git a/src/include/skalibs/allreadwrite.h b/src/include/skalibs/allreadwrite.h index 254d766..3df1ed1 100644 --- a/src/include/skalibs/allreadwrite.h +++ b/src/include/skalibs/allreadwrite.h @@ -14,6 +14,8 @@ extern size_t allreadwritev (iovfunc_t_ref, int, struct iovec const *, unsigned extern ssize_t fd_read (int, char *, size_t) ; extern ssize_t fd_write (int, char const *, size_t) ; +extern ssize_t fd_readv (int, struct iovec const *, unsigned int) ; +extern ssize_t fd_writev (int, struct iovec const *, unsigned int) ; extern ssize_t fd_recv (int, char *, size_t, unsigned int) ; extern ssize_t fd_send (int, char const *, size_t, unsigned int) ; @@ -23,7 +25,4 @@ extern size_t allwrite (int, char const *, size_t) ; extern size_t allreadv (int, struct iovec const *, unsigned int) ; extern size_t allwritev (int, struct iovec const *, unsigned int) ; -extern ssize_t fd_readv (int, struct iovec const *, unsigned int) ; -extern ssize_t fd_writev (int, struct iovec const *, unsigned int) ; - #endif -- 2.32.0
[PATCH 1/2] doc: reconcile openwrite{,v}nclose_suffix{,_devino}{,_sync} with djbunix.h
--- Hello, I've been working with the some of the scatter/gather I/O functions and noticed the documentation duplicated the non-vectored names. On closer inspection, I also discovered the devino variants have their arguments rearranged (except for *_internal). Cheers, Jesse doc/libstddjb/djbunix.html | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/libstddjb/djbunix.html b/doc/libstddjb/djbunix.html index 95f3777..ce34d25 100644 --- a/doc/libstddjb/djbunix.html +++ b/doc/libstddjb/djbunix.html @@ -560,8 +560,8 @@ The function returns 1 if it succeeds, or 0 (and sets errno) if it fails. int openwritenclose_suffix (char const *file, char const *s, size_t len, char const *suffix) int openwritenclose_suffix_sync (char const *file, char const *s, size_t len, char const *suffix) -int openwritenclose_suffix_devino (char const *file, char const *s, size_t len, dev_t *dev, ino_t *ino, char const *suffix) -int openwritenclose_suffix_devino_sync (char const *file, char const *s, size_t len, dev_t *dev, ino_t *ino, char const *suffix) +int openwritenclose_suffix_devino (char const *file, char const *s, size_t len, char const *suffix, dev_t *dev, ino_t *ino) +int openwritenclose_suffix_devino_sync (char const *file, char const *s, size_t len, char const *suffix, dev_t *dev, ino_t *ino) Trivial shortcuts around openwritenclose_suffix_internal(). The reader can easily figure out what they do. @@ -591,10 +591,10 @@ elements instead of a single string. - int openwritenclose_suffix (char const *file, char const *s, size_t len, char const *suffix) -int openwritenclose_suffix_sync (char const *file, char const *s, size_t len, char const *suffix) -int openwritenclose_suffix_devino (char const *file, char const *s, size_t len, dev_t *dev, ino_t *ino, char const *suffix) -int openwritenclose_suffix_devino_sync (char const *file, char const *s, size_t len, dev_t *dev, ino_t *ino, char const *suffix) + int openwritevnclose_suffix (char const *file, struct iovec const *v, unsigned int vlen, char const *suffix) +int openwritevnclose_suffix_sync (char const *file, struct iovec const *v, unsigned int vlen, char const *suffix) +int openwritevnclose_suffix_devino (char const *file, struct iovec const *v, unsigned int vlen, char const *suffix, dev_t *dev, ino_t *ino) +int openwritevnclose_suffix_devino_sync (char const *file, struct iovec const *v, unsigned int vlen, char const *suffix, dev_t *dev, ino_t *ino) Trivial wrappers around openwritevnclose_suffix_internal(). -- 2.32.0
Re: [PATCH] s6-supervise: Optionally run child in a new pid namespace
On Sun, 16 Jul 2017 08:41:03 + "Laurent Bercot"wrote: > As I told Jesse on IRC, the patch isn't going in. I'm not including > OS-specific code into s6, even with a compile-time option. The main > reason for it is that it changes the API: the choice to spawn the > service in a new namespace or not should be made at run time, so > it would introduce a new file in the service directory that would only > be valid under Linux, and the file would need to be supported by > s6-rc and friends even on other systems, etc. This is exactly the kind > of complexity created by OS divergences that plagues the Unix world > and that I very much want to avoid. This change itself looks quite > simple, but it would be a precedent and the slope is extremely > slippery. > > > >Though as Jesse explained, this requires some sort of exit/signal > >proxing, which isn't the case here. Here the direct child of > >s6-supervise remains the daemon itself - in its own pid ns - which is > >much better. > It would unarguably be more elegant, yes, but since there's a way to > do without it, it's only about elegance, not feasability - and I > really think the cost for elegance is too high. > > execline's 'trap' binary can adequately perform the needed proxying > at low resource cost. There's a reliability issue as well, one thing trap won't be able to do is relay KILL and other uncatchable signals. The proxy and the service aren't completely sharing the same fate as each other, which may break users expectations when they run, e.g. s6-svc -k. This fate sharing mismatch is similar to how s6-sudod may keep running despite s6-sudoc's death. To reliably deliver a KILL signal to the service without re-introducing pid races, the proxy would need a non-fatal way to know to send the signal, requiring a new API. Improving on execline's trap program to handle this would probably transform it into a version of daemontools supervise that exits rather than respawns. If there were no interposing process, an added benefit is that killing the pid 1 in a namespace also kills all the other processes in that namespace. This is a secondary thought, but we get it for free. Using pid namespaces isn't necessary to reliably kill a set of processes on Linux, another technique is to use a cgroup freezer, which essentially puts a write lock on a set of processes, so that an unrelated process can send signals without the TOCTOU races. > If more various namespace feature requests come in, at some point I > will look for a way to integrate some namespace functions into > skalibs, with proper compile-time guards and stubs, and then > reconsider; but as long as there are ways to achieve the desired > outcome with external tools, it's not a priority. Unsharing everything, except for the pid namespace, should take effect immediately on the calling process. CLONE_NEWNS is special since the pid 1 effect gets applied to the process' first child, rather than the process itself. This means the unshare tool from util-linux should be able to do everything else in the usual exec chainloading style. Alternative to patching s6-supervise, what do you think about adding an option to spawn a different supervisor from s6-svscan? An example API could be, s6-svscan will spawn "./supervisor" for each service dir it tracks, if it doesn't exist, it just spawns s6-supervise as usual. Note that having ./supervisor consist of "unshare -p s6-supervise" will prevent s6-supervise from re-spawning its supervised child. fork() will succeed the first time, but error with ENOMEM on subsequent calls unless unshare(CLONE_NEWPID) is called again. In fact, the patch could probably avoid refactoring the child into a continuation by doing unshare() ; fork() ; rather than clone().
[PATCH] s6-supervise: Optionally run child in a new pid namespace
This patch modifies s6-supervise to use the Linux specific clone() system call to enable the child process to become the pid 1 of a new pid namespace. To enable it, compile with -DWANT_CLONE_NEWPID and make the ./clone-newpid file readable to s6-supervise in the desired service directories. I ask that this be included in s6-supervise.c because doing unshare(CLONE_NEWPID) in the child process doesn't change the process's pid to 1. Rather, it runs the next spawned child as pid 1. After spawning that first process, the parent is prevented from spawning any future children, subsequent attempts will fail with ENOMEM. Changing s6-supervise to use clone() avoids these limitations as well as avoiding extending the supervision chain, which would make exit/signal proxying necessary. To see correct ps output, /proc needs to be remounted. To avoid conflicts with the parent pid namespace's /proc, this is done in a new mount namespace. For example: #!/bin/execlineb -P unshare -m -- foreground { umount /proc } if -- { mount -t proc proc /proc } exec ... The functions added in this patch could be migrated into skalibs or libs6, but I wanted to start with this as a PoC without making API changes. Jesse --- src/supervision/s6-supervise.c | 87 +++--- 1 file changed, 65 insertions(+), 22 deletions(-) diff --git a/src/supervision/s6-supervise.c b/src/supervision/s6-supervise.c index 2e8fa38..7605a82 100644 --- a/src/supervision/s6-supervise.c +++ b/src/supervision/s6-supervise.c @@ -9,6 +9,9 @@ #include #include #include +#ifdef WANT_CLONE_NEWPID +# include +#endif #include #include #include @@ -203,6 +206,67 @@ static int maybesetsid (void) return 1 ; } +static void exec_run(int p[2], int notifyp[2], int fd) gccattr_noreturn ; +static void exec_run(int p[2], int notifyp[2], int fd) +{ + char const *cargv[2] = { "run", 0 } ; + PROG = "s6-supervise (child)" ; + selfpipe_finish() ; + if (notifyp[0] >= 0) close(notifyp[0]) ; + close(p[0]) ; + if (notifyp[1] >= 0 && fd_move(fd, notifyp[1]) < 0) + { +failcoe(p[1]) ; +strerr_diefu1sys(127, "move notification descriptor") ; + } + if (!maybesetsid()) + { +failcoe(p[1]) ; +strerr_diefu1sys(127, "access ./nosetsid") ; + } + execve("./run", (char *const *)cargv, (char *const *)environ) ; + failcoe(p[1]) ; + strerr_dieexec(127, "run") ; +} + +static pid_t spawn_run_fork(int p[2], int notifyp[2], int fd) +{ + pid_t pid = fork() ; + if (!pid) exec_run(p, notifyp, fd) ; + return pid ; +} + +#ifdef WANT_CLONE_NEWPID +typedef struct +{ + int p[2] ; + int notifyp[2] ; + int fd ; +} exec_run_t ; + +static int exec_run_shim(void *ctx) gccattr_noreturn ; +static int exec_run_shim(void *ctx) +{ + exec_run_t *er = (exec_run_t *) ctx ; + exec_run(er->p, er->notifyp, er->fd) ; +} + +static pid_t spawn_run(int p[2], int notifyp[2], int fd) +{ + exec_run_t arg = { { p[0], p[1] }, { notifyp[0], notifyp[1] }, fd } ; + char child_stack[SIGSTKSZ] ; + if (access("clone-newpid", F_OK) < 0 && errno == ENOENT) +return spawn_run_fork(p, notifyp, fd) ; + return (pid_t) clone(_run_shim, child_stack + sizeof(child_stack), + CLONE_NEWPID | SIGCHLD, ) ; +} +#else /* if !defined(WANT_CLONE_NEWPID) */ +static pid_t spawn_run(int p[2], int notifyp[2], int fd) +{ + return spawn_run_fork(p, notifyp, fd) ; +} +#endif /* defined(WANT_CLONE_NEWPID) */ + static void trystart (void) { int p[2] ; @@ -222,7 +286,7 @@ static void trystart (void) fd_close(p[1]) ; fd_close(p[0]) ; return ; } - pid = fork() ; + pid = spawn_run(p, notifyp, (int)fd) ; if (pid < 0) { settimeout(60) ; @@ -232,27 +296,6 @@ static void trystart (void) fd_close(p[1]) ; fd_close(p[0]) ; return ; } - else if (!pid) - { -char const *cargv[2] = { "run", 0 } ; -PROG = "s6-supervise (child)" ; -selfpipe_finish() ; -if (notifyp[0] >= 0) close(notifyp[0]) ; -close(p[0]) ; -if (notifyp[1] >= 0 && fd_move((int)fd, notifyp[1]) < 0) -{ - failcoe(p[1]) ; - strerr_diefu1sys(127, "move notification descriptor") ; -} -if (!maybesetsid()) -{ - failcoe(p[1]) ; - strerr_diefu1sys(127, "access ./nosetsid") ; -} -execve("./run", (char *const *)cargv, (char *const *)environ) ; -failcoe(p[1]) ; -strerr_dieexec(127, "run") ; - } if (notifyp[1] >= 0) fd_close(notifyp[1]) ; fd_close(p[1]) ; { -- 2.13.1
Re: [PATCH] socket_local46: Make room on the stack for IPv6 sockaddrs
On Sun, 26 Jul 2015 21:29:31 +0200 Laurent Bercot ska-skaw...@skarnet.org wrote: On 26/07/2015 21:15, Jesse Young wrote: byte_copy() reads past the end of the sockaddr structure because it isn't sufficiently large enough to handle sockaddr_in6 addresses resulting in undefined behavior. armv6-alpine-linux-muslgnueabihf-gcc 5.1.0-r0 generates the UDF instruction in this case, causing programs to SIGILL. Ah, good catch. sockaddr used to work, but it's stricto sensu incorrect - and musl is very touchy with standards. I believe the right fix is to use sockaddr_storage, though. I'll commit a fix tomorrow. Thanks ! I don't think sockaddr_storage is strictly necessary in this instance. getsockname() will truncate its result, but socket_local46() won't use that result unless sa_family is AF_INET or AF_INET6. That being said, the only penalty to using sockaddr_storage is the extra space, and it'd be more robust in cases of buggy implementations of getsockname(). Jesse
[PATCH] socket_local46: Make room on the stack for IPv6 sockaddrs
byte_copy() reads past the end of the sockaddr structure because it isn't sufficiently large enough to handle sockaddr_in6 addresses resulting in undefined behavior. armv6-alpine-linux-muslgnueabihf-gcc 5.1.0-r0 generates the UDF instruction in this case, causing programs to SIGILL. --- src/libstddjb/socket_local46.c | 22 -- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/libstddjb/socket_local46.c b/src/libstddjb/socket_local46.c index 448160c..611eb37 100644 --- a/src/libstddjb/socket_local46.c +++ b/src/libstddjb/socket_local46.c @@ -12,21 +12,23 @@ int socket_local46 (int s, ip46_t *ip, uint16 *port) { - struct sockaddr sa ; + union { +struct sockaddr_in6 sa6 ; +struct sockaddr_in sa4 ; +struct sockaddr sa ; + } sa ; socklen_t dummy = sizeof sa ; - if (getsockname(s, sa, dummy) 0) return -1 ; - if (sa.sa_family == AF_INET6) + if (getsockname(s, sa.sa, dummy) 0) return -1 ; + if (sa.sa.sa_family == AF_INET6) { -register struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa ; -byte_copy(ip-ip, 16, (char const *)sa6-sin6_addr.s6_addr) ; -uint16_unpack_big((char *)sa6-sin6_port, port) ; +byte_copy(ip-ip, 16, (char const *)sa.sa6.sin6_addr.s6_addr) ; +uint16_unpack_big((char *)sa.sa6.sin6_port, port) ; ip-is6 = 1 ; } - else if (sa.sa_family == AF_INET) + else if (sa.sa.sa_family == AF_INET) { -register struct sockaddr_in *sa4 = (struct sockaddr_in *)sa ; -byte_copy(ip-ip, 4, (char const *)sa4-sin_addr.s_addr) ; -uint16_unpack_big((char *)sa4-sin_port, port) ; +byte_copy(ip-ip, 4, (char const *)sa.sa4.sin_addr.s_addr) ; +uint16_unpack_big((char *)sa.sa4.sin_port, port) ; ip-is6 = 0 ; } else return (errno = EAFNOSUPPORT, -1) ; -- 2.4.6