The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxc/pull/3412
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From 78ae61d86cd502df5f757ba4ba5cf5304aad55d2 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Fri, 15 May 2020 11:44:46 +0200 Subject: [PATCH 1/5] syscall_numbers: handle ia64 syscall numbers correctly They are offset by 1024. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- src/lxc/syscall_numbers.h | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/lxc/syscall_numbers.h b/src/lxc/syscall_numbers.h index e7a9dd9a4f..c4be407a47 100644 --- a/src/lxc/syscall_numbers.h +++ b/src/lxc/syscall_numbers.h @@ -40,7 +40,7 @@ #elif defined __sparc__ #define __NR_keyctl 283 #elif defined __ia64__ - #define __NR_keyctl 249 + #define __NR_keyctl (249 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_keyctl 4282 @@ -112,7 +112,7 @@ #elif defined __sparc__ #define __NR_pivot_root 146 #elif defined __ia64__ - #define __NR_pivot_root 183 + #define __NR_pivot_root (183 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_pivot_root 4216 @@ -147,7 +147,7 @@ #elif defined __sparc__ #define __NR_setns 337 #elif defined __ia64__ - #define __NR_setns 306 + #define __NR_setns (306 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_setns 4344 @@ -182,7 +182,7 @@ #elif defined __sparc__ #define __NR_sethostname 88 #elif defined __ia64__ - #define __NR_sethostname 59 + #define __NR_sethostname (59 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_sethostname 474 @@ -217,7 +217,7 @@ #elif defined __sparc__ #define __NR_signalfd 311 #elif defined __ia64__ - #define __NR_signalfd 283 + #define __NR_signalfd (283 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_signalfd 4317 @@ -252,7 +252,7 @@ #elif defined __sparc__ #define __NR_signalfd4 317 #elif defined __ia64__ - #define __NR_signalfd4 289 + #define __NR_signalfd4 (289 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_signalfd4 4324 @@ -287,7 +287,7 @@ #elif defined __sparc__ #define __NR_unshare 299 #elif defined __ia64__ - #define __NR_unshare 272 + #define __NR_unshare (272 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_unshare 4303 @@ -322,7 +322,7 @@ #elif defined __sparc__ #define __NR_bpf 349 #elif defined __ia64__ - #define __NR_bpf 317 + #define __NR_bpf (317 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_bpf 4355 @@ -357,7 +357,7 @@ #elif defined __sparc__ #define __NR_faccessat 296 #elif defined __ia64__ - #define __NR_faccessat 269 + #define __NR_faccessat (269 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_faccessat 4300 @@ -387,6 +387,8 @@ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_pidfd_send_signal 5424 #endif + #elif defined __ia64__ + #define __NR_pidfd_send_signal (424 + 1024) #else #define __NR_pidfd_send_signal 424 #endif @@ -410,7 +412,7 @@ #elif defined __sparc__ #define __NR_seccomp 346 #elif defined __ia64__ - #define __NR_seccomp 329 + #define __NR_seccomp (329 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_seccomp 4352 @@ -445,7 +447,7 @@ #elif defined __sparc__ #define __NR_gettid 143 #elif defined __ia64__ - #define __NR_gettid 81 + #define __NR_gettid (81 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_gettid 4222 @@ -484,7 +486,7 @@ #elif defined __sparc__ #define __NR_execveat 350 #elif defined __ia64__ - #define __NR_execveat 318 + #define __NR_execveat (318 + 1024) #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_execveat 4356 @@ -514,6 +516,8 @@ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_move_mount 5429 #endif + #elif defined __ia64__ + #define __NR_move_mount (428 + 1024) #else #define __NR_move_mount 429 #endif @@ -532,6 +536,8 @@ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_open_tree 5428 #endif + #elif defined __ia64__ + #define __NR_open_tree (428 + 1024) #else #define __NR_open_tree 428 #endif From 923d3a2dba12dee9a543af4757f1c37f83007a00 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Fri, 15 May 2020 11:48:25 +0200 Subject: [PATCH 2/5] syscall_numbers: add clone3() Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- src/lxc/syscall_numbers.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/lxc/syscall_numbers.h b/src/lxc/syscall_numbers.h index c4be407a47..bfd0e57ab9 100644 --- a/src/lxc/syscall_numbers.h +++ b/src/lxc/syscall_numbers.h @@ -543,4 +543,24 @@ #endif #endif +#ifndef __NR_clone3 + #if defined __alpha__ + #define __NR_clone3 545 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_clone3 4435 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_clone3 6435 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_clone3 5435 + #endif + #elif defined __ia64__ + #define __NR_clone3 (435 + 1024) + #else + #define __NR_clone3 435 + #endif +#endif + #endif /* __LXC_SYSCALL_NUMBERS_H */ From f40988c7736333b67f55dcc9e3a3340f7793a16f Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Fri, 15 May 2020 12:32:28 +0200 Subject: [PATCH 3/5] process_utils: introduce new process_utils.{c,h} This will be the central place for all process management helpers. This also removes raw_syscalls.{c,h}. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- configure.ac | 2 +- src/include/fexecve.c | 2 +- src/lxc/Makefile.am | 10 +- src/lxc/af_unix.c | 2 +- src/lxc/attach.c | 2 +- src/lxc/cmd/lxc_init.c | 2 +- src/lxc/cmd/lxc_monitord.c | 2 +- src/lxc/cmd/lxc_user_nic.c | 2 +- src/lxc/conf.c | 4 +- src/lxc/execute.c | 4 +- src/lxc/lsm/apparmor.c | 2 +- src/lxc/lxccontainer.c | 2 +- src/lxc/namespace.c | 27 --- src/lxc/namespace.h | 90 ------- src/lxc/network.c | 2 +- src/lxc/{raw_syscalls.c => process_utils.c} | 36 ++- src/lxc/process_utils.h | 248 ++++++++++++++++++++ src/lxc/raw_syscalls.h | 94 -------- src/lxc/rexec.c | 2 +- src/lxc/start.c | 2 +- src/lxc/utils.c | 2 +- src/lxc/utils.h | 2 +- src/tests/Makefile.am | 2 +- src/tests/lxc_raw_clone.c | 2 +- 24 files changed, 303 insertions(+), 242 deletions(-) rename src/lxc/{raw_syscalls.c => process_utils.c} (82%) create mode 100644 src/lxc/process_utils.h delete mode 100644 src/lxc/raw_syscalls.h diff --git a/configure.ac b/configure.ac index c55810831e..7f589f9405 100644 --- a/configure.ac +++ b/configure.ac @@ -622,7 +622,7 @@ AC_CHECK_HEADER([ifaddrs.h], AC_HEADER_MAJOR # Check for some syscalls functions -AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree]) +AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat]) # Check for strerror_r() support. Defines: # - HAVE_STRERROR_R if available diff --git a/src/include/fexecve.c b/src/include/fexecve.c index 123f273098..0627cc802d 100644 --- a/src/include/fexecve.c +++ b/src/include/fexecve.c @@ -29,7 +29,7 @@ #include <fcntl.h> #include "config.h" #include "macro.h" -#include "raw_syscalls.h" +#include "process_utils.h" int fexecve(int fd, char *const argv[], char *const envp[]) { diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am index c374c2d0ac..d1e23647e0 100644 --- a/src/lxc/Makefile.am +++ b/src/lxc/Makefile.am @@ -27,7 +27,7 @@ noinst_HEADERS = api_extensions.h \ memory_utils.h \ monitor.h \ namespace.h \ - raw_syscalls.h \ + process_utils.h \ rexec.h \ start.h \ state.h \ @@ -128,7 +128,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \ network.c network.h \ monitor.c monitor.h \ parse.c parse.h \ - raw_syscalls.c raw_syscalls.h \ + process_utils.c process_utils.h \ ringbuf.c ringbuf.h \ rtnl.c rtnl.h \ state.c state.h \ @@ -384,7 +384,7 @@ init_lxc_SOURCES = cmd/lxc_init.c \ initutils.c initutils.h \ memory_utils.h \ parse.c parse.h \ - raw_syscalls.c raw_syscalls.h \ + process_utils.c process_utils.h \ syscall_numbers.h \ string_utils.c string_utils.h @@ -395,7 +395,7 @@ lxc_monitord_SOURCES = cmd/lxc_monitord.c \ log.c log.h \ mainloop.c mainloop.h \ monitor.c monitor.h \ - raw_syscalls.c raw_syscalls.h \ + process_utils.c process_utils.h \ syscall_numbers.h \ utils.c utils.h lxc_user_nic_SOURCES = cmd/lxc_user_nic.c \ @@ -404,7 +404,7 @@ lxc_user_nic_SOURCES = cmd/lxc_user_nic.c \ memory_utils.h \ network.c network.h \ parse.c parse.h \ - raw_syscalls.c raw_syscalls.h \ + process_utils.c process_utils.h \ syscall_numbers.h \ file_utils.c file_utils.h \ string_utils.c string_utils.h \ diff --git a/src/lxc/af_unix.c b/src/lxc/af_unix.c index bf626a109e..5cf54917f1 100644 --- a/src/lxc/af_unix.c +++ b/src/lxc/af_unix.c @@ -18,7 +18,7 @@ #include "log.h" #include "macro.h" #include "memory_utils.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "utils.h" #ifndef HAVE_STRLCPY diff --git a/src/lxc/attach.c b/src/lxc/attach.c index bbf95bd5b6..57c7f46170 100644 --- a/src/lxc/attach.c +++ b/src/lxc/attach.c @@ -40,7 +40,7 @@ #include "mainloop.h" #include "memory_utils.h" #include "namespace.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "syscall_wrappers.h" #include "terminal.h" #include "utils.h" diff --git a/src/lxc/cmd/lxc_init.c b/src/lxc/cmd/lxc_init.c index a52793343a..a03631f1a4 100644 --- a/src/lxc/cmd/lxc_init.c +++ b/src/lxc/cmd/lxc_init.c @@ -28,7 +28,7 @@ #include "initutils.h" #include "memory_utils.h" #include "parse.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "string_utils.h" /* option keys for long only options */ diff --git a/src/lxc/cmd/lxc_monitord.c b/src/lxc/cmd/lxc_monitord.c index 7318df9542..da7db28207 100644 --- a/src/lxc/cmd/lxc_monitord.c +++ b/src/lxc/cmd/lxc_monitord.c @@ -28,7 +28,7 @@ #include "log.h" #include "mainloop.h" #include "monitor.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "utils.h" #define CLIENTFDS_CHUNK 64 diff --git a/src/lxc/cmd/lxc_user_nic.c b/src/lxc/cmd/lxc_user_nic.c index edb2d8f03f..4160565f36 100644 --- a/src/lxc/cmd/lxc_user_nic.c +++ b/src/lxc/cmd/lxc_user_nic.c @@ -36,7 +36,7 @@ #include "memory_utils.h" #include "network.h" #include "parse.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "string_utils.h" #include "syscall_wrappers.h" #include "utils.h" diff --git a/src/lxc/conf.c b/src/lxc/conf.c index e2e2f9e97a..5cbca60006 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -51,7 +51,7 @@ #include "namespace.h" #include "network.h" #include "parse.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "ringbuf.h" #include "start.h" #include "storage.h" @@ -3245,7 +3245,7 @@ static bool verify_start_hooks(struct lxc_conf *conf) static bool execveat_supported(void) { - lxc_raw_execveat(-1, "", NULL, NULL, AT_EMPTY_PATH); + execveat(-1, "", NULL, NULL, AT_EMPTY_PATH); if (errno == ENOSYS) return false; diff --git a/src/lxc/execute.c b/src/lxc/execute.c index 7dd835862f..7175ef2cf2 100644 --- a/src/lxc/execute.c +++ b/src/lxc/execute.c @@ -14,7 +14,7 @@ #include "config.h" #include "log.h" #include "start.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "utils.h" lxc_log_define(execute, start); @@ -66,7 +66,7 @@ static int execute_start(struct lxc_handler *handler, void* data) NOTICE("Exec'ing \"%s\"", my_args->argv[0]); if (my_args->init_fd >= 0) - lxc_raw_execveat(my_args->init_fd, "", argv, environ, AT_EMPTY_PATH); + execveat(my_args->init_fd, "", argv, environ, AT_EMPTY_PATH); else execvp(argv[0], argv); SYSERROR("Failed to exec %s", argv[0]); diff --git a/src/lxc/lsm/apparmor.c b/src/lxc/lsm/apparmor.c index 4fc18eb438..cef95c8542 100644 --- a/src/lxc/lsm/apparmor.c +++ b/src/lxc/lsm/apparmor.c @@ -19,7 +19,7 @@ #include "log.h" #include "lsm.h" #include "parse.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "utils.h" lxc_log_define(apparmor, lsm); diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c index 48018fe329..9c9d023b87 100644 --- a/src/lxc/lxccontainer.c +++ b/src/lxc/lxccontainer.c @@ -49,7 +49,7 @@ #include "namespace.h" #include "network.h" #include "parse.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "start.h" #include "state.h" #include "storage.h" diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c index 38d2ae5d71..f2e0175630 100644 --- a/src/lxc/namespace.c +++ b/src/lxc/namespace.c @@ -21,33 +21,6 @@ lxc_log_define(namespace, lxc); -/* - * Let's use the "standard stack limit" (i.e. glibc thread size default) for - * stack sizes: 8MB. - */ -#define __LXC_STACK_SIZE (8 * 1024 * 1024) -pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd) -{ - pid_t ret; - void *stack; - - stack = malloc(__LXC_STACK_SIZE); - if (!stack) { - SYSERROR("Failed to allocate clone stack"); - return -ENOMEM; - } - -#ifdef __ia64__ - ret = __clone2(fn, stack, __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd); -#else - ret = clone(fn, stack + __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd); -#endif - if (ret < 0) - SYSERROR("Failed to clone (%#x)", flags); - - return ret; -} - /* Leave the user namespace at the first position in the array of structs so * that we always attach to it first when iterating over the struct and using * setns() to switch namespaces. This especially affects lxc_attach(): Suppose diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h index a8fda783c3..84976f60f2 100644 --- a/src/lxc/namespace.h +++ b/src/lxc/namespace.h @@ -7,63 +7,6 @@ #include <unistd.h> #include <sys/syscall.h> -#ifndef CLONE_PARENT_SETTID -#define CLONE_PARENT_SETTID 0x00100000 -#endif - -#ifndef CLONE_CHILD_CLEARTID -#define CLONE_CHILD_CLEARTID 0x00200000 -#endif - -#ifndef CLONE_CHILD_SETTID -#define CLONE_CHILD_SETTID 0x01000000 -#endif - -#ifndef CLONE_VFORK -#define CLONE_VFORK 0x00004000 -#endif - -#ifndef CLONE_THREAD -#define CLONE_THREAD 0x00010000 -#endif - -#ifndef CLONE_SETTLS -#define CLONE_SETTLS 0x00080000 -#endif - -#ifndef CLONE_VM -#define CLONE_VM 0x00000100 -#endif - -#ifndef CLONE_FILES -#define CLONE_FILES 0x00000400 -#endif - -#ifndef CLONE_FS -# define CLONE_FS 0x00000200 -#endif -#ifndef CLONE_NEWNS -# define CLONE_NEWNS 0x00020000 -#endif -#ifndef CLONE_NEWCGROUP -# define CLONE_NEWCGROUP 0x02000000 -#endif -#ifndef CLONE_NEWUTS -# define CLONE_NEWUTS 0x04000000 -#endif -#ifndef CLONE_NEWIPC -# define CLONE_NEWIPC 0x08000000 -#endif -#ifndef CLONE_NEWUSER -# define CLONE_NEWUSER 0x10000000 -#endif -#ifndef CLONE_NEWPID -# define CLONE_NEWPID 0x20000000 -#endif -#ifndef CLONE_NEWNET -# define CLONE_NEWNET 0x40000000 -#endif - enum { LXC_NS_USER, LXC_NS_MNT, @@ -82,39 +25,6 @@ extern const struct ns_info { const char *env_name; } ns_info[LXC_NS_MAX]; -#if defined(__ia64__) -int __clone2(int (*__fn) (void *__arg), void *__child_stack_base, - size_t __child_stack_size, int __flags, void *__arg, ...); -#else -int clone(int (*fn)(void *), void *child_stack, - int flags, void *arg, ... - /* pid_t *ptid, struct user_desc *tls, pid_t *ctid */ ); -#endif - -/** - * lxc_clone() - create a new process - * - * - allocate stack: - * This function allocates a new stack the size of page and passes it to the - * kernel. - * - * - support all CLONE_*flags: - * This function supports all CLONE_* flags. If in doubt or not sufficiently - * familiar with process creation in the kernel and interactions with libcs - * this function should be used. - * - * - pthread_atfork() handlers depending on libc: - * Whether this function runs pthread_atfork() handlers depends on the - * corresponding libc wrapper. glibc currently does not run pthread_atfork() - * handlers but does not guarantee that they are not. Other libcs might or - * might not run pthread_atfork() handlers. If you require guarantees please - * refer to the lxc_raw_clone*() functions in raw_syscalls.{c,h}. - * - * - should call lxc_raw_getpid(): - * The child should use lxc_raw_getpid() to retrieve its pid. - */ -extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd); - extern int lxc_namespace_2_cloneflag(const char *namespace); extern int lxc_namespace_2_ns_idx(const char *namespace); extern int lxc_namespace_2_std_identifiers(char *namespaces); diff --git a/src/lxc/network.c b/src/lxc/network.c index a825180cf7..eaab9eccfe 100644 --- a/src/lxc/network.c +++ b/src/lxc/network.c @@ -36,7 +36,7 @@ #include "memory_utils.h" #include "network.h" #include "nl.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "syscall_wrappers.h" #include "utils.h" diff --git a/src/lxc/raw_syscalls.c b/src/lxc/process_utils.c similarity index 82% rename from src/lxc/raw_syscalls.c rename to src/lxc/process_utils.c index 3c6bd2506d..89abddec54 100644 --- a/src/lxc/raw_syscalls.c +++ b/src/lxc/process_utils.c @@ -13,15 +13,12 @@ #include "compiler.h" #include "config.h" +#include "log.h" #include "macro.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "syscall_numbers.h" -int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[], - char *const envp[], int flags) -{ - return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); -} +lxc_log_define(process_utils, lxc); /* * This is based on raw_clone in systemd but adapted to our needs. This uses @@ -124,3 +121,30 @@ int lxc_raw_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, { return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); } + +/* + * Let's use the "standard stack limit" (i.e. glibc thread size default) for + * stack sizes: 8MB. + */ +#define __LXC_STACK_SIZE (8 * 1024 * 1024) +pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd) +{ + pid_t ret; + void *stack; + + stack = malloc(__LXC_STACK_SIZE); + if (!stack) { + SYSERROR("Failed to allocate clone stack"); + return -ENOMEM; + } + +#ifdef __ia64__ + ret = __clone2(fn, stack, __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd); +#else + ret = clone(fn, stack + __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd); +#endif + if (ret < 0) + SYSERROR("Failed to clone (%#x)", flags); + + return ret; +} diff --git a/src/lxc/process_utils.h b/src/lxc/process_utils.h new file mode 100644 index 0000000000..6016f792ef --- /dev/null +++ b/src/lxc/process_utils.h @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef __LXC_PROCESS_UTILS_H +#define __LXC_PROCESS_UTILS_H + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include <sched.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef CSIGNAL +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#endif + +#ifndef CLONE_VM +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#endif + +#ifndef CLONE_FS +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#endif + +#ifndef CLONE_FILES +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#endif + +#ifndef CLONE_SIGHAND +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#endif + +#ifndef CLONE_PIDFD +#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */ +#endif + +#ifndef CLONE_PTRACE +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#endif + +#ifndef CLONE_VFORK +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#endif + +#ifndef CLONE_PARENT +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#endif + +#ifndef CLONE_THREAD +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#endif + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif + +#ifndef CLONE_SYSVSEM +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#endif + +#ifndef CLONE_SETTLS +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#endif + +#ifndef CLONE_PARENT_SETTID +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#endif + +#ifndef CLONE_CHILD_CLEARTID +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#endif + +#ifndef CLONE_DETACHED +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ +#endif + +#ifndef CLONE_UNTRACED +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#endif + +#ifndef CLONE_CHILD_SETTID +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#endif + +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif + +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif + +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif + +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif + +#ifndef CLONE_NEWPID +#define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif + +#ifndef CLONE_NEWNET +#define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#ifndef CLONE_IO +#define CLONE_IO 0x80000000 /* Clone io context */ +#endif + +/* Flags for the clone3() syscall. */ +#ifndef CLONE_CLEAR_SIGHAND +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +#endif + +#ifndef CLONE_INTO_CGROUP +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#endif + +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ +#endif + +/* waitid */ +#ifndef P_PIDFD +#define P_PIDFD 3 +#endif + + +#if defined(__ia64__) +int __clone2(int (*__fn)(void *__arg), void *__child_stack_base, + size_t __child_stack_size, int __flags, void *__arg, ...); +#else +int clone(int (*fn)(void *), void *child_stack, int flags, void *arg, ... + /* pid_t *ptid, struct user_desc *tls, pid_t *ctid */); +#endif + +/** + * lxc_clone() - create a new process + * + * - allocate stack: + * This function allocates a new stack the size of page and passes it to the + * kernel. + * + * - support all CLONE_*flags: + * This function supports all CLONE_* flags. If in doubt or not sufficiently + * familiar with process creation in the kernel and interactions with libcs + * this function should be used. + * + * - pthread_atfork() handlers depending on libc: + * Whether this function runs pthread_atfork() handlers depends on the + * corresponding libc wrapper. glibc currently does not run pthread_atfork() + * handlers but does not guarantee that they are not. Other libcs might or + * might not run pthread_atfork() handlers. If you require guarantees please + * refer to the lxc_raw_clone*() functions in process_utils.{c,h}. + * + * - should call lxc_raw_getpid(): + * The child should use lxc_raw_getpid() to retrieve its pid. + */ +extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd); + + +/* + * lxc_raw_clone() - create a new process + * + * - fork() behavior: + * This function returns 0 in the child and > 0 in the parent. + * + * - copy-on-write: + * This function does not allocate a new stack and relies on copy-on-write + * semantics. + * + * - supports subset of ClONE_* flags: + * lxc_raw_clone() intentionally only supports a subset of the flags available + * to the actual system call. Please refer to the implementation what flags + * cannot be used. Also, please don't assume that just because a flag isn't + * explicitly checked for as being unsupported that it is supported. If in + * doubt or not sufficiently familiar with process creation in the kernel and + * interactions with libcs this function should be used. + * + * - no pthread_atfork() handlers: + * This function circumvents - as much as this this is possible - any libc + * wrappers and thus does not run any pthread_atfork() handlers. Make sure + * that this is safe to do in the context you are trying to call this + * function. + * + * - must call lxc_raw_getpid(): + * The child must use lxc_raw_getpid() to retrieve its pid. + */ +extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd); + +/* + * lxc_raw_clone_cb() - create a new process + * + * - non-fork() behavior: + * Function does return pid of the child or -1 on error. Pass in a callback + * function via the "fn" argument that gets executed in the child process. + * The "args" argument is passed to "fn". + * + * All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb() + * as well. + */ +extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, + unsigned long flags, int *pidfd); + +#ifndef HAVE_EXECVEAT +static inline int execveat(int dirfd, const char *pathname, char *const argv[], + char *const envp[], int flags) +{ + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +} +#else +extern int execveat(int dirfd, const char *pathname, char *const argv[], + char *const envp[], int flags); +#endif + +/* + * Because of older glibc's pid cache (up to 2.25) whenever clone() is called + * the child must must retrieve it's own pid via lxc_raw_getpid(). + */ +static inline pid_t lxc_raw_getpid(void) +{ + return (pid_t)syscall(SYS_getpid); +} + +static inline pid_t lxc_raw_gettid(void) +{ +#if __NR_gettid > 0 + return syscall(__NR_gettid); +#else + return lxc_raw_getpid(); +#endif +} + +extern int lxc_raw_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, + unsigned int flags); + +#endif /* __LXC_PROCESS_UTILS_H */ diff --git a/src/lxc/raw_syscalls.h b/src/lxc/raw_syscalls.h deleted file mode 100644 index 1219f28f43..0000000000 --- a/src/lxc/raw_syscalls.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1+ */ - -#ifndef __LXC_RAW_SYSCALL_H -#define __LXC_RAW_SYSCALL_H - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif -#include <sched.h> -#include <stdbool.h> -#include <stdio.h> -#include <stdlib.h> -#include <signal.h> -#include <sys/syscall.h> -#include <unistd.h> - -/* clone */ -#ifndef CLONE_PIDFD -#define CLONE_PIDFD 0x00001000 -#endif - -/* waitid */ -#ifndef P_PIDFD -#define P_PIDFD 3 -#endif - -/* - * lxc_raw_clone() - create a new process - * - * - fork() behavior: - * This function returns 0 in the child and > 0 in the parent. - * - * - copy-on-write: - * This function does not allocate a new stack and relies on copy-on-write - * semantics. - * - * - supports subset of ClONE_* flags: - * lxc_raw_clone() intentionally only supports a subset of the flags available - * to the actual system call. Please refer to the implementation what flags - * cannot be used. Also, please don't assume that just because a flag isn't - * explicitly checked for as being unsupported that it is supported. If in - * doubt or not sufficiently familiar with process creation in the kernel and - * interactions with libcs this function should be used. - * - * - no pthread_atfork() handlers: - * This function circumvents - as much as this this is possible - any libc - * wrappers and thus does not run any pthread_atfork() handlers. Make sure - * that this is safe to do in the context you are trying to call this - * function. - * - * - must call lxc_raw_getpid(): - * The child must use lxc_raw_getpid() to retrieve its pid. - */ -extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd); - -/* - * lxc_raw_clone_cb() - create a new process - * - * - non-fork() behavior: - * Function does return pid of the child or -1 on error. Pass in a callback - * function via the "fn" argument that gets executed in the child process. - * The "args" argument is passed to "fn". - * - * All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb() - * as well. - */ -extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, - unsigned long flags, int *pidfd); - -extern int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[], - char *const envp[], int flags); - -/* - * Because of older glibc's pid cache (up to 2.25) whenever clone() is called - * the child must must retrieve it's own pid via lxc_raw_getpid(). - */ -static inline pid_t lxc_raw_getpid(void) -{ - return (pid_t)syscall(SYS_getpid); -} - -static inline pid_t lxc_raw_gettid(void) -{ -#if __NR_gettid > 0 - return syscall(__NR_gettid); -#else - return lxc_raw_getpid(); -#endif -} - -extern int lxc_raw_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, - unsigned int flags); - -#endif /* __LXC_RAW_SYSCALL_H */ diff --git a/src/lxc/rexec.c b/src/lxc/rexec.c index cd76efb3c4..cf198c0211 100644 --- a/src/lxc/rexec.c +++ b/src/lxc/rexec.c @@ -13,7 +13,7 @@ #include "file_utils.h" #include "macro.h" #include "memory_utils.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "string_utils.h" #include "syscall_wrappers.h" diff --git a/src/lxc/start.c b/src/lxc/start.c index ba92393ebf..fa84461c30 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -47,7 +47,7 @@ #include "monitor.h" #include "namespace.h" #include "network.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "start.h" #include "storage/storage.h" #include "storage/storage_utils.h" diff --git a/src/lxc/utils.c b/src/lxc/utils.c index 2cf99945fb..88d0f85ee5 100644 --- a/src/lxc/utils.c +++ b/src/lxc/utils.c @@ -35,7 +35,7 @@ #include "memory_utils.h" #include "namespace.h" #include "parse.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "syscall_wrappers.h" #include "utils.h" diff --git a/src/lxc/utils.h b/src/lxc/utils.h index 45ca5270de..cf2c04251b 100644 --- a/src/lxc/utils.h +++ b/src/lxc/utils.h @@ -25,7 +25,7 @@ #include "initutils.h" #include "macro.h" #include "memory_utils.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "string_utils.h" /* returns 1 on success, 0 if there were any failures */ diff --git a/src/tests/Makefile.am b/src/tests/Makefile.am index 493b33c9d8..69b8f30fbc 100644 --- a/src/tests/Makefile.am +++ b/src/tests/Makefile.am @@ -30,7 +30,7 @@ lxc_test_parse_config_file_SOURCES = parse_config_file.c \ lxc_test_raw_clone_SOURCES = lxc_raw_clone.c \ lxctest.h \ ../lxc/namespace.c ../lxc/namespace.h \ - ../lxc/raw_syscalls.c ../lxc/raw_syscalls.h + ../lxc/process_utils.c ../lxc/process_utils.h ../lxc/utils.c ../lxc/utils.h lxc_test_reboot_SOURCES = reboot.c lxc_test_saveconfig_SOURCES = saveconfig.c diff --git a/src/tests/lxc_raw_clone.c b/src/tests/lxc_raw_clone.c index 655454f395..f72e20cccd 100644 --- a/src/tests/lxc_raw_clone.c +++ b/src/tests/lxc_raw_clone.c @@ -39,7 +39,7 @@ #include "lxctest.h" #include "namespace.h" -#include "raw_syscalls.h" +#include "process_utils.h" #include "utils.h" int main(int argc, char *argv[]) From 96086a6b7b4f62c4f397de146b11456efa5327f7 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Fri, 15 May 2020 13:42:56 +0200 Subject: [PATCH 4/5] process_utils: add clone3() support Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- configure.ac | 8 +++++-- src/lxc/process_utils.h | 47 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 7f589f9405..4e11254b5b 100644 --- a/configure.ac +++ b/configure.ac @@ -622,7 +622,11 @@ AC_CHECK_HEADER([ifaddrs.h], AC_HEADER_MAJOR # Check for some syscalls functions -AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat]) +AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat clone3]) +# HAVE_STRUCT_CLONE_ARGS={0,1} +AC_CHECK_TYPES([struct clone_args], [], [], [[#include <linux/sched.h>]]) +AC_CHECK_MEMBERS([struct clone_args.set_tid],[],[],[[#include <linux/sched.h>]]) +AC_CHECK_MEMBERS([struct clone_args.cgroup],[],[],[[#include <linux/sched.h>]]) # Check for strerror_r() support. Defines: # - HAVE_STRERROR_R if available @@ -761,7 +765,7 @@ AX_CHECK_COMPILE_FLAG([-Wstringop-overflow], [CFLAGS="$CFLAGS -Wstringop-overflo AX_CHECK_LINK_FLAG([-z relro], [LDFLAGS="$LDFLAGS -z relro"],,[]) AX_CHECK_LINK_FLAG([-z now], [LDFLAGS="$LDFLAGS -z now"],,[]) -CFLAGS="$CFLAGS -Wvla -std=gnu11" +CFLAGS="$CFLAGS -Wvla -std=gnu11 -fms-extensions" if test "x$enable_werror" = "xyes"; then CFLAGS="$CFLAGS -Werror" fi diff --git a/src/lxc/process_utils.h b/src/lxc/process_utils.h index 6016f792ef..8795247596 100644 --- a/src/lxc/process_utils.h +++ b/src/lxc/process_utils.h @@ -6,6 +6,7 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif +#include <linux/sched.h> #include <sched.h> #include <signal.h> #include <stdbool.h> @@ -14,6 +15,9 @@ #include <sys/syscall.h> #include <unistd.h> +#include "config.h" +#include "syscall_numbers.h" + #ifndef CSIGNAL #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ #endif @@ -136,6 +140,49 @@ #define P_PIDFD 3 #endif +#ifndef CLONE_ARGS_SIZE_VER0 +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#endif + +#ifndef CLONE_ARGS_SIZE_VER1 +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#endif + +#ifndef CLONE_ARGS_SIZE_VER2 +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#endif + +#ifndef HAVE_STRUCT_CLONE_ARGS +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; + __aligned_u64 cgroup; +}; +#endif + +struct lxc_clone_args { + struct clone_args; +#ifndef HAVE_STRUCT_CLONE_ARGS_SET_TID + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; +#endif +#ifndef HAVE_STRUCT_CLONE_ARGS_CGROUP + __aligned_u64 cgroup; +#endif +}; + +static inline pid_t lxc_clone3(struct lxc_clone_args *args, size_t size) +{ + return syscall(__NR_clone3, (struct clone_args *)args, size); +} #if defined(__ia64__) int __clone2(int (*__fn)(void *__arg), void *__child_stack_base, From 26d02c9ca2fb95d9f233eb2147f667803d34e439 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Fri, 15 May 2020 14:13:07 +0200 Subject: [PATCH 5/5] process_utils: make lxc use clone3() whenever possible No more weird api quirks between architectures and cool new features. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- src/lxc/process_utils.c | 20 +++++++++++++++++++- src/lxc/process_utils.h | 10 +++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/lxc/process_utils.c b/src/lxc/process_utils.c index 89abddec54..5be89532a3 100644 --- a/src/lxc/process_utils.c +++ b/src/lxc/process_utils.c @@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc); * The nice thing about this is that we get fork() behavior. That is * lxc_raw_clone() returns 0 in the child and the child pid in the parent. */ -__returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd) +__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd) { /* * These flags don't interest at all so we don't jump through any hoops @@ -97,6 +97,24 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd) #endif } +__returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd) +{ + pid_t pid; + struct lxc_clone_args args = { + .flags = flags, + .pidfd = ptr_to_u64(pidfd), + .exit_signal = SIGCHLD, + }; + + pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0); + if (pid < 0 && errno == ENOSYS) { + SYSTRACE("Falling back to legacy clone"); + return __lxc_raw_clone(flags, pidfd); + } + + return pid; +} + pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags, int *pidfd) { diff --git a/src/lxc/process_utils.h b/src/lxc/process_utils.h index 8795247596..48f3c96f95 100644 --- a/src/lxc/process_utils.h +++ b/src/lxc/process_utils.h @@ -15,6 +15,7 @@ #include <sys/syscall.h> #include <unistd.h> +#include "compiler.h" #include "config.h" #include "syscall_numbers.h" @@ -168,6 +169,13 @@ struct clone_args { }; #endif +#ifndef ptr_to_u64 +#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) +#endif +#ifndef u64_to_ptr +#define u64_to_ptr(x) ((void *)(uintptr_t)x) +#endif + struct lxc_clone_args { struct clone_args; #ifndef HAVE_STRUCT_CLONE_ARGS_SET_TID @@ -179,7 +187,7 @@ struct lxc_clone_args { #endif }; -static inline pid_t lxc_clone3(struct lxc_clone_args *args, size_t size) +__returns_twice static inline pid_t lxc_clone3(struct lxc_clone_args *args, size_t size) { return syscall(__NR_clone3, (struct clone_args *)args, size); }
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel