The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/7735
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From 94bcac3febf6790aa9a292ee1bb91d12579e73e1 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Thu, 6 Aug 2020 11:16:19 +0200 Subject: [PATCH] [RFC]: seccomp: enable unpriviled bpf through syscall interception Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- lxd/daemon.go | 8 + lxd/include/lxd_seccomp.h | 23 ++ lxd/include/syscall_numbers.h | 35 +++ lxd/instance/drivers/driver_lxc.go | 11 + lxd/instance/instance_interface.go | 1 + lxd/main_checkfeature.go | 128 ++++++++++- lxd/seccomp/seccomp.go | 336 ++++++++++++++++++++++++----- lxd/sys/os.go | 1 + shared/instance.go | 30 +-- 9 files changed, 502 insertions(+), 71 deletions(-) diff --git a/lxd/daemon.go b/lxd/daemon.go index 6ce21a1c19..2d62378164 100644 --- a/lxd/daemon.go +++ b/lxd/daemon.go @@ -631,6 +631,7 @@ func (d *Daemon) init() error { "pidfd", "seccomp_allow_deny_syntax", "devpts_fd", + "seccomp_proxy_send_notify_fd", } for _, extension := range lxcExtensions { d.os.LXCFeatures[extension] = liblxc.HasApiExtension(extension) @@ -675,6 +676,13 @@ func (d *Daemon) init() error { logger.Infof(" - seccomp listener continue syscalls: no") } + if canUseSeccompListenerAddfd() && d.os.LXCFeatures["seccomp_proxy_send_notify_fd"] { + d.os.SeccompListenerAddfd = true + logger.Infof(" - seccomp listener add file descriptors: yes") + } else { + logger.Infof(" - seccomp listener add file descriptors: no") + } + if d.os.LXCFeatures["devpts_fd"] && canUseNativeTerminals() { d.os.NativeTerminals = true logger.Infof(" - safe native terminal allocation : yes") diff --git a/lxd/include/lxd_seccomp.h b/lxd/include/lxd_seccomp.h index 242347e3e1..976947e4bc 100644 --- a/lxd/include/lxd_seccomp.h +++ b/lxd/include/lxd_seccomp.h @@ -65,4 +65,27 @@ struct seccomp_notif_sizes { struct seccomp_notif_resp) #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) #endif + +#ifndef SECCOMP_IOCTL_NOTIF_ADDFD +#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, struct seccomp_notif_addfd) + +/* valid flags for seccomp_notif_addfd */ +#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ + +/** + * struct seccomp_notif_addfd + * @id: The ID of the seccomp notification + * @flags: SECCOMP_ADDFD_FLAG_* + * @srcfd: The local fd number + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0. + * @newfd_flags: The O_* flags the remote FD should have applied + */ +struct seccomp_notif_addfd { + __u64 id; + __u32 flags; + __u32 srcfd; + __u32 newfd; + __u32 newfd_flags; +}; +#endif #endif /* LXD_SECCOMP_H */ diff --git a/lxd/include/syscall_numbers.h b/lxd/include/syscall_numbers.h index f953a26911..269b8c795b 100644 --- a/lxd/include/syscall_numbers.h +++ b/lxd/include/syscall_numbers.h @@ -74,4 +74,39 @@ #endif #endif +#ifndef __NR_bpf + #if defined __i386__ + #define __NR_bpf 357 + #elif defined __x86_64__ + #define __NR_bpf 321 + #elif defined __arm__ + #define __NR_bpf 386 + #elif defined __aarch64__ + #define __NR_bpf 386 + #elif defined __s390__ + #define __NR_bpf 351 + #elif defined __powerpc__ + #define __NR_bpf 361 + #elif defined __riscv + #define __NR_bpf 280 + #elif defined __sparc__ + #define __NR_bpf 349 + #elif defined __ia64__ + #define __NR_bpf (317 + 1024) + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_bpf 4355 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_bpf 6319 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_bpf 5315 + #endif + #else + #define -1 + #warning "__NR_bpf not defined for your architecture" + #endif +#endif + #endif /* __LXD_SYSCALL_NUMBERS_H */ diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go index 6b3de9a40e..aa1f0f52e0 100644 --- a/lxd/instance/drivers/driver_lxc.go +++ b/lxd/instance/drivers/driver_lxc.go @@ -6665,6 +6665,17 @@ func (c *lxc) DevptsFd() (*os.File, error) { return c.c.DevptsFd() } +// SeccompNotifyFd returns seccomp notify fd of the container. +func (c *lxc) SeccompNotifyFd() (*os.File, error) { + // Load the go-lxc struct + err := c.initLXC(false) + if err != nil { + return nil, err + } + + return c.c.SeccompNotifyFd() +} + // LocalConfig returns local config. func (c *lxc) LocalConfig() map[string]string { return c.localConfig diff --git a/lxd/instance/instance_interface.go b/lxd/instance/instance_interface.go index e03db25bfb..c8dbf1cd5d 100644 --- a/lxd/instance/instance_interface.go +++ b/lxd/instance/instance_interface.go @@ -148,6 +148,7 @@ type Container interface { ConsoleLog(opts liblxc.ConsoleLogOptions) (string, error) InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) error DevptsFd() (*os.File, error) + SeccompNotifyFd() (*os.File, error) } // CriuMigrationArgs arguments for CRIU migration. diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go index 01995462b9..77a7f53738 100644 --- a/lxd/main_checkfeature.go +++ b/lxd/main_checkfeature.go @@ -293,6 +293,127 @@ static void is_user_notification_continue_aware(void) seccomp_notify_aware = 2; } +__noreturn static void __do_user_notification_addfd(void) +{ + __do_close int listener = -EBADF; + pid_t pid; + int ret; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + struct seccomp_notif_addfd addfd = {}; + struct pollfd pollfd; + + listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) + _exit(EXIT_FAILURE); + + pid = fork(); + if (pid < 0) + _exit(EXIT_FAILURE); + + if (pid == 0) { + int dup_fd, pipe_fds[2]; + pid_t self; + + // Don't bother cleaning up. On child exit all of those + // will be closed anyway. + ret = pipe(pipe_fds); + if (ret < 0) + _exit(EXIT_FAILURE); + + // O_CLOEXEC doesn't matter as we're in the child and we're + // not going to exec. + dup_fd = dup(pipe_fds[0]); + if (dup_fd < 0) + _exit(EXIT_FAILURE); + + self = getpid(); + + ret = filecmp(self, self, pipe_fds[0], dup_fd); + if (ret) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + ret = poll(&pollfd, 1, 5000); + if (ret <= 0) + goto cleanup_sigkill; + + if (!(pollfd.revents & POLLIN)) + goto cleanup_sigkill; + + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + if (ret) + goto cleanup_sigkill; + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + ret = poll(&pollfd, 1, 5000); + if (ret <= 0) + goto cleanup_sigkill; + + if (!(pollfd.revents & POLLOUT)) + goto cleanup_sigkill; + + if (req.data.nr != __NR_dup) + goto cleanup_sigkill; + + addfd.srcfd = 3; + addfd.id = req.id; + addfd.flags = 0; + + // Inject the fd into the task. + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + if (ret < 0) + goto cleanup_sigkill; + close(ret); + + resp.id = req.id; + resp.flags |= SECCOMP_USER_NOTIF_FLAG_CONTINUE; + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp); + resp.error = -EPERM; + resp.flags = 0; + if (ret) { + ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp); + goto cleanup_sigkill; + } + +cleanup_wait: + ret = wait_for_pid(pid); + if (ret) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + +cleanup_sigkill: + kill(pid, SIGKILL); + goto cleanup_wait; +} + +static void is_user_notification_addfd_aware(void) +{ + int ret; + pid_t pid; + + pid = fork(); + if (pid < 0) + return; + + if (pid == 0) { + __do_user_notification_addfd(); + // Should not be reached. + _exit(EXIT_FAILURE); + } + + ret = wait_for_pid(pid); + if (!ret) + seccomp_notify_aware = 3; +} + static void is_seccomp_notify_aware(void) { __u32 action[] = { SECCOMP_RET_USER_NOTIF }; @@ -300,6 +421,8 @@ static void is_seccomp_notify_aware(void) if (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action[0]) == 0) { seccomp_notify_aware = 1; is_user_notification_continue_aware(); + if (seccomp_notify_aware == 2) + is_user_notification_addfd_aware(); } } @@ -403,9 +526,12 @@ func canUseSeccompListener() bool { } func canUseSeccompListenerContinue() bool { - return bool(C.seccomp_notify_aware == 2) + return bool(C.seccomp_notify_aware >= 2) } +func canUseSeccompListenerAddfd() bool { + return bool(C.seccomp_notify_aware == 3) +} func canUsePidFds() bool { return bool(C.pidfd_aware) } diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go index 52e88049a1..d1d6e40dfe 100644 --- a/lxd/seccomp/seccomp.go +++ b/lxd/seccomp/seccomp.go @@ -15,6 +15,8 @@ import ( "strings" "unsafe" + "github.com/pkg/errors" + "golang.org/x/sys/unix" liblxc "gopkg.in/lxc/go-lxc.v2" @@ -41,6 +43,7 @@ import ( #include <elf.h> #include <errno.h> #include <fcntl.h> +#include <linux/bpf.h> #include <linux/seccomp.h> #include <linux/types.h> #include <linux/kdev_t.h> @@ -58,6 +61,8 @@ import ( #include <unistd.h> #include "../include/lxd_seccomp.h" +#include "../include/memory_utils.h" +#include "../include/process_utils.h" struct seccomp_notif_sizes expected_sizes; @@ -123,69 +128,71 @@ struct lxd_seccomp_data_arch { int nr_mknodat; int nr_setxattr; int nr_mount; + int nr_bpf; }; #define LXD_SECCOMP_NOTIFY_MKNOD 0 #define LXD_SECCOMP_NOTIFY_MKNODAT 1 #define LXD_SECCOMP_NOTIFY_SETXATTR 2 #define LXD_SECCOMP_NOTIFY_MOUNT 3 +#define LXD_SECCOMP_NOTIFY_BPF 4 // ordered by likelihood of usage... static const struct lxd_seccomp_data_arch seccomp_notify_syscall_table[] = { - { -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT }, + { -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT, LXD_SECCOMP_NOTIFY_BPF }, #ifdef AUDIT_ARCH_X86_64 - { AUDIT_ARCH_X86_64, 133, 259, 188, 165 }, + { AUDIT_ARCH_X86_64, 133, 259, 188, 165, 321 }, #endif #ifdef AUDIT_ARCH_I386 - { AUDIT_ARCH_I386, 14, 297, 226, 21 }, + { AUDIT_ARCH_I386, 14, 297, 226, 21, 357 }, #endif #ifdef AUDIT_ARCH_AARCH64 - { AUDIT_ARCH_AARCH64, -1, 33, 5, 21 }, + { AUDIT_ARCH_AARCH64, -1, 33, 5, 21, 386 }, #endif #ifdef AUDIT_ARCH_ARM - { AUDIT_ARCH_ARM, 14, 324, 226, 21 }, + { AUDIT_ARCH_ARM, 14, 324, 226, 21, 386 }, #endif #ifdef AUDIT_ARCH_ARMEB - { AUDIT_ARCH_ARMEB, 14, 324, 226, 21 }, + { AUDIT_ARCH_ARMEB, 14, 324, 226, 21, 386 }, #endif #ifdef AUDIT_ARCH_S390 - { AUDIT_ARCH_S390, 14, 290, 224, 21 }, + { AUDIT_ARCH_S390, 14, 290, 224, 21, 386 }, #endif #ifdef AUDIT_ARCH_S390X - { AUDIT_ARCH_S390X, 14, 290, 224, 21 }, + { AUDIT_ARCH_S390X, 14, 290, 224, 21, 351 }, #endif #ifdef AUDIT_ARCH_PPC - { AUDIT_ARCH_PPC, 14, 288, 209, 21 }, + { AUDIT_ARCH_PPC, 14, 288, 209, 21, 361 }, #endif #ifdef AUDIT_ARCH_PPC64 - { AUDIT_ARCH_PPC64, 14, 288, 209, 21 }, + { AUDIT_ARCH_PPC64, 14, 288, 209, 21, 361 }, #endif #ifdef AUDIT_ARCH_PPC64LE - { AUDIT_ARCH_PPC64LE, 14, 288, 209, 21 }, + { AUDIT_ARCH_PPC64LE, 14, 288, 209, 21, 361 }, #endif #ifdef AUDIT_ARCH_SPARC - { AUDIT_ARCH_SPARC, 14, 286, 169, 167 }, + { AUDIT_ARCH_SPARC, 14, 286, 169, 167, 349 }, #endif #ifdef AUDIT_ARCH_SPARC64 - { AUDIT_ARCH_SPARC64, 14, 286, 169, 167 }, + { AUDIT_ARCH_SPARC64, 14, 286, 169, 167, 349 }, #endif #ifdef AUDIT_ARCH_MIPS - { AUDIT_ARCH_MIPS, 14, 290, 224, 21 }, + { AUDIT_ARCH_MIPS, 14, 290, 224, 21, -1 }, #endif #ifdef AUDIT_ARCH_MIPSEL - { AUDIT_ARCH_MIPSEL, 14, 290, 224, 21 }, + { AUDIT_ARCH_MIPSEL, 14, 290, 224, 21, -1 }, #endif #ifdef AUDIT_ARCH_MIPS64 - { AUDIT_ARCH_MIPS64, 131, 249, 180, 160 }, + { AUDIT_ARCH_MIPS64, 131, 249, 180, 160, -1 }, #endif #ifdef AUDIT_ARCH_MIPS64N32 - { AUDIT_ARCH_MIPS64N32, 131, 253, 180, 160 }, + { AUDIT_ARCH_MIPS64N32, 131, 253, 180, 160, -1 }, #endif #ifdef AUDIT_ARCH_MIPSEL64 - { AUDIT_ARCH_MIPSEL64, 131, 249, 180, 160 }, + { AUDIT_ARCH_MIPSEL64, 131, 249, 180, 160, -1 }, #endif #ifdef AUDIT_ARCH_MIPSEL64N32 - { AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160 }, + { AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160, -1 }, #endif }; @@ -217,6 +224,9 @@ static int seccomp_notify_get_syscall(struct seccomp_notif *req, if (entry->nr_mount == req->data.nr) return LXD_SECCOMP_NOTIFY_MOUNT; + if (entry->nr_bpf == req->data.nr) + return LXD_SECCOMP_NOTIFY_BPF; + break; } @@ -249,6 +259,109 @@ static void prepare_seccomp_iovec(struct iovec *iov, iov[3].iov_len = SECCOMP_COOKIE_SIZE; } +static inline int pidfd_getfd(int pidfd, int fd, int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, fd, flags); +} + +static int handleBpfSyscall(int notify_fd, int mem_fd, + struct seccomp_notify_proxy_msg *msg, + struct seccomp_notif *req, + struct seccomp_notif_resp *resp, + char *buf, size_t *buf_size) +{ + __do_close int pidfd = -EBADF, bpf_target_fd = -EBADF, + bpf_attach_fd = -EBADF, bpf_prog_fd = -EBADF; + union bpf_attr attr = {}; + unsigned int attr_len = sizeof(attr); + struct seccomp_notif_addfd addfd = {}; + int ret; + int cmd; + + if (attr_len < req->data.args[2]) + return -1; + attr_len = req->data.args[2]; + + switch (req->data.args[0]) { + case BPF_PROG_LOAD: + cmd = BPF_PROG_LOAD; + break; + case BPF_PROG_ATTACH: + cmd = BPF_PROG_ATTACH; + break; + case BPF_PROG_DETACH: + cmd = BPF_PROG_DETACH; + break; + default: + return -EINVAL; + } + + ret = pread(mem_fd, &attr, attr_len, (off_t)req->data.args[1]); + if (ret < 0) + return -1; + + switch (attr.prog_type) { + case BPF_PROG_TYPE_CGROUP_DEVICE: + break; + default: + return -EINVAL; + } + + pidfd = pidfd_open(req->pid, 0); + if (pidfd < 0) + return -errno; + + switch (cmd) { + case BPF_PROG_LOAD: + bpf_prog_fd = syscall(__NR_bpf, cmd, &attr, attr_len); + if (ret < 0) + return -errno; + + addfd.srcfd = bpf_prog_fd; + addfd.id = req->id; + addfd.flags = 0; + + // Inject the fd into the task. + ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + if (ret < 0) + return -errno; + + // Tell the caller what fd it got. + // Let me tell you, coding this is absurdly exciting. :D + resp->val = ret; + ret = 0; + break; + case BPF_PROG_ATTACH: + bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0); + if (bpf_target_fd < 0) + return -errno; + + bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0); + if (bpf_attach_fd < 0) + return -errno; + + attr.target_fd = bpf_target_fd; + attr.attach_bpf_fd = bpf_attach_fd; + ret = syscall(__NR_bpf, cmd, &attr, attr_len); + break; + case BPF_PROG_DETACH: + bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0); + if (bpf_target_fd < 0) + return -10; + + bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0); + if (bpf_attach_fd < 0) + return -11; + + attr.target_fd = bpf_target_fd; + attr.attach_bpf_fd = bpf_attach_fd; + ret = syscall(__NR_bpf, cmd, &attr, attr_len); + break; + } + + return ret; +} + #ifndef MS_LAZYTIME #define MS_LAZYTIME (1<<25) #endif @@ -259,6 +372,7 @@ const lxdSeccompNotifyMknod = C.LXD_SECCOMP_NOTIFY_MKNOD const lxdSeccompNotifyMknodat = C.LXD_SECCOMP_NOTIFY_MKNODAT const lxdSeccompNotifySetxattr = C.LXD_SECCOMP_NOTIFY_SETXATTR const lxdSeccompNotifyMount = C.LXD_SECCOMP_NOTIFY_MOUNT +const lxdSeccompNotifyBpf = C.LXD_SECCOMP_NOTIFY_BPF const seccompHeader = `2 ` @@ -327,6 +441,14 @@ move_mount errno 38 const seccompNotifyMount = `mount notify [3,0,SCMP_CMP_MASKED_EQ,18446744070422410016] ` +// 5 == BPF_PROG_LOAD +// 8 == BPF_PROG_ATTACH +// 9 == BPF_PROG_DETACH +const seccompNotifyBpf = `bpf notify [0,5,SCMP_CMP_EQ] +bpf notify [0,8,SCMP_CMP_EQ] +bpf notify [0,9,SCMP_CMP_EQ] +` + const compatBlockingPolicy = `[%s] compat_sys_rt_sigaction errno 38 stub_x32_rt_sigreturn errno 38 @@ -412,6 +534,7 @@ func InstanceNeedsPolicy(c Instance) bool { "security.syscalls.intercept.mknod", "security.syscalls.intercept.setxattr", "security.syscalls.intercept.mount", + "security.syscalls.intercept.bpf", } for _, k := range keys { @@ -446,20 +569,22 @@ func InstanceNeedsIntercept(s *state.State, c Instance) (bool, error) { config := c.ExpandedConfig() - var keys = map[string]func(state *state.State) bool{ + var keys = map[string]func(state *state.State) error{ "security.syscalls.intercept.mknod": lxcSupportSeccompNotify, "security.syscalls.intercept.setxattr": lxcSupportSeccompNotify, "security.syscalls.intercept.mount": lxcSupportSeccompNotifyContinue, + "security.syscalls.intercept.bpf": lxcSupportSeccompNotifyAddfd, } needed := false - for key, isSupported := range keys { + for key, check := range keys { if !shared.IsTrue(config[key]) { continue } - if !isSupported(s) { - return needed, fmt.Errorf("System doesn't support syscall interception") + err := check(s) + if err != nil { + return needed, err } needed = true @@ -546,6 +671,11 @@ func seccompGetPolicyContent(s *state.State, c Instance) (string, error) { // multiple syscalls. policy += seccompBlockNewMountAPI } + + if shared.IsTrue(config["security.syscalls.intercept.bpf"]) && + shared.IsTrue(config["security.syscalls.intercept.bpf.prog.type.device"]) { + policy += seccompNotifyBpf + } } if allowlist != "" { @@ -617,14 +747,15 @@ type Server struct { // Iovec defines an iovec to move data between kernel and userspace. type Iovec struct { - ucred *unix.Ucred - memFd int - procFd int - msg *C.struct_seccomp_notify_proxy_msg - req *C.struct_seccomp_notif - resp *C.struct_seccomp_notif_resp - cookie *C.char - iov *C.struct_iovec + ucred *unix.Ucred + memFd int + procFd int + notifyFd int + msg *C.struct_seccomp_notify_proxy_msg + req *C.struct_seccomp_notif + resp *C.struct_seccomp_notif_resp + cookie *C.char + iov *C.struct_iovec } // NewSeccompIovec creates a new seccomp iovec. @@ -652,14 +783,15 @@ func NewSeccompIovec(ucred *unix.Ucred) *Iovec { C.prepare_seccomp_iovec(iov, msg, req, resp, cookie) return &Iovec{ - memFd: -1, - procFd: -1, - msg: msg, - req: req, - resp: resp, - cookie: cookie, - iov: iov, - ucred: ucred, + memFd: -1, + procFd: -1, + notifyFd: -1, + msg: msg, + req: req, + resp: resp, + cookie: cookie, + iov: iov, + ucred: ucred, } } @@ -671,6 +803,9 @@ func (siov *Iovec) PutSeccompIovec() { if siov.procFd >= 0 { unix.Close(siov.procFd) } + if siov.notifyFd >= 0 { + unix.Close(siov.notifyFd) + } C.free(unsafe.Pointer(siov.msg)) C.free(unsafe.Pointer(siov.req)) C.free(unsafe.Pointer(siov.resp)) @@ -678,20 +813,30 @@ func (siov *Iovec) PutSeccompIovec() { C.free(unsafe.Pointer(siov.iov)) } -// ReceiveSeccompIovec receives a seccomp iovec. -func (siov *Iovec) ReceiveSeccompIovec(fd int) (uint64, error) { +// ReceiveSeccompIovecV1 receives a v1 seccomp iovec. +func (siov *Iovec) ReceiveSeccompIovecV1(fd int) (uint64, error) { bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 2, unsafe.Pointer(siov.iov), 4) if err != nil || err == io.EOF { return 0, err } - if len(fds) == 2 { - siov.procFd = int(fds[0]) - siov.memFd = int(fds[1]) - } else { - siov.memFd = int(fds[0]) + siov.procFd = int(fds[0]) + siov.memFd = int(fds[1]) + + return bytes, nil +} + +// ReceiveSeccompIovecV2 receives a v2 seccomp iovec. +func (siov *Iovec) ReceiveSeccompIovecV2(fd int) (uint64, error) { + bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 3, unsafe.Pointer(siov.iov), 4) + if err != nil || err == io.EOF { + return 0, err } + siov.procFd = int(fds[0]) + siov.memFd = int(fds[1]) + siov.notifyFd = int(fds[2]) + return bytes, nil } @@ -810,8 +955,15 @@ func NewSeccompServer(s *state.State, path string, findPID func(pid int32, state } for { + var bytes uint64 + var err error + siov := NewSeccompIovec(ucred) - bytes, err := siov.ReceiveSeccompIovec(int(unixFile.Fd())) + if lxcSupportSeccompV2(server.s) { + bytes, err = siov.ReceiveSeccompIovecV2(int(unixFile.Fd())) + } else { + bytes, err = siov.ReceiveSeccompIovecV1(int(unixFile.Fd())) + } if err != nil { logger.Debugf("Disconnected from seccomp socket after failed receive: pid=%v, err=%s", ucred.Pid, err) c.Close() @@ -1601,6 +1753,45 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int { return 0 } +// BpfArgs arguments for mount. +type BpfArgs struct { + pid int +} + +// HandleBpfSyscall handles mount syscalls. +func (s *Server) HandleBpfSyscall(c Instance, siov *Iovec) int { + ctx := log.Ctx{"container": c.Name(), + "project": c.Project(), + "syscall_number": siov.req.data.nr, + "audit_architecture": siov.req.data.arch, + "seccomp_notify_id": siov.req.id, + "seccomp_notify_flags": siov.req.flags, + } + + defer logger.Debug("Handling bpf syscall", ctx) + + args := BpfArgs{ + pid: int(siov.req.pid), + } + + pidFdNr, pidFd := inheritPidFd(args.pid, s.s) + if pidFdNr >= 0 { + defer pidFd.Close() + } + + cBpfAttrBuf := [4096]C.char{} + cBpfAttrSize := C.size_t(len(cBpfAttrBuf)) + + ret := C.handleBpfSyscall(C.int(siov.notifyFd), C.int(siov.memFd), siov.msg, siov.req, siov.resp, &cBpfAttrBuf[0], &cBpfAttrSize) + if ret < 0 { + ctx["syscall_continue"] = "true" + C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue)) + return 0 + } + + return 0 +} + func (s *Server) handleSyscall(c Instance, siov *Iovec) int { switch int(C.seccomp_notify_get_syscall(siov.req, siov.resp)) { case lxdSeccompNotifyMknod: @@ -1611,6 +1802,8 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) int { return s.HandleSetxattrSyscall(c, siov) case lxdSeccompNotifyMount: return s.HandleMountSyscall(c, siov) + case lxdSeccompNotifyBpf: + return s.HandleBpfSyscall(c, siov) } return int(-C.EINVAL) @@ -1649,39 +1842,70 @@ func (s *Server) Stop() error { return s.l.Close() } -func lxcSupportSeccompNotifyContinue(state *state.State) bool { - if !lxcSupportSeccompNotify(state) { +func lxcSupportSeccompV2(state *state.State) bool { + err := lxcSupportSeccompNotify(state) + if err != nil { return false } - if !state.OS.SeccompListenerContinue { + if !state.OS.LXCFeatures["seccomp_proxy_send_notify_fd"] { return false } return true } -func lxcSupportSeccompNotify(state *state.State) bool { +func lxcSupportSeccompNotifyContinue(state *state.State) error { + err := lxcSupportSeccompNotify(state) + if err != nil { + return err + } + + if !state.OS.SeccompListenerContinue { + return fmt.Errorf("Seccomp notify doesn't support continuing syscalls") + } + + return nil +} + +func lxcSupportSeccompNotifyAddfd(state *state.State) error { + err := lxcSupportSeccompNotify(state) + if err != nil { + return err + } + + if !state.OS.SeccompListenerContinue { + return fmt.Errorf("Seccomp notify doesn't support continuing syscalls") + } + + if !state.OS.SeccompListenerAddfd { + return fmt.Errorf("Seccomp notify doesn't support adding file descriptors") + } + + return nil +} + +func lxcSupportSeccompNotify(state *state.State) error { if !state.OS.SeccompListener { - return false + return fmt.Errorf("Seccomp notify not supported") } if !state.OS.LXCFeatures["seccomp_notify"] { - return false + return fmt.Errorf("LXC doesn't support seccomp notify") } c, err := liblxc.NewContainer("test-seccomp", state.OS.LxcPath) if err != nil { - return false + return fmt.Errorf("Failed to load seccomp notify test container") } err = c.SetConfigItem("lxc.seccomp.notify.proxy", fmt.Sprintf("unix:%s", shared.VarPath("seccomp.socket"))) if err != nil { - return false + return errors.Wrap(err, "LXC doesn't support notify proxy") } c.Release() - return true + return nil } // MountSyscallFilter creates a mount syscall filter from the config. diff --git a/lxd/sys/os.go b/lxd/sys/os.go index d20b6aef85..b7faa7d4e3 100644 --- a/lxd/sys/os.go +++ b/lxd/sys/os.go @@ -67,6 +67,7 @@ type OS struct { NetnsGetifaddrs bool PidFds bool SeccompListener bool + SeccompListenerAddfd bool SeccompListenerContinue bool Shiftfs bool UeventInjection bool diff --git a/shared/instance.go b/shared/instance.go index 42eba631c1..b8072a53b3 100644 --- a/shared/instance.go +++ b/shared/instance.go @@ -199,20 +199,22 @@ var KnownInstanceConfigKeys = map[string]func(value string) error{ "security.secureboot": validate.Optional(validate.IsBool), - "security.syscalls.allow": validate.IsAny, - "security.syscalls.blacklist_default": validate.Optional(validate.IsBool), - "security.syscalls.blacklist_compat": validate.Optional(validate.IsBool), - "security.syscalls.blacklist": validate.IsAny, - "security.syscalls.deny_default": validate.Optional(validate.IsBool), - "security.syscalls.deny_compat": validate.Optional(validate.IsBool), - "security.syscalls.deny": validate.IsAny, - "security.syscalls.intercept.mknod": validate.Optional(validate.IsBool), - "security.syscalls.intercept.mount": validate.Optional(validate.IsBool), - "security.syscalls.intercept.mount.allowed": validate.IsAny, - "security.syscalls.intercept.mount.fuse": validate.IsAny, - "security.syscalls.intercept.mount.shift": validate.Optional(validate.IsBool), - "security.syscalls.intercept.setxattr": validate.Optional(validate.IsBool), - "security.syscalls.whitelist": validate.IsAny, + "security.syscalls.allow": validate.IsAny, + "security.syscalls.blacklist_default": validate.Optional(validate.IsBool), + "security.syscalls.blacklist_compat": validate.Optional(validate.IsBool), + "security.syscalls.blacklist": validate.IsAny, + "security.syscalls.deny_default": validate.Optional(validate.IsBool), + "security.syscalls.deny_compat": validate.Optional(validate.IsBool), + "security.syscalls.deny": validate.IsAny, + "security.syscalls.intercept.bpf": validate.Optional(validate.IsBool), + "security.syscalls.intercept.bpf.prog.type.device": validate.Optional(validate.IsBool), + "security.syscalls.intercept.mknod": validate.Optional(validate.IsBool), + "security.syscalls.intercept.mount": validate.Optional(validate.IsBool), + "security.syscalls.intercept.mount.allowed": validate.IsAny, + "security.syscalls.intercept.mount.fuse": validate.IsAny, + "security.syscalls.intercept.mount.shift": validate.Optional(validate.IsBool), + "security.syscalls.intercept.setxattr": validate.Optional(validate.IsBool), + "security.syscalls.whitelist": validate.IsAny, "snapshots.schedule": func(value string) error { if value == "" {
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel