The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/7735

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===
Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From 94bcac3febf6790aa9a292ee1bb91d12579e73e1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brau...@ubuntu.com>
Date: Thu, 6 Aug 2020 11:16:19 +0200
Subject: [PATCH] [RFC]: seccomp: enable unpriviled bpf through syscall
 interception

Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
---
 lxd/daemon.go                      |   8 +
 lxd/include/lxd_seccomp.h          |  23 ++
 lxd/include/syscall_numbers.h      |  35 +++
 lxd/instance/drivers/driver_lxc.go |  11 +
 lxd/instance/instance_interface.go |   1 +
 lxd/main_checkfeature.go           | 128 ++++++++++-
 lxd/seccomp/seccomp.go             | 336 ++++++++++++++++++++++++-----
 lxd/sys/os.go                      |   1 +
 shared/instance.go                 |  30 +--
 9 files changed, 502 insertions(+), 71 deletions(-)

diff --git a/lxd/daemon.go b/lxd/daemon.go
index 6ce21a1c19..2d62378164 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -631,6 +631,7 @@ func (d *Daemon) init() error {
                "pidfd",
                "seccomp_allow_deny_syntax",
                "devpts_fd",
+               "seccomp_proxy_send_notify_fd",
        }
        for _, extension := range lxcExtensions {
                d.os.LXCFeatures[extension] = liblxc.HasApiExtension(extension)
@@ -675,6 +676,13 @@ func (d *Daemon) init() error {
                logger.Infof(" - seccomp listener continue syscalls: no")
        }
 
+       if canUseSeccompListenerAddfd() && 
d.os.LXCFeatures["seccomp_proxy_send_notify_fd"] {
+               d.os.SeccompListenerAddfd = true
+               logger.Infof(" - seccomp listener add file descriptors: yes")
+       } else {
+               logger.Infof(" - seccomp listener add file descriptors: no")
+       }
+
        if d.os.LXCFeatures["devpts_fd"] && canUseNativeTerminals() {
                d.os.NativeTerminals = true
                logger.Infof(" - safe native terminal allocation : yes")
diff --git a/lxd/include/lxd_seccomp.h b/lxd/include/lxd_seccomp.h
index 242347e3e1..976947e4bc 100644
--- a/lxd/include/lxd_seccomp.h
+++ b/lxd/include/lxd_seccomp.h
@@ -65,4 +65,27 @@ struct seccomp_notif_sizes {
                                                struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID   SECCOMP_IOR(2, __u64)
 #endif
+
+#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
+#define SECCOMP_IOCTL_NOTIF_ADDFD      SECCOMP_IOW(3, struct 
seccomp_notif_addfd)
+
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD       (1UL << 0) /* Specify remote fd */
+
+/**
+ * struct seccomp_notif_addfd
+ * @id: The ID of the seccomp notification
+ * @flags: SECCOMP_ADDFD_FLAG_*
+ * @srcfd: The local fd number
+ * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
+ * @newfd_flags: The O_* flags the remote FD should have applied
+ */
+struct seccomp_notif_addfd {
+       __u64 id;
+       __u32 flags;
+       __u32 srcfd;
+       __u32 newfd;
+       __u32 newfd_flags;
+};
+#endif
 #endif /* LXD_SECCOMP_H */
diff --git a/lxd/include/syscall_numbers.h b/lxd/include/syscall_numbers.h
index f953a26911..269b8c795b 100644
--- a/lxd/include/syscall_numbers.h
+++ b/lxd/include/syscall_numbers.h
@@ -74,4 +74,39 @@
        #endif
 #endif
 
+#ifndef __NR_bpf
+       #if defined __i386__
+               #define __NR_bpf 357
+       #elif defined __x86_64__
+               #define __NR_bpf 321
+       #elif defined __arm__
+               #define __NR_bpf 386
+       #elif defined __aarch64__
+               #define __NR_bpf 386
+       #elif defined __s390__
+               #define __NR_bpf 351
+       #elif defined __powerpc__
+               #define __NR_bpf 361
+       #elif defined __riscv
+               #define __NR_bpf 280
+       #elif defined __sparc__
+               #define __NR_bpf 349
+       #elif defined __ia64__
+               #define __NR_bpf (317 + 1024)
+       #elif defined _MIPS_SIM
+               #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
+                       #define __NR_bpf 4355
+               #endif
+               #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
+                       #define __NR_bpf 6319
+               #endif
+               #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
+                       #define __NR_bpf 5315
+               #endif
+       #else
+               #define -1
+               #warning "__NR_bpf not defined for your architecture"
+       #endif
+#endif
+
 #endif /* __LXD_SYSCALL_NUMBERS_H */
diff --git a/lxd/instance/drivers/driver_lxc.go 
b/lxd/instance/drivers/driver_lxc.go
index 6b3de9a40e..aa1f0f52e0 100644
--- a/lxd/instance/drivers/driver_lxc.go
+++ b/lxd/instance/drivers/driver_lxc.go
@@ -6665,6 +6665,17 @@ func (c *lxc) DevptsFd() (*os.File, error) {
        return c.c.DevptsFd()
 }
 
+// SeccompNotifyFd returns seccomp notify fd of the container.
+func (c *lxc) SeccompNotifyFd() (*os.File, error) {
+       // Load the go-lxc struct
+       err := c.initLXC(false)
+       if err != nil {
+               return nil, err
+       }
+
+       return c.c.SeccompNotifyFd()
+}
+
 // LocalConfig returns local config.
 func (c *lxc) LocalConfig() map[string]string {
        return c.localConfig
diff --git a/lxd/instance/instance_interface.go 
b/lxd/instance/instance_interface.go
index e03db25bfb..c8dbf1cd5d 100644
--- a/lxd/instance/instance_interface.go
+++ b/lxd/instance/instance_interface.go
@@ -148,6 +148,7 @@ type Container interface {
        ConsoleLog(opts liblxc.ConsoleLogOptions) (string, error)
        InsertSeccompUnixDevice(prefix string, m deviceConfig.Device, pid int) 
error
        DevptsFd() (*os.File, error)
+       SeccompNotifyFd() (*os.File, error)
 }
 
 // CriuMigrationArgs arguments for CRIU migration.
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index 01995462b9..77a7f53738 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -293,6 +293,127 @@ static void is_user_notification_continue_aware(void)
                seccomp_notify_aware = 2;
 }
 
+__noreturn static void __do_user_notification_addfd(void)
+{
+       __do_close int listener = -EBADF;
+       pid_t pid;
+       int ret;
+       struct seccomp_notif req = {};
+       struct seccomp_notif_resp resp = {};
+       struct seccomp_notif_addfd addfd = {};
+       struct pollfd pollfd;
+
+       listener = user_trap_syscall(__NR_dup, 
SECCOMP_FILTER_FLAG_NEW_LISTENER);
+       if (listener < 0)
+               _exit(EXIT_FAILURE);
+
+       pid = fork();
+       if (pid < 0)
+               _exit(EXIT_FAILURE);
+
+       if (pid == 0) {
+               int dup_fd, pipe_fds[2];
+               pid_t self;
+
+               // Don't bother cleaning up. On child exit all of those
+               // will be closed anyway.
+               ret = pipe(pipe_fds);
+               if (ret < 0)
+                       _exit(EXIT_FAILURE);
+
+               // O_CLOEXEC doesn't matter as we're in the child and we're
+               // not going to exec.
+               dup_fd = dup(pipe_fds[0]);
+               if (dup_fd < 0)
+                       _exit(EXIT_FAILURE);
+
+               self = getpid();
+
+               ret = filecmp(self, self, pipe_fds[0], dup_fd);
+               if (ret)
+                       _exit(EXIT_FAILURE);
+
+               _exit(EXIT_SUCCESS);
+       }
+
+       pollfd.fd = listener;
+       pollfd.events = POLLIN | POLLOUT;
+
+       ret = poll(&pollfd, 1, 5000);
+       if (ret <= 0)
+               goto cleanup_sigkill;
+
+       if (!(pollfd.revents & POLLIN))
+               goto cleanup_sigkill;
+
+       ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+       if (ret)
+               goto cleanup_sigkill;
+
+       pollfd.fd = listener;
+       pollfd.events = POLLIN | POLLOUT;
+
+       ret = poll(&pollfd, 1, 5000);
+       if (ret <= 0)
+               goto cleanup_sigkill;
+
+       if (!(pollfd.revents & POLLOUT))
+               goto cleanup_sigkill;
+
+       if (req.data.nr != __NR_dup)
+               goto cleanup_sigkill;
+
+       addfd.srcfd     = 3;
+       addfd.id        = req.id;
+       addfd.flags     = 0;
+
+       // Inject the fd into the task.
+       ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+       if (ret < 0)
+               goto cleanup_sigkill;
+       close(ret);
+
+       resp.id = req.id;
+       resp.flags |= SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+       ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+       resp.error = -EPERM;
+       resp.flags = 0;
+       if (ret) {
+               ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+               goto cleanup_sigkill;
+       }
+
+cleanup_wait:
+       ret = wait_for_pid(pid);
+       if (ret)
+               _exit(EXIT_FAILURE);
+       _exit(EXIT_SUCCESS);
+
+cleanup_sigkill:
+       kill(pid, SIGKILL);
+       goto cleanup_wait;
+}
+
+static void is_user_notification_addfd_aware(void)
+{
+       int ret;
+       pid_t pid;
+
+       pid = fork();
+       if (pid < 0)
+               return;
+
+       if (pid == 0) {
+               __do_user_notification_addfd();
+               // Should not be reached.
+               _exit(EXIT_FAILURE);
+       }
+
+       ret = wait_for_pid(pid);
+       if (!ret)
+               seccomp_notify_aware = 3;
+}
+
 static void is_seccomp_notify_aware(void)
 {
        __u32 action[] = { SECCOMP_RET_USER_NOTIF };
@@ -300,6 +421,8 @@ static void is_seccomp_notify_aware(void)
        if (syscall(__NR_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action[0]) == 
0) {
                seccomp_notify_aware = 1;
                is_user_notification_continue_aware();
+               if (seccomp_notify_aware == 2)
+                       is_user_notification_addfd_aware();
        }
 
 }
@@ -403,9 +526,12 @@ func canUseSeccompListener() bool {
 }
 
 func canUseSeccompListenerContinue() bool {
-       return bool(C.seccomp_notify_aware == 2)
+       return bool(C.seccomp_notify_aware >= 2)
 }
 
+func canUseSeccompListenerAddfd() bool {
+       return bool(C.seccomp_notify_aware == 3)
+}
 func canUsePidFds() bool {
        return bool(C.pidfd_aware)
 }
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 52e88049a1..d1d6e40dfe 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -15,6 +15,8 @@ import (
        "strings"
        "unsafe"
 
+       "github.com/pkg/errors"
+
        "golang.org/x/sys/unix"
        liblxc "gopkg.in/lxc/go-lxc.v2"
 
@@ -41,6 +43,7 @@ import (
 #include <elf.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/bpf.h>
 #include <linux/seccomp.h>
 #include <linux/types.h>
 #include <linux/kdev_t.h>
@@ -58,6 +61,8 @@ import (
 #include <unistd.h>
 
 #include "../include/lxd_seccomp.h"
+#include "../include/memory_utils.h"
+#include "../include/process_utils.h"
 
 struct seccomp_notif_sizes expected_sizes;
 
@@ -123,69 +128,71 @@ struct lxd_seccomp_data_arch {
        int nr_mknodat;
        int nr_setxattr;
        int nr_mount;
+       int nr_bpf;
 };
 
 #define LXD_SECCOMP_NOTIFY_MKNOD    0
 #define LXD_SECCOMP_NOTIFY_MKNODAT  1
 #define LXD_SECCOMP_NOTIFY_SETXATTR 2
 #define LXD_SECCOMP_NOTIFY_MOUNT 3
+#define LXD_SECCOMP_NOTIFY_BPF 4
 
 // ordered by likelihood of usage...
 static const struct lxd_seccomp_data_arch seccomp_notify_syscall_table[] = {
-       { -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, 
LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT },
+       { -1, LXD_SECCOMP_NOTIFY_MKNOD, LXD_SECCOMP_NOTIFY_MKNODAT, 
LXD_SECCOMP_NOTIFY_SETXATTR, LXD_SECCOMP_NOTIFY_MOUNT, LXD_SECCOMP_NOTIFY_BPF },
 #ifdef AUDIT_ARCH_X86_64
-       { AUDIT_ARCH_X86_64,      133, 259, 188, 165 },
+       { AUDIT_ARCH_X86_64,      133, 259, 188, 165, 321 },
 #endif
 #ifdef AUDIT_ARCH_I386
-       { AUDIT_ARCH_I386,         14, 297, 226,  21 },
+       { AUDIT_ARCH_I386,         14, 297, 226,  21, 357 },
 #endif
 #ifdef AUDIT_ARCH_AARCH64
-       { AUDIT_ARCH_AARCH64,      -1,  33,   5,  21 },
+       { AUDIT_ARCH_AARCH64,      -1,  33,   5,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_ARM
-       { AUDIT_ARCH_ARM,          14, 324, 226,  21 },
+       { AUDIT_ARCH_ARM,          14, 324, 226,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_ARMEB
-       { AUDIT_ARCH_ARMEB,        14, 324, 226,  21 },
+       { AUDIT_ARCH_ARMEB,        14, 324, 226,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_S390
-       { AUDIT_ARCH_S390,         14, 290, 224,  21 },
+       { AUDIT_ARCH_S390,         14, 290, 224,  21, 386 },
 #endif
 #ifdef AUDIT_ARCH_S390X
-       { AUDIT_ARCH_S390X,        14, 290, 224,  21 },
+       { AUDIT_ARCH_S390X,        14, 290, 224,  21, 351 },
 #endif
 #ifdef AUDIT_ARCH_PPC
-       { AUDIT_ARCH_PPC,          14, 288, 209,  21 },
+       { AUDIT_ARCH_PPC,          14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_PPC64
-       { AUDIT_ARCH_PPC64,        14, 288, 209,  21 },
+       { AUDIT_ARCH_PPC64,        14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_PPC64LE
-       { AUDIT_ARCH_PPC64LE,      14, 288, 209,  21 },
+       { AUDIT_ARCH_PPC64LE,      14, 288, 209,  21, 361 },
 #endif
 #ifdef AUDIT_ARCH_SPARC
-       { AUDIT_ARCH_SPARC,        14, 286, 169, 167 },
+       { AUDIT_ARCH_SPARC,        14, 286, 169, 167, 349 },
 #endif
 #ifdef AUDIT_ARCH_SPARC64
-       { AUDIT_ARCH_SPARC64,      14, 286, 169, 167 },
+       { AUDIT_ARCH_SPARC64,      14, 286, 169, 167, 349 },
 #endif
 #ifdef AUDIT_ARCH_MIPS
-       { AUDIT_ARCH_MIPS,         14, 290, 224,  21 },
+       { AUDIT_ARCH_MIPS,         14, 290, 224,  21,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL
-       { AUDIT_ARCH_MIPSEL,       14, 290, 224,  21 },
+       { AUDIT_ARCH_MIPSEL,       14, 290, 224,  21,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64
-       { AUDIT_ARCH_MIPS64,      131, 249, 180, 160 },
+       { AUDIT_ARCH_MIPS64,      131, 249, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPS64N32
-       { AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160 },
+       { AUDIT_ARCH_MIPS64N32,   131, 253, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64
-       { AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160 },
+       { AUDIT_ARCH_MIPSEL64,    131, 249, 180, 160,  -1 },
 #endif
 #ifdef AUDIT_ARCH_MIPSEL64N32
-       { AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160 },
+       { AUDIT_ARCH_MIPSEL64N32, 131, 253, 180, 160,  -1 },
 #endif
 };
 
@@ -217,6 +224,9 @@ static int seccomp_notify_get_syscall(struct seccomp_notif 
*req,
                if (entry->nr_mount == req->data.nr)
                        return LXD_SECCOMP_NOTIFY_MOUNT;
 
+               if (entry->nr_bpf == req->data.nr)
+                       return LXD_SECCOMP_NOTIFY_BPF;
+
                break;
        }
 
@@ -249,6 +259,109 @@ static void prepare_seccomp_iovec(struct iovec *iov,
        iov[3].iov_len = SECCOMP_COOKIE_SIZE;
 }
 
+static inline int pidfd_getfd(int pidfd, int fd, int flags)
+{
+       return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
+}
+
+static int handleBpfSyscall(int notify_fd, int mem_fd,
+                           struct seccomp_notify_proxy_msg *msg,
+                           struct seccomp_notif *req,
+                           struct seccomp_notif_resp *resp,
+                           char *buf, size_t *buf_size)
+{
+       __do_close int pidfd = -EBADF, bpf_target_fd = -EBADF,
+                      bpf_attach_fd = -EBADF, bpf_prog_fd = -EBADF;
+       union bpf_attr attr = {};
+       unsigned int attr_len = sizeof(attr);
+       struct seccomp_notif_addfd addfd = {};
+       int ret;
+       int cmd;
+
+       if (attr_len < req->data.args[2])
+               return -1;
+       attr_len = req->data.args[2];
+
+       switch (req->data.args[0]) {
+       case BPF_PROG_LOAD:
+               cmd = BPF_PROG_LOAD;
+               break;
+       case BPF_PROG_ATTACH:
+               cmd = BPF_PROG_ATTACH;
+               break;
+       case BPF_PROG_DETACH:
+               cmd = BPF_PROG_DETACH;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ret = pread(mem_fd, &attr, attr_len, (off_t)req->data.args[1]);
+       if (ret < 0)
+               return -1;
+
+       switch (attr.prog_type) {
+       case BPF_PROG_TYPE_CGROUP_DEVICE:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       pidfd = pidfd_open(req->pid, 0);
+       if (pidfd < 0)
+               return -errno;
+
+       switch (cmd) {
+       case BPF_PROG_LOAD:
+               bpf_prog_fd = syscall(__NR_bpf, cmd, &attr, attr_len);
+               if (ret < 0)
+                       return -errno;
+
+               addfd.srcfd     = bpf_prog_fd;
+               addfd.id        = req->id;
+               addfd.flags     = 0;
+
+               // Inject the fd into the task.
+               ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+               if (ret < 0)
+                       return -errno;
+
+               // Tell the caller what fd it got.
+               // Let me tell you, coding this is absurdly exciting. :D
+               resp->val = ret;
+               ret = 0;
+               break;
+       case BPF_PROG_ATTACH:
+               bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+               if (bpf_target_fd < 0)
+                       return -errno;
+
+               bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+               if (bpf_attach_fd < 0)
+                       return -errno;
+
+               attr.target_fd = bpf_target_fd;
+               attr.attach_bpf_fd = bpf_attach_fd;
+               ret = syscall(__NR_bpf, cmd, &attr, attr_len);
+               break;
+       case BPF_PROG_DETACH:
+               bpf_target_fd = pidfd_getfd(pidfd, attr.target_fd, 0);
+               if (bpf_target_fd < 0)
+                       return -10;
+
+               bpf_attach_fd = pidfd_getfd(pidfd, attr.attach_bpf_fd, 0);
+               if (bpf_attach_fd < 0)
+                       return -11;
+
+               attr.target_fd = bpf_target_fd;
+               attr.attach_bpf_fd = bpf_attach_fd;
+               ret = syscall(__NR_bpf, cmd, &attr, attr_len);
+               break;
+       }
+
+       return ret;
+}
+
 #ifndef MS_LAZYTIME
 #define MS_LAZYTIME (1<<25)
 #endif
@@ -259,6 +372,7 @@ const lxdSeccompNotifyMknod = C.LXD_SECCOMP_NOTIFY_MKNOD
 const lxdSeccompNotifyMknodat = C.LXD_SECCOMP_NOTIFY_MKNODAT
 const lxdSeccompNotifySetxattr = C.LXD_SECCOMP_NOTIFY_SETXATTR
 const lxdSeccompNotifyMount = C.LXD_SECCOMP_NOTIFY_MOUNT
+const lxdSeccompNotifyBpf = C.LXD_SECCOMP_NOTIFY_BPF
 
 const seccompHeader = `2
 `
@@ -327,6 +441,14 @@ move_mount errno 38
 const seccompNotifyMount = `mount notify 
[3,0,SCMP_CMP_MASKED_EQ,18446744070422410016]
 `
 
+// 5 == BPF_PROG_LOAD
+// 8 == BPF_PROG_ATTACH
+// 9 == BPF_PROG_DETACH
+const seccompNotifyBpf = `bpf notify [0,5,SCMP_CMP_EQ]
+bpf notify [0,8,SCMP_CMP_EQ]
+bpf notify [0,9,SCMP_CMP_EQ]
+`
+
 const compatBlockingPolicy = `[%s]
 compat_sys_rt_sigaction errno 38
 stub_x32_rt_sigreturn errno 38
@@ -412,6 +534,7 @@ func InstanceNeedsPolicy(c Instance) bool {
                "security.syscalls.intercept.mknod",
                "security.syscalls.intercept.setxattr",
                "security.syscalls.intercept.mount",
+               "security.syscalls.intercept.bpf",
        }
 
        for _, k := range keys {
@@ -446,20 +569,22 @@ func InstanceNeedsIntercept(s *state.State, c Instance) 
(bool, error) {
 
        config := c.ExpandedConfig()
 
-       var keys = map[string]func(state *state.State) bool{
+       var keys = map[string]func(state *state.State) error{
                "security.syscalls.intercept.mknod":    lxcSupportSeccompNotify,
                "security.syscalls.intercept.setxattr": lxcSupportSeccompNotify,
                "security.syscalls.intercept.mount":    
lxcSupportSeccompNotifyContinue,
+               "security.syscalls.intercept.bpf":      
lxcSupportSeccompNotifyAddfd,
        }
 
        needed := false
-       for key, isSupported := range keys {
+       for key, check := range keys {
                if !shared.IsTrue(config[key]) {
                        continue
                }
 
-               if !isSupported(s) {
-                       return needed, fmt.Errorf("System doesn't support 
syscall interception")
+               err := check(s)
+               if err != nil {
+                       return needed, err
                }
 
                needed = true
@@ -546,6 +671,11 @@ func seccompGetPolicyContent(s *state.State, c Instance) 
(string, error) {
                        // multiple syscalls.
                        policy += seccompBlockNewMountAPI
                }
+
+               if shared.IsTrue(config["security.syscalls.intercept.bpf"]) &&
+                       
shared.IsTrue(config["security.syscalls.intercept.bpf.prog.type.device"]) {
+                       policy += seccompNotifyBpf
+               }
        }
 
        if allowlist != "" {
@@ -617,14 +747,15 @@ type Server struct {
 
 // Iovec defines an iovec to move data between kernel and userspace.
 type Iovec struct {
-       ucred  *unix.Ucred
-       memFd  int
-       procFd int
-       msg    *C.struct_seccomp_notify_proxy_msg
-       req    *C.struct_seccomp_notif
-       resp   *C.struct_seccomp_notif_resp
-       cookie *C.char
-       iov    *C.struct_iovec
+       ucred    *unix.Ucred
+       memFd    int
+       procFd   int
+       notifyFd int
+       msg      *C.struct_seccomp_notify_proxy_msg
+       req      *C.struct_seccomp_notif
+       resp     *C.struct_seccomp_notif_resp
+       cookie   *C.char
+       iov      *C.struct_iovec
 }
 
 // NewSeccompIovec creates a new seccomp iovec.
@@ -652,14 +783,15 @@ func NewSeccompIovec(ucred *unix.Ucred) *Iovec {
        C.prepare_seccomp_iovec(iov, msg, req, resp, cookie)
 
        return &Iovec{
-               memFd:  -1,
-               procFd: -1,
-               msg:    msg,
-               req:    req,
-               resp:   resp,
-               cookie: cookie,
-               iov:    iov,
-               ucred:  ucred,
+               memFd:    -1,
+               procFd:   -1,
+               notifyFd: -1,
+               msg:      msg,
+               req:      req,
+               resp:     resp,
+               cookie:   cookie,
+               iov:      iov,
+               ucred:    ucred,
        }
 }
 
@@ -671,6 +803,9 @@ func (siov *Iovec) PutSeccompIovec() {
        if siov.procFd >= 0 {
                unix.Close(siov.procFd)
        }
+       if siov.notifyFd >= 0 {
+               unix.Close(siov.notifyFd)
+       }
        C.free(unsafe.Pointer(siov.msg))
        C.free(unsafe.Pointer(siov.req))
        C.free(unsafe.Pointer(siov.resp))
@@ -678,20 +813,30 @@ func (siov *Iovec) PutSeccompIovec() {
        C.free(unsafe.Pointer(siov.iov))
 }
 
-// ReceiveSeccompIovec receives a seccomp iovec.
-func (siov *Iovec) ReceiveSeccompIovec(fd int) (uint64, error) {
+// ReceiveSeccompIovecV1 receives a v1 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV1(fd int) (uint64, error) {
        bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 2, 
unsafe.Pointer(siov.iov), 4)
        if err != nil || err == io.EOF {
                return 0, err
        }
 
-       if len(fds) == 2 {
-               siov.procFd = int(fds[0])
-               siov.memFd = int(fds[1])
-       } else {
-               siov.memFd = int(fds[0])
+       siov.procFd = int(fds[0])
+       siov.memFd = int(fds[1])
+
+       return bytes, nil
+}
+
+// ReceiveSeccompIovecV2 receives a v2 seccomp iovec.
+func (siov *Iovec) ReceiveSeccompIovecV2(fd int) (uint64, error) {
+       bytes, fds, err := netutils.AbstractUnixReceiveFdData(fd, 3, 
unsafe.Pointer(siov.iov), 4)
+       if err != nil || err == io.EOF {
+               return 0, err
        }
 
+       siov.procFd = int(fds[0])
+       siov.memFd = int(fds[1])
+       siov.notifyFd = int(fds[2])
+
        return bytes, nil
 }
 
@@ -810,8 +955,15 @@ func NewSeccompServer(s *state.State, path string, findPID 
func(pid int32, state
                                }
 
                                for {
+                                       var bytes uint64
+                                       var err error
+
                                        siov := NewSeccompIovec(ucred)
-                                       bytes, err := 
siov.ReceiveSeccompIovec(int(unixFile.Fd()))
+                                       if lxcSupportSeccompV2(server.s) {
+                                               bytes, err = 
siov.ReceiveSeccompIovecV2(int(unixFile.Fd()))
+                                       } else {
+                                               bytes, err = 
siov.ReceiveSeccompIovecV1(int(unixFile.Fd()))
+                                       }
                                        if err != nil {
                                                logger.Debugf("Disconnected 
from seccomp socket after failed receive: pid=%v, err=%s", ucred.Pid, err)
                                                c.Close()
@@ -1601,6 +1753,45 @@ func (s *Server) HandleMountSyscall(c Instance, siov 
*Iovec) int {
        return 0
 }
 
+// BpfArgs arguments for mount.
+type BpfArgs struct {
+       pid int
+}
+
+// HandleBpfSyscall handles mount syscalls.
+func (s *Server) HandleBpfSyscall(c Instance, siov *Iovec) int {
+       ctx := log.Ctx{"container": c.Name(),
+               "project":              c.Project(),
+               "syscall_number":       siov.req.data.nr,
+               "audit_architecture":   siov.req.data.arch,
+               "seccomp_notify_id":    siov.req.id,
+               "seccomp_notify_flags": siov.req.flags,
+       }
+
+       defer logger.Debug("Handling bpf syscall", ctx)
+
+       args := BpfArgs{
+               pid: int(siov.req.pid),
+       }
+
+       pidFdNr, pidFd := inheritPidFd(args.pid, s.s)
+       if pidFdNr >= 0 {
+               defer pidFd.Close()
+       }
+
+       cBpfAttrBuf := [4096]C.char{}
+       cBpfAttrSize := C.size_t(len(cBpfAttrBuf))
+
+       ret := C.handleBpfSyscall(C.int(siov.notifyFd), C.int(siov.memFd), 
siov.msg, siov.req, siov.resp, &cBpfAttrBuf[0], &cBpfAttrSize)
+       if ret < 0 {
+               ctx["syscall_continue"] = "true"
+               C.seccomp_notify_update_response(siov.resp, 0, 
C.uint32_t(seccompUserNotifFlagContinue))
+               return 0
+       }
+
+       return 0
+}
+
 func (s *Server) handleSyscall(c Instance, siov *Iovec) int {
        switch int(C.seccomp_notify_get_syscall(siov.req, siov.resp)) {
        case lxdSeccompNotifyMknod:
@@ -1611,6 +1802,8 @@ func (s *Server) handleSyscall(c Instance, siov *Iovec) 
int {
                return s.HandleSetxattrSyscall(c, siov)
        case lxdSeccompNotifyMount:
                return s.HandleMountSyscall(c, siov)
+       case lxdSeccompNotifyBpf:
+               return s.HandleBpfSyscall(c, siov)
        }
 
        return int(-C.EINVAL)
@@ -1649,39 +1842,70 @@ func (s *Server) Stop() error {
        return s.l.Close()
 }
 
-func lxcSupportSeccompNotifyContinue(state *state.State) bool {
-       if !lxcSupportSeccompNotify(state) {
+func lxcSupportSeccompV2(state *state.State) bool {
+       err := lxcSupportSeccompNotify(state)
+       if err != nil {
                return false
        }
 
-       if !state.OS.SeccompListenerContinue {
+       if !state.OS.LXCFeatures["seccomp_proxy_send_notify_fd"] {
                return false
        }
 
        return true
 }
 
-func lxcSupportSeccompNotify(state *state.State) bool {
+func lxcSupportSeccompNotifyContinue(state *state.State) error {
+       err := lxcSupportSeccompNotify(state)
+       if err != nil {
+               return err
+       }
+
+       if !state.OS.SeccompListenerContinue {
+               return fmt.Errorf("Seccomp notify doesn't support continuing 
syscalls")
+       }
+
+       return nil
+}
+
+func lxcSupportSeccompNotifyAddfd(state *state.State) error {
+       err := lxcSupportSeccompNotify(state)
+       if err != nil {
+               return err
+       }
+
+       if !state.OS.SeccompListenerContinue {
+               return fmt.Errorf("Seccomp notify doesn't support continuing 
syscalls")
+       }
+
+       if !state.OS.SeccompListenerAddfd {
+               return fmt.Errorf("Seccomp notify doesn't support adding file 
descriptors")
+       }
+
+       return nil
+}
+
+func lxcSupportSeccompNotify(state *state.State) error {
        if !state.OS.SeccompListener {
-               return false
+               return fmt.Errorf("Seccomp notify not supported")
        }
 
        if !state.OS.LXCFeatures["seccomp_notify"] {
-               return false
+               return fmt.Errorf("LXC doesn't support seccomp notify")
        }
 
        c, err := liblxc.NewContainer("test-seccomp", state.OS.LxcPath)
        if err != nil {
-               return false
+               return fmt.Errorf("Failed to load seccomp notify test 
container")
        }
 
        err = c.SetConfigItem("lxc.seccomp.notify.proxy", 
fmt.Sprintf("unix:%s", shared.VarPath("seccomp.socket")))
        if err != nil {
-               return false
+               return errors.Wrap(err, "LXC doesn't support notify proxy")
        }
 
        c.Release()
-       return true
+       return nil
 }
 
 // MountSyscallFilter creates a mount syscall filter from the config.
diff --git a/lxd/sys/os.go b/lxd/sys/os.go
index d20b6aef85..b7faa7d4e3 100644
--- a/lxd/sys/os.go
+++ b/lxd/sys/os.go
@@ -67,6 +67,7 @@ type OS struct {
        NetnsGetifaddrs         bool
        PidFds                  bool
        SeccompListener         bool
+       SeccompListenerAddfd    bool
        SeccompListenerContinue bool
        Shiftfs                 bool
        UeventInjection         bool
diff --git a/shared/instance.go b/shared/instance.go
index 42eba631c1..b8072a53b3 100644
--- a/shared/instance.go
+++ b/shared/instance.go
@@ -199,20 +199,22 @@ var KnownInstanceConfigKeys = map[string]func(value 
string) error{
 
        "security.secureboot": validate.Optional(validate.IsBool),
 
-       "security.syscalls.allow":                   validate.IsAny,
-       "security.syscalls.blacklist_default":       
validate.Optional(validate.IsBool),
-       "security.syscalls.blacklist_compat":        
validate.Optional(validate.IsBool),
-       "security.syscalls.blacklist":               validate.IsAny,
-       "security.syscalls.deny_default":            
validate.Optional(validate.IsBool),
-       "security.syscalls.deny_compat":             
validate.Optional(validate.IsBool),
-       "security.syscalls.deny":                    validate.IsAny,
-       "security.syscalls.intercept.mknod":         
validate.Optional(validate.IsBool),
-       "security.syscalls.intercept.mount":         
validate.Optional(validate.IsBool),
-       "security.syscalls.intercept.mount.allowed": validate.IsAny,
-       "security.syscalls.intercept.mount.fuse":    validate.IsAny,
-       "security.syscalls.intercept.mount.shift":   
validate.Optional(validate.IsBool),
-       "security.syscalls.intercept.setxattr":      
validate.Optional(validate.IsBool),
-       "security.syscalls.whitelist":               validate.IsAny,
+       "security.syscalls.allow":                          validate.IsAny,
+       "security.syscalls.blacklist_default":              
validate.Optional(validate.IsBool),
+       "security.syscalls.blacklist_compat":               
validate.Optional(validate.IsBool),
+       "security.syscalls.blacklist":                      validate.IsAny,
+       "security.syscalls.deny_default":                   
validate.Optional(validate.IsBool),
+       "security.syscalls.deny_compat":                    
validate.Optional(validate.IsBool),
+       "security.syscalls.deny":                           validate.IsAny,
+       "security.syscalls.intercept.bpf":                  
validate.Optional(validate.IsBool),
+       "security.syscalls.intercept.bpf.prog.type.device": 
validate.Optional(validate.IsBool),
+       "security.syscalls.intercept.mknod":                
validate.Optional(validate.IsBool),
+       "security.syscalls.intercept.mount":                
validate.Optional(validate.IsBool),
+       "security.syscalls.intercept.mount.allowed":        validate.IsAny,
+       "security.syscalls.intercept.mount.fuse":           validate.IsAny,
+       "security.syscalls.intercept.mount.shift":          
validate.Optional(validate.IsBool),
+       "security.syscalls.intercept.setxattr":             
validate.Optional(validate.IsBool),
+       "security.syscalls.whitelist":                      validate.IsAny,
 
        "snapshots.schedule": func(value string) error {
                if value == "" {
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to