Without a per-VE cap a single container could exhaust the system-wide
bpf JIT memory budget by loading excessive numbers of CGROUP_DEVICE
programs via the VE_FEATURE_BPF path.

Add bpf_prog_avail_nr / bpf_prog_max_nr counters to ve_struct and
enforce them in bpf_prog_load() for non-bpf-capable callers loading
CGROUP_DEVICE programs.

Lifetime note: A BPF program loadded in VE takes a reference to ve, when
container is stopped, all open fds to the BPF programm will be closed
and when container manager removes container cgroups the BPF program
will be released and thus releasing the reference to VE.

Default max number note: It is somehow similar to ve.netif_max_nr, there
each docker container creates two veths, and I also observe that docker
container loads two bpf programs (one by dockerd, one by systemd). So
let's use the same number.

https://virtuozzo.atlassian.net/browse/VSTOR-131947
Signed-off-by: Pavel Tikhomirov <[email protected]>
Feature: ve: allow BPF in Containers
---
 include/linux/bpf.h  |  8 ++++++++
 include/linux/ve.h   |  4 ++++
 kernel/bpf/core.c    |  8 ++++++++
 kernel/bpf/syscall.c | 35 +++++++++++++++++++++++++++++++++++
 kernel/ve/ve.c       |  5 +++++
 5 files changed, 60 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 80175c7a21c27..0212806d5efc2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct cgroup;
 struct bpf_token;
 struct user_namespace;
 struct super_block;
+struct ve_struct;
 struct inode;
 
 extern struct idr btf_idr;
@@ -1522,6 +1523,13 @@ struct bpf_prog_aux {
        void *security;
 #endif
        struct bpf_token *token;
+#ifdef CONFIG_VE
+       /* VE that loaded the program via VE_FEATURE_BPF path and against whose
+        * bpf_prog_avail_nr counter the program is accounted. NULL for programs
+        * loaded through the regular (non VE-restricted) path.
+        */
+       struct ve_struct *owner_ve;
+#endif
        struct bpf_prog_offload *offload;
        struct btf *btf;
        struct bpf_func_info *func_info;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 224acf012821f..88b4d531c466e 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -76,6 +76,9 @@ struct ve_struct {
        atomic_t                netif_avail_nr;
        int                     netif_max_nr;
 
+       atomic_t                bpf_prog_avail_nr;
+       int                     bpf_prog_max_nr;
+
        atomic64_t              _uevent_seqnum;
 
        int                     _randomize_va_space;
@@ -149,6 +152,7 @@ extern int nr_ve;
 
 #define NETNS_MAX_NR_DEFAULT   256     /* number of net-namespaces per-VE */
 #define NETIF_MAX_NR_DEFAULT   256     /* number of net-interfaces per-VE */
+#define BPF_PROG_MAX_NR_DEFAULT        256     /* number of loaded BPF progs 
per-VE */
 
 extern unsigned int sysctl_ve_mount_nr;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4de8774458aca..7aaf73180fcdc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -38,6 +38,7 @@
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
 #include <linux/execmem.h>
+#include <linux/ve.h>
 
 #include <asm/barrier.h>
 #include <linux/unaligned.h>
@@ -2828,6 +2829,13 @@ void bpf_prog_free(struct bpf_prog *fp)
        if (aux->dst_prog)
                bpf_prog_put(aux->dst_prog);
        bpf_token_put(aux->token);
+#ifdef CONFIG_VE
+       if (aux->owner_ve) {
+               atomic_inc(&aux->owner_ve->bpf_prog_avail_nr);
+               put_ve(aux->owner_ve);
+               aux->owner_ve = NULL;
+       }
+#endif
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
        schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0475a72c93c06..8bfea71716de9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2663,6 +2663,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t 
uattr, u32 uattr_size)
        struct bpf_prog *prog, *dst_prog = NULL;
        struct btf *attach_btf = NULL;
        struct bpf_token *token = NULL;
+       struct ve_struct *load_ve = NULL;
        bool bpf_cap;
        int err;
        char license[128];
@@ -2744,6 +2745,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t 
uattr, u32 uattr_size)
        if (is_perfmon_prog_type(type) && !bpf_token_capable(token, 
CAP_PERFMON))
                goto put_token;
 
+#ifdef CONFIG_VE
+       /* Restrict the number of BPF programs that can be loaded via the
+        * VE-allowed path. Without this, a single container could exhaust
+        * the system-wide bpf JIT memory budget by loading excessive
+        * numbers of CGROUP_DEVICE programs.
+        */
+       if (!bpf_cap && type == BPF_PROG_TYPE_CGROUP_DEVICE) {
+               load_ve = get_exec_env();
+               if (atomic_dec_if_positive(&load_ve->bpf_prog_avail_nr) < 0) {
+                       load_ve = NULL;
+                       err = -ENOSPC;
+                       goto put_token;
+               }
+       }
+#endif
+
        /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
         * or btf, we need to check which one it is
         */
@@ -2809,6 +2826,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t 
uattr, u32 uattr_size)
        prog->aux->dev_bound = !!attr->prog_ifindex;
        prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
+#ifdef CONFIG_VE
+       /* Hand the avail_nr slot reservation over to the prog. bpf_prog_free()
+        * will release it via put_ve + counter increment.
+        */
+       if (load_ve) {
+               prog->aux->owner_ve = get_ve(load_ve);
+               load_ve = NULL;
+       }
+#endif
+
        /* move token into prog->aux, reuse taken refcnt */
        prog->aux->token = token;
        token = NULL;
@@ -2932,6 +2959,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t 
uattr, u32 uattr_size)
                btf_put(prog->aux->attach_btf);
        bpf_prog_free(prog);
 put_token:
+#ifdef CONFIG_VE
+       /* The load_ve is non-NULL only if we decremented bpf_prog_avail_nr
+        * but did not hand the reservation off to the prog yet (i.e. failure
+        * happened before bpf_prog_alloc()). Roll back the counter.
+        */
+       if (load_ve)
+               atomic_inc(&load_ve->bpf_prog_avail_nr);
+#endif
        bpf_token_put(token);
        return err;
 }
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 198c82f010cc1..48da546117bb7 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -76,6 +76,8 @@ struct ve_struct ve0 = {
        .netns_max_nr           = INT_MAX,
        .netif_avail_nr         = ATOMIC_INIT(INT_MAX),
        .netif_max_nr           = INT_MAX,
+       .bpf_prog_avail_nr      = ATOMIC_INIT(INT_MAX),
+       .bpf_prog_max_nr        = INT_MAX,
        .fsync_enable           = FSYNC_FILTERED,
        ._randomize_va_space    =
 #ifdef CONFIG_COMPAT_BRK
@@ -983,6 +985,9 @@ static struct cgroup_subsys_state *ve_create(struct 
cgroup_subsys_state *parent_
        atomic_set(&ve->netif_avail_nr, NETIF_MAX_NR_DEFAULT);
        ve->netif_max_nr = NETIF_MAX_NR_DEFAULT;
 
+       atomic_set(&ve->bpf_prog_avail_nr, BPF_PROG_MAX_NR_DEFAULT);
+       ve->bpf_prog_max_nr = BPF_PROG_MAX_NR_DEFAULT;
+
        err = ve_log_init(ve);
        if (err)
                goto err_log;
-- 
2.54.0

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to