Test bpf device cgroup program operations inside VE with and without VE_FEATURE_BPF enabled:
- prog_load: BPF_PROG_LOAD(BPF_PROG_TYPE_CGROUP_DEVICE) succeeds with VE_FEATURE_BPF - prog_load_denied: BPF_PROG_LOAD(BPF_PROG_TYPE_CGROUP_DEVICE) returns EPERM without VE_FEATURE_BPF - prog_query: BPF_PROG_QUERY(BPF_CGROUP_DEVICE) succeeds on a descendant cgroup with VE_FEATURE_BPF - prog_query_effective_denied: BPF_PROG_QUERY with BPF_F_QUERY_EFFECTIVE returns EPERM (prevents peeking into ancestor cgroup programs) - prog_query_root_denied: BPF_PROG_QUERY on VE root cgroup returns EPERM - prog_attach_query: Full Docker-like workflow of load, attach to a descendant cgroup, query back and verify prog_cnt - prog_load_oversized_denied: With VE_FEATURE_BPF loading oversized program, with >BPF_MAXINSNS instructions, fails with E2BIG. https://virtuozzo.atlassian.net/browse/VSTOR-126504 Signed-off-by: Pavel Tikhomirov <[email protected]> --- .../testing/selftests/ve_devcg_bpf/.gitignore | 1 + tools/testing/selftests/ve_devcg_bpf/Makefile | 7 + .../ve_devcg_bpf/ve_devcg_bpf_test.c | 610 ++++++++++++++++++ 3 files changed, 618 insertions(+) create mode 100644 tools/testing/selftests/ve_devcg_bpf/.gitignore create mode 100644 tools/testing/selftests/ve_devcg_bpf/Makefile create mode 100644 tools/testing/selftests/ve_devcg_bpf/ve_devcg_bpf_test.c diff --git a/tools/testing/selftests/ve_devcg_bpf/.gitignore b/tools/testing/selftests/ve_devcg_bpf/.gitignore new file mode 100644 index 000000000000..a1f032ce2428 --- /dev/null +++ b/tools/testing/selftests/ve_devcg_bpf/.gitignore @@ -0,0 +1 @@ +ve_devcg_bpf_test diff --git a/tools/testing/selftests/ve_devcg_bpf/Makefile b/tools/testing/selftests/ve_devcg_bpf/Makefile new file mode 100644 index 000000000000..57c37d8ee876 --- /dev/null +++ b/tools/testing/selftests/ve_devcg_bpf/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for ve_devcg_bpf selftests. +CFLAGS += -g -I../../../../usr/include/ -I../../../../tools/include/ -Wall -O2 + +TEST_GEN_PROGS += ve_devcg_bpf_test + +include ../lib.mk diff --git a/tools/testing/selftests/ve_devcg_bpf/ve_devcg_bpf_test.c b/tools/testing/selftests/ve_devcg_bpf/ve_devcg_bpf_test.c new file mode 100644 index 000000000000..cb91541893d7 --- /dev/null +++ b/tools/testing/selftests/ve_devcg_bpf/ve_devcg_bpf_test.c @@ -0,0 +1,610 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ve_devcg_bpf selftests + * + * Tests for the VE_FEATURE_BPF which allows bpf device cgroup programs + * inside VE (containers). Verifies that: + * + * - BPF_PROG_TYPE_CGROUP_DEVICE programs can be loaded inside VE when + * VE_FEATURE_BPF is enabled, and cannot be loaded when it is not. + * - BPF_PROG_QUERY(BPF_CGROUP_DEVICE) works on descendant cgroups inside + * VE with VE_FEATURE_BPF, but is denied with BPF_F_QUERY_EFFECTIVE or + * on the VE root cgroup. + * - BPF_PROG_TYPE_CGROUP_DEVICE programs can be attached to and queried + * on descendant cgroups inside VE. + */ +#define _GNU_SOURCE +#include <linux/sched.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/mount.h> +#include <sched.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <asm/unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <linux/limits.h> +#include <errno.h> + +#include "../../../../include/uapi/linux/vzcalluser.h" + +#include "../kselftest_harness.h" + +#define __STACK_SIZE (8 * 1024 * 1024) +#define CTID_MIN 108 +#define CTID_MAX 200 + +#ifndef CLONE_NEWVE +#define CLONE_NEWVE 0x00000040 +#endif + +static int write_file_at(int dirfd, const char *path, const char *val) +{ + int fd, ret; + size_t len = strlen(val); + + fd = openat(dirfd, path, O_WRONLY); + if (fd < 0) + return -1; + + ret = write(fd, val, len); + close(fd); + return (ret == len) ? 0 : -1; +} + +enum { + TEST_PROG_LOAD, + TEST_PROG_LOAD_DENIED, + TEST_PROG_QUERY, + TEST_PROG_QUERY_EFFECTIVE_DENIED, + TEST_PROG_QUERY_ROOT_DENIED, + TEST_PROG_ATTACH_QUERY, + TEST_PROG_LOAD_OVERSIZED_DENIED, +}; + +struct fargs { + int cgv2_fd; + int ctid; + int test; +}; + +static int load_devcg_prog(void) +{ + /* + * Minimal BPF_PROG_TYPE_CGROUP_DEVICE program: + * r0 = 1 (allow device access) + * exit + */ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + union bpf_attr attr; + static char log_buf[4096]; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_CGROUP_DEVICE; + attr.insn_cnt = ARRAY_SIZE(insns); + attr.insns = (unsigned long)insns; + attr.license = (unsigned long)"GPL"; + attr.log_buf = (unsigned long)log_buf; + attr.log_size = sizeof(log_buf); + attr.log_level = 1; + + return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); +} + +static int mount_cg2_fd(void) +{ + int fs_fd, mnt_fd; + + fs_fd = syscall(__NR_fsopen, "cgroup2", 0); + if (fs_fd < 0) + return -1; + + if (syscall(__NR_fsconfig, fs_fd, FSCONFIG_CMD_CREATE, + NULL, NULL, 0) < 0) { + close(fs_fd); + return -1; + } + + mnt_fd = syscall(__NR_fsmount, fs_fd, 0, 0); + close(fs_fd); + return mnt_fd; +} + +static int test_prog_load(void) +{ + int fd; + + fd = load_devcg_prog(); + if (fd < 0) + return 1; + + close(fd); + return 0; +} + +static int test_prog_load_denied(void) +{ + int fd; + + fd = load_devcg_prog(); + if (fd >= 0) { + close(fd); + return 1; + } + + return (errno == EPERM) ? 0 : 2; +} + +static int test_prog_load_oversized_denied(void) +{ + struct bpf_insn insns[BPF_MAXINSNS + 1]; + union bpf_attr attr; + int i, fd; + + for (i = 0; i < BPF_MAXINSNS; i++) + insns[i] = (struct bpf_insn)BPF_MOV64_IMM(BPF_REG_0, 1); + insns[BPF_MAXINSNS] = (struct bpf_insn)BPF_EXIT_INSN(); + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_CGROUP_DEVICE; + attr.insn_cnt = BPF_MAXINSNS + 1; + attr.insns = (unsigned long)insns; + attr.license = (unsigned long)"GPL"; + + fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (fd >= 0) { + close(fd); + return 1; + } + + return (errno == E2BIG) ? 0 : 2; +} + +static int test_prog_query(int ve_cg_fd) +{ + union bpf_attr attr; + int cg_fd, ret; + + if (mkdirat(ve_cg_fd, "subcg", 0755)) + return 1; + + cg_fd = openat(ve_cg_fd, "subcg", O_RDONLY | O_DIRECTORY); + if (cg_fd < 0) { + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + return 2; + } + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_DEVICE; + + ret = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + close(cg_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + + return (ret < 0) ? 3 : 0; +} + +static int test_prog_query_effective_denied(int ve_cg_fd) +{ + union bpf_attr attr; + int cg_fd, ret, saved_errno; + + if (mkdirat(ve_cg_fd, "subcg", 0755)) + return 1; + + cg_fd = openat(ve_cg_fd, "subcg", O_RDONLY | O_DIRECTORY); + if (cg_fd < 0) { + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + return 2; + } + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_DEVICE; + attr.query.query_flags = BPF_F_QUERY_EFFECTIVE; + + ret = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + saved_errno = errno; + + close(cg_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + + if (ret == 0) + return 3; + + return (saved_errno == EPERM) ? 0 : 4; +} + +static int test_prog_query_root_denied(int ve_cg_fd) +{ + union bpf_attr attr; + int ret; + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = ve_cg_fd; + attr.query.attach_type = BPF_CGROUP_DEVICE; + + ret = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + if (ret == 0) + return 1; + + return (errno == EPERM) ? 0 : 2; +} + +static int test_prog_attach_query(int ve_cg_fd) +{ + union bpf_attr attr; + int cg_fd, prog_fd, ret; + __u32 prog_ids[1]; + + prog_fd = load_devcg_prog(); + if (prog_fd < 0) + return 1; + + if (mkdirat(ve_cg_fd, "subcg", 0755)) { + close(prog_fd); + return 2; + } + + cg_fd = openat(ve_cg_fd, "subcg", O_RDONLY | O_DIRECTORY); + if (cg_fd < 0) { + close(prog_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + return 3; + } + + /* Attach the loaded program to the sub-cgroup */ + memset(&attr, 0, sizeof(attr)); + attr.target_fd = cg_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = BPF_CGROUP_DEVICE; + + ret = syscall(__NR_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr)); + if (ret < 0) { + close(cg_fd); + close(prog_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + return 4; + } + + /* Query the attached program */ + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_DEVICE; + attr.query.prog_ids = (unsigned long)prog_ids; + attr.query.prog_cnt = 1; + + ret = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + if (ret < 0) { + close(cg_fd); + close(prog_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + return 5; + } + + if (attr.query.prog_cnt != 1) { + close(cg_fd); + close(prog_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + return 6; + } + + /* Detach before cleanup */ + memset(&attr, 0, sizeof(attr)); + attr.target_fd = cg_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = BPF_CGROUP_DEVICE; + syscall(__NR_bpf, BPF_PROG_DETACH, &attr, sizeof(attr)); + + close(cg_fd); + close(prog_fd); + unlinkat(ve_cg_fd, "subcg", AT_REMOVEDIR); + + return 0; +} + +int setup_timens(void) +{ + int ret, fd; + + if (access("/proc/self/timens_offsets", F_OK)) + return 0; + + if (unshare(CLONE_NEWTIME)) + return -1; + + fd = open("/proc/self/ns/time_for_children", O_RDONLY); + if (fd < 0) + return -1; + + ret = setns(fd, CLONE_NEWTIME); + + close(fd); + return ret; +} + +int child_func(void *arg) +{ + int ret; + int fd, ve_cg_fd; + struct fargs *args = (struct fargs *)arg; + char path[64]; + + ret = setup_timens(); + if (ret) + return ret; + + (void)umount2("/proc", MNT_DETACH); + ret = mount("proc", "/proc", "proc", 0, NULL); + if (ret < 0) + return 1; + + fd = open("/proc/self/uid_map", O_WRONLY); + if (fd < 0) + return 1; + + ret = write(fd, "0 0 1\n", 6); + if (ret < 0 || ret != 6) { + close(fd); + return 1; + } + close(fd); + + snprintf(path, sizeof(path), "%d", args->ctid); + ve_cg_fd = openat(args->cgv2_fd, path, O_RDONLY | O_DIRECTORY); + if (ve_cg_fd < 0) + return 1; + + ret = write_file_at(ve_cg_fd, "ve.state", "START"); + if (ret < 0) { + close(ve_cg_fd); + return 1; + } + + switch (args->test) { + case TEST_PROG_LOAD: + ret = test_prog_load(); + break; + case TEST_PROG_LOAD_DENIED: + ret = test_prog_load_denied(); + break; + case TEST_PROG_QUERY: + ret = test_prog_query(ve_cg_fd); + break; + case TEST_PROG_QUERY_EFFECTIVE_DENIED: + ret = test_prog_query_effective_denied(ve_cg_fd); + break; + case TEST_PROG_QUERY_ROOT_DENIED: + ret = test_prog_query_root_denied(ve_cg_fd); + break; + case TEST_PROG_ATTACH_QUERY: + ret = test_prog_attach_query(ve_cg_fd); + break; + case TEST_PROG_LOAD_OVERSIZED_DENIED: + ret = test_prog_load_oversized_denied(); + break; + default: + ret = 1; + } + close(ve_cg_fd); + return ret; +} + +int enter_cgroup(int cgv2_fd, int ctid) +{ + char cg_path[64]; + char pid_str[64]; + int fd; + int ret; + + if (ctid) + snprintf(cg_path, sizeof(cg_path), "%d/cgroup.procs", ctid); + else + snprintf(cg_path, sizeof(cg_path), "cgroup.procs"); + fd = openat(cgv2_fd, cg_path, O_WRONLY); + if (fd < 0) + return -1; + + snprintf(pid_str, sizeof(pid_str), "%d", getpid()); + ret = write(fd, pid_str, strlen(pid_str)); + if (ret < 0 || ret != strlen(pid_str)) + ret = -1; + + close(fd); + return ret; +} + +int run_vzct(struct __test_metadata *_metadata, int cgv2_fd, int ctid, + int testid) +{ + pid_t pid; + struct fargs args = { + .cgv2_fd = cgv2_fd, + .ctid = ctid, + .test = testid, + }; + struct clone_args cargs = { + .flags = CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWNET | + CLONE_NEWCGROUP | CLONE_NEWVE, + .exit_signal = SIGCHLD, + }; + int status; + int ret; + + ASSERT_GE(enter_cgroup(cgv2_fd, ctid), 0); + + pid = syscall(__NR_clone3, &cargs, sizeof(cargs)); + ASSERT_GE(pid, 0); + if (pid == 0) + _exit(child_func((void *)&args)); + + ASSERT_GE(waitpid(pid, &status, 0), 0); + ASSERT_TRUE(WIFEXITED(status)); + ret = WEXITSTATUS(status); + enter_cgroup(cgv2_fd, 0); + + return ret; +} + +FIXTURE(ve_devcg_bpf) +{ + int cgv2_fd; + int ctid; +}; + +FIXTURE_SETUP(ve_devcg_bpf) +{ + char ctid_str[16]; + char path[64]; + + self->cgv2_fd = mount_cg2_fd(); + ASSERT_GE(self->cgv2_fd, 0); + + ASSERT_EQ(write_file_at(self->cgv2_fd, "cgroup.subtree_control", + "+cpuset +cpu +cpuacct +io +memory +hugetlb +pids +rdma +misc +ve"), 0); + + ASSERT_EQ(write_file_at(self->cgv2_fd, + "ve.default_sysfs_permissions", "/ rx"), 0); + ASSERT_EQ(write_file_at(self->cgv2_fd, + "ve.default_sysfs_permissions", "fs rx"), 0); + ASSERT_EQ(write_file_at(self->cgv2_fd, + "ve.default_sysfs_permissions", "fs/cgroup rw"), 0); + + self->ctid = CTID_MIN; + while (self->ctid < CTID_MAX) { + snprintf(ctid_str, sizeof(ctid_str), "%d", self->ctid); + if (faccessat(self->cgv2_fd, ctid_str, F_OK, 0) != 0 && + errno == ENOENT) + break; + self->ctid++; + } + ASSERT_LT(self->ctid, CTID_MAX); + + ASSERT_EQ(mkdirat(self->cgv2_fd, ctid_str, 0755), 0); + + snprintf(path, sizeof(path), "%d/cgroup.controllers_hidden", self->ctid); + ASSERT_EQ(write_file_at(self->cgv2_fd, path, "-ve"), 0); + + snprintf(path, sizeof(path), "%d/ve.veid", self->ctid); + ASSERT_EQ(write_file_at(self->cgv2_fd, path, ctid_str), 0); +}; + +FIXTURE_TEARDOWN(ve_devcg_bpf) +{ + char path[64]; + + snprintf(path, sizeof(path), "%d/vz.slice", self->ctid); + unlinkat(self->cgv2_fd, path, AT_REMOVEDIR); + snprintf(path, sizeof(path), "%d", self->ctid); + unlinkat(self->cgv2_fd, path, AT_REMOVEDIR); + close(self->cgv2_fd); +} + +static void set_ve_features(struct __test_metadata *_metadata, + int cgv2_fd, int ctid, + unsigned long long features) +{ + char path[64]; + char val[32]; + + snprintf(path, sizeof(path), "%d/ve.features", ctid); + snprintf(val, sizeof(val), "%llu", features); + ASSERT_EQ(write_file_at(cgv2_fd, path, val), 0); +} + +/* + * With VE_FEATURE_BPF enabled, loading a BPF_PROG_TYPE_CGROUP_DEVICE + * program inside the VE should succeed. + */ +TEST_F(ve_devcg_bpf, prog_load) +{ + set_ve_features(_metadata, self->cgv2_fd, self->ctid, + VE_FEATURES_DEF | VE_FEATURE_BPF); + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_LOAD), 0); +} + +/* + * Without VE_FEATURE_BPF, loading a BPF_PROG_TYPE_CGROUP_DEVICE program + * inside the VE should fail with EPERM. + */ +TEST_F(ve_devcg_bpf, prog_load_denied) +{ + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_LOAD_DENIED), 0); +} + +/* + * With VE_FEATURE_BPF, querying BPF_CGROUP_DEVICE programs on a + * descendant (non-root) cgroup inside VE should succeed. + */ +TEST_F(ve_devcg_bpf, prog_query) +{ + set_ve_features(_metadata, self->cgv2_fd, self->ctid, + VE_FEATURES_DEF | VE_FEATURE_BPF); + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_QUERY), 0); +} + +/* + * With VE_FEATURE_BPF, querying with BPF_F_QUERY_EFFECTIVE should be + * denied (EPERM) to prevent VE processes from peeking into host programs + * attached to ancestor cgroups. + */ +TEST_F(ve_devcg_bpf, prog_query_effective_denied) +{ + set_ve_features(_metadata, self->cgv2_fd, self->ctid, + VE_FEATURES_DEF | VE_FEATURE_BPF); + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_QUERY_EFFECTIVE_DENIED), 0); +} + +/* + * With VE_FEATURE_BPF, querying on the VE root cgroup itself should be + * denied (EPERM). + */ +TEST_F(ve_devcg_bpf, prog_query_root_denied) +{ + set_ve_features(_metadata, self->cgv2_fd, self->ctid, + VE_FEATURES_DEF | VE_FEATURE_BPF); + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_QUERY_ROOT_DENIED), 0); +} + +/* + * With VE_FEATURE_BPF, loading a BPF_PROG_TYPE_CGROUP_DEVICE program, + * attaching it to a descendant cgroup, and querying it back should all + * succeed. Verifies the full Docker-like workflow. + */ +TEST_F(ve_devcg_bpf, prog_attach_query) +{ + set_ve_features(_metadata, self->cgv2_fd, self->ctid, + VE_FEATURES_DEF | VE_FEATURE_BPF); + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_ATTACH_QUERY), 0); +} + +/* + * With VE_FEATURE_BPF, loading a program exceeding BPF_MAXINSNS (4096) + * instructions should fail with E2BIG. + */ +TEST_F(ve_devcg_bpf, prog_load_oversized_denied) +{ + set_ve_features(_metadata, self->cgv2_fd, self->ctid, + VE_FEATURES_DEF | VE_FEATURE_BPF); + ASSERT_EQ(run_vzct(_metadata, self->cgv2_fd, self->ctid, + TEST_PROG_LOAD_OVERSIZED_DENIED), 0); +} + +TEST_HARNESS_MAIN -- 2.53.0 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
