This is an automated email from the ASF dual-hosted git repository.
bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/master by this push:
new 5882d4148 [cgroups2] Support device access permissions via ebpf.
5882d4148 is described below
commit 5882d41485a839449a3183984d2d111304fb961a
Author: Devin Leamy <[email protected]>
AuthorDate: Fri Mar 22 17:55:15 2024 -0400
[cgroups2] Support device access permissions via ebpf.
Cgroups v2 uses eBPF to manage device access permissions.
For each cgroup, we attach a `BPF_PROG_TYPE_CGROUP_DEVICE` program type to
the
`BPF_CGROUP_DEVICE` eBPF kernel hook. The attached program will be run for
every
device access made from within the cgroup. If the program returns a `0` exit
code the device access is denied, otherwise it is allowed.
For more information about the Device Controller in cgroups v2:
https://docs.kernel.org/admin-guide/cgroup-v2.html#device-controller
This closes #519
---
src/linux/cgroups2.cpp | 190 +++++++++++++++++++++++++++++++++++++++++++++++++
src/linux/cgroups2.hpp | 16 +++++
2 files changed, 206 insertions(+)
diff --git a/src/linux/cgroups2.cpp b/src/linux/cgroups2.cpp
index de79c7fc1..13b6e7dff 100644
--- a/src/linux/cgroups2.cpp
+++ b/src/linux/cgroups2.cpp
@@ -25,9 +25,11 @@
#include <stout/numify.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
+#include <stout/unreachable.hpp>
#include <stout/stringify.hpp>
#include <stout/try.hpp>
+#include "linux/ebpf.hpp"
#include "linux/fs.hpp"
using std::ostream;
@@ -486,4 +488,192 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ ebpf::Program build()
+ {
+ if (!hasCatchAll) {
+ // Exit instructions.
+ // If no entry granted access, then deny the access.
+ program.append({
+ BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS),
+ BPF_EXIT_INSN(),
+ });
+ }
+ return program;
+ }
+
+private:
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this entry does not apply) and we want to skip the
+ // subsequent checks.
+ short jmp_size = 1 + (check_major ? 1 : 0) + (check_minor ? 1 : 0) +
+ (check_access ? 3 : 0) + (check_type ? 1 : 0);
+
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = [selector]() {
+ switch (selector.type) {
+ case Entry::Selector::Type::BLOCK: return BPF_DEVCG_DEV_BLOCK;
+ case Entry::Selector::Type::CHARACTER: return BPF_DEVCG_DEV_CHAR;
+ case Entry::Selector::Type::ALL: UNREACHABLE();
+ }
+ }();
+
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG(
+ BPF_JNE, BPF_REG_1, BPF_REG_3, static_cast<short>(jmp_size - 2)),
+ });
+ jmp_size -= 3;
+ }
+
+ if (!check_major && !check_minor && !check_type && !check_access) {
+ // The exit instructions as well as any additional device entries would
+ // generate unreachable blocks.
+ hasCatchAll = true;
+ }
+
+ // Allow/Deny access block.
+ program.append({
+ BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS),
+ BPF_EXIT_INSN(),
+ });
+
+ return Nothing();
+ }
+
+ ebpf::Program program;
+
+ // Whether the program has a device entry that allows or denies ALL accesses.
+ // Such cases need to be specially handled because any instructions added
+ // after it will be unreachable, and thus will cause the eBPF verifier to
+ // reject the program.
+ bool hasCatchAll = false;
+
+ static const int ALLOW_ACCESS = 1;
+ static const int DENY_ACCESS = 0;
+};
+
+
+Try<Nothing> configure(
+ const string& cgroup,
+ const vector<Entry>& allow,
+ const vector<Entry>& deny)
+{
+ DeviceProgram program = DeviceProgram();
+ foreach (const Entry entry, allow) {
+ program.allow(entry);
+ }
+ foreach (const Entry entry, deny) {
+ program.deny(entry);
+ }
+
+ Try<Nothing> attach = ebpf::cgroups2::attach(
+ cgroups2::path(cgroup),
+ program.build());
+
+ if (attach.isError()) {
+ return Error("Failed to attach BPF_PROG_TYPE_CGROUP_DEVICE program: " +
+ attach.error());
+ }
+
+ return Nothing();
+}
+
+} // namespace devices {
+
} // namespace cgroups2 {
diff --git a/src/linux/cgroups2.hpp b/src/linux/cgroups2.hpp
index 1913a750a..d3c829ea5 100644
--- a/src/linux/cgroups2.hpp
+++ b/src/linux/cgroups2.hpp
@@ -24,6 +24,8 @@
#include <stout/nothing.hpp>
#include <stout/try.hpp>
+#include "linux/cgroups.hpp"
+
namespace cgroups2 {
// Root cgroup in the cgroup v2 hierarchy. Since the root cgroup has the same
@@ -116,6 +118,20 @@ Try<uint64_t> weight(const std::string& cgroup);
} // namespace cpu {
+namespace devices {
+
+using cgroups::devices::Entry;
+
+// Configure the device access permissions for the cgroup. These permissions
+// are hierarchical. I.e. if a parent cgroup does not allow an access then
+// 'this' cgroup will be denied access.
+Try<Nothing> configure(
+ const std::string& cgroup,
+ const std::vector<Entry>& allow,
+ const std::vector<Entry>& deny);
+
+} // namespace devices {
+
} // namespace cgroups2 {
#endif // __CGROUPS_V2_HPP__