This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 5882d4148 [cgroups2] Support device access permissions via ebpf.
5882d4148 is described below

commit 5882d41485a839449a3183984d2d111304fb961a
Author: Devin Leamy <[email protected]>
AuthorDate: Fri Mar 22 17:55:15 2024 -0400

    [cgroups2] Support device access permissions via ebpf.
    
    Cgroups v2 uses eBPF to manage device access permissions.
    
    For each cgroup, we attach a `BPF_PROG_TYPE_CGROUP_DEVICE` program type to 
the
    `BPF_CGROUP_DEVICE` eBPF kernel hook. The attached program will be run for 
every
    device access made from within the cgroup. If the program returns a `0` exit
    code the device access is denied, otherwise it is allowed.
    
    For more information about the Device Controller in cgroups v2:
    https://docs.kernel.org/admin-guide/cgroup-v2.html#device-controller
    
    This closes #519
---
 src/linux/cgroups2.cpp | 190 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/linux/cgroups2.hpp |  16 +++++
 2 files changed, 206 insertions(+)

diff --git a/src/linux/cgroups2.cpp b/src/linux/cgroups2.cpp
index de79c7fc1..13b6e7dff 100644
--- a/src/linux/cgroups2.cpp
+++ b/src/linux/cgroups2.cpp
@@ -25,9 +25,11 @@
 #include <stout/numify.hpp>
 #include <stout/os.hpp>
 #include <stout/path.hpp>
+#include <stout/unreachable.hpp>
 #include <stout/stringify.hpp>
 #include <stout/try.hpp>
 
+#include "linux/ebpf.hpp"
 #include "linux/fs.hpp"
 
 using std::ostream;
@@ -486,4 +488,192 @@ Try<uint64_t> weight(const string& cgroup)
 
 } // namespace cpu {
 
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+  DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+  {
+    // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+    // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+    // registers r2-5.
+    //
+    // The device type is encoded in the first 16 bits of `access_type` and
+    // the access type is encoded in the last 16 bits of `access_type`.
+    program.append({
+      // r2: Type ('c', 'b', '?')
+      BPF_LDX_MEM(
+        BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx, 
access_type)),
+      BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+      // r3: Access ('r', 'w', 'm')
+      BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+        offsetof(bpf_cgroup_dev_ctx, access_type)),
+      BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+      // r4: Major Version
+      BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+        offsetof(bpf_cgroup_dev_ctx, major)),
+      // r5: Minor Version
+      BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+        offsetof(bpf_cgroup_dev_ctx, minor)),
+    });
+  }
+
+  Try<Nothing> allow(const Entry entry) { return addDevice(entry, true);  }
+  Try<Nothing>  deny(const Entry entry) { return addDevice(entry, false); }
+
+  ebpf::Program build()
+  {
+    if (!hasCatchAll) {
+      // Exit instructions.
+      // If no entry granted access, then deny the access.
+      program.append({
+        BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS),
+        BPF_EXIT_INSN(),
+      });
+    }
+    return program;
+  }
+
+private:
+  Try<Nothing> addDevice(const Entry entry, bool allow)
+  {
+    if (hasCatchAll) {
+      return Nothing();
+    }
+
+    // We create a block of bytecode with the format:
+    // 1. Major Version Check
+    // 2. Minor Version Check
+    // 3. Type Check
+    // 4. Access Check
+    // 5. Allow/Deny Access
+    //
+    // 6. NEXT BLOCK
+    //
+    // Either:
+    // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+    //    block (5) is executed.
+    // 2. One of (1,2,3,4) does not match the requested access and we skip
+    //    to the next block (6).
+
+    const Entry::Selector& selector = entry.selector;
+    const Entry::Access& access = entry.access;
+
+    bool check_major = selector.major.isSome();
+    bool check_minor = selector.minor.isSome();
+    bool check_type = selector.type != Entry::Selector::Type::ALL;
+    bool check_access = !access.mknod || !access.read || !access.write;
+
+    // Number of instructions to the [NEXT BLOCK]. This is used if a check
+    // fails (meaning this entry does not apply) and we want to skip the
+    // subsequent checks.
+    short jmp_size = 1 + (check_major ? 1 : 0) + (check_minor ? 1 : 0) +
+                     (check_access ? 3 : 0) + (check_type ? 1 : 0);
+
+    // Check major version (r4) against entry.
+    if (check_major) {
+      program.append({
+        BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+      });
+      --jmp_size;
+    }
+
+    // Check minor version (r5) against entry.
+    if (check_minor) {
+      program.append({
+        BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+      });
+      --jmp_size;
+    }
+
+    // Check type (r2) against entry.
+    if (check_type) {
+      int bpf_type = [selector]() {
+        switch (selector.type) {
+          case Entry::Selector::Type::BLOCK:     return BPF_DEVCG_DEV_BLOCK;
+          case Entry::Selector::Type::CHARACTER: return BPF_DEVCG_DEV_CHAR;
+          case Entry::Selector::Type::ALL:       UNREACHABLE();
+        }
+      }();
+
+      program.append({
+        BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+      });
+      --jmp_size;
+    }
+
+    // Check access (r3) against entry.
+    if (check_access) {
+      int bpf_access = 0;
+      bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+      bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+      bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+      program.append({
+        BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+        BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+        BPF_JMP_REG(
+          BPF_JNE, BPF_REG_1, BPF_REG_3, static_cast<short>(jmp_size - 2)),
+      });
+      jmp_size -= 3;
+    }
+
+    if (!check_major && !check_minor && !check_type && !check_access) {
+      // The exit instructions as well as any additional device entries would
+      // generate unreachable blocks.
+      hasCatchAll = true;
+    }
+
+    // Allow/Deny access block.
+    program.append({
+      BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS),
+      BPF_EXIT_INSN(),
+    });
+
+    return Nothing();
+  }
+
+  ebpf::Program program;
+
+  // Whether the program has a device entry that allows or denies ALL accesses.
+  // Such cases need to be specially handled because any instructions added
+  // after it will be unreachable, and thus will cause the eBPF verifier to
+  // reject the program.
+  bool hasCatchAll = false;
+
+  static const int ALLOW_ACCESS = 1;
+  static const int DENY_ACCESS = 0;
+};
+
+
+Try<Nothing> configure(
+    const string& cgroup,
+    const vector<Entry>& allow,
+    const vector<Entry>& deny)
+{
+  DeviceProgram program = DeviceProgram();
+  foreach (const Entry entry, allow) {
+    program.allow(entry);
+  }
+  foreach (const Entry entry, deny) {
+    program.deny(entry);
+  }
+
+  Try<Nothing> attach = ebpf::cgroups2::attach(
+      cgroups2::path(cgroup),
+      program.build());
+
+  if (attach.isError()) {
+    return Error("Failed to attach BPF_PROG_TYPE_CGROUP_DEVICE program: " +
+                 attach.error());
+  }
+
+  return Nothing();
+}
+
+} // namespace devices {
+
 } // namespace cgroups2 {
diff --git a/src/linux/cgroups2.hpp b/src/linux/cgroups2.hpp
index 1913a750a..d3c829ea5 100644
--- a/src/linux/cgroups2.hpp
+++ b/src/linux/cgroups2.hpp
@@ -24,6 +24,8 @@
 #include <stout/nothing.hpp>
 #include <stout/try.hpp>
 
+#include "linux/cgroups.hpp"
+
 namespace cgroups2 {
 
 // Root cgroup in the cgroup v2 hierarchy. Since the root cgroup has the same
@@ -116,6 +118,20 @@ Try<uint64_t> weight(const std::string& cgroup);
 
 } // namespace cpu {
 
+namespace devices {
+
+using cgroups::devices::Entry;
+
+// Configure the device access permissions for the cgroup. These permissions
+// are hierarchical. I.e. if a parent cgroup does not allow an access then
+// 'this' cgroup will be denied access.
+Try<Nothing> configure(
+    const std::string& cgroup,
+    const std::vector<Entry>& allow,
+    const std::vector<Entry>& deny);
+
+} // namespace devices {
+
 } // namespace cgroups2 {
 
 #endif // __CGROUPS_V2_HPP__

Reply via email to