This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 3990bf745 [ebpf] Implement atomic replacement of cgroup device 
programs.
3990bf745 is described below

commit 3990bf745ba95f8f9a9bafc9c3a983f191b4c12d
Author: Jason Zhou <[email protected]>
AuthorDate: Sun Jul 14 21:02:52 2024 -0400

    [ebpf] Implement atomic replacement of cgroup device programs.
    
    Currently, if we try to attach device ebpf files to the same cgroup
    multiple times, they will all be attached, and they will all be run
    when a device requests access. This conflicts with our design to have
    one ebpf file per cgroup that represents all the files they want to
    allow or deny, where that file is updated when the cgroup adds or
    removes a device. So we add a patch to atomically replace any existing
    ebpf file already attached to our target cgroup using our new ebpf file.
    
    Review: https://reviews.apache.org/r/75080/
---
 src/linux/cgroups2.cpp                     |  2 +-
 src/linux/ebpf.cpp                         | 67 ++++++++++++++++++------
 src/tests/containerizer/cgroups2_tests.cpp | 81 ++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/src/linux/cgroups2.cpp b/src/linux/cgroups2.cpp
index 133eec1b2..cb3c425a4 100644
--- a/src/linux/cgroups2.cpp
+++ b/src/linux/cgroups2.cpp
@@ -1440,7 +1440,7 @@ Try<Nothing> configure(
   }
 
   Try<Nothing> attach = ebpf::cgroups2::attach(
-      cgroups2::path(cgroup),
+      cgroup,
       *program);
 
   if (attach.isError()) {
diff --git a/src/linux/ebpf.cpp b/src/linux/ebpf.cpp
index 50f2c6743..e74768e80 100644
--- a/src/linux/ebpf.cpp
+++ b/src/linux/ebpf.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "linux/ebpf.hpp"
+#include "linux/cgroups2.hpp"
 
 #include <linux/bpf.h>
 #include <sys/syscall.h>
@@ -25,9 +26,11 @@
 
 #include "stout/check.hpp"
 #include "stout/error.hpp"
+#include "stout/none.hpp"
 #include "stout/nothing.hpp"
 #include "stout/os/close.hpp"
 #include "stout/os/open.hpp"
+#include "stout/stringify.hpp"
 #include "stout/try.hpp"
 
 using std::string;
@@ -125,25 +128,44 @@ Try<int> bpf_get_fd_by_id(uint32_t prog_id)
 
 
 // Attaches the eBPF program identified by the provided fd to a cgroup.
-//
-// TODO(dleamy): This currently does not replace existing programs attached
-// to the cgroup, we will need to add replacement to support adding / removing
-// device access dynamically.
-Try<Nothing> attach(const string& cgroup, int fd)
+// If there is an existing ebpf program at the cgroup, we will replace
+// it atomically with the provided ebpf program.
+Try<Nothing> attach(const string& cgroup, int new_program_fd)
 {
-  Try<int> cgroup_fd = os::open(cgroup, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+  string cgroup_path = ::cgroups2::path(cgroup);
+
+  Try<int> cgroup_fd =
+    os::open(cgroup_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   if (cgroup_fd.isError()) {
-    return Error("Failed to open '" + cgroup + "': " + cgroup_fd.error());
+    return Error("Failed to open cgroup: " + cgroup_fd.error());
+  }
+
+  Try<vector<uint32_t>> attached = ebpf::cgroups2::attached(cgroup_path);
+  if (attached.isError()) {
+    return Error("Failed to retrieve attached bpf device programs: "
+                 + attached.error());
+  } else if (attached->size() > 1) {
+    return Error("Detected multiple (" + stringify(attached->size()) + ")"
+                 " BPF_CGROUP_DEVICE attached to cgroup");
+  }
+
+  Result<int> old_program_fd = None();
+
+  if (attached->size() == 1) {
+    uint32_t old_program_id = attached->at(0);
+    old_program_fd = bpf_get_fd_by_id(old_program_id);
+    if (old_program_fd.isError()) {
+      return Error("Failed to open existing program fd for id "
+                   + stringify(old_program_id) + ": " + 
old_program_fd.error());
+    }
   }
 
   bpf_attr attr;
   memset(&attr, 0, sizeof(attr));
   attr.attach_type = BPF_CGROUP_DEVICE;
   attr.target_fd = *cgroup_fd;
-  attr.attach_bpf_fd = fd;
+  attr.attach_bpf_fd = new_program_fd;
 
-  // TODO(dleamy): Replace any existing attached programs here!
-  //
   // BPF_F_ALLOW_MULTI allows multiple eBPF programs to be attached to a single
   // cgroup and determines how the programs up and down the hierarchy will run.
   //
@@ -163,13 +185,24 @@ Try<Nothing> attach(const string& cgroup, int fd)
   // cgroup4 run order: F, D, E, A, B
   // cgroup2 run order: C, A, B
   //
+  // If any program in the run order returns a non-successful code, the device
+  // access will be denied. Thus, we opt to atomically replace the existing
+  // ebpf device file so that we have exactly one file as our source of truth
+  // for device access.
   // For full details, see:
   // 
https://elixir.bootlin.com/linux/v6.7.9/source/include/uapi/linux/bpf.h#L1090
   attr.attach_flags = BPF_F_ALLOW_MULTI;
+  if (old_program_fd.isSome()) {
+    attr.attach_flags |= BPF_F_REPLACE;
+    attr.replace_bpf_fd = *old_program_fd;
+  }
 
   Try<int, ErrnoError> result = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 
   os::close(*cgroup_fd);
+  if (old_program_fd.isSome()) {
+    os::close(*old_program_fd);
+  }
 
   if (result.isError()) {
     return Error("BPF program attach syscall failed: "
@@ -199,9 +232,12 @@ Try<Nothing> attach(const string& cgroup, const Program& 
program)
 
 Try<vector<uint32_t>> attached(const string& cgroup)
 {
-  Try<int> cgroup_fd = os::open(cgroup, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+  string cgroup_path = ::cgroups2::path(cgroup);
+
+  Try<int> cgroup_fd =
+    os::open(cgroup_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   if (cgroup_fd.isError()) {
-    return Error("Failed to open '" + cgroup + "': " + cgroup_fd.error());
+    return Error("Failed to open '" + cgroup_path + "': " + cgroup_fd.error());
   }
 
   // Program ids are unsigned 32-bit integers. We assume that a maximum
@@ -238,9 +274,12 @@ Try<vector<uint32_t>> attached(const string& cgroup)
 // and program id. Returns Nothing() on success or if no program is found.
 Try<Nothing> detach(const string& cgroup, uint32_t program_id)
 {
-  Try<int> cgroup_fd = os::open(cgroup, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+  string cgroup_path = ::cgroups2::path(cgroup);
+
+  Try<int> cgroup_fd =
+    os::open(cgroup_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   if (cgroup_fd.isError()) {
-    return Error("Failed to open '" + cgroup + "': " + cgroup_fd.error());
+    return Error("Failed to open '" + cgroup_path + "': " + cgroup_fd.error());
   }
 
   Try<int> program_fd = bpf_get_fd_by_id(program_id);
diff --git a/src/tests/containerizer/cgroups2_tests.cpp 
b/src/tests/containerizer/cgroups2_tests.cpp
index 2ee39b342..3982e2598 100644
--- a/src/tests/containerizer/cgroups2_tests.cpp
+++ b/src/tests/containerizer/cgroups2_tests.cpp
@@ -760,6 +760,87 @@ INSTANTIATE_TEST_CASE_P(
       ));
 
 
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_AtomicReplace)
+{
+  const string& cgroup = TEST_CGROUP;
+  const vector<devices::Entry>& allow = {*devices::Entry::parse("c 1:3 w")};
+  const vector<devices::Entry>& deny = {};
+  const vector<OpenArgs>& allowed_accesses = {
+      {os::DEV_NULL, O_WRONLY}
+  };
+  const vector<OpenArgs>& blocked_accesses = {
+      {os::DEV_NULL, O_RDWR},
+      {os::DEV_NULL, O_RDONLY}
+  };
+  const vector<devices::Entry>& to_be_replaced_allow = {};
+  const vector<devices::Entry>& to_be_replaced_deny = {
+      *devices::Entry::parse("c 1:3 w")
+  };
+
+  ASSERT_SOME(cgroups2::create(cgroup));
+  string path = cgroups2::path(cgroup);
+
+  Try<vector<uint32_t>> attached = ebpf::cgroups2::attached(path);
+  ASSERT_SOME(attached);
+  ASSERT_EQ(0u, attached->size());
+
+  ASSERT_SOME(
+    devices::configure(cgroup, to_be_replaced_allow, to_be_replaced_deny));
+  attached = ebpf::cgroups2::attached(path);
+  ASSERT_SOME(attached);
+  ASSERT_EQ(1u, attached->size());
+
+  ASSERT_SOME(devices::configure(cgroup, allow, deny));
+  attached = ebpf::cgroups2::attached(path);
+  ASSERT_SOME(attached);
+  ASSERT_EQ(1u, attached->size());
+
+  pid_t pid = ::fork();
+  ASSERT_NE(-1, pid);
+
+  if (pid == 0) {
+    // Move the child process into the newly created cgroup.
+    Try<Nothing> assign = cgroups2::assign(cgroup, ::getpid());
+    if (assign.isError()) {
+      SAFE_EXIT(EXIT_FAILURE, "Failed to assign child process to cgroup");
+    }
+
+    // Check that we can only do the "allowed_accesses".
+    foreach(const OpenArgs& args, allowed_accesses) {
+      if (os::open(args.first, args.second).isError()) {
+        SAFE_EXIT(EXIT_FAILURE, "Expected allowed read to succeed");
+      }
+    }
+    foreach(const OpenArgs& args, blocked_accesses) {
+      if (os::open(args.first, args.second).isSome()) {
+        SAFE_EXIT(EXIT_FAILURE, "Expected blocked read to fail");
+      }
+    }
+
+    ASSERT_SOME(ebpf::cgroups2::detach(path, attached->at(0)));
+
+    // Check that we can do both the "allowed_accesses" and "blocked_accesses".
+    foreach(const OpenArgs& args, allowed_accesses) {
+      if (os::open(args.first, args.second).isError()) {
+        SAFE_EXIT(EXIT_FAILURE, "Expected successful read after detaching"
+                                " device controller program");
+      }
+    }
+    foreach(const OpenArgs& args, blocked_accesses) {
+      if (os::open(args.first, args.second).isError()) {
+        SAFE_EXIT(EXIT_FAILURE, "Expected successful read after detaching"
+                                " device controller program");
+      }
+    }
+
+    ::_exit(EXIT_SUCCESS);
+  }
+
+  AWAIT_EXPECT_WEXITSTATUS_EQ(EXIT_SUCCESS, process::reap(pid));
+}
+
+
+
 TEST_F(Cgroups2Test, ROOT_CGROUPS2_GetBpfFdById)
 {
   const string& cgroup = TEST_CGROUP;

Reply via email to