This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 8d9048a4c [cgroups2] Add device manager recovery support.
8d9048a4c is described below

commit 8d9048a4c27aad4e343d7d49d2539e6dec895e68
Author: Jason Zhou <[email protected]>
AuthorDate: Mon Aug 12 17:04:09 2024 -0400

    [cgroups2] Add device manager recovery support.
    
    We currently do not have any method of recovering the device access
    states when the cgroups2 isolator is atempting to recover containers.
    
    We add a recovery state here that makes use of the protobuf checkpoint
    files to ensure that the previous device accesses of cgroups can be
    restored. It will be used by the cgroups2 isolator.
    
    Review: https://reviews.apache.org/r/75145/
---
 .../device_manager/device_manager.cpp              | 132 ++++++++++++++++++++-
 .../device_manager/device_manager.hpp              |   7 ++
 src/tests/device_manager_tests.cpp                 |  66 +++++++++++
 3 files changed, 202 insertions(+), 3 deletions(-)

diff --git a/src/slave/containerizer/device_manager/device_manager.cpp 
b/src/slave/containerizer/device_manager/device_manager.cpp
index 8ffbf5ad1..c245e4104 100644
--- a/src/slave/containerizer/device_manager/device_manager.cpp
+++ b/src/slave/containerizer/device_manager/device_manager.cpp
@@ -23,15 +23,20 @@
 #include <process/process.hpp>
 
 #include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/os/exists.hpp>
 #include <stout/stringify.hpp>
 #include <stout/unreachable.hpp>
 
 #include "slave/containerizer/device_manager/device_manager.hpp"
 #include "slave/containerizer/device_manager/state.hpp"
+#include "slave/containerizer/mesos/paths.hpp"
 #include "slave/paths.hpp"
 #include "slave/state.hpp"
 #include "linux/cgroups2.hpp"
 
+using google::protobuf::RepeatedPtrField;
+
 using std::string;
 using std::vector;
 
@@ -42,6 +47,8 @@ using process::Owned;
 
 using cgroups::devices::Entry;
 
+using mesos::slave::ContainerState;
+
 namespace mesos {
 namespace internal {
 namespace slave {
@@ -114,9 +121,10 @@ Try<vector<DeviceManager::NonWildcardEntry>>
 class DeviceManagerProcess : public process::Process<DeviceManagerProcess>
 {
 public:
-  DeviceManagerProcess(const string& work_dir)
+  DeviceManagerProcess(const Flags& flags)
     : ProcessBase(process::ID::generate("device-manager")),
-      meta_dir(paths::getMetaRootDir(work_dir)) {}
+      meta_dir(paths::getMetaRootDir(flags.work_dir)),
+      cgroups_root(flags.cgroups_root) {}
 
   Future<Nothing> configure(
       const string& cgroup,
@@ -224,9 +232,121 @@ public:
     return Nothing();
   }
 
+  Future<Nothing> recover(const vector<ContainerState>& states)
+  {
+    hashset<string> cgroups_to_recover;
+    foreach(const ContainerState& state, states) {
+      cgroups_to_recover.insert(containerizer::paths::cgroups2::container(
+          cgroups_root, state.container_id(), false));
+    }
+
+    const string checkpoint_path = paths::getDevicesStatePath(meta_dir);
+    if (!os::exists(checkpoint_path)) {
+      return Nothing(); // This happens on the first run.
+    }
+
+    Result<CgroupDeviceAccessStates> device_states =
+      state::read<CgroupDeviceAccessStates>(checkpoint_path);
+
+    if (device_states.isError()) {
+      return Failure("Failed to read device configuration info from"
+                     " '" + checkpoint_path + "': " + device_states.error());
+    } else if (device_states.isNone()) {
+      LOG(WARNING) << "The device info file at '" << checkpoint_path << "'"
+                   << " is empty";
+      return Nothing();
+    }
+    CHECK_SOME(device_states);
+
+    vector<string> recovered_cgroups = {};
+    foreach (const auto& entry, device_states->device_access_per_cgroup()) {
+      const string& cgroup = entry.first;
+      const CgroupDeviceAccessState& state = entry.second;
+
+      if (!cgroups_to_recover.contains(cgroup)) {
+        LOG(WARNING)
+          << "The cgroup '" << cgroup << "' from the device manager's"
+             " checkpointed state is not present in the expected cgroups of"
+             " the containerizer";
+        continue;
+      }
+
+      auto parse = [&](const RepeatedPtrField<string>& list)
+          -> Try<vector<Entry>>
+      {
+        vector<Entry> parsed_entries;
+        foreach (const string& entry, list) {
+          Try<Entry> parsed_entry = Entry::parse(entry);
+          if (parsed_entry.isError()) {
+            return Error("Failed to parse entry " + entry + " during recover"
+                         " for cgroup " + cgroup + ": " + 
parsed_entry.error());
+          }
+          parsed_entries.push_back(*parsed_entry);
+        }
+        return parsed_entries;
+      };
+
+      // We return failure because we expect all data in the checkpoint file
+      // to be valid.
+      Try<vector<Entry>> allow_entries = parse(state.allow_list());
+      if (allow_entries.isError()) {
+        return Failure(allow_entries.error());
+      }
+
+      Try<vector<Entry>> deny_entries = parse(state.deny_list());
+      if (deny_entries.isError()) {
+        return Failure(deny_entries.error());
+      }
+
+      auto result = device_access_per_cgroup.emplace(
+          cgroup,
+          CHECK_NOTERROR(DeviceManager::CgroupDeviceAccess::create(
+              *allow_entries, *deny_entries)));
+
+      CHECK(result.second); // There should be a single insertion per cgroup.
+
+      recovered_cgroups.push_back(cgroup);
+    }
+
+    foreach (const string& cgroup, recovered_cgroups) {
+      // Commit with checkpoint = false, since there's no need to 
re-checkpoint.
+      Try<Nothing> commit = commit_device_access_changes(cgroup, false);
+      if (commit.isError()) {
+        // Return failure as the checkpointed state should be valid, allowing 
us
+        // to generate and attach BPF programs. This is because the cgroup
+        // previously succeeded in doing so.
+        return Failure(
+            "Failed to perform configuration of ebpf file for cgroup"
+            " '" + cgroup + "': " + commit.error());
+      }
+    }
+
+    // Checkpoint only after all cgroups are recovered to avoid deleting states
+    // of unrecovered cgroups.
+    Try<Nothing> status = checkpoint();
+
+    if (status.isError()) {
+      return Failure(
+          "Failed to checkpoint device access state: " + status.error());
+    }
+
+    foreach(const string& cgroup, cgroups_to_recover) {
+      if (!device_access_per_cgroup.contains(cgroup)) {
+        LOG(WARNING)
+          << "Unable to recover state for cgroup '" + cgroup + "' as requested"
+             " by the containerizer, because it was missing in the device"
+             " manager's checkpointed state";
+      }
+    }
+
+    return Nothing();
+  }
+
 private:
   const string meta_dir;
 
+  const string cgroups_root;
+
   hashmap<string, DeviceManager::CgroupDeviceAccess> device_access_per_cgroup;
 
   Try<Nothing> checkpoint() const
@@ -286,7 +406,7 @@ private:
 Try<DeviceManager*> DeviceManager::create(const Flags& flags)
 {
   return new DeviceManager(
-      Owned<DeviceManagerProcess>(new DeviceManagerProcess(flags.work_dir)));
+      Owned<DeviceManagerProcess>(new DeviceManagerProcess(flags)));
 }
 
 
@@ -495,6 +615,12 @@ DeviceManager::CgroupDeviceAccess::create(
 }
 
 
+Future<Nothing> DeviceManager::recover(const vector<ContainerState>& states)
+{
+  return dispatch(*process, &DeviceManagerProcess::recover, states);
+}
+
+
 Future<Nothing> DeviceManager::remove(const std::string& cgroup)
 {
   return dispatch(
diff --git a/src/slave/containerizer/device_manager/device_manager.hpp 
b/src/slave/containerizer/device_manager/device_manager.hpp
index 853350f70..0a8ea4f93 100644
--- a/src/slave/containerizer/device_manager/device_manager.hpp
+++ b/src/slave/containerizer/device_manager/device_manager.hpp
@@ -23,6 +23,7 @@
 #include <stout/try.hpp>
 
 #include "linux/cgroups.hpp"
+#include "slave/containerizer/containerizer.hpp"
 #include "slave/flags.hpp"
 
 namespace mesos {
@@ -130,6 +131,12 @@ public:
   // Remove the cgroup from the DeviceManager state if the state contains it.
   process::Future<Nothing> remove(const std::string& cgroup);
 
+  // Recover the cgroup device access state stored in the checkpointing file.
+  // We will only recover cgroups that belong to containers in the passed in
+  // container states.
+  process::Future<Nothing> recover(
+      const std::vector<mesos::slave::ContainerState>& states);
+
 private:
   explicit DeviceManager(const process::Owned<DeviceManagerProcess>& process);
   process::Owned<DeviceManagerProcess> process;
diff --git a/src/tests/device_manager_tests.cpp 
b/src/tests/device_manager_tests.cpp
index c4e9b8c58..b6235e9e5 100644
--- a/src/tests/device_manager_tests.cpp
+++ b/src/tests/device_manager_tests.cpp
@@ -14,6 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "common/protobuf_utils.hpp"
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
@@ -28,6 +30,7 @@
 
 #include "linux/cgroups2.hpp"
 #include "slave/containerizer/device_manager/device_manager.hpp"
+#include "slave/containerizer/mesos/paths.hpp"
 
 using mesos::internal::slave::DeviceManager;
 using mesos::internal::slave::DeviceManagerProcess;
@@ -47,6 +50,10 @@ namespace tests {
 
 const string TEST_CGROUP = "test";
 
+const string TEST_CGROUPS_ROOT = "mesos_test";
+
+const string TEST_CGROUP_WITH_ROOT = path::join(TEST_CGROUPS_ROOT, 
TEST_CGROUP);
+
 
 class DeviceManagerTest : public TemporaryDirectoryTest
 {
@@ -59,6 +66,10 @@ class DeviceManagerTest : public TemporaryDirectoryTest
     if (cgroups2::exists(TEST_CGROUP)) {
       AWAIT_READY(cgroups2::destroy(TEST_CGROUP));
     }
+
+    if (cgroups2::exists(TEST_CGROUP_WITH_ROOT)) {
+      AWAIT_READY(cgroups2::destroy(TEST_CGROUP_WITH_ROOT));
+    }
   }
 
   void TearDown() override
@@ -67,6 +78,10 @@ class DeviceManagerTest : public TemporaryDirectoryTest
       AWAIT_READY(cgroups2::destroy(TEST_CGROUP));
     }
 
+    if (cgroups2::exists(TEST_CGROUP_WITH_ROOT)) {
+      AWAIT_READY(cgroups2::destroy(TEST_CGROUP_WITH_ROOT));
+    }
+
     TemporaryDirectoryTest::TearDown();
   }
 };
@@ -488,6 +503,57 @@ TEST(DeviceManagerCgroupDeviceAccessTest, 
IsAccessGrantedTest)
   );
 }
 
+
+TEST_F(DeviceManagerTest, ROOT_Recover)
+{
+  slave::Flags flags;
+  flags.work_dir = *sandbox;
+  flags.cgroups_root = TEST_CGROUPS_ROOT;
+  ASSERT_SOME(cgroups2::create(TEST_CGROUP_WITH_ROOT, true));
+  Owned<DeviceManager> dm =
+    Owned<DeviceManager>(CHECK_NOTERROR(DeviceManager::create(flags)));
+
+  vector<devices::Entry> allow_list = {*devices::Entry::parse("c 1:3 w")};
+
+  Future<Nothing> setup = dm->configure(
+      TEST_CGROUP_WITH_ROOT,
+      allow_list,
+      {});
+
+  AWAIT_ASSERT_READY(setup);
+
+  Future<DeviceManager::CgroupDeviceAccess> dm_state =
+    dm->state(TEST_CGROUP_WITH_ROOT);
+
+  AWAIT_ASSERT_READY(dm_state);
+
+  EXPECT_EQ(dm_state->allow_list, allow_list);
+  EXPECT_EQ(dm_state->deny_list, vector<devices::Entry>{});
+
+  dm = Owned<DeviceManager>(CHECK_NOTERROR(DeviceManager::create(flags)));
+
+  dm_state = dm->state(TEST_CGROUP_WITH_ROOT);
+  AWAIT_ASSERT_READY(dm_state);
+  EXPECT_EQ(dm_state->allow_list, vector<devices::Entry>{});
+  EXPECT_EQ(dm_state->deny_list, vector<devices::Entry>{});
+
+  Option<ContainerID> container_id =
+    slave::containerizer::paths::cgroups2::containerId(
+        flags.cgroups_root, TEST_CGROUP);
+  ASSERT_SOME(container_id);
+
+
+  Future<Nothing> recover = dm->recover({protobuf::slave::createContainerState(
+      None(), None(), *container_id, -1, *sandbox)});
+
+  AWAIT_ASSERT_READY(recover);
+
+  dm_state = dm->state(TEST_CGROUP_WITH_ROOT);
+  AWAIT_ASSERT_READY(dm_state);
+  EXPECT_EQ(dm_state->allow_list, allow_list);
+  EXPECT_EQ(dm_state->deny_list, vector<devices::Entry>{});
+}
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {

Reply via email to