This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 14d701331 [cgroups2] Implement Cgroups 2 isolator w/o nested 
containers and systemd.
14d701331 is described below

commit 14d7013319eca952cdf09e6029ba5a4d4278a85a
Author: Devin Leamy <[email protected]>
AuthorDate: Mon Apr 22 12:31:52 2024 -0400

    [cgroups2] Implement Cgroups 2 isolator w/o nested containers and systemd.
    
    Updates the cgroups v2 isolator to include initialization, cleanup,
    update, and recovery logic.
    
    Unlike cgroups v1 we:
    - Create a new cgroup namespace during isolation, by introducing a new
      clone namespace flag. This implies that the contained process will
      only have access to cgroups in its cgroup subtree.
    - We only need to recover two cgroups (the non-leaf and leaf cgroups [1])
      for each container, rather than having to recover one cgroup for each
      controller the container used.
    - We do not yet support nested containers.
    - We do not yet have a systemd integration. Since the cgroups v1
      isolator's integration with systemd was largely to extend process
      lifetimes, the cgroups v2 isolator will function on systemd-managed
      machines, despite not having a first-class integration.
      A systemd integration will be added.
    
    Using the cgroups v2 isolator requires Mesos to be compiled with
    `--enable-cgroups-v2` and to have the cgroup2 filesystem mounted
    at /sys/fs/cgroup. Selecting the correct isolator version (v1 or v2)
    is done automatically. v2 is used if the host supports cgroups v2
    and is correctly configured.
    
    [1] The "non-leaf" cgroup is the cgroup for a container where resource
        constraints are imposed. The "leaf" cgroup, which has the path
        <non-leaf cgroup>/leaf, is where the container PID is put.
        Container PIDs are only put in leaf cgroups.
    
    This closes #556
---
 .../mesos/isolators/cgroups2/cgroups2.cpp          | 666 ++++++++++++++++++++-
 .../mesos/isolators/cgroups2/cgroups2.hpp          | 114 +++-
 2 files changed, 777 insertions(+), 3 deletions(-)

diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp 
b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
index dfb4e0b3a..911eadb1e 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
@@ -14,33 +14,56 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "linux/cgroups2.hpp"
+#include "common/protobuf_utils.hpp"
 
+#include "slave/containerizer/mesos/paths.hpp"
 #include "slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp"
 #include "slave/containerizer/mesos/isolators/cgroups2/controllers/core.hpp"
 #include "slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.hpp"
 
 #include <set>
 #include <string>
+#include <vector>
 
+#include <process/collect.hpp>
+#include <process/defer.hpp>
 #include <process/id.hpp>
+#include <process/pid.hpp>
 
 #include <stout/foreach.hpp>
+#include <stout/strings.hpp>
+
+#include "linux/cgroups2.hpp"
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+#include "linux/systemd.hpp"
 
+using mesos::slave::ContainerClass;
+using mesos::slave::ContainerConfig;
+using mesos::slave::ContainerLaunchInfo;
+using mesos::slave::ContainerState;
 using mesos::slave::Isolator;
 
+using process::Failure;
+using process::Future;
 using process::Owned;
+using process::PID;
 
 using std::set;
 using std::string;
+using std::vector;
 
 namespace mesos {
 namespace internal {
 namespace slave {
 
+namespace cgroups2_paths = containerizer::paths::cgroups2;
+
 Cgroups2IsolatorProcess::Cgroups2IsolatorProcess(
+    const Flags& _flags,
     const hashmap<string, Owned<Controller>>& _controllers)
     : ProcessBase(process::ID::generate("cgroups2-isolator")),
+    flags(_flags),
     controllers(_controllers) {}
 
 
@@ -103,7 +126,8 @@ Try<Isolator*> Cgroups2IsolatorProcess::create(const Flags& 
flags)
   }
 
 
-  Owned<MesosIsolatorProcess> process(new 
Cgroups2IsolatorProcess(controllers));
+  Owned<MesosIsolatorProcess> process(
+      new Cgroups2IsolatorProcess(flags, controllers));
   return new MesosIsolator(process);
 }
 
@@ -120,6 +144,644 @@ bool Cgroups2IsolatorProcess::supportsStandalone()
   return true;
 }
 
+
+Future<Option<ContainerLaunchInfo>> Cgroups2IsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ContainerConfig& containerConfig)
+{
+  if (containerId.has_parent()) {
+    return Failure("cgroups v2 does not support nested containers");
+  }
+
+  if (infos.contains(containerId)) {
+    return Failure("Container with id '" + stringify(containerId) + "'"
+                   " has already been prepared");
+  }
+
+  CHECK(containerConfig.container_class() != ContainerClass::DEBUG);
+
+  // Create the non-leaf and leaf cgroups for the container, enable
+  // controllers in the non-leaf cgroup, and `prepare` each of the controllers.
+  const string nonLeafCgroup = cgroups2_paths::container(
+      flags.cgroups_root, containerId);
+  if (cgroups2::exists(nonLeafCgroup)) {
+    return Failure("Cgroup '" + nonLeafCgroup + "' already exists");
+  }
+
+  Try<Nothing> create = cgroups2::create(nonLeafCgroup);
+  if (create.isError()) {
+    return Failure("Failed to create cgroup '" + nonLeafCgroup + "': "
+                   + create.error());
+  }
+
+  const string leafCgroup = cgroups2_paths::container(
+      flags.cgroups_root, containerId, true);
+  if (cgroups2::exists(leafCgroup)) {
+    return Failure("Cgroup '" + leafCgroup + "' already exists");
+  }
+
+  create = cgroups2::create(leafCgroup);
+  if (create.isError()) {
+    return Failure("Failed to create cgroup '" + leafCgroup + "': "
+                   + create.error());
+  }
+
+  LOG(INFO) << "Created cgroups '" << nonLeafCgroup << "'"
+            << " and '" << leafCgroup << "'";
+
+  infos[containerId] = Owned<Info>(
+      new Info(containerId, nonLeafCgroup, leafCgroup));
+
+  vector<Future<Nothing>> prepares;
+  foreachvalue (const Owned<Controller>& controller, controllers) {
+    if (controller->name() == "core") {
+      // The "core" controller is always enabled because the "cgroup.*" control
+      // files exist for all cgroups. Additionally, since "core" isn't a
+      // valid controller name (i.e. it doesn't exist in "cgroup.controllers"),
+      // calling `cgroups2::controllers::enable` with the "core" cgroup will
+      // fail with "Invalid argument".
+      //
+      // For that reason, we skip enabling the "core" controller here.
+      continue;
+    }
+
+    Try<Nothing> enable =
+      cgroups2::controllers::enable(nonLeafCgroup, {controller->name()});
+    if (enable.isError()) {
+      return Failure("Failed to enable controller '" + controller->name() + "'"
+                     " in cgroup '" + nonLeafCgroup + "': " + enable.error());
+    }
+
+    // We enable controllers in the leaf cgroup to allow the container process
+    // to manage their own cgroups, if they choose.
+    enable = cgroups2::controllers::enable(leafCgroup, {controller->name()});
+    if (enable.isError()) {
+      return Failure("Failed to enable controllers in cgroup"
+                     " '" + nonLeafCgroup + "': " + enable.error());
+    }
+
+    infos[containerId]->controllers.insert(controller->name());
+    prepares.push_back(
+        controller->prepare(containerId, nonLeafCgroup, containerConfig));
+  }
+
+  return await(prepares)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::_prepare,
+        containerId,
+        containerConfig,
+        lambda::_1));
+}
+
+
+Future<Option<ContainerLaunchInfo>> Cgroups2IsolatorProcess::_prepare(
+    const ContainerID& containerId,
+    const ContainerConfig& containerConfig,
+    const vector<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+
+  if (!errors.empty()) {
+    return Failure("Failed to prepare controllers: "
+                   + strings::join(", ", errors));
+  }
+
+  return update(
+      containerId,
+      containerConfig.resources(),
+      containerConfig.limits())
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::__prepare,
+        containerId,
+        containerConfig));
+}
+
+
+Future<Option<ContainerLaunchInfo>> Cgroups2IsolatorProcess::__prepare(
+    const ContainerID& containerId,
+    const ContainerConfig& containerConfig)
+{
+  // Only create cgroup mounts for containers with rootfs.
+  //
+  // TODO(bmahler): Consider adding cgroup namespace isolation for containers
+  // without a rootfs, which seems to be a useful feature?
+  if (!containerConfig.has_rootfs()) {
+    return None();
+  }
+
+  Owned<Info> info = cgroupInfo(containerId);
+  if (!info.get()) {
+    return Failure("Failed to get cgroup for container"
+                   " '" + stringify(containerId) + "'");
+  }
+
+  ContainerLaunchInfo launchInfo;
+
+  // Create a new cgroup namespace. The child process will only be able to
+  // see the cgroups that are in its cgroup subtree.
+  launchInfo.add_clone_namespaces(CLONE_NEWCGROUP);
+
+  // Create a new mount namespace and mount the root cgroup at /sys/fs/cgroup.
+  // TODO(bmahler): Is this the right way to mount?
+  launchInfo.add_clone_namespaces(CLONE_NEWNS);
+  *launchInfo.add_mounts() = protobuf::slave::createContainerMount(
+      cgroups2::path(info->cgroup_leaf) ,
+      path::join(containerConfig.rootfs(), "/sys/fs/cgroup"),
+      MS_BIND | MS_REC);
+
+  // TODO(qianzhang): This is a hack to pass the container-specific cgroups
+  // mounts and the symbolic links to the command executor to do for the
+  // command task. The reasons that we do it in this way are:
+  //   1. We need to ensure the container-specific cgroups mounts are done
+  //      only in the command task's mount namespace but not in the command
+  //      executor's mount namespace.
+  //   2. Even it's acceptable to do the container-specific cgroups mounts
+  //      in the command executor's mount namespace and the command task
+  //      inherit them from there (i.e., here we just return `launchInfo`
+  //      rather than passing it via `--task_launch_info`), the container
+  //      specific cgroups mounts will be hidden by the `sysfs` mounts done in
+  //      `mountSpecialFilesystems()` when the command executor launches the
+  //      command task.
+  if (containerConfig.has_task_info()) {
+    ContainerLaunchInfo _launchInfo;
+
+    _launchInfo.mutable_command()->add_arguments(
+        "--task_launch_info=" +
+        stringify(JSON::protobuf(launchInfo)));
+
+    return _launchInfo;
+  }
+
+  return launchInfo;
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::recover(
+    const vector<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  // Recover containers from checkpointed data:
+  vector<Future<Nothing>> recovers;
+  foreach (const ContainerState& state, states) {
+    recovers.push_back(___recover(state.container_id()));
+  }
+
+  // Then recover containers we find in the cgroups hierarchy:
+  return await(recovers)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::_recover,
+        orphans,
+        lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_recover(
+  const hashset<ContainerID>& orphans,
+  const vector<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+
+  if (!errors.empty()) {
+    return Failure("Failed to recover active containers: "
+                   + strings::join(", ", errors));
+  }
+
+  hashset<ContainerID> knownOrphans;
+  hashset<ContainerID> unknownOrphans;
+
+  Try<set<string>> cgroups = cgroups2::get(flags.cgroups_root);
+  if (cgroups.isError()) {
+    return Failure("Failed to get cgroups under '" + flags.cgroups_root + "': "
+                   + cgroups.error());
+  }
+
+  foreach (const string& cgroup, *cgroups) {
+    if (cgroup == cgroups2_paths::agent(flags.cgroups_root)) {
+      continue;
+    }
+
+    Option<ContainerID> containerId = cgroups2_paths::containerId(
+        flags.cgroups_root, cgroup);
+    if (containerId.isNone()) {
+      LOG(INFO) << "Cgroup '" << cgroup << "' does not correspond to a"
+                << " container id and will not be recovered";
+      continue;
+    }
+
+    if (infos.contains(*containerId)) {
+      // Container has already been recovered.
+      continue;
+    }
+
+    orphans.contains(*containerId) ?
+        knownOrphans.insert(*containerId) :
+        unknownOrphans.insert(*containerId);
+  }
+
+  vector<Future<Nothing>> recovers;
+  foreach (const ContainerID& containerId, knownOrphans) {
+    recovers.push_back(___recover(containerId));
+  }
+
+  foreach (const ContainerID& containerId, unknownOrphans) {
+    recovers.push_back(___recover(containerId));
+  }
+
+  return await(recovers)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::__recover,
+        unknownOrphans,
+        lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::__recover(
+    const hashset<ContainerID>& unknownOrphans,
+    const vector<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+  if (!errors.empty()) {
+    return Failure("Failed to recover orphan containers: "
+                   + strings::join(", ", errors));
+  }
+
+  // Known orphan cgroups will be destroyed by the containerizer using
+  // the normal cleanup path, but for unknown orphans we need to clean
+  // them up here:
+  foreach (const ContainerID& containerId, unknownOrphans) {
+    LOG(INFO) << "Cleaning up unknown orphaned container " << containerId;
+    cleanup(containerId);
+  }
+
+  return Nothing();
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::___recover(
+    const ContainerID& containerId)
+{
+  // Remark and handle invalid container states and recover enabled 
controllers.
+  //
+  // Invalid container states:
+  // 1. Missing non-leaf cgroup            => Log and create cgroup
+  // 2. Missing leaf cgroup                => Log and create cgroup
+  // 3. Some controllers are not enabled   => Log
+  //
+  // Failure modes that can lead to an invalid container state:
+  //
+  // 1. Mesos agent is restarted during launch.
+  //    This can happen if the launcher fails to `fork`, 'this' isolator fails
+  //    to `prepare` or `isolate`, among other reasons. Cgroups may be
+  //    improperly configured meaning there may be missing cgroups or cgroup
+  //    control files that have the wrong values.
+  // 2. Mesos agent is restarted during destroy.
+  //    The container fails to be destroyed so cgroups may not have been
+  //    cleaned up correctly. This can result in orphan cgroups.
+  // 3. Mesos agent is restarted with different flags.
+  //    If the agent is started with new isolators the cgroups for the existing
+  //    containers, from a previous run, won't have all the requested
+  //    controllers enabled.
+  //
+  // If a container is missing a cgroup, we create the missing cgroup. This
+  // is done exclusively so that the container can be cleanup()ed up by 'this'
+  // isolator and destroy()ed by the launcher like other containers.
+  // The alternative would be to break the invariant that each container has
+  // a leaf and non-leaf cgroup but that requires more special-case handling.
+  const string nonLeafCgroup =
+    cgroups2_paths::container(flags.cgroups_root, containerId, false);
+  const string leafCgroup =
+    cgroups2_paths::container(flags.cgroups_root, containerId, true);
+
+  if (!cgroups2::exists(nonLeafCgroup)) {
+    LOG(WARNING) << "Container '" << stringify(containerId) << "'"
+                 << " is missing the cgroup '" << nonLeafCgroup << "';"
+                 << " creating missing cgroup";
+
+    Try<Nothing> create = cgroups2::create(nonLeafCgroup);
+    if (create.isError()) {
+      return Failure("Failed to create cgroup '" + nonLeafCgroup + "': "
+                     + create.error());
+    }
+  }
+
+  if (!cgroups2::exists(leafCgroup)) {
+    LOG(WARNING) << "Container '" << stringify(containerId) << "'"
+                 << " is missing the cgroup '" << leafCgroup << "';"
+                 << " creating missing cgroup";
+
+    Try<Nothing> create = cgroups2::create(leafCgroup);
+    if (create.isError()) {
+      return Failure("Failed to create cgroup '" + leafCgroup + "': "
+                     + create.error());
+    }
+  }
+
+  Try<set<string>> enabled = cgroups2::controllers::enabled(nonLeafCgroup);
+  if (enabled.isError()) {
+    return Failure("Failed to get the enabled controllers for container"
+                   " '" + stringify(containerId) + "': " + enabled.error());
+  }
+
+  vector<Future<Nothing>> recovers;
+  hashset<string> recoveredControllers;
+  foreachvalue (const Owned<Controller>& controller, controllers) {
+    if (enabled->count(controller->name()) == 0) {
+      // Controller is expected to be enabled but isn't.
+      LOG(WARNING) << "Controller '" << controller->name() << "' is not 
enabled"
+                   << " for container '" << stringify(containerId) << "'";
+
+      continue;
+    }
+
+    recovers.push_back(controller->recover(containerId, nonLeafCgroup));
+    recoveredControllers.insert(controller->name());
+  }
+
+  return await(recovers)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::____recover,
+        containerId,
+        recoveredControllers,
+        lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::____recover(
+    const ContainerID& containerId,
+    const hashset<string>& recoveredControllers,
+    const vector<Future<Nothing>>& futures)
+{
+  CHECK(!infos.contains(containerId));
+
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+
+  if (!errors.empty()) {
+    return Failure("Failed to recover controllers: "
+                   + strings::join(", ", errors));
+  }
+
+  infos[containerId] = Owned<Info>(new Info(
+      containerId,
+      cgroups2_paths::container(flags.cgroups_root, containerId, false),
+      cgroups2_paths::container(flags.cgroups_root, containerId, true)));
+
+  infos[containerId]->controllers = recoveredControllers;
+
+  return Nothing();
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  vector<Future<Nothing>> isolates;
+
+  // Move the process into the container's cgroup.
+  if (infos.contains(containerId)) {
+    foreachvalue (const Owned<Controller> controller, controllers) {
+      isolates.push_back(controller->isolate(
+          containerId,
+          infos[containerId]->cgroup,
+          pid));
+    }
+  }
+
+  return await(isolates)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::_isolate,
+        lambda::_1,
+        containerId,
+        pid));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_isolate(
+    const vector<Future<Nothing>>& futures,
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+
+  if (!errors.empty()) {
+    return Failure("Failed to prepare controllers: "
+                   + strings::join(", ", errors));
+  }
+
+  Owned<Info> info = cgroupInfo(containerId);
+  if (!info.get()) {
+    return Failure(
+        "Failed to find cgroup for container '" + stringify(containerId) + 
"'");
+  }
+
+  // Move the process into its leaf cgroup.
+  Try<Nothing> assign = cgroups2::assign(info->cgroup_leaf, pid);
+  if (assign.isError()) {
+    return Failure("Failed to assign container '" + stringify(containerId) + 
"'"
+                   " to cgroup '" + info->cgroup + "': " + assign.error());
+  }
+
+  return Nothing();
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resourceRequests,
+    const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container '" + stringify(containerId) + "'");
+  }
+
+  vector<Future<Nothing>> updates;
+
+  LOG(INFO) << "Updating controllers for cgroup"
+            << " '" << infos[containerId]->cgroup << "'";
+
+  foreachvalue (const Owned<Controller>& controller, controllers) {
+    if (infos[containerId]->controllers.contains(controller->name())) {
+      updates.push_back(controller->update(
+          containerId,
+          infos[containerId]->cgroup,
+          resourceRequests,
+          resourceLimits));
+    }
+  }
+
+  return await(updates)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::_update,
+        lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_update(
+    const vector<Future<Nothing>>& futures)
+{
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+
+  if (!errors.empty()) {
+    return Failure("Failed to update controllers: "
+                   + strings::join(", ", errors));
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerStatus> Cgroups2IsolatorProcess::status(
+    const ContainerID& containerId)
+{
+  CHECK(infos.contains(containerId));
+
+  vector<Future<ContainerStatus>> statuses;
+  foreachvalue (const Owned<Controller>& controller, controllers) {
+    if (infos[containerId]->controllers.contains(controller->name())) {
+      statuses.push_back(controller->status(
+          containerId,
+          infos[containerId]->cgroup));
+    }
+  }
+
+  return await(statuses)
+    .then([containerId](const vector<Future<ContainerStatus>>& _statuses) {
+      ContainerStatus result;
+
+      foreach (const Future<ContainerStatus>& status, _statuses) {
+        if (status.isReady()) {
+          result.MergeFrom(status.get());
+        } else {
+          LOG(WARNING) << "Skipping status for container " << containerId
+                       << " because: "
+                       << (status.isFailed() ? status.failure() : "discarded");
+        }
+      }
+
+      return result;
+    });
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    VLOG(1) << "Ignoring cleanup request for unknown container " << 
containerId;
+    return Nothing();
+  }
+
+  vector<Future<Nothing>> cleanups;
+  foreachvalue (const Owned<Controller>& controller, controllers) {
+    if (infos[containerId]->controllers.contains(controller->name())) {
+      cleanups.push_back(controller->cleanup(
+          containerId,
+          infos[containerId]->cgroup));
+    }
+  }
+
+  return await(cleanups)
+    .then(defer(
+        PID<Cgroups2IsolatorProcess>(this),
+        &Cgroups2IsolatorProcess::_cleanup,
+        containerId,
+        lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_cleanup(
+    const ContainerID& containerId,
+    const vector<Future<Nothing>>& futures)
+{
+  CHECK(infos.contains(containerId));
+
+  vector<string> errors;
+  foreach (const Future<Nothing>& future, futures) {
+    if (!future.isReady()) {
+      errors.push_back(future.isFailed() ? future.failure() : "discarded");
+    }
+  }
+
+  if (!errors.empty()) {
+    return Failure("Failed to cleanup subsystems: "
+                   + strings::join(", ", errors));
+  }
+
+  if (cgroups2::exists(infos[containerId]->cgroup)) {
+    Try<Nothing> destroy = cgroups2::destroy(infos[containerId]->cgroup);
+    if (destroy.isError()) {
+      return Failure(
+          "Failed to destroy cgroup '" + infos[containerId]->cgroup + "': "
+          + destroy.error());
+    }
+  }
+
+  infos.erase(containerId);
+
+  return Nothing();
+}
+
+
+Owned<Cgroups2IsolatorProcess::Info> Cgroups2IsolatorProcess::cgroupInfo(
+    const ContainerID& containerId) const
+{
+  // `ContainerID`s are hierarchical, where each container id potentially has a
+  // parent container id. Here we walk up the hierarchy until we find a
+  // container id that has a corresponding info.
+
+  Option<ContainerID> current = containerId;
+  while (current.isSome()) {
+    Option<Owned<Info>> info = infos.get(*current);
+    if (info.isSome()) {
+      return *info;
+    }
+
+    if (!current->has_parent()) {
+      break;
+    }
+    current = current->parent();
+  }
+
+  return nullptr;
+}
+
 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {
diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp 
b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
index c67cf777d..211b9a1df 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
@@ -18,9 +18,12 @@
 #define __CGROUPS_V2_ISOLATOR_HPP__
 
 #include <string>
+#include <vector>
 
+#include <process/future.hpp>
 #include <process/owned.hpp>
 
+#include <stout/nothing.hpp>
 #include <stout/hashmap.hpp>
 #include <stout/try.hpp>
 
@@ -32,6 +35,24 @@ namespace mesos {
 namespace internal {
 namespace slave {
 
+// Cgroups v2 Mesos isolator.
+//
+// Manages the cgroup v2 controllers that are used by containers. Each
+// container is associated with two cgroups: a non-leaf cgroup whose control
+// files are updated and a leaf cgroup where the container's processes lives.
+// The container pid cannot live in the non-leaf cgroup because of the cgroups
+// v2 internal process constraint:
+//
+// 
https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint
 // NOLINT
+//
+// Example cgroups:
+//     containerA                       non-leaf cgroup
+//     /      \                         /            \
+// processes  containerB           leaf cgroup   non-leaf child cgroup
+//             |                                      |
+//            processes                          leaf-cgroup
+//
+// TODO(dleamy): Nested containers are not yet supported.
 class Cgroups2IsolatorProcess : public MesosIsolatorProcess
 {
 public:
@@ -43,12 +64,103 @@ public:
 
   bool supportsStandalone() override;
 
+  process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
+      const ContainerID& containerId,
+      const mesos::slave::ContainerConfig& containerConfig) override;
+
+  process::Future<Nothing> recover(
+      const std::vector<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans) override;
+
+  process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid) override;
+
+  process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resourceRequests,
+      const google::protobuf::Map<
+          std::string, Value::Scalar>& resourceLimits = {}) override;
+
+  process::Future<ContainerStatus> status(
+      const ContainerID& containerId) override;
+
+  process::Future<Nothing> cleanup(const ContainerID& containerId) override;
 private:
+  struct Info
+  {
+    Info(const ContainerID& containerId,
+         const std::string& cgroup,
+         const std::string& cgroup_leaf)
+      : containerId(containerId), cgroup(cgroup), cgroup_leaf(cgroup_leaf) {}
+
+    const ContainerID containerId;
+
+    // Non-leaf cgroup for the container. Control files in this cgroup are
+    // updated to set resource constraints on this and descendant
+    // containers. Processes should not be assigned to this cgroup.
+    const std::string cgroup;
+    const std::string cgroup_leaf;
+
+    // Names of the controllers which are prepared for the container.
+    hashset<std::string> controllers;
+  };
+
   Cgroups2IsolatorProcess(
+      const Flags& flags,
       const hashmap<std::string, process::Owned<Controller>>& controllers);
 
+  process::Future<Option<mesos::slave::ContainerLaunchInfo>> _prepare(
+    const ContainerID& containerId,
+    const mesos::slave::ContainerConfig& containerConfig,
+    const std::vector<process::Future<Nothing>>& futures);
+
+  process::Future<Option<mesos::slave::ContainerLaunchInfo>> __prepare(
+      const ContainerID& containerId,
+      const mesos::slave::ContainerConfig& containerConfig);
+
+  process::Future<Nothing> _recover(
+    const hashset<ContainerID>& orphans,
+    const std::vector<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> __recover(
+      const hashset<ContainerID>& unknownOrphans,
+      const std::vector<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> ___recover(
+      const ContainerID& containerId);
+
+  process::Future<Nothing> ____recover(
+      const ContainerID& containerId,
+      const hashset<std::string>& recoveredSubsystems,
+      const std::vector<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> _isolate(
+      const std::vector<process::Future<Nothing>>& futures,
+      const ContainerID& containerId,
+      pid_t pid);
+
+  process::Future<Nothing> _update(
+      const std::vector<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> _cleanup(
+      const ContainerID& containerId,
+      const std::vector<process::Future<Nothing>>& futures);
+
+  process::Future<Nothing> __cleanup(
+      const ContainerID& containerId,
+      const std::vector<process::Future<Nothing>>& futures);
+
+  process::Owned<Cgroups2IsolatorProcess::Info> cgroupInfo(
+      const ContainerID& containerId) const;
+
+  Flags flags;
+
   // Maps each controller to the `Controller` isolator that manages it.
-  const hashmap<std::string, process::Owned<Controller>> controllers;
+  hashmap<std::string, process::Owned<Controller>> controllers;
+
+  // Associates a container with the information to access its controllers.
+  hashmap<ContainerID, process::Owned<Info>> infos;
 };
 
 } // namespace slave {

Reply via email to