This is an automated email from the ASF dual-hosted git repository.
bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/master by this push:
new 14d701331 [cgroups2] Implement Cgroups 2 isolator w/o nested
containers and systemd.
14d701331 is described below
commit 14d7013319eca952cdf09e6029ba5a4d4278a85a
Author: Devin Leamy <[email protected]>
AuthorDate: Mon Apr 22 12:31:52 2024 -0400
[cgroups2] Implement Cgroups 2 isolator w/o nested containers and systemd.
Updates the cgroups v2 isolator to include initialization, cleanup,
update, and recovery logic.
Unlike cgroups v1 we:
- Create a new cgroup namespace during isolation, by introducing a new
clone namespace flag. This implies that the contained process will
only have access to cgroups in its cgroup subtree.
- We only need to recover two cgroups (the non-leaf and leaf cgroups [1])
for each container, rather than having to recover one cgroup for each
controller the container used.
- We do not yet support nested containers.
- We do not yet have a systemd integration. Since the cgroups v1
isolator's integration with systemd was largely to extend process
lifetimes, the cgroups v2 isolator will function on systemd-managed
machines, despite not having a first-class integration.
A systemd integration will be added.
Using the cgroups v2 isolator requires Mesos to be compiled with
`--enable-cgroups-v2` and to have the cgroup2 filesystem mounted
at /sys/fs/cgroup. Selecting the correct isolator version (v1 or v2)
is done automatically. v2 is used if the host supports cgroups v2
and is correctly configured.
[1] The "non-leaf" cgroup is the cgroup for a container where resource
constraints are imposed. The "leaf" cgroup, which has the path
<non-leaf cgroup>/leaf, is where the container PID is put.
Container PIDs are only put in leaf cgroups.
This closes #556
---
.../mesos/isolators/cgroups2/cgroups2.cpp | 666 ++++++++++++++++++++-
.../mesos/isolators/cgroups2/cgroups2.hpp | 114 +++-
2 files changed, 777 insertions(+), 3 deletions(-)
diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
index dfb4e0b3a..911eadb1e 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
@@ -14,33 +14,56 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "linux/cgroups2.hpp"
+#include "common/protobuf_utils.hpp"
+#include "slave/containerizer/mesos/paths.hpp"
#include "slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp"
#include "slave/containerizer/mesos/isolators/cgroups2/controllers/core.hpp"
#include "slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.hpp"
#include <set>
#include <string>
+#include <vector>
+#include <process/collect.hpp>
+#include <process/defer.hpp>
#include <process/id.hpp>
+#include <process/pid.hpp>
#include <stout/foreach.hpp>
+#include <stout/strings.hpp>
+
+#include "linux/cgroups2.hpp"
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+#include "linux/systemd.hpp"
+using mesos::slave::ContainerClass;
+using mesos::slave::ContainerConfig;
+using mesos::slave::ContainerLaunchInfo;
+using mesos::slave::ContainerState;
using mesos::slave::Isolator;
+using process::Failure;
+using process::Future;
using process::Owned;
+using process::PID;
using std::set;
using std::string;
+using std::vector;
namespace mesos {
namespace internal {
namespace slave {
+namespace cgroups2_paths = containerizer::paths::cgroups2;
+
Cgroups2IsolatorProcess::Cgroups2IsolatorProcess(
+ const Flags& _flags,
const hashmap<string, Owned<Controller>>& _controllers)
: ProcessBase(process::ID::generate("cgroups2-isolator")),
+ flags(_flags),
controllers(_controllers) {}
@@ -103,7 +126,8 @@ Try<Isolator*> Cgroups2IsolatorProcess::create(const Flags&
flags)
}
- Owned<MesosIsolatorProcess> process(new
Cgroups2IsolatorProcess(controllers));
+ Owned<MesosIsolatorProcess> process(
+ new Cgroups2IsolatorProcess(flags, controllers));
return new MesosIsolator(process);
}
@@ -120,6 +144,644 @@ bool Cgroups2IsolatorProcess::supportsStandalone()
return true;
}
+
+Future<Option<ContainerLaunchInfo>> Cgroups2IsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ContainerConfig& containerConfig)
+{
+ if (containerId.has_parent()) {
+ return Failure("cgroups v2 does not support nested containers");
+ }
+
+ if (infos.contains(containerId)) {
+ return Failure("Container with id '" + stringify(containerId) + "'"
+ " has already been prepared");
+ }
+
+ CHECK(containerConfig.container_class() != ContainerClass::DEBUG);
+
+ // Create the non-leaf and leaf cgroups for the container, enable
+ // controllers in the non-leaf cgroup, and `prepare` each of the controllers.
+ const string nonLeafCgroup = cgroups2_paths::container(
+ flags.cgroups_root, containerId);
+ if (cgroups2::exists(nonLeafCgroup)) {
+ return Failure("Cgroup '" + nonLeafCgroup + "' already exists");
+ }
+
+ Try<Nothing> create = cgroups2::create(nonLeafCgroup);
+ if (create.isError()) {
+ return Failure("Failed to create cgroup '" + nonLeafCgroup + "': "
+ + create.error());
+ }
+
+ const string leafCgroup = cgroups2_paths::container(
+ flags.cgroups_root, containerId, true);
+ if (cgroups2::exists(leafCgroup)) {
+ return Failure("Cgroup '" + leafCgroup + "' already exists");
+ }
+
+ create = cgroups2::create(leafCgroup);
+ if (create.isError()) {
+ return Failure("Failed to create cgroup '" + leafCgroup + "': "
+ + create.error());
+ }
+
+ LOG(INFO) << "Created cgroups '" << nonLeafCgroup << "'"
+ << " and '" << leafCgroup << "'";
+
+ infos[containerId] = Owned<Info>(
+ new Info(containerId, nonLeafCgroup, leafCgroup));
+
+ vector<Future<Nothing>> prepares;
+ foreachvalue (const Owned<Controller>& controller, controllers) {
+ if (controller->name() == "core") {
+ // The "core" controller is always enabled because the "cgroup.*" control
+ // files exist for all cgroups. Additionally, since "core" isn't a
+ // valid controller name (i.e. it doesn't exist in "cgroup.controllers"),
+ // calling `cgroups2::controllers::enable` with the "core" cgroup will
+ // fail with "Invalid argument".
+ //
+ // For that reason, we skip enabling the "core" controller here.
+ continue;
+ }
+
+ Try<Nothing> enable =
+ cgroups2::controllers::enable(nonLeafCgroup, {controller->name()});
+ if (enable.isError()) {
+ return Failure("Failed to enable controller '" + controller->name() + "'"
+ " in cgroup '" + nonLeafCgroup + "': " + enable.error());
+ }
+
+ // We enable controllers in the leaf cgroup to allow the container process
+ // to manage their own cgroups, if they choose.
+ enable = cgroups2::controllers::enable(leafCgroup, {controller->name()});
+ if (enable.isError()) {
+ return Failure("Failed to enable controllers in cgroup"
+ " '" + nonLeafCgroup + "': " + enable.error());
+ }
+
+ infos[containerId]->controllers.insert(controller->name());
+ prepares.push_back(
+ controller->prepare(containerId, nonLeafCgroup, containerConfig));
+ }
+
+ return await(prepares)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::_prepare,
+ containerId,
+ containerConfig,
+ lambda::_1));
+}
+
+
+Future<Option<ContainerLaunchInfo>> Cgroups2IsolatorProcess::_prepare(
+ const ContainerID& containerId,
+ const ContainerConfig& containerConfig,
+ const vector<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+
+ if (!errors.empty()) {
+ return Failure("Failed to prepare controllers: "
+ + strings::join(", ", errors));
+ }
+
+ return update(
+ containerId,
+ containerConfig.resources(),
+ containerConfig.limits())
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::__prepare,
+ containerId,
+ containerConfig));
+}
+
+
+Future<Option<ContainerLaunchInfo>> Cgroups2IsolatorProcess::__prepare(
+ const ContainerID& containerId,
+ const ContainerConfig& containerConfig)
+{
+ // Only create cgroup mounts for containers with rootfs.
+ //
+ // TODO(bmahler): Consider adding cgroup namespace isolation for containers
+ // without a rootfs, which seems to be a useful feature?
+ if (!containerConfig.has_rootfs()) {
+ return None();
+ }
+
+ Owned<Info> info = cgroupInfo(containerId);
+ if (!info.get()) {
+ return Failure("Failed to get cgroup for container"
+ " '" + stringify(containerId) + "'");
+ }
+
+ ContainerLaunchInfo launchInfo;
+
+ // Create a new cgroup namespace. The child process will only be able to
+ // see the cgroups that are in its cgroup subtree.
+ launchInfo.add_clone_namespaces(CLONE_NEWCGROUP);
+
+ // Create a new mount namespace and mount the root cgroup at /sys/fs/cgroup.
+ // TODO(bmahler): Is this the right way to mount?
+ launchInfo.add_clone_namespaces(CLONE_NEWNS);
+ *launchInfo.add_mounts() = protobuf::slave::createContainerMount(
+ cgroups2::path(info->cgroup_leaf) ,
+ path::join(containerConfig.rootfs(), "/sys/fs/cgroup"),
+ MS_BIND | MS_REC);
+
+ // TODO(qianzhang): This is a hack to pass the container-specific cgroups
+ // mounts and the symbolic links to the command executor to do for the
+ // command task. The reasons that we do it in this way are:
+ // 1. We need to ensure the container-specific cgroups mounts are done
+ // only in the command task's mount namespace but not in the command
+ // executor's mount namespace.
+ // 2. Even it's acceptable to do the container-specific cgroups mounts
+ // in the command executor's mount namespace and the command task
+ // inherit them from there (i.e., here we just return `launchInfo`
+ // rather than passing it via `--task_launch_info`), the container
+ // specific cgroups mounts will be hidden by the `sysfs` mounts done in
+ // `mountSpecialFilesystems()` when the command executor launches the
+ // command task.
+ if (containerConfig.has_task_info()) {
+ ContainerLaunchInfo _launchInfo;
+
+ _launchInfo.mutable_command()->add_arguments(
+ "--task_launch_info=" +
+ stringify(JSON::protobuf(launchInfo)));
+
+ return _launchInfo;
+ }
+
+ return launchInfo;
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::recover(
+ const vector<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ // Recover containers from checkpointed data:
+ vector<Future<Nothing>> recovers;
+ foreach (const ContainerState& state, states) {
+ recovers.push_back(___recover(state.container_id()));
+ }
+
+ // Then recover containers we find in the cgroups hierarchy:
+ return await(recovers)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::_recover,
+ orphans,
+ lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_recover(
+ const hashset<ContainerID>& orphans,
+ const vector<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+
+ if (!errors.empty()) {
+ return Failure("Failed to recover active containers: "
+ + strings::join(", ", errors));
+ }
+
+ hashset<ContainerID> knownOrphans;
+ hashset<ContainerID> unknownOrphans;
+
+ Try<set<string>> cgroups = cgroups2::get(flags.cgroups_root);
+ if (cgroups.isError()) {
+ return Failure("Failed to get cgroups under '" + flags.cgroups_root + "': "
+ + cgroups.error());
+ }
+
+ foreach (const string& cgroup, *cgroups) {
+ if (cgroup == cgroups2_paths::agent(flags.cgroups_root)) {
+ continue;
+ }
+
+ Option<ContainerID> containerId = cgroups2_paths::containerId(
+ flags.cgroups_root, cgroup);
+ if (containerId.isNone()) {
+ LOG(INFO) << "Cgroup '" << cgroup << "' does not correspond to a"
+ << " container id and will not be recovered";
+ continue;
+ }
+
+ if (infos.contains(*containerId)) {
+ // Container has already been recovered.
+ continue;
+ }
+
+ orphans.contains(*containerId) ?
+ knownOrphans.insert(*containerId) :
+ unknownOrphans.insert(*containerId);
+ }
+
+ vector<Future<Nothing>> recovers;
+ foreach (const ContainerID& containerId, knownOrphans) {
+ recovers.push_back(___recover(containerId));
+ }
+
+ foreach (const ContainerID& containerId, unknownOrphans) {
+ recovers.push_back(___recover(containerId));
+ }
+
+ return await(recovers)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::__recover,
+ unknownOrphans,
+ lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::__recover(
+ const hashset<ContainerID>& unknownOrphans,
+ const vector<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+ if (!errors.empty()) {
+ return Failure("Failed to recover orphan containers: "
+ + strings::join(", ", errors));
+ }
+
+ // Known orphan cgroups will be destroyed by the containerizer using
+ // the normal cleanup path, but for unknown orphans we need to clean
+ // them up here:
+ foreach (const ContainerID& containerId, unknownOrphans) {
+ LOG(INFO) << "Cleaning up unknown orphaned container " << containerId;
+ cleanup(containerId);
+ }
+
+ return Nothing();
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::___recover(
+ const ContainerID& containerId)
+{
+ // Remark and handle invalid container states and recover enabled
controllers.
+ //
+ // Invalid container states:
+ // 1. Missing non-leaf cgroup => Log and create cgroup
+ // 2. Missing leaf cgroup => Log and create cgroup
+ // 3. Some controllers are not enabled => Log
+ //
+ // Failure modes that can lead to an invalid container state:
+ //
+ // 1. Mesos agent is restarted during launch.
+ // This can happen if the launcher fails to `fork`, 'this' isolator fails
+ // to `prepare` or `isolate`, among other reasons. Cgroups may be
+ // improperly configured meaning there may be missing cgroups or cgroup
+ // control files that have the wrong values.
+ // 2. Mesos agent is restarted during destroy.
+ // The container fails to be destroyed so cgroups may not have been
+ // cleaned up correctly. This can result in orphan cgroups.
+ // 3. Mesos agent is restarted with different flags.
+ // If the agent is started with new isolators the cgroups for the existing
+ // containers, from a previous run, won't have all the requested
+ // controllers enabled.
+ //
+ // If a container is missing a cgroup, we create the missing cgroup. This
+ // is done exclusively so that the container can be cleanup()ed up by 'this'
+ // isolator and destroy()ed by the launcher like other containers.
+ // The alternative would be to break the invariant that each container has
+ // a leaf and non-leaf cgroup but that requires more special-case handling.
+ const string nonLeafCgroup =
+ cgroups2_paths::container(flags.cgroups_root, containerId, false);
+ const string leafCgroup =
+ cgroups2_paths::container(flags.cgroups_root, containerId, true);
+
+ if (!cgroups2::exists(nonLeafCgroup)) {
+ LOG(WARNING) << "Container '" << stringify(containerId) << "'"
+ << " is missing the cgroup '" << nonLeafCgroup << "';"
+ << " creating missing cgroup";
+
+ Try<Nothing> create = cgroups2::create(nonLeafCgroup);
+ if (create.isError()) {
+ return Failure("Failed to create cgroup '" + nonLeafCgroup + "': "
+ + create.error());
+ }
+ }
+
+ if (!cgroups2::exists(leafCgroup)) {
+ LOG(WARNING) << "Container '" << stringify(containerId) << "'"
+ << " is missing the cgroup '" << leafCgroup << "';"
+ << " creating missing cgroup";
+
+ Try<Nothing> create = cgroups2::create(leafCgroup);
+ if (create.isError()) {
+ return Failure("Failed to create cgroup '" + leafCgroup + "': "
+ + create.error());
+ }
+ }
+
+ Try<set<string>> enabled = cgroups2::controllers::enabled(nonLeafCgroup);
+ if (enabled.isError()) {
+ return Failure("Failed to get the enabled controllers for container"
+ " '" + stringify(containerId) + "': " + enabled.error());
+ }
+
+ vector<Future<Nothing>> recovers;
+ hashset<string> recoveredControllers;
+ foreachvalue (const Owned<Controller>& controller, controllers) {
+ if (enabled->count(controller->name()) == 0) {
+ // Controller is expected to be enabled but isn't.
+ LOG(WARNING) << "Controller '" << controller->name() << "' is not
enabled"
+ << " for container '" << stringify(containerId) << "'";
+
+ continue;
+ }
+
+ recovers.push_back(controller->recover(containerId, nonLeafCgroup));
+ recoveredControllers.insert(controller->name());
+ }
+
+ return await(recovers)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::____recover,
+ containerId,
+ recoveredControllers,
+ lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::____recover(
+ const ContainerID& containerId,
+ const hashset<string>& recoveredControllers,
+ const vector<Future<Nothing>>& futures)
+{
+ CHECK(!infos.contains(containerId));
+
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+
+ if (!errors.empty()) {
+ return Failure("Failed to recover controllers: "
+ + strings::join(", ", errors));
+ }
+
+ infos[containerId] = Owned<Info>(new Info(
+ containerId,
+ cgroups2_paths::container(flags.cgroups_root, containerId, false),
+ cgroups2_paths::container(flags.cgroups_root, containerId, true)));
+
+ infos[containerId]->controllers = recoveredControllers;
+
+ return Nothing();
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ vector<Future<Nothing>> isolates;
+
+ // Move the process into the container's cgroup.
+ if (infos.contains(containerId)) {
+ foreachvalue (const Owned<Controller> controller, controllers) {
+ isolates.push_back(controller->isolate(
+ containerId,
+ infos[containerId]->cgroup,
+ pid));
+ }
+ }
+
+ return await(isolates)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::_isolate,
+ lambda::_1,
+ containerId,
+ pid));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_isolate(
+ const vector<Future<Nothing>>& futures,
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+
+ if (!errors.empty()) {
+ return Failure("Failed to prepare controllers: "
+ + strings::join(", ", errors));
+ }
+
+ Owned<Info> info = cgroupInfo(containerId);
+ if (!info.get()) {
+ return Failure(
+ "Failed to find cgroup for container '" + stringify(containerId) +
"'");
+ }
+
+ // Move the process into its leaf cgroup.
+ Try<Nothing> assign = cgroups2::assign(info->cgroup_leaf, pid);
+ if (assign.isError()) {
+ return Failure("Failed to assign container '" + stringify(containerId) +
"'"
+ " to cgroup '" + info->cgroup + "': " + assign.error());
+ }
+
+ return Nothing();
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resourceRequests,
+ const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container '" + stringify(containerId) + "'");
+ }
+
+ vector<Future<Nothing>> updates;
+
+ LOG(INFO) << "Updating controllers for cgroup"
+ << " '" << infos[containerId]->cgroup << "'";
+
+ foreachvalue (const Owned<Controller>& controller, controllers) {
+ if (infos[containerId]->controllers.contains(controller->name())) {
+ updates.push_back(controller->update(
+ containerId,
+ infos[containerId]->cgroup,
+ resourceRequests,
+ resourceLimits));
+ }
+ }
+
+ return await(updates)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::_update,
+ lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_update(
+ const vector<Future<Nothing>>& futures)
+{
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+
+ if (!errors.empty()) {
+ return Failure("Failed to update controllers: "
+ + strings::join(", ", errors));
+ }
+
+ return Nothing();
+}
+
+
+Future<ContainerStatus> Cgroups2IsolatorProcess::status(
+ const ContainerID& containerId)
+{
+ CHECK(infos.contains(containerId));
+
+ vector<Future<ContainerStatus>> statuses;
+ foreachvalue (const Owned<Controller>& controller, controllers) {
+ if (infos[containerId]->controllers.contains(controller->name())) {
+ statuses.push_back(controller->status(
+ containerId,
+ infos[containerId]->cgroup));
+ }
+ }
+
+ return await(statuses)
+ .then([containerId](const vector<Future<ContainerStatus>>& _statuses) {
+ ContainerStatus result;
+
+ foreach (const Future<ContainerStatus>& status, _statuses) {
+ if (status.isReady()) {
+ result.MergeFrom(status.get());
+ } else {
+ LOG(WARNING) << "Skipping status for container " << containerId
+ << " because: "
+ << (status.isFailed() ? status.failure() : "discarded");
+ }
+ }
+
+ return result;
+ });
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ VLOG(1) << "Ignoring cleanup request for unknown container " <<
containerId;
+ return Nothing();
+ }
+
+ vector<Future<Nothing>> cleanups;
+ foreachvalue (const Owned<Controller>& controller, controllers) {
+ if (infos[containerId]->controllers.contains(controller->name())) {
+ cleanups.push_back(controller->cleanup(
+ containerId,
+ infos[containerId]->cgroup));
+ }
+ }
+
+ return await(cleanups)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::_cleanup,
+ containerId,
+ lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::_cleanup(
+ const ContainerID& containerId,
+ const vector<Future<Nothing>>& futures)
+{
+ CHECK(infos.contains(containerId));
+
+ vector<string> errors;
+ foreach (const Future<Nothing>& future, futures) {
+ if (!future.isReady()) {
+ errors.push_back(future.isFailed() ? future.failure() : "discarded");
+ }
+ }
+
+ if (!errors.empty()) {
+ return Failure("Failed to cleanup subsystems: "
+ + strings::join(", ", errors));
+ }
+
+ if (cgroups2::exists(infos[containerId]->cgroup)) {
+ Try<Nothing> destroy = cgroups2::destroy(infos[containerId]->cgroup);
+ if (destroy.isError()) {
+ return Failure(
+ "Failed to destroy cgroup '" + infos[containerId]->cgroup + "': "
+ + destroy.error());
+ }
+ }
+
+ infos.erase(containerId);
+
+ return Nothing();
+}
+
+
+Owned<Cgroups2IsolatorProcess::Info> Cgroups2IsolatorProcess::cgroupInfo(
+ const ContainerID& containerId) const
+{
+ // `ContainerID`s are hierarchical, where each container id potentially has a
+ // parent container id. Here we walk up the hierarchy until we find a
+ // container id that has a corresponding info.
+
+ Option<ContainerID> current = containerId;
+ while (current.isSome()) {
+ Option<Owned<Info>> info = infos.get(*current);
+ if (info.isSome()) {
+ return *info;
+ }
+
+ if (!current->has_parent()) {
+ break;
+ }
+ current = current->parent();
+ }
+
+ return nullptr;
+}
+
} // namespace slave {
} // namespace internal {
} // namespace mesos {
diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
index c67cf777d..211b9a1df 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
@@ -18,9 +18,12 @@
#define __CGROUPS_V2_ISOLATOR_HPP__
#include <string>
+#include <vector>
+#include <process/future.hpp>
#include <process/owned.hpp>
+#include <stout/nothing.hpp>
#include <stout/hashmap.hpp>
#include <stout/try.hpp>
@@ -32,6 +35,24 @@ namespace mesos {
namespace internal {
namespace slave {
+// Cgroups v2 Mesos isolator.
+//
+// Manages the cgroup v2 controllers that are used by containers. Each
+// container is associated with two cgroups: a non-leaf cgroup whose control
+// files are updated and a leaf cgroup where the container's processes lives.
+// The container pid cannot live in the non-leaf cgroup because of the cgroups
+// v2 internal process constraint:
+//
+//
https://docs.kernel.org/admin-guide/cgroup-v2.html#no-internal-process-constraint
// NOLINT
+//
+// Example cgroups:
+// containerA non-leaf cgroup
+// / \ / \
+// processes containerB leaf cgroup non-leaf child cgroup
+// | |
+// processes leaf-cgroup
+//
+// TODO(dleamy): Nested containers are not yet supported.
class Cgroups2IsolatorProcess : public MesosIsolatorProcess
{
public:
@@ -43,12 +64,103 @@ public:
bool supportsStandalone() override;
+ process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
+ const ContainerID& containerId,
+ const mesos::slave::ContainerConfig& containerConfig) override;
+
+ process::Future<Nothing> recover(
+ const std::vector<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans) override;
+
+ process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid) override;
+
+ process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resourceRequests,
+ const google::protobuf::Map<
+ std::string, Value::Scalar>& resourceLimits = {}) override;
+
+ process::Future<ContainerStatus> status(
+ const ContainerID& containerId) override;
+
+ process::Future<Nothing> cleanup(const ContainerID& containerId) override;
private:
+ struct Info
+ {
+ Info(const ContainerID& containerId,
+ const std::string& cgroup,
+ const std::string& cgroup_leaf)
+ : containerId(containerId), cgroup(cgroup), cgroup_leaf(cgroup_leaf) {}
+
+ const ContainerID containerId;
+
+ // Non-leaf cgroup for the container. Control files in this cgroup are
+ // updated to set resource constraints on this and descendant
+ // containers. Processes should not be assigned to this cgroup.
+ const std::string cgroup;
+ const std::string cgroup_leaf;
+
+ // Names of the controllers which are prepared for the container.
+ hashset<std::string> controllers;
+ };
+
Cgroups2IsolatorProcess(
+ const Flags& flags,
const hashmap<std::string, process::Owned<Controller>>& controllers);
+ process::Future<Option<mesos::slave::ContainerLaunchInfo>> _prepare(
+ const ContainerID& containerId,
+ const mesos::slave::ContainerConfig& containerConfig,
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Future<Option<mesos::slave::ContainerLaunchInfo>> __prepare(
+ const ContainerID& containerId,
+ const mesos::slave::ContainerConfig& containerConfig);
+
+ process::Future<Nothing> _recover(
+ const hashset<ContainerID>& orphans,
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> __recover(
+ const hashset<ContainerID>& unknownOrphans,
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> ___recover(
+ const ContainerID& containerId);
+
+ process::Future<Nothing> ____recover(
+ const ContainerID& containerId,
+ const hashset<std::string>& recoveredSubsystems,
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> _isolate(
+ const std::vector<process::Future<Nothing>>& futures,
+ const ContainerID& containerId,
+ pid_t pid);
+
+ process::Future<Nothing> _update(
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> _cleanup(
+ const ContainerID& containerId,
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Future<Nothing> __cleanup(
+ const ContainerID& containerId,
+ const std::vector<process::Future<Nothing>>& futures);
+
+ process::Owned<Cgroups2IsolatorProcess::Info> cgroupInfo(
+ const ContainerID& containerId) const;
+
+ Flags flags;
+
// Maps each controller to the `Controller` isolator that manages it.
- const hashmap<std::string, process::Owned<Controller>> controllers;
+ hashmap<std::string, process::Owned<Controller>> controllers;
+
+ // Associates a container with the information to access its controllers.
+ hashmap<ContainerID, process::Owned<Info>> infos;
};
} // namespace slave {