This is an automated email from the ASF dual-hosted git repository.
jpeach pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/master by this push:
new a535849 Assign cgroup processes after configuring the subsystem.
a535849 is described below
commit a535849448209356b14f18958b1738fa7d4459a9
Author: James Peach <[email protected]>
AuthorDate: Tue Jun 18 13:44:45 2019 -0700
Assign cgroup processes after configuring the subsystem.
Currently, the PID targeted by the cgroups isolator is moved into
the cgroup before the subsystem runs to apply any type-specific
cgroup configuration. We should reverse the order of this so that
the PID is only moved once the cgroup is fully configured by the
subsystem.
A specific case that can happen is where a PID is assigned to a net_cls
cgroup before that cgroup has its class ID set. This intermediate
process state can be observed by system monitoring process, causing
confusion that is hard to debug.
Review: https://reviews.apache.org/r/70863/
---
.../mesos/isolators/cgroups/cgroups.cpp | 114 +++++++++++----------
.../mesos/isolators/cgroups/cgroups.hpp | 4 +-
2 files changed, 64 insertions(+), 54 deletions(-)
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
index e7819d7..b12b73d 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
@@ -663,48 +663,6 @@ Future<Nothing> CgroupsIsolatorProcess::isolate(
const ContainerID& containerId,
pid_t pid)
{
- // If we are a nested container, we inherit
- // the cgroup from our root ancestor.
- ContainerID rootContainerId = protobuf::getRootContainerId(containerId);
-
- if (!infos.contains(rootContainerId)) {
- return Failure("Failed to isolate the container: Unknown root container");
- }
-
- const string& cgroup = infos[rootContainerId]->cgroup;
-
- // TODO(haosdent): Use foreachkey once MESOS-5037 is resolved.
- foreach (const string& hierarchy, subsystems.keys()) {
- // If new cgroup subsystems are added after the agent
- // upgrade, the newly added cgroup subsystems do not
- // exist on old container's cgroup hierarchy. So skip
- // assigning the pid to this cgroup subsystem.
- if (containerId.has_parent() && !cgroups::exists(hierarchy, cgroup)) {
- LOG(INFO) << "Skipping assigning pid " << stringify(pid)
- << " to cgroup at '" << path::join(hierarchy, cgroup)
- << "' for container " << containerId
- << " because its parent container " << containerId.parent()
- << " does not have this cgroup hierarchy";
- continue;
- }
-
- Try<Nothing> assign = cgroups::assign(
- hierarchy,
- cgroup,
- pid);
-
- if (assign.isError()) {
- string message =
- "Failed to assign container " + stringify(containerId) +
- " pid " + stringify(pid) + " to cgroup at '" +
- path::join(hierarchy, cgroup) + "': " + assign.error();
-
- LOG(ERROR) << message;
-
- return Failure(message);
- }
- }
-
// We currently can't call `subsystem->isolate()` on nested
// containers, because we don't call `prepare()`, `recover()`, or
// `cleanup()` on them either. If we were to call `isolate()` on
@@ -717,30 +675,39 @@ Future<Nothing> CgroupsIsolatorProcess::isolate(
// TODO(klueska): In the future we should revisit this to make
// sure that doing things this way is sufficient (or otherwise
// update our invariants to allow us to call this here).
- if (containerId.has_parent()) {
- return Nothing();
- }
vector<Future<Nothing>> isolates;
- foreachvalue (const Owned<Subsystem>& subsystem, subsystems) {
- isolates.push_back(subsystem->isolate(
- containerId,
- infos[containerId]->cgroup,
- pid));
+
+ if (!containerId.has_parent()) {
+ foreachvalue (const Owned<Subsystem>& subsystem, subsystems) {
+ isolates.push_back(subsystem->isolate(
+ containerId,
+ infos[containerId]->cgroup,
+ pid));
+ }
}
+ // We isolate all the subsystems first so that the subsystem is
+ // fully configured before assigning the process. This improves
+ // the atomicity of the assignment for any system processes that
+ // observe it.
return await(isolates)
.then(defer(
PID<CgroupsIsolatorProcess>(this),
&CgroupsIsolatorProcess::_isolate,
- lambda::_1));
+ lambda::_1,
+ containerId,
+ pid));
}
Future<Nothing> CgroupsIsolatorProcess::_isolate(
- const vector<Future<Nothing>>& futures)
+ const vector<Future<Nothing>>& futures,
+ const ContainerID& containerId,
+ pid_t pid)
{
vector<string> errors;
+
foreach (const Future<Nothing>& future, futures) {
if (!future.isReady()) {
errors.push_back((future.isFailed()
@@ -749,12 +716,53 @@ Future<Nothing> CgroupsIsolatorProcess::_isolate(
}
}
- if (errors.size() > 0) {
+ if (!errors.empty()) {
return Failure(
"Failed to isolate subsystems: " +
strings::join(";", errors));
}
+ // If we are a nested container, we inherit the cgroup from our parent.
+ const ContainerID rootContainerId =
protobuf::getRootContainerId(containerId);
+
+ if (!infos.contains(rootContainerId)) {
+ return Failure("Failed to isolate the container: Unknown root container");
+ }
+
+ const string& cgroup = infos[rootContainerId]->cgroup;
+
+ // TODO(haosdent): Use foreachkey once MESOS-5037 is resolved.
+ foreach (const string& hierarchy, subsystems.keys()) {
+ // If new cgroup subsystems are added after the agent
+ // upgrade, the newly added cgroup subsystems do not
+ // exist on old container's cgroup hierarchy. So skip
+ // assigning the pid to this cgroup subsystem.
+ if (containerId.has_parent() && !cgroups::exists(hierarchy, cgroup)) {
+ LOG(INFO) << "Skipping assigning pid " << stringify(pid)
+ << " to cgroup at '" << path::join(hierarchy, cgroup)
+ << "' for container " << containerId
+ << " because its parent container " << containerId.parent()
+ << " does not have this cgroup hierarchy";
+ continue;
+ }
+
+ Try<Nothing> assign = cgroups::assign(
+ hierarchy,
+ cgroup,
+ pid);
+
+ if (assign.isError()) {
+ string message =
+ "Failed to assign container " + stringify(containerId) +
+ " pid " + stringify(pid) + " to cgroup at '" +
+ path::join(hierarchy, cgroup) + "': " + assign.error();
+
+ LOG(ERROR) << message;
+
+ return Failure(message);
+ }
+ }
+
return Nothing();
}
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
index 4a1871b..4bd3d6d 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.hpp
@@ -130,7 +130,9 @@ private:
const mesos::slave::ContainerConfig& containerConfig);
process::Future<Nothing> _isolate(
- const std::vector<process::Future<Nothing>>& futures);
+ const std::vector<process::Future<Nothing>>& futures,
+ const ContainerID& containerId,
+ pid_t pid);
void _watch(
const ContainerID& containerId,