This is an automated email from the ASF dual-hosted git repository.
qianzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/master by this push:
new 8a344da Fixed a bug where the cgroup task killer leaves the cgroup
frozen.
8a344da is described below
commit 8a344da266bdded895f73d7ab189e868d326f1e5
Author: Charles-Francois Natali <[email protected]>
AuthorDate: Wed Jul 21 21:13:55 2021 +0800
Fixed a bug where the cgroup task killer leaves the cgroup frozen.
This closes #388
---
src/linux/cgroups.cpp | 28 ++++++++++++++++------------
1 file changed, 16 insertions(+), 12 deletions(-)
diff --git a/src/linux/cgroups.cpp b/src/linux/cgroups.cpp
index 43d8ac0..11626cf 100644
--- a/src/linux/cgroups.cpp
+++ b/src/linux/cgroups.cpp
@@ -1398,15 +1398,14 @@ public:
// Return a future indicating the state of the killer.
// Failure occurs if any process in the cgroup is unable to be
// killed.
+ // Discarding the future will cause this process to stop the next time it
+ // calls `freeze`: we don't want to stop at an arbitrary point since it might
+ // leave the cgroup frozen.
Future<Nothing> future() { return promise.future(); }
protected:
void initialize() override
{
- // Stop when no one cares.
- promise.future().onDiscard(lambda::bind(
- static_cast<void (*)(const UPID&, bool)>(terminate), self(), true));
-
killTasks();
}
@@ -1425,16 +1424,15 @@ private:
const PID<TasksKiller>& pid)
{
// Cancel the freeze operation.
- // TODO(jieyu): Wait until 'future' is in DISCARDED state before
- // starting retry.
future.discard();
- // We attempt to kill the processes before we thaw again,
- // due to a bug in the kernel. See MESOS-1758 for more details.
- // We thaw the cgroup before trying to freeze again to allow any
- // pending signals to be delivered. See MESOS-1689 for details.
- // This is a short term hack until we have PID namespace support.
- return Future<bool>(true)
+ // Wait until the freeze is cancelled, and then attempt to kill the
+ // processes before we thaw again, due to a bug in the kernel. See
+ // MESOS-1758 for more details. We thaw the cgroup before trying to freeze
+ // again to allow any pending signals to be delivered. See MESOS-1689 for
+ // details. This is a short term hack until we have PID namespace support.
+ return future
+ .recover([](const Future<Nothing>&){ return Future<Nothing>(Nothing());
})
.then(defer(pid, &Self::kill))
.then(defer(pid, &Self::thaw))
.then(defer(pid, &Self::freeze));
@@ -1452,6 +1450,12 @@ private:
Future<Nothing> freeze()
{
+ // Don't start another `killTasks` cycle if we've been asked to stop.
+ if (promise.future().hasDiscard()) {
+ terminate(self());
+ return Nothing();
+ }
+
// TODO(jieyu): This is a workaround for MESOS-1689. We will move
// away from freezer once we have pid namespace support.
return cgroups::freezer::freeze(hierarchy, cgroup).after(