This is an automated email from the ASF dual-hosted git repository.

qianzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 8a344da  Fixed a bug where the cgroup task killer leaves the cgroup 
frozen.
8a344da is described below

commit 8a344da266bdded895f73d7ab189e868d326f1e5
Author: Charles-Francois Natali <cf.nat...@gmail.com>
AuthorDate: Wed Jul 21 21:13:55 2021 +0800

    Fixed a bug where the cgroup task killer leaves the cgroup frozen.
    
    This closes #388
---
 src/linux/cgroups.cpp | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/linux/cgroups.cpp b/src/linux/cgroups.cpp
index 43d8ac0..11626cf 100644
--- a/src/linux/cgroups.cpp
+++ b/src/linux/cgroups.cpp
@@ -1398,15 +1398,14 @@ public:
   // Return a future indicating the state of the killer.
   // Failure occurs if any process in the cgroup is unable to be
   // killed.
+  // Discarding the future will cause this process to stop the next time it
+  // calls `freeze`: we don't want to stop at an arbitrary point since it might
+  // leave the cgroup frozen.
   Future<Nothing> future() { return promise.future(); }
 
 protected:
   void initialize() override
   {
-    // Stop when no one cares.
-    promise.future().onDiscard(lambda::bind(
-        static_cast<void (*)(const UPID&, bool)>(terminate), self(), true));
-
     killTasks();
   }
 
@@ -1425,16 +1424,15 @@ private:
       const PID<TasksKiller>& pid)
   {
     // Cancel the freeze operation.
-    // TODO(jieyu): Wait until 'future' is in DISCARDED state before
-    // starting retry.
     future.discard();
 
-    // We attempt to kill the processes before we thaw again,
-    // due to a bug in the kernel. See MESOS-1758 for more details.
-    // We thaw the cgroup before trying to freeze again to allow any
-    // pending signals to be delivered. See MESOS-1689 for details.
-    // This is a short term hack until we have PID namespace support.
-    return Future<bool>(true)
+    // Wait until the freeze is cancelled, and then attempt to kill the
+    // processes before we thaw again, due to a bug in the kernel. See
+    // MESOS-1758 for more details.  We thaw the cgroup before trying to freeze
+    // again to allow any pending signals to be delivered. See MESOS-1689 for
+    // details.  This is a short term hack until we have PID namespace support.
+    return future
+      .recover([](const Future<Nothing>&){ return Future<Nothing>(Nothing()); 
})
       .then(defer(pid, &Self::kill))
       .then(defer(pid, &Self::thaw))
       .then(defer(pid, &Self::freeze));
@@ -1452,6 +1450,12 @@ private:
 
   Future<Nothing> freeze()
   {
+    // Don't start another `killTasks` cycle if we've been asked to stop.
+    if (promise.future().hasDiscard()) {
+        terminate(self());
+        return Nothing();
+    }
+
     // TODO(jieyu): This is a workaround for MESOS-1689. We will move
     // away from freezer once we have pid namespace support.
     return cgroups::freezer::freeze(hierarchy, cgroup).after(

Reply via email to