Enabled retries for `killTask` in docker executor.

Previously, after `docker stop` command failure, docker executor
neither allowed a scheduler to retry `killTask` command, nor retried
`killTask` when task kill was triggered by a failed health check.

Review: https://reviews.apache.org/r/61530/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/67723f01
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/67723f01
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/67723f01

Branch: refs/heads/1.3.x
Commit: 67723f01b5ca3583725011c691e8ffd6b89a02e5
Parents: 5852756
Author: Andrei Budnik <abud...@mesosphere.com>
Authored: Thu Aug 10 18:53:03 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 00:09:04 2017 +0200

----------------------------------------------------------------------
 src/docker/executor.cpp | 80 ++++++++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/67723f01/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 875b43c..c6f50fa 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,6 +22,7 @@
 #include <mesos/executor.hpp>
 #include <mesos/mesos.hpp>
 
+#include <process/delay.hpp>
 #include <process/id.hpp>
 #include <process/owned.hpp>
 #include <process/process.hpp>
@@ -70,6 +71,7 @@ namespace docker {
 
 const Duration DOCKER_INSPECT_DELAY = Milliseconds(500);
 const Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+const Duration KILL_RETRY_INTERVAL = Seconds(5);
 
 // Executor that is responsible to execute a docker container and
 // redirect log output to configured stdout and stderr files. Similar
@@ -90,8 +92,9 @@ public:
       bool cgroupsEnableCfs)
     : ProcessBase(ID::generate("docker-executor")),
       killed(false),
-      killedByHealthCheck(false),
       terminated(false),
+      killedByHealthCheck(false),
+      killingInProgress(false),
       launcherDir(launcherDir),
       docker(docker),
       containerName(containerName),
@@ -355,8 +358,8 @@ private:
     // the grace period if a new one is provided.
 
     // Issue the kill signal if the container is running
-    // and we haven't killed it yet.
-    if (run.isSome() && !killed) {
+    // and kill attempt is not in progress.
+    if (run.isSome() && !killingInProgress) {
       // We have to issue the kill after 'docker inspect' has
       // completed, otherwise we may race with 'docker run'
       // and docker may not know about the container. Note
@@ -375,7 +378,15 @@ private:
     CHECK_SOME(taskId);
     CHECK_EQ(taskId_, taskId.get());
 
-    if (!terminated && !killed) {
+    if (!terminated && !killingInProgress) {
+      killingInProgress = true;
+
+      // Once the task has been transitioned to `killed`,
+      // there is no way back, even if the kill attempt
+      // failed. This also allows us to send TASK_KILLING
+      // only once, regardless of how many kill attempts
+      // have been made.
+      //
       // Because we rely on `killed` to determine whether
       // to send TASK_KILLED, we set `killed` only once the
       // kill is issued. If we set it earlier we're more
@@ -383,23 +394,25 @@ private:
       // signaled the container. Note that in general it's
       // a race between signaling and the container
       // terminating with a non-zero exit status.
-      killed = true;
-
-      // Send TASK_KILLING if the framework can handle it.
-      if (protobuf::frameworkHasCapability(
-              frameworkInfo.get(),
-              FrameworkInfo::Capability::TASK_KILLING_STATE)) {
-        // TODO(alexr): Use `protobuf::createTaskStatus()`
-        // instead of manually setting fields.
-        TaskStatus status;
-        status.mutable_task_id()->CopyFrom(taskId.get());
-        status.set_state(TASK_KILLING);
-        driver.get()->sendStatusUpdate(status);
-      }
+      if (!killed) {
+        killed = true;
 
-      // Stop health checking the task.
-      if (checker.get() != nullptr) {
-        checker->pause();
+        // Send TASK_KILLING if the framework can handle it.
+        if (protobuf::frameworkHasCapability(
+                frameworkInfo.get(),
+                FrameworkInfo::Capability::TASK_KILLING_STATE)) {
+          // TODO(alexr): Use `protobuf::createTaskStatus()`
+          // instead of manually setting fields.
+          TaskStatus status;
+          status.mutable_task_id()->CopyFrom(taskId.get());
+          status.set_state(TASK_KILLING);
+          driver.get()->sendStatusUpdate(status);
+        }
+
+        // Stop health checking the task.
+        if (checker.get() != nullptr) {
+          checker->pause();
+        }
       }
 
       // TODO(bmahler): Replace this with 'docker kill' so
@@ -407,9 +420,32 @@ private:
       // a `KillPolicy` override.
       stop = docker->stop(containerName, gracePeriod);
 
+      // Invoking `docker stop` might be unsuccessful, in which case the
+      // container most probably does not receive the signal. In this case we
+      // should allow schedulers to retry the kill operation or, if the kill
+      // was initiated by a failing health check, retry ourselves. We do not
+      // bail out nor stop retrying to avoid sending a terminal status update
+      // while the container might still be running.
+      //
+      // NOTE: `docker stop` might also hang. We do not address this for now,
+      // because there is no evidence that in this case docker daemon might
+      // function properly, i.e., it's only the docker cli command that hangs,
+      // and hence there is not so much we can do. See MESOS-6743.
       stop.onFailed(defer(self(), [=](const string& failure) {
         LOG(ERROR) << "Failed to stop container '" << containerName << "'"
                    << ": " << failure;
+
+        killingInProgress = false;
+
+        if (killedByHealthCheck) {
+          LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL;
+          delay(
+              KILL_RETRY_INTERVAL,
+              self(),
+              &Self::_killTask,
+              taskId_,
+              gracePeriod);
+        }
       }));
     }
   }
@@ -597,9 +633,11 @@ private:
   // TODO(alexr): Introduce a state enum and document transitions,
   // see MESOS-5252.
   bool killed;
-  bool killedByHealthCheck;
   bool terminated;
 
+  bool killedByHealthCheck;
+  bool killingInProgress; // Guard against simultaneous kill attempts.
+
   string launcherDir;
   Owned<Docker> docker;
   string containerName;

Reply via email to