Enabled retries for `killTask` in docker executor.

Previously, after `docker stop` command failure, docker executor
neither allowed a scheduler to retry `killTask` command, nor retried
`killTask` when task kill was triggered by a failed health check.

Review: https://reviews.apache.org/r/61530/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8440065c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8440065c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8440065c

Branch: refs/heads/1.2.x
Commit: 8440065c5c4ce1d9c026ae225bed984e855d080b
Parents: 89747f4
Author: Andrei Budnik <abud...@mesosphere.com>
Authored: Thu Aug 10 18:53:03 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 01:29:48 2017 +0200

----------------------------------------------------------------------
 src/docker/executor.cpp | 76 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/8440065c/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index e638451..d7f2134 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,6 +22,7 @@
 #include <mesos/executor.hpp>
 #include <mesos/mesos.hpp>
 
+#include <process/delay.hpp>
 #include <process/id.hpp>
 #include <process/owned.hpp>
 #include <process/process.hpp>
@@ -70,6 +71,7 @@ namespace docker {
 
 const Duration DOCKER_INSPECT_DELAY = Milliseconds(500);
 const Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+const Duration KILL_RETRY_INTERVAL = Seconds(5);
 
 // Executor that is responsible to execute a docker container and
 // redirect log output to configured stdout and stderr files. Similar
@@ -89,8 +91,9 @@ public:
       const map<string, string>& taskEnvironment)
     : ProcessBase(ID::generate("docker-executor")),
       killed(false),
-      killedByHealthCheck(false),
       terminated(false),
+      killedByHealthCheck(false),
+      killingInProgress(false),
       launcherDir(launcherDir),
       docker(docker),
       containerName(containerName),
@@ -325,8 +328,8 @@ private:
     // the grace period if a new one is provided.
 
     // Issue the kill signal if the container is running
-    // and we haven't killed it yet.
-    if (run.isSome() && !killed) {
+    // and kill attempt is not in progress.
+    if (run.isSome() && !killingInProgress) {
       // We have to issue the kill after 'docker inspect' has
       // completed, otherwise we may race with 'docker run'
       // and docker may not know about the container. Note
@@ -345,7 +348,15 @@ private:
     CHECK_SOME(taskId);
     CHECK_EQ(taskId_, taskId.get());
 
-    if (!terminated && !killed) {
+    if (!terminated && !killingInProgress) {
+      killingInProgress = true;
+
+      // Once the task has been transitioned to `killed`,
+      // there is no way back, even if the kill attempt
+      // failed. This also allows us to send TASK_KILLING
+      // only once, regardless of how many kill attempts
+      // have been made.
+      //
       // Because we rely on `killed` to determine whether
       // to send TASK_KILLED, we set `killed` only once the
       // kill is issued. If we set it earlier we're more
@@ -353,21 +364,23 @@ private:
       // signaled the container. Note that in general it's
       // a race between signaling and the container
       // terminating with a non-zero exit status.
-      killed = true;
-
-      // Send TASK_KILLING if the framework can handle it.
-      if (protobuf::frameworkHasCapability(
-              frameworkInfo.get(),
-              FrameworkInfo::Capability::TASK_KILLING_STATE)) {
-        TaskStatus status;
-        status.mutable_task_id()->CopyFrom(taskId.get());
-        status.set_state(TASK_KILLING);
-        driver.get()->sendStatusUpdate(status);
-      }
+      if (!killed) {
+        killed = true;
+
+        // Send TASK_KILLING if the framework can handle it.
+        if (protobuf::frameworkHasCapability(
+                frameworkInfo.get(),
+                FrameworkInfo::Capability::TASK_KILLING_STATE)) {
+          TaskStatus status;
+          status.mutable_task_id()->CopyFrom(taskId.get());
+          status.set_state(TASK_KILLING);
+          driver.get()->sendStatusUpdate(status);
+        }
 
-      // Stop health checking the task.
-      if (checker.get() != nullptr) {
-        checker->stop();
+        // Stop health checking the task.
+        if (checker.get() != nullptr) {
+          checker->stop();
+        }
       }
 
       // TODO(bmahler): Replace this with 'docker kill' so
@@ -375,9 +388,32 @@ private:
       // a `KillPolicy` override.
       stop = docker->stop(containerName, gracePeriod);
 
+      // Invoking `docker stop` might be unsuccessful, in which case the
+      // container most probably does not receive the signal. In this case we
+      // should allow schedulers to retry the kill operation or, if the kill
+      // was initiated by a failing health check, retry ourselves. We do not
+      // bail out nor stop retrying to avoid sending a terminal status update
+      // while the container might still be running.
+      //
+      // NOTE: `docker stop` might also hang. We do not address this for now,
+      // because there is no evidence that in this case docker daemon might
+      // function properly, i.e., it's only the docker cli command that hangs,
+      // and hence there is not so much we can do. See MESOS-6743.
       stop.onFailed(defer(self(), [=](const string& failure) {
         LOG(ERROR) << "Failed to stop container '" << containerName << "'"
                    << ": " << failure;
+
+        killingInProgress = false;
+
+        if (killedByHealthCheck) {
+          LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL;
+          delay(
+              KILL_RETRY_INTERVAL,
+              self(),
+              &Self::_killTask,
+              taskId_,
+              gracePeriod);
+        }
       }));
     }
   }
@@ -552,9 +588,11 @@ private:
   // TODO(alexr): Introduce a state enum and document transitions,
   // see MESOS-5252.
   bool killed;
-  bool killedByHealthCheck;
   bool terminated;
 
+  bool killedByHealthCheck;
+  bool killingInProgress; // Guard against simultaneous kill attempts.
+
   string launcherDir;
   Owned<Docker> docker;
   string containerName;

Reply via email to