Added inspect retries to the docker containerizer in `update` method.

This patch fixes the bug when a terminal status update is never sent
after termination of the docker executor, when the Docker daemon hangs
for `inspect` command. A terminal status update is postponed until
containerizer `update` completes that uses `inspect` command to get
a PID of the docker container. To address the issue, we retry `inspect`
in the loop.

Review: https://reviews.apache.org/r/65868/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/84c97007
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/84c97007
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/84c97007

Branch: refs/heads/1.4.x
Commit: 84c97007ca01b4e3d3851a899a38ae2e13493e5f
Parents: 513c8dd
Author: Andrei Budnik <abud...@mesosphere.com>
Authored: Fri Mar 2 15:39:09 2018 -0800
Committer: Gilbert Song <songzihao1...@gmail.com>
Committed: Mon Mar 5 18:11:13 2018 -0800

----------------------------------------------------------------------
 src/slave/constants.hpp            |  3 +++
 src/slave/containerizer/docker.cpp | 42 ++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/84c97007/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index fac831c..71837e7 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -133,6 +133,9 @@ constexpr Duration DOCKER_REMOVE_DELAY = Hours(6);
 // container.
 constexpr Duration DOCKER_INSPECT_DELAY = Seconds(1);
 
+// Default duration to wait for `inspect` command completion.
+constexpr Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+
 // Default maximum number of docker inspect calls docker ps will invoke
 // in parallel to prevent hitting system's open file descriptor limit.
 constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100;

http://git-wip-us.apache.org/repos/asf/mesos/blob/84c97007/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp 
b/src/slave/containerizer/docker.cpp
index e261a42..79d1caa 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1681,7 +1681,47 @@ Future<Nothing> DockerContainerizerProcess::update(
     return __update(containerId, _resources, container->pid.get());
   }
 
-  return docker->inspect(containers_.at(containerId)->containerName)
+  string containerName = containers_.at(containerId)->containerName;
+
+  // Since the Docker daemon might hang, we have to retry the inspect command.
+  //
+  // NOTE: This code is duplicated from the built-in docker executor, but
+  // the retry interval is not passed to `inspect`, because the container might
+  // be terminated.
+  // TODO(abudnik): Consider using a class helper for retrying docker commands.
+  auto inspectLoop = loop(
+      self(),
+      [=]() {
+        return await(
+            docker->inspect(containerName)
+              .after(
+                  slave::DOCKER_INSPECT_TIMEOUT,
+                  [=](Future<Docker::Container> future) {
+                    LOG(WARNING) << "Docker inspect timed out after "
+                                 << slave::DOCKER_INSPECT_TIMEOUT
+                                 << " for container "
+                                 << "'" << containerName << "'";
+
+                    // We need to clean up the hanging Docker CLI process.
+                    // Discarding the inspect future triggers a callback in
+                    // the Docker library that kills the subprocess and
+                    // transitions the future.
+                    future.discard();
+                    return future;
+                  }));
+      },
+      [](const Future<Docker::Container>& future)
+          -> Future<ControlFlow<Docker::Container>> {
+        if (future.isReady()) {
+          return Break(future.get());
+        }
+        if (future.isFailed()) {
+          return Failure(future.failure());
+        }
+        return Continue();
+      });
+
+  return inspectLoop
     .then(defer(self(), &Self::_update, containerId, _resources, lambda::_1));
 #else
   return Nothing();

Reply via email to