Added inspect retries to the Docker executor. This patch adds retries for `inspect` command to workaround docker daemon hangs. We assume that the docker daemon can be temporarily unresponsive. If it's unresponsive, then any started docker cli command hangs. To address the issue, we retry `inspect` in the loop.
Review: https://reviews.apache.org/r/65759/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/84c3b4c4 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/84c3b4c4 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/84c3b4c4 Branch: refs/heads/master Commit: 84c3b4c4b22b100644f46b561772732e942ceb49 Parents: 32fe390 Author: Andrei Budnik <abud...@mesosphere.com> Authored: Fri Mar 2 15:39:05 2018 -0800 Committer: Gilbert Song <songzihao1...@gmail.com> Committed: Fri Mar 2 15:40:31 2018 -0800 ---------------------------------------------------------------------- src/docker/executor.cpp | 46 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/84c3b4c4/src/docker/executor.cpp ---------------------------------------------------------------------- diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 8fe8a7c..1d67211 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -22,8 +22,10 @@ #include <mesos/executor.hpp> #include <mesos/mesos.hpp> +#include <process/collect.hpp> #include <process/delay.hpp> #include <process/id.hpp> +#include <process/loop.hpp> #include <process/owned.hpp> #include <process/process.hpp> #include <process/protobuf.hpp> @@ -215,13 +217,46 @@ public: run->onAny(defer(self(), &Self::reaped, lambda::_1)); + // Since the Docker daemon might hang, we have to retry the inspect command. + auto inspectLoop = loop( + self(), + [=]() { + return await( + docker->inspect(containerName, DOCKER_INSPECT_DELAY) + .after( + DOCKER_INSPECT_TIMEOUT, + [=](Future<Docker::Container> future) { + LOG(WARNING) << "Docker inspect timed out after " + << DOCKER_INSPECT_TIMEOUT + << " for container " + << "'" << containerName << "'"; + + // We need to clean up the hanging Docker CLI process. + // Discarding the inspect future triggers a callback in + // the Docker library that kills the subprocess and + // transitions the future. + future.discard(); + return future; + })); + }, + [](const Future<Docker::Container>& future) + -> Future<ControlFlow<Docker::Container>> { + if (future.isReady()) { + return Break(future.get()); + } + if (future.isFailed()) { + return Failure(future.failure()); + } + return Continue(); + }); + // Delay sending TASK_RUNNING status update until we receive // inspect output. Note that we store a future that completes // after the sending of the running update. This allows us to // ensure that the terminal update is sent after the running // update (see `reaped()`). - inspect = docker->inspect(containerName, DOCKER_INSPECT_DELAY) - .then(defer(self(), [=](const Docker::Container& container) { + inspect = + inspectLoop.then(defer(self(), [=](const Docker::Container& container) { if (!killed) { containerPid = container.pid; @@ -322,13 +357,6 @@ public: return Nothing(); })); - inspect - .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) { - LOG(WARNING) << "Docker inspect has not finished after " - << DOCKER_INSPECT_TIMEOUT; - return inspect; - }); - inspect.onFailed(defer(self(), [=](const string& failure) { LOG(ERROR) << "Failed to inspect container '" << containerName << "'" << ": " << failure;