Added inspect retries to the Docker executor. This patch adds retries for `inspect` command to workaround docker daemon hangs. We assume that the docker daemon can be temporarily unresponsive. If it's unresponsive, then any started docker cli command hangs. To address the issue, we retry `inspect` in the loop.
Review: https://reviews.apache.org/r/65759/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/513c8dd2 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/513c8dd2 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/513c8dd2 Branch: refs/heads/1.4.x Commit: 513c8dd2c18911ec1090a67193faf1d28a1b2a1f Parents: 9018409 Author: Andrei Budnik <abud...@mesosphere.com> Authored: Fri Mar 2 15:39:05 2018 -0800 Committer: Gilbert Song <songzihao1...@gmail.com> Committed: Mon Mar 5 18:11:12 2018 -0800 ---------------------------------------------------------------------- src/docker/executor.cpp | 46 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/513c8dd2/src/docker/executor.cpp ---------------------------------------------------------------------- diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 5df8707..4b5f257 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -22,8 +22,10 @@ #include <mesos/executor.hpp> #include <mesos/mesos.hpp> +#include <process/collect.hpp> #include <process/delay.hpp> #include <process/id.hpp> +#include <process/loop.hpp> #include <process/owned.hpp> #include <process/process.hpp> #include <process/protobuf.hpp> @@ -204,13 +206,46 @@ public: run->onAny(defer(self(), &Self::reaped, lambda::_1)); + // Since the Docker daemon might hang, we have to retry the inspect command. + auto inspectLoop = loop( + self(), + [=]() { + return await( + docker->inspect(containerName, DOCKER_INSPECT_DELAY) + .after( + DOCKER_INSPECT_TIMEOUT, + [=](Future<Docker::Container> future) { + LOG(WARNING) << "Docker inspect timed out after " + << DOCKER_INSPECT_TIMEOUT + << " for container " + << "'" << containerName << "'"; + + // We need to clean up the hanging Docker CLI process. + // Discarding the inspect future triggers a callback in + // the Docker library that kills the subprocess and + // transitions the future. + future.discard(); + return future; + })); + }, + [](const Future<Docker::Container>& future) + -> Future<ControlFlow<Docker::Container>> { + if (future.isReady()) { + return Break(future.get()); + } + if (future.isFailed()) { + return Failure(future.failure()); + } + return Continue(); + }); + // Delay sending TASK_RUNNING status update until we receive // inspect output. Note that we store a future that completes // after the sending of the running update. This allows us to // ensure that the terminal update is sent after the running // update (see `reaped()`). - inspect = docker->inspect(containerName, DOCKER_INSPECT_DELAY) - .then(defer(self(), [=](const Docker::Container& container) { + inspect = + inspectLoop.then(defer(self(), [=](const Docker::Container& container) { if (!killed) { containerPid = container.pid; @@ -297,13 +332,6 @@ public: return Nothing(); })); - inspect - .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) { - LOG(WARNING) << "Docker inspect has not finished after " - << DOCKER_INSPECT_TIMEOUT; - return inspect; - }); - inspect.onFailed(defer(self(), [=](const string& failure) { LOG(ERROR) << "Failed to inspect container '" << containerName << "'" << ": " << failure;