Repository: mesos Updated Branches: refs/heads/master b0a269691 -> 76c38f9d0
Handled hanging docker `stop`, `inspect` commands in docker executor. Previosly, if `docker inspect` command hanged, the docker container ended up in an unkillable state. This patch adds a timeout for inspect command after receiving `killTask` analogically to `reaped` handler. In addition we've added a timeout for `docker stop` command. If docker `stop` or `inspect` command times out, we discard the related future, thus the docker library kills previously spawned docker cli subprocess. As a result, a scheduler can retry `killTask` operation to handle nasty docker bugs that lead to hanging docker cli. Review: https://reviews.apache.org/r/65713/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/32fe3905 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/32fe3905 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/32fe3905 Branch: refs/heads/master Commit: 32fe39055545a6511c1613be9907cbb3357d86a4 Parents: 8346ab0 Author: Andrei Budnik <abud...@mesosphere.com> Authored: Fri Mar 2 15:38:59 2018 -0800 Committer: Gilbert Song <songzihao1...@gmail.com> Committed: Fri Mar 2 15:40:30 2018 -0800 ---------------------------------------------------------------------- src/docker/executor.cpp | 68 ++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/32fe3905/src/docker/executor.cpp ---------------------------------------------------------------------- diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 93c3e1d..8fe8a7c 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -97,7 +97,6 @@ public: killed(false), terminated(false), killedByHealthCheck(false), - killingInProgress(false), launcherDir(launcherDir), docker(docker), containerName(containerName), @@ -323,6 +322,13 @@ public: return Nothing(); })); + inspect + .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) { + LOG(WARNING) << "Docker inspect has not finished after " + << DOCKER_INSPECT_TIMEOUT; + return inspect; + }); + inspect.onFailed(defer(self(), [=](const string& failure) { LOG(ERROR) << "Failed to inspect container '" << containerName << "'" << ": " << failure; @@ -442,9 +448,8 @@ private: // TODO(alexr): If a kill is in progress, consider adjusting // the grace period if a new one is provided. - // Issue the kill signal if the container is running - // and kill attempt is not in progress. - if (run.isSome() && !killingInProgress) { + // Issue the kill signal if there was an attempt to launch the container. + if (run.isSome()) { // We have to issue the kill after 'docker inspect' has // completed, otherwise we may race with 'docker run' // and docker may not know about the container. Note @@ -453,6 +458,15 @@ private: // issued the kill. inspect .onAny(defer(self(), &Self::_killTask, _taskId, gracePeriod)); + + // If the inspect takes too long we discard it to ensure we + // don't wait forever, however in this case there may be no + // TASK_RUNNING update. + inspect + .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) { + inspect.discard(); + return inspect; + }); } } @@ -463,9 +477,7 @@ private: CHECK_SOME(taskId); CHECK_EQ(taskId_, taskId.get()); - if (!terminated && !killingInProgress) { - killingInProgress = true; - + if (!terminated) { // Once the task has been transitioned to `killed`, // there is no way back, even if the kill attempt // failed. This also allows us to send TASK_KILLING @@ -500,28 +512,43 @@ private: } } + // If a previous attempt to stop a Docker container is still in progress, + // we need to kill the hanging Docker CLI subprocess. Discarding this + // future triggers a callback in the Docker library that kills the + // subprocess. + if (stop.isPending()) { + LOG(WARNING) << "Previous docker stop has not terminated yet" + << " for container '" << containerName << "'"; + stop.discard(); + } + // TODO(bmahler): Replace this with 'docker kill' so // that we can adjust the grace period in the case of // a `KillPolicy` override. + // + // NOTE: `docker stop` may or may not finish. Our behaviour is to give + // the subprocess a chance to finish until next time `_killtask` is + // invoked. Also, invoking `docker stop` might be unsuccessful, in which + // case the container most probably does not receive the signal. + // In any case we should allow schedulers to retry the kill operation or, + // if the kill was initiated by a failing health check, retry ourselves. + // We do not bail out nor stop retrying to avoid sending a terminal + // status update while the container might still be running. stop = docker->stop(containerName, gracePeriod); - // Invoking `docker stop` might be unsuccessful, in which case the - // container most probably does not receive the signal. In this case we - // should allow schedulers to retry the kill operation or, if the kill - // was initiated by a failing health check, retry ourselves. We do not - // bail out nor stop retrying to avoid sending a terminal status update - // while the container might still be running. - // - // NOTE: `docker stop` might also hang. We do not address this for now, - // because there is no evidence that in this case docker daemon might - // function properly, i.e., it's only the docker cli command that hangs, - // and hence there is not so much we can do. See MESOS-6743. + if (killedByHealthCheck) { + stop + .after(KILL_RETRY_INTERVAL, defer(self(), [=](Future<Nothing>) { + LOG(INFO) << "Retrying to kill task"; + _killTask(taskId_, gracePeriod); + return stop; + })); + } + stop.onFailed(defer(self(), [=](const string& failure) { LOG(ERROR) << "Failed to stop container '" << containerName << "'" << ": " << failure; - killingInProgress = false; - if (killedByHealthCheck) { LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL; delay( @@ -724,7 +751,6 @@ private: bool terminated; bool killedByHealthCheck; - bool killingInProgress; // Guard against simultaneous kill attempts. string launcherDir; Owned<Docker> docker;