This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 3d60cba39d0377a7dc19b4c47f3bb0807418fe50 Author: Andrei Budnik <abud...@apache.org> AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++++++-------- src/exec/exec.cpp | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 132f42b..ebbbc0d 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -224,7 +224,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), &Self::_stop); return; } @@ -773,17 +775,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), &Self::_stop); } void _stop() { - // A hack for now ... but we need to wait until the status update - // is sent to the slave before we shut ourselves down. - // TODO(tnachen): Remove this hack and also the same hack in the - // command executor when we have the new HTTP APIs to wait until - // an ack. - os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index 67e082e..8ed77f2 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -409,10 +409,29 @@ protected: return; } + if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; + } + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; + // If this is a terminal status update acknowledgment for the Docker + // executor, stop the driver to terminate the executor. + // + // TODO(abudnik): This is a workaround for MESOS-9847. A better solution + // is to update supported API for the Docker executor from V0 to V1. It + // will allow the executor to handle status update acknowledgments itself. + if (mesos::internal::protobuf::isTerminalState( + updates[uuid_.get()].status().state()) && + dynamic_cast<docker::DockerExecutor*>(executor)) { + driver->stop(); + } + // Remove the corresponding update. updates.erase(uuid_.get());