Made the default executor treat agent disconnections more gracefully. This patch makes the default executor not shutdown if there are active child containers, and it fails to connect or is not subscribed to the agent when starting to launch a task group.
Review: https://reviews.apache.org/r/65556/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/54b6c5b9 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/54b6c5b9 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/54b6c5b9 Branch: refs/heads/master Commit: 54b6c5b9c7cb059ebd87ee0f9927cfa6ff73129d Parents: 656196e Author: Gaston Kleiman <gas...@mesosphere.io> Authored: Wed Feb 14 14:35:22 2018 +0800 Committer: Qian Zhang <zhq527...@gmail.com> Committed: Wed Feb 14 20:37:31 2018 +0800 ---------------------------------------------------------------------- src/launcher/default_executor.cpp | 43 +++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/54b6c5b9/src/launcher/default_executor.cpp ---------------------------------------------------------------------- diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp index 16977b5..ff437bc 100644 --- a/src/launcher/default_executor.cpp +++ b/src/launcher/default_executor.cpp @@ -366,19 +366,33 @@ protected: } if (!connection.isReady()) { - LOG(ERROR) - << "Unable to establish connection with the agent: " - << (connection.isFailed() ? connection.failure() : "discarded"); - _shutdown(); + LOG(WARNING) << "Unable to establish connection with the agent to " + << "complete the launch group operation: " + << (connection.isFailed() ? connection.failure() + : "discarded"); + dropTaskGroup(taskGroup); + + // Shutdown the executor if all the active child containers have + // terminated. + if (containers.empty()) { + _shutdown(); + } + return; } // It is possible that the agent process failed after the connection was - // established. Shutdown the executor if this happens. + // established. Drop the task group if this happens. if (state == DISCONNECTED || state == CONNECTED) { - LOG(ERROR) << "Unable to complete the launch group operation " - << "as the executor is in state " << state; - _shutdown(); + LOG(WARNING) << "Unable to complete the launch group operation " + << "as the executor is in state " << state; + dropTaskGroup(taskGroup); + + // Shutdown the executor if all the active child containers have + // terminated. + if (containers.empty()) { + _shutdown(); + } return; } @@ -1515,6 +1529,19 @@ private: taskId); } + void dropTaskGroup(const TaskGroupInfo& taskGroup) + { + TaskState taskState = + protobuf::frameworkHasCapability( + frameworkInfo.get(), FrameworkInfo::Capability::PARTITION_AWARE) + ? TASK_DROPPED + : TASK_LOST; + + foreach (const TaskInfo& task, taskGroup.tasks()) { + forward(createTaskStatus(task.task_id(), taskState)); + } + } + enum State { CONNECTED,