Made default executor not shutdown if unsubscribed during task launch. The default executor would unnecessarily shutdown if, while launching a task group, it gets unsubscribed after having successfully launched the task group's containers.
Review: https://reviews.apache.org/r/65550/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/24b8b43d Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/24b8b43d Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/24b8b43d Branch: refs/heads/1.5.x Commit: 24b8b43d9599b3cff58b7153d9b5e854d7bbb811 Parents: 2459ea1 Author: Gaston Kleiman <gas...@mesosphere.io> Authored: Wed Feb 14 14:35:01 2018 +0800 Committer: Qian Zhang <zhq527...@gmail.com> Committed: Wed Feb 14 20:58:10 2018 +0800 ---------------------------------------------------------------------- src/launcher/default_executor.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/24b8b43d/src/launcher/default_executor.cpp ---------------------------------------------------------------------- diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp index 9aab90d..ea9b2d4 100644 --- a/src/launcher/default_executor.cpp +++ b/src/launcher/default_executor.cpp @@ -534,16 +534,6 @@ protected: } } - // This could happen if the agent process failed after the child - // containers were launched. Shutdown the executor if this happens. - if (state == DISCONNECTED || state == CONNECTED) { - LOG(ERROR) << "Unable to complete the operation of launching child " - << "containers as the executor is in state " << state; - _shutdown(); - return; - } - - CHECK_EQ(SUBSCRIBED, state); CHECK(launched); CHECK_EQ(containerIds.size(), (size_t) taskGroup.tasks().size()); @@ -643,7 +633,17 @@ protected: << stringify(taskIds()) << " in child containers " << stringify(containerIds); - wait(taskIds()); + if (state == SUBSCRIBED) { + // `wait()` requires the executor to be subscribed. + // + // Upon subscription, `received()` will call `wait()` on all containers, + // so it is safe to skip it here if we are not subscribed. + wait(taskIds()); + } else { + LOG(INFO) << "Skipped waiting on child containers of tasks " + << stringify(taskIds()) << " until the connection " + << "to the agent is reestablished"; + } } void wait(const list<TaskID>& taskIds) @@ -652,6 +652,8 @@ protected: CHECK(launched); CHECK_SOME(connectionId); + LOG(INFO) << "Waiting on child containers of tasks " << stringify(taskIds); + list<Future<Connection>> connections; for (size_t i = 0; i < taskIds.size(); i++) { connections.push_back(process::http::connect(agent));