Made default executor not shutdown if unsubscribed during task launch.

The default executor would unnecessarily shutdown if, while launching a
task group, it gets unsubscribed after having successfully launched the
task group's containers.

Review: https://reviews.apache.org/r/65550/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/24b8b43d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/24b8b43d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/24b8b43d

Branch: refs/heads/1.5.x
Commit: 24b8b43d9599b3cff58b7153d9b5e854d7bbb811
Parents: 2459ea1
Author: Gaston Kleiman <gas...@mesosphere.io>
Authored: Wed Feb 14 14:35:01 2018 +0800
Committer: Qian Zhang <zhq527...@gmail.com>
Committed: Wed Feb 14 20:58:10 2018 +0800

----------------------------------------------------------------------
 src/launcher/default_executor.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/24b8b43d/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 9aab90d..ea9b2d4 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -534,16 +534,6 @@ protected:
       }
     }
 
-    // This could happen if the agent process failed after the child
-    // containers were launched. Shutdown the executor if this happens.
-    if (state == DISCONNECTED || state == CONNECTED) {
-      LOG(ERROR) << "Unable to complete the operation of launching child "
-                 << "containers as the executor is in state " << state;
-      _shutdown();
-      return;
-    }
-
-    CHECK_EQ(SUBSCRIBED, state);
     CHECK(launched);
     CHECK_EQ(containerIds.size(), (size_t) taskGroup.tasks().size());
 
@@ -643,7 +633,17 @@ protected:
       << stringify(taskIds()) << " in child containers "
       << stringify(containerIds);
 
-    wait(taskIds());
+    if (state == SUBSCRIBED) {
+      // `wait()` requires the executor to be subscribed.
+      //
+      // Upon subscription, `received()` will call `wait()` on all containers,
+      // so it is safe to skip it here if we are not subscribed.
+      wait(taskIds());
+    } else {
+      LOG(INFO) << "Skipped waiting on child containers of tasks "
+                << stringify(taskIds()) << " until the connection "
+                << "to the agent is reestablished";
+    }
   }
 
   void wait(const list<TaskID>& taskIds)
@@ -652,6 +652,8 @@ protected:
     CHECK(launched);
     CHECK_SOME(connectionId);
 
+    LOG(INFO) << "Waiting on child containers of tasks " << stringify(taskIds);
+
     list<Future<Connection>> connections;
     for (size_t i = 0; i < taskIds.size(); i++) {
       connections.push_back(process::http::connect(agent));

Reply via email to