Made the default executor treat agent disconnections more gracefully.

This patch makes the default executor not shutdown if there are active
child containers, and it fails to connect or is not subscribed to the
agent when starting to launch a task group.

Review: https://reviews.apache.org/r/65556/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f9f8d6b8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f9f8d6b8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f9f8d6b8

Branch: refs/heads/1.5.x
Commit: f9f8d6b86ffecc216fc37b6191a1b786ab8842c4
Parents: e5afcbe
Author: Gaston Kleiman <gas...@mesosphere.io>
Authored: Wed Feb 14 14:35:22 2018 +0800
Committer: Qian Zhang <zhq527...@gmail.com>
Committed: Wed Feb 14 21:06:40 2018 +0800

----------------------------------------------------------------------
 src/launcher/default_executor.cpp | 43 +++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f9f8d6b8/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 16977b5..ff437bc 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -366,19 +366,33 @@ protected:
     }
 
     if (!connection.isReady()) {
-      LOG(ERROR)
-        << "Unable to establish connection with the agent: "
-        << (connection.isFailed() ? connection.failure() : "discarded");
-      _shutdown();
+      LOG(WARNING) << "Unable to establish connection with the agent to "
+                   << "complete the launch group operation: "
+                   << (connection.isFailed() ? connection.failure()
+                                             : "discarded");
+      dropTaskGroup(taskGroup);
+
+      // Shutdown the executor if all the active child containers have
+      // terminated.
+      if (containers.empty()) {
+        _shutdown();
+      }
+
       return;
     }
 
     // It is possible that the agent process failed after the connection was
-    // established. Shutdown the executor if this happens.
+    // established. Drop the task group if this happens.
     if (state == DISCONNECTED || state == CONNECTED) {
-      LOG(ERROR) << "Unable to complete the launch group operation "
-                 << "as the executor is in state " << state;
-      _shutdown();
+      LOG(WARNING) << "Unable to complete the launch group operation "
+                   << "as the executor is in state " << state;
+      dropTaskGroup(taskGroup);
+
+      // Shutdown the executor if all the active child containers have
+      // terminated.
+      if (containers.empty()) {
+        _shutdown();
+      }
       return;
     }
 
@@ -1515,6 +1529,19 @@ private:
         taskId);
   }
 
+  void dropTaskGroup(const TaskGroupInfo& taskGroup)
+  {
+    TaskState taskState =
+      protobuf::frameworkHasCapability(
+          frameworkInfo.get(), FrameworkInfo::Capability::PARTITION_AWARE)
+        ? TASK_DROPPED
+        : TASK_LOST;
+
+    foreach (const TaskInfo& task, taskGroup.tasks()) {
+      forward(createTaskStatus(task.task_id(), taskState));
+    }
+  }
+
   enum State
   {
     CONNECTED,

Reply via email to