This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit f37ae68a8f0d23a2e0f31812b8fe4494109769c6
Author: Andrei Budnik <abud...@apache.org>
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

    Changed termination logic of the default executor.
    
    Previously, the default executor terminated itself after all containers
    had terminated. This could lead to termination of the executor before
    processing of a terminal status update by the agent. In order
    to mitigate this issue, the executor slept for one second to give a
    chance to send all status updates and receive all status update
    acknowledgements before terminating itself. This might have led to
    various race conditions in some circumstances (e.g., on a slow host).
    This patch terminates the default executor if all status updates have
    been acknowledged by the agent and no running containers left.
    Also, this patch increases the timeout from one second to one minute
    for fail-safety.
    
    Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 521494a..7781382 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -279,6 +279,12 @@ public:
           containers.at(taskId)->acknowledged = true;
         }
 
+        // Terminate the executor if all status updates have been acknowledged
+        // by the agent and no running containers left.
+        if (containers.empty() && unacknowledgedUpdates.empty()) {
+          terminate(self());
+        }
+
         break;
       }
 
@@ -1088,14 +1094,21 @@ protected:
 
   void _shutdown()
   {
-    const Duration duration = Seconds(1);
+    if (unacknowledgedUpdates.empty()) {
+      terminate(self());
+    } else {
+      // This is a fail safe in case the agent doesn't send an ACK for
+      // a status update for some reason.
+      const Duration duration = Seconds(60);
 
-    LOG(INFO) << "Terminating after " << duration;
+      LOG(INFO) << "Terminating after " << duration;
 
-    // TODO(qianzhang): Remove this hack since the executor now receives
-    // acknowledgements for status updates. The executor can terminate
-    // after it receives an ACK for a terminal status update.
-    os::sleep(duration);
+      delay(duration, self(), &Self::__shutdown);
+    }
+  }
+
+  void __shutdown()
+  {
     terminate(self());
   }
 

Reply via email to