This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 6a7da284d1b89f8a144ed2f896f005a5ee9d4aea
Author: Andrei Budnik <abud...@apache.org>
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

    Changed termination logic of the Docker executor.
    
    Previously, the Docker executor terminated itself after a task's
    container had terminated. This could lead to termination of the
    executor before processing of a terminal status update by the agent.
    In order to mitigate this issue, the executor slept for one second to
    give a chance to send all status updates and receive all status update
    acknowledgments before terminating itself. This might have led to
    various race conditions in some circumstances (e.g., on a slow host).
    This patch terminates the Docker executor after receiving a terminal
    status update acknowledgment. Also, this patch increases the timeout
    from one second to one minute for fail-safety.
    
    Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++++++--------
 src/exec/exec.cpp       | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index f638e4b..ed1b718 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -224,7 +224,9 @@ public:
 
       driver->sendStatusUpdate(status);
 
-      _stop();
+      // This is a fail safe in case the agent doesn't send an ACK for
+      // the terminal update for some reason.
+      delay(Seconds(60), self(), &Self::_stop);
       return;
     }
 
@@ -757,17 +759,13 @@ private:
     CHECK_SOME(driver);
     driver.get()->sendStatusUpdate(taskStatus);
 
-    _stop();
+    // This is a fail safe in case the agent doesn't send an ACK for
+    // the terminal update for some reason.
+    delay(Seconds(60), self(), &Self::_stop);
   }
 
   void _stop()
   {
-    // A hack for now ... but we need to wait until the status update
-    // is sent to the slave before we shut ourselves down.
-    // TODO(tnachen): Remove this hack and also the same hack in the
-    // command executor when we have the new HTTP APIs to wait until
-    // an ack.
-    os::sleep(Seconds(1));
     driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index c0fa3b6..dee2074 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -395,10 +395,29 @@ protected:
       return;
     }
 
+    if (!updates.contains(uuid_.get())) {
+      LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+                   << uuid_.get() << " for task " << taskId
+                   << " of framework " << frameworkId;
+      return;
+    }
+
     VLOG(1) << "Executor received status update acknowledgement "
             << uuid_.get() << " for task " << taskId
             << " of framework " << frameworkId;
 
+    // If this is a terminal status update acknowledgment for the Docker
+    // executor, stop the driver to terminate the executor.
+    //
+    // TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+    // is to update supported API for the Docker executor from V0 to V1. It
+    // will allow the executor to handle status update acknowledgments itself.
+    if (mesos::internal::protobuf::isTerminalState(
+            updates[uuid_.get()].status().state()) &&
+        dynamic_cast<docker::DockerExecutor*>(executor)) {
+      driver->stop();
+    }
+
     // Remove the corresponding update.
     updates.erase(uuid_.get());
 

Reply via email to