[mesos] 01/02: Changed termination logic of the Docker executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit ff98f12a50a56c13688b87068a116d1d08142f49
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

Changed termination logic of the Docker executor.

Previously, the Docker executor terminated itself after a task's
container had terminated. This could lead to termination of the
executor before processing of a terminal status update by the agent.
In order to mitigate this issue, the executor slept for one second to
give a chance to send all status updates and receive all status update
acknowledgments before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the Docker executor after receiving a terminal
status update acknowledgment. Also, this patch increases the timeout
from one second to one minute for fail-safety.

Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++
 src/exec/exec.cpp   | 19 +++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 7b48f84..0d75801 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -199,7 +199,9 @@ public:
 
   driver->sendStatusUpdate(status);
 
-  _stop();
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // the terminal update for some reason.
+  delay(Seconds(60), self(), ::_stop);
   return;
 }
 
@@ -691,17 +693,13 @@ private:
 CHECK_SOME(driver);
 driver.get()->sendStatusUpdate(taskStatus);
 
-_stop();
+// This is a fail safe in case the agent doesn't send an ACK for
+// the terminal update for some reason.
+delay(Seconds(60), self(), ::_stop);
   }
 
   void _stop()
   {
-// A hack for now ... but we need to wait until the status update
-// is sent to the slave before we shut ourselves down.
-// TODO(tnachen): Remove this hack and also the same hack in the
-// command executor when we have the new HTTP APIs to wait until
-// an ack.
-os::sleep(Seconds(1));
 driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 0c76a3f..e964a11 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -395,10 +395,29 @@ protected:
   return;
 }
 
+if (!updates.contains(uuid_.get())) {
+  LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+   << uuid_.get() << " for task " << taskId
+   << " of framework " << frameworkId;
+  return;
+}
+
 VLOG(1) << "Executor received status update acknowledgement "
 << uuid_.get() << " for task " << taskId
 << " of framework " << frameworkId;
 
+// If this is a terminal status update acknowledgment for the Docker
+// executor, stop the driver to terminate the executor.
+//
+// TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+// is to update supported API for the Docker executor from V0 to V1. It
+// will allow the executor to handle status update acknowledgments itself.
+if (mesos::internal::protobuf::isTerminalState(
+updates[uuid_.get()].status().state()) &&
+dynamic_cast(executor)) {
+  driver->stop();
+}
+
 // Remove the corresponding update.
 updates.erase(uuid_.get());
 



[mesos] branch 1.5.x updated (2f2146a -> 84b7af3)

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a change to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 2f2146a  Remembered whether an executor was agent-generated.
 new ff98f12  Changed termination logic of the Docker executor.
 new 84b7af3  Changed termination logic of the default executor.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/docker/executor.cpp   | 14 ++
 src/exec/exec.cpp | 19 +++
 src/launcher/default_executor.cpp | 25 +++--
 3 files changed, 44 insertions(+), 14 deletions(-)



[mesos] 02/02: Changed termination logic of the default executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 84b7af3409d8af343da0f0420e168a42de4b110f
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

Changed termination logic of the default executor.

Previously, the default executor terminated itself after all containers
had terminated. This could lead to termination of the executor before
processing of a terminal status update by the agent. In order
to mitigate this issue, the executor slept for one second to give a
chance to send all status updates and receive all status update
acknowledgements before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the default executor if all status updates have
been acknowledged by the agent and no running containers left.
Also, this patch increases the timeout from one second to one minute
for fail-safety.

Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 8720dad..c6b13e9 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -267,6 +267,12 @@ public:
   containers.at(taskId)->acknowledged = true;
 }
 
+// Terminate the executor if all status updates have been acknowledged
+// by the agent and no running containers left.
+if (containers.empty() && unacknowledgedUpdates.empty()) {
+  terminate(self());
+}
+
 break;
   }
 
@@ -1060,14 +1066,21 @@ protected:
 
   void _shutdown()
   {
-const Duration duration = Seconds(1);
+if (unacknowledgedUpdates.empty()) {
+  terminate(self());
+} else {
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // a status update for some reason.
+  const Duration duration = Seconds(60);
 
-LOG(INFO) << "Terminating after " << duration;
+  LOG(INFO) << "Terminating after " << duration;
 
-// TODO(qianzhang): Remove this hack since the executor now receives
-// acknowledgements for status updates. The executor can terminate
-// after it receives an ACK for a terminal status update.
-os::sleep(duration);
+  delay(duration, self(), ::__shutdown);
+}
+  }
+
+  void __shutdown()
+  {
 terminate(self());
   }
 



[mesos] 02/02: Changed termination logic of the default executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 205525eb56a33e58bed1fc38e0b32189b19d3fbc
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

Changed termination logic of the default executor.

Previously, the default executor terminated itself after all containers
had terminated. This could lead to termination of the executor before
processing of a terminal status update by the agent. In order
to mitigate this issue, the executor slept for one second to give a
chance to send all status updates and receive all status update
acknowledgements before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the default executor if all status updates have
been acknowledged by the agent and no running containers left.
Also, this patch increases the timeout from one second to one minute
for fail-safety.

Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 76c6106..ca17c70 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -276,6 +276,12 @@ public:
   containers.at(taskId)->acknowledged = true;
 }
 
+// Terminate the executor if all status updates have been acknowledged
+// by the agent and no running containers left.
+if (containers.empty() && unacknowledgedUpdates.empty()) {
+  terminate(self());
+}
+
 break;
   }
 
@@ -1070,14 +1076,21 @@ protected:
 
   void _shutdown()
   {
-const Duration duration = Seconds(1);
+if (unacknowledgedUpdates.empty()) {
+  terminate(self());
+} else {
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // a status update for some reason.
+  const Duration duration = Seconds(60);
 
-LOG(INFO) << "Terminating after " << duration;
+  LOG(INFO) << "Terminating after " << duration;
 
-// TODO(qianzhang): Remove this hack since the executor now receives
-// acknowledgements for status updates. The executor can terminate
-// after it receives an ACK for a terminal status update.
-os::sleep(duration);
+  delay(duration, self(), ::__shutdown);
+}
+  }
+
+  void __shutdown()
+  {
 terminate(self());
   }
 



[mesos] branch 1.6.x updated (7e4d380 -> 205525e)

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a change to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 7e4d380  Remembered whether an executor was agent-generated.
 new f511f25  Changed termination logic of the Docker executor.
 new 205525e  Changed termination logic of the default executor.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/docker/executor.cpp   | 14 ++
 src/exec/exec.cpp | 19 +++
 src/launcher/default_executor.cpp | 25 +++--
 3 files changed, 44 insertions(+), 14 deletions(-)



[mesos] 01/02: Changed termination logic of the Docker executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit f511f25be9d850ee9b65fc3ec5f54d149beb2f19
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

Changed termination logic of the Docker executor.

Previously, the Docker executor terminated itself after a task's
container had terminated. This could lead to termination of the
executor before processing of a terminal status update by the agent.
In order to mitigate this issue, the executor slept for one second to
give a chance to send all status updates and receive all status update
acknowledgments before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the Docker executor after receiving a terminal
status update acknowledgment. Also, this patch increases the timeout
from one second to one minute for fail-safety.

Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++
 src/exec/exec.cpp   | 19 +++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 57cfbcd..d5c905d 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -217,7 +217,9 @@ public:
 
   driver->sendStatusUpdate(status);
 
-  _stop();
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // the terminal update for some reason.
+  delay(Seconds(60), self(), ::_stop);
   return;
 }
 
@@ -750,17 +752,13 @@ private:
 CHECK_SOME(driver);
 driver.get()->sendStatusUpdate(taskStatus);
 
-_stop();
+// This is a fail safe in case the agent doesn't send an ACK for
+// the terminal update for some reason.
+delay(Seconds(60), self(), ::_stop);
   }
 
   void _stop()
   {
-// A hack for now ... but we need to wait until the status update
-// is sent to the slave before we shut ourselves down.
-// TODO(tnachen): Remove this hack and also the same hack in the
-// command executor when we have the new HTTP APIs to wait until
-// an ack.
-os::sleep(Seconds(1));
 driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 65a671d..adeb413 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -395,10 +395,29 @@ protected:
   return;
 }
 
+if (!updates.contains(uuid_.get())) {
+  LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+   << uuid_.get() << " for task " << taskId
+   << " of framework " << frameworkId;
+  return;
+}
+
 VLOG(1) << "Executor received status update acknowledgement "
 << uuid_.get() << " for task " << taskId
 << " of framework " << frameworkId;
 
+// If this is a terminal status update acknowledgment for the Docker
+// executor, stop the driver to terminate the executor.
+//
+// TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+// is to update supported API for the Docker executor from V0 to V1. It
+// will allow the executor to handle status update acknowledgments itself.
+if (mesos::internal::protobuf::isTerminalState(
+updates[uuid_.get()].status().state()) &&
+dynamic_cast(executor)) {
+  driver->stop();
+}
+
 // Remove the corresponding update.
 updates.erase(uuid_.get());
 



[mesos] branch 1.7.x updated (47f2a11 -> 5b39908)

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a change to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 47f2a11  Remembered whether an executor was agent-generated.
 new 6a7da28  Changed termination logic of the Docker executor.
 new 5b39908  Changed termination logic of the default executor.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/docker/executor.cpp   | 14 ++
 src/exec/exec.cpp | 19 +++
 src/launcher/default_executor.cpp | 25 +++--
 3 files changed, 44 insertions(+), 14 deletions(-)



[mesos] 02/02: Changed termination logic of the default executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 5b399080eee11ee03f4bc6c09b791c24670da6c1
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

Changed termination logic of the default executor.

Previously, the default executor terminated itself after all containers
had terminated. This could lead to termination of the executor before
processing of a terminal status update by the agent. In order
to mitigate this issue, the executor slept for one second to give a
chance to send all status updates and receive all status update
acknowledgements before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the default executor if all status updates have
been acknowledged by the agent and no running containers left.
Also, this patch increases the timeout from one second to one minute
for fail-safety.

Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index b89b036..6a049b3 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -278,6 +278,12 @@ public:
   containers.at(taskId)->acknowledged = true;
 }
 
+// Terminate the executor if all status updates have been acknowledged
+// by the agent and no running containers left.
+if (containers.empty() && unacknowledgedUpdates.empty()) {
+  terminate(self());
+}
+
 break;
   }
 
@@ -1072,14 +1078,21 @@ protected:
 
   void _shutdown()
   {
-const Duration duration = Seconds(1);
+if (unacknowledgedUpdates.empty()) {
+  terminate(self());
+} else {
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // a status update for some reason.
+  const Duration duration = Seconds(60);
 
-LOG(INFO) << "Terminating after " << duration;
+  LOG(INFO) << "Terminating after " << duration;
 
-// TODO(qianzhang): Remove this hack since the executor now receives
-// acknowledgements for status updates. The executor can terminate
-// after it receives an ACK for a terminal status update.
-os::sleep(duration);
+  delay(duration, self(), ::__shutdown);
+}
+  }
+
+  void __shutdown()
+  {
 terminate(self());
   }
 



[mesos] 01/02: Changed termination logic of the Docker executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 6a7da284d1b89f8a144ed2f896f005a5ee9d4aea
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

Changed termination logic of the Docker executor.

Previously, the Docker executor terminated itself after a task's
container had terminated. This could lead to termination of the
executor before processing of a terminal status update by the agent.
In order to mitigate this issue, the executor slept for one second to
give a chance to send all status updates and receive all status update
acknowledgments before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the Docker executor after receiving a terminal
status update acknowledgment. Also, this patch increases the timeout
from one second to one minute for fail-safety.

Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++
 src/exec/exec.cpp   | 19 +++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index f638e4b..ed1b718 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -224,7 +224,9 @@ public:
 
   driver->sendStatusUpdate(status);
 
-  _stop();
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // the terminal update for some reason.
+  delay(Seconds(60), self(), ::_stop);
   return;
 }
 
@@ -757,17 +759,13 @@ private:
 CHECK_SOME(driver);
 driver.get()->sendStatusUpdate(taskStatus);
 
-_stop();
+// This is a fail safe in case the agent doesn't send an ACK for
+// the terminal update for some reason.
+delay(Seconds(60), self(), ::_stop);
   }
 
   void _stop()
   {
-// A hack for now ... but we need to wait until the status update
-// is sent to the slave before we shut ourselves down.
-// TODO(tnachen): Remove this hack and also the same hack in the
-// command executor when we have the new HTTP APIs to wait until
-// an ack.
-os::sleep(Seconds(1));
 driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index c0fa3b6..dee2074 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -395,10 +395,29 @@ protected:
   return;
 }
 
+if (!updates.contains(uuid_.get())) {
+  LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+   << uuid_.get() << " for task " << taskId
+   << " of framework " << frameworkId;
+  return;
+}
+
 VLOG(1) << "Executor received status update acknowledgement "
 << uuid_.get() << " for task " << taskId
 << " of framework " << frameworkId;
 
+// If this is a terminal status update acknowledgment for the Docker
+// executor, stop the driver to terminate the executor.
+//
+// TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+// is to update supported API for the Docker executor from V0 to V1. It
+// will allow the executor to handle status update acknowledgments itself.
+if (mesos::internal::protobuf::isTerminalState(
+updates[uuid_.get()].status().state()) &&
+dynamic_cast(executor)) {
+  driver->stop();
+}
+
 // Remove the corresponding update.
 updates.erase(uuid_.get());
 



[mesos] 02/02: Changed termination logic of the default executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit a2ca451aab4625e126b9e7b470eb9f7c232dd746
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

Changed termination logic of the default executor.

Previously, the default executor terminated itself after all containers
had terminated. This could lead to termination of the executor before
processing of a terminal status update by the agent. In order
to mitigate this issue, the executor slept for one second to give a
chance to send all status updates and receive all status update
acknowledgements before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the default executor if all status updates have
been acknowledged by the agent and no running containers left.
Also, this patch increases the timeout from one second to one minute
for fail-safety.

Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 5837cfa..af520f7 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -278,6 +278,12 @@ public:
   containers.at(taskId)->acknowledged = true;
 }
 
+// Terminate the executor if all status updates have been acknowledged
+// by the agent and no running containers left.
+if (containers.empty() && unacknowledgedUpdates.empty()) {
+  terminate(self());
+}
+
 break;
   }
 
@@ -1076,14 +1082,21 @@ protected:
 
   void _shutdown()
   {
-const Duration duration = Seconds(1);
+if (unacknowledgedUpdates.empty()) {
+  terminate(self());
+} else {
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // a status update for some reason.
+  const Duration duration = Seconds(60);
 
-LOG(INFO) << "Terminating after " << duration;
+  LOG(INFO) << "Terminating after " << duration;
 
-// TODO(qianzhang): Remove this hack since the executor now receives
-// acknowledgements for status updates. The executor can terminate
-// after it receives an ACK for a terminal status update.
-os::sleep(duration);
+  delay(duration, self(), ::__shutdown);
+}
+  }
+
+  void __shutdown()
+  {
 terminate(self());
   }
 



[mesos] 02/02: Changed termination logic of the default executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit f37ae68a8f0d23a2e0f31812b8fe4494109769c6
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

Changed termination logic of the default executor.

Previously, the default executor terminated itself after all containers
had terminated. This could lead to termination of the executor before
processing of a terminal status update by the agent. In order
to mitigate this issue, the executor slept for one second to give a
chance to send all status updates and receive all status update
acknowledgements before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the default executor if all status updates have
been acknowledged by the agent and no running containers left.
Also, this patch increases the timeout from one second to one minute
for fail-safety.

Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 521494a..7781382 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -279,6 +279,12 @@ public:
   containers.at(taskId)->acknowledged = true;
 }
 
+// Terminate the executor if all status updates have been acknowledged
+// by the agent and no running containers left.
+if (containers.empty() && unacknowledgedUpdates.empty()) {
+  terminate(self());
+}
+
 break;
   }
 
@@ -1088,14 +1094,21 @@ protected:
 
   void _shutdown()
   {
-const Duration duration = Seconds(1);
+if (unacknowledgedUpdates.empty()) {
+  terminate(self());
+} else {
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // a status update for some reason.
+  const Duration duration = Seconds(60);
 
-LOG(INFO) << "Terminating after " << duration;
+  LOG(INFO) << "Terminating after " << duration;
 
-// TODO(qianzhang): Remove this hack since the executor now receives
-// acknowledgements for status updates. The executor can terminate
-// after it receives an ACK for a terminal status update.
-os::sleep(duration);
+  delay(duration, self(), ::__shutdown);
+}
+  }
+
+  void __shutdown()
+  {
 terminate(self());
   }
 



[mesos] 01/02: Changed termination logic of the Docker executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 1bd0b37a7e522d63319db426dae7068b901eaea6
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

Changed termination logic of the Docker executor.

Previously, the Docker executor terminated itself after a task's
container had terminated. This could lead to termination of the
executor before processing of a terminal status update by the agent.
In order to mitigate this issue, the executor slept for one second to
give a chance to send all status updates and receive all status update
acknowledgments before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the Docker executor after receiving a terminal
status update acknowledgment. Also, this patch increases the timeout
from one second to one minute for fail-safety.

Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++
 src/exec/exec.cpp   | 19 +++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index f638e4b..ed1b718 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -224,7 +224,9 @@ public:
 
   driver->sendStatusUpdate(status);
 
-  _stop();
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // the terminal update for some reason.
+  delay(Seconds(60), self(), ::_stop);
   return;
 }
 
@@ -757,17 +759,13 @@ private:
 CHECK_SOME(driver);
 driver.get()->sendStatusUpdate(taskStatus);
 
-_stop();
+// This is a fail safe in case the agent doesn't send an ACK for
+// the terminal update for some reason.
+delay(Seconds(60), self(), ::_stop);
   }
 
   void _stop()
   {
-// A hack for now ... but we need to wait until the status update
-// is sent to the slave before we shut ourselves down.
-// TODO(tnachen): Remove this hack and also the same hack in the
-// command executor when we have the new HTTP APIs to wait until
-// an ack.
-os::sleep(Seconds(1));
 driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index c0fa3b6..dee2074 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -395,10 +395,29 @@ protected:
   return;
 }
 
+if (!updates.contains(uuid_.get())) {
+  LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+   << uuid_.get() << " for task " << taskId
+   << " of framework " << frameworkId;
+  return;
+}
+
 VLOG(1) << "Executor received status update acknowledgement "
 << uuid_.get() << " for task " << taskId
 << " of framework " << frameworkId;
 
+// If this is a terminal status update acknowledgment for the Docker
+// executor, stop the driver to terminate the executor.
+//
+// TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+// is to update supported API for the Docker executor from V0 to V1. It
+// will allow the executor to handle status update acknowledgments itself.
+if (mesos::internal::protobuf::isTerminalState(
+updates[uuid_.get()].status().state()) &&
+dynamic_cast(executor)) {
+  driver->stop();
+}
+
 // Remove the corresponding update.
 updates.erase(uuid_.get());
 



[mesos] branch 1.8.x updated (a66419b -> a2ca451)

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a change to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from a66419b  Remembered whether an executor was agent-generated.
 new 1bd0b37  Changed termination logic of the Docker executor.
 new a2ca451  Changed termination logic of the default executor.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/docker/executor.cpp   | 14 ++
 src/exec/exec.cpp | 19 +++
 src/launcher/default_executor.cpp | 25 +++--
 3 files changed, 44 insertions(+), 14 deletions(-)



[mesos] branch 1.9.x updated (298d1d7 -> f37ae68)

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a change to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 298d1d7  Remembered whether an executor was agent-generated.
 new 3d60cba  Changed termination logic of the Docker executor.
 new f37ae68  Changed termination logic of the default executor.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/docker/executor.cpp   | 14 ++
 src/exec/exec.cpp | 19 +++
 src/launcher/default_executor.cpp | 25 +++--
 3 files changed, 44 insertions(+), 14 deletions(-)



[mesos] 01/02: Changed termination logic of the Docker executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 3d60cba39d0377a7dc19b4c47f3bb0807418fe50
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

Changed termination logic of the Docker executor.

Previously, the Docker executor terminated itself after a task's
container had terminated. This could lead to termination of the
executor before processing of a terminal status update by the agent.
In order to mitigate this issue, the executor slept for one second to
give a chance to send all status updates and receive all status update
acknowledgments before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the Docker executor after receiving a terminal
status update acknowledgment. Also, this patch increases the timeout
from one second to one minute for fail-safety.

Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++
 src/exec/exec.cpp   | 19 +++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 132f42b..ebbbc0d 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -224,7 +224,9 @@ public:
 
   driver->sendStatusUpdate(status);
 
-  _stop();
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // the terminal update for some reason.
+  delay(Seconds(60), self(), ::_stop);
   return;
 }
 
@@ -773,17 +775,13 @@ private:
 CHECK_SOME(driver);
 driver.get()->sendStatusUpdate(taskStatus);
 
-_stop();
+// This is a fail safe in case the agent doesn't send an ACK for
+// the terminal update for some reason.
+delay(Seconds(60), self(), ::_stop);
   }
 
   void _stop()
   {
-// A hack for now ... but we need to wait until the status update
-// is sent to the slave before we shut ourselves down.
-// TODO(tnachen): Remove this hack and also the same hack in the
-// command executor when we have the new HTTP APIs to wait until
-// an ack.
-os::sleep(Seconds(1));
 driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 67e082e..8ed77f2 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -409,10 +409,29 @@ protected:
   return;
 }
 
+if (!updates.contains(uuid_.get())) {
+  LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+   << uuid_.get() << " for task " << taskId
+   << " of framework " << frameworkId;
+  return;
+}
+
 VLOG(1) << "Executor received status update acknowledgement "
 << uuid_.get() << " for task " << taskId
 << " of framework " << frameworkId;
 
+// If this is a terminal status update acknowledgment for the Docker
+// executor, stop the driver to terminate the executor.
+//
+// TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+// is to update supported API for the Docker executor from V0 to V1. It
+// will allow the executor to handle status update acknowledgments itself.
+if (mesos::internal::protobuf::isTerminalState(
+updates[uuid_.get()].status().state()) &&
+dynamic_cast(executor)) {
+  driver->stop();
+}
+
 // Remove the corresponding update.
 updates.erase(uuid_.get());
 



[mesos] branch master updated (3d197d8 -> 683dfc1)

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 3d197d8  Fixed the build on Windows.
 new 457c389  Changed termination logic of the Docker executor.
 new 683dfc1  Changed termination logic of the default executor.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/docker/executor.cpp   | 14 ++
 src/exec/exec.cpp | 19 +++
 src/launcher/default_executor.cpp | 25 +++--
 3 files changed, 44 insertions(+), 14 deletions(-)



[mesos] 02/02: Changed termination logic of the default executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 683dfc1ffb0b1ca758a07d19ab3badd8cac62dc7
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 19:07:50 2020 +0100

Changed termination logic of the default executor.

Previously, the default executor terminated itself after all containers
had terminated. This could lead to termination of the executor before
processing of a terminal status update by the agent. In order
to mitigate this issue, the executor slept for one second to give a
chance to send all status updates and receive all status update
acknowledgements before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the default executor if all status updates have
been acknowledged by the agent and no running containers left.
Also, this patch increases the timeout from one second to one minute
for fail-safety.

Review: https://reviews.apache.org/r/72029
---
 src/launcher/default_executor.cpp | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 7997003..b431d8f 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -279,6 +279,12 @@ public:
   containers.at(taskId)->acknowledged = true;
 }
 
+// Terminate the executor if all status updates have been acknowledged
+// by the agent and no running containers left.
+if (containers.empty() && unacknowledgedUpdates.empty()) {
+  terminate(self());
+}
+
 break;
   }
 
@@ -1088,14 +1094,21 @@ protected:
 
   void _shutdown()
   {
-const Duration duration = Seconds(1);
+if (unacknowledgedUpdates.empty()) {
+  terminate(self());
+} else {
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // a status update for some reason.
+  const Duration duration = Seconds(60);
 
-LOG(INFO) << "Terminating after " << duration;
+  LOG(INFO) << "Terminating after " << duration;
 
-// TODO(qianzhang): Remove this hack since the executor now receives
-// acknowledgements for status updates. The executor can terminate
-// after it receives an ACK for a terminal status update.
-os::sleep(duration);
+  delay(duration, self(), ::__shutdown);
+}
+  }
+
+  void __shutdown()
+  {
 terminate(self());
   }
 



[mesos] 01/02: Changed termination logic of the Docker executor.

2020-02-03 Thread abudnik
This is an automated email from the ASF dual-hosted git repository.

abudnik pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 457c38967bf9a53c1c5cd2743385937a26f413f6
Author: Andrei Budnik 
AuthorDate: Wed Jan 29 13:35:02 2020 +0100

Changed termination logic of the Docker executor.

Previously, the Docker executor terminated itself after a task's
container had terminated. This could lead to termination of the
executor before processing of a terminal status update by the agent.
In order to mitigate this issue, the executor slept for one second to
give a chance to send all status updates and receive all status update
acknowledgments before terminating itself. This might have led to
various race conditions in some circumstances (e.g., on a slow host).
This patch terminates the Docker executor after receiving a terminal
status update acknowledgment. Also, this patch increases the timeout
from one second to one minute for fail-safety.

Review: https://reviews.apache.org/r/72055
---
 src/docker/executor.cpp | 14 ++
 src/exec/exec.cpp   | 19 +++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 132f42b..ebbbc0d 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -224,7 +224,9 @@ public:
 
   driver->sendStatusUpdate(status);
 
-  _stop();
+  // This is a fail safe in case the agent doesn't send an ACK for
+  // the terminal update for some reason.
+  delay(Seconds(60), self(), ::_stop);
   return;
 }
 
@@ -773,17 +775,13 @@ private:
 CHECK_SOME(driver);
 driver.get()->sendStatusUpdate(taskStatus);
 
-_stop();
+// This is a fail safe in case the agent doesn't send an ACK for
+// the terminal update for some reason.
+delay(Seconds(60), self(), ::_stop);
   }
 
   void _stop()
   {
-// A hack for now ... but we need to wait until the status update
-// is sent to the slave before we shut ourselves down.
-// TODO(tnachen): Remove this hack and also the same hack in the
-// command executor when we have the new HTTP APIs to wait until
-// an ack.
-os::sleep(Seconds(1));
 driver.get()->stop();
   }
 
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 69e5e24..0fba5e0 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -409,10 +409,29 @@ protected:
   return;
 }
 
+if (!updates.contains(uuid_.get())) {
+  LOG(WARNING) << "Ignoring unknown status update acknowledgement "
+   << uuid_.get() << " for task " << taskId
+   << " of framework " << frameworkId;
+  return;
+}
+
 VLOG(1) << "Executor received status update acknowledgement "
 << uuid_.get() << " for task " << taskId
 << " of framework " << frameworkId;
 
+// If this is a terminal status update acknowledgment for the Docker
+// executor, stop the driver to terminate the executor.
+//
+// TODO(abudnik): This is a workaround for MESOS-9847. A better solution
+// is to update supported API for the Docker executor from V0 to V1. It
+// will allow the executor to handle status update acknowledgments itself.
+if (mesos::internal::protobuf::isTerminalState(
+updates[uuid_.get()].status().state()) &&
+dynamic_cast(executor)) {
+  driver->stop();
+}
+
 // Remove the corresponding update.
 updates.erase(uuid_.get());
 



[mesos] branch master updated: Fixed the build on Windows.

2020-02-03 Thread bbannier
This is an automated email from the ASF dual-hosted git repository.

bbannier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 3d197d8  Fixed the build on Windows.
3d197d8 is described below

commit 3d197d8e815f1f2de0565ce64547f409997c5e82
Author: Benjamin Bannier 
AuthorDate: Mon Feb 3 09:30:25 2020 +0100

Fixed the build on Windows.

The addition of executor domain socket support broke the build on
Windows. This patch updates preprocessor directives in order to fix
the build.

Review: https://reviews.apache.org/r/72070/
---
 src/executor/executor.cpp |  8 +++-
 src/launcher/default_executor.cpp |  6 --
 src/local/local.cpp   |  2 ++
 src/slave/flags.cpp   |  7 +++
 src/slave/main.cpp| 12 
 src/slave/slave.cpp   |  6 ++
 src/slave/slave.hpp   |  5 +
 src/tests/cluster.cpp | 10 ++
 src/tests/mock_slave.cpp  |  2 ++
 9 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/src/executor/executor.cpp b/src/executor/executor.cpp
index 7cff258..213f38e 100644
--- a/src/executor/executor.cpp
+++ b/src/executor/executor.cpp
@@ -43,7 +43,10 @@
 #include 
 #include 
 
+#ifndef __WINDOWS__
 #include "common/domain_sockets.hpp"
+#endif // __WINDOWS__
+
 #include "common/http.hpp"
 #include "common/recordio.hpp"
 #include "common/validation.hpp"
@@ -233,6 +236,7 @@ public:
 }
 #endif // USE_SSL_SOCKET
 
+#ifndef __WINDOWS__
 value = env.get("MESOS_DOMAIN_SOCKET");
 if (value.isSome()) {
   string scheme = "http+unix";
@@ -269,7 +273,9 @@ public:
   scheme,
   path,
   upid.id + "/api/v1/executor");
-} else {
+} else
+#endif
+{
   agent = ::URL(
   scheme,
   upid.address.ip,
diff --git a/src/launcher/default_executor.cpp 
b/src/launcher/default_executor.cpp
index 4369fd0..7997003 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -1730,6 +1730,7 @@ int main(int argc, char** argv)
   UPID upid(value.get());
   CHECK(upid) << "Failed to parse MESOS_SLAVE_PID '" << value.get() << "'";
 
+#ifndef __WINDOWS__
   value = os::getenv("MESOS_DOMAIN_SOCKET");
   if (value.isSome()) {
 // The previous value of `scheme` can be ignored here, since we do not
@@ -1764,8 +1765,9 @@ int main(int argc, char** argv)
 scheme,
 path,
 upid.id + "/api/v1");
-
-  } else {
+  } else
+#endif // __WINDOWS__
+  {
 agent = ::URL(
 scheme,
 upid.address.ip,
diff --git a/src/local/local.cpp b/src/local/local.cpp
index 6a7709d..8950570 100644
--- a/src/local/local.cpp
+++ b/src/local/local.cpp
@@ -535,7 +535,9 @@ PID launch(const Flags& flags, Allocator* 
_allocator)
 secretGenerators->back(),
 nullptr,
 nullptr,
+#ifndef __WINDOWS__
 None(),
+#endif // __WINDOWS__
 authorizer_); // Same authorizer as master.
 
 slaves[containerizer.get()] = slave;
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index 7653c39..0f159a3 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -25,7 +25,10 @@
 
 #include 
 
+#ifndef __WINDOWS__
 #include "common/domain_sockets.hpp"
+#endif // __WINDOWS__
+
 #include "common/http.hpp"
 #include "common/parse.hpp"
 #include "common/protobuf_utils.hpp"
@@ -985,6 +988,7 @@ mesos::internal::slave::Flags::Flags()
 #endif // __WINDOWS__
   );
 
+#ifndef __WINDOWS__
   add(::domain_socket_location,
   "domain_socket_location",
   "Location on the host filesystem of the domain socket used for\n"
@@ -1003,6 +1007,7 @@ mesos::internal::slave::Flags::Flags()
 
 return None();
   });
+#endif // __WINDOWS__
 
   add(::default_container_dns,
   "default_container_dns",
@@ -1401,12 +1406,14 @@ mesos::internal::slave::Flags::Flags()
   false);
 #endif // USE_SSL_SOCKET
 
+#ifndef __WINDOWS__
   add(::http_executor_domain_sockets,
   "http_executor_domain_sockets",
   "If true, the agent will provide a unix domain sockets that the\n"
   "executor can use to connect to the agent, instead of relying on\n"
   "a TCP connection.",
   false);
+#endif // __WINDOWS__
 
   add(::http_credentials,
   "http_credentials",
diff --git a/src/slave/main.cpp b/src/slave/main.cpp
index b5715d9..c1e6551 100644
--- a/src/slave/main.cpp
+++ b/src/slave/main.cpp
@@ -62,7 +62,11 @@
 
 #include "common/authorization.hpp"
 #include "common/build.hpp"
+
+#ifndef __WINDOWS__
 #include "common/domain_sockets.hpp"
+#endif // __WINDOWS__
+
 #include "common/http.hpp"
 
 #include "hook/manager.hpp"
@@ -112,7 +116,9 @@ using process::Owned;
 using process::firewall::DisabledEndpointsFirewallRule;
 using process::firewall::FirewallRule;
 
+#ifndef __WINDOWS__
 using process::network::unix::Socket;
+#endif //