[mesos] 01/02: Changed termination logic of the Docker executor.
This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch 1.5.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit ff98f12a50a56c13688b87068a116d1d08142f49 Author: Andrei Budnik AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++ src/exec/exec.cpp | 19 +++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 7b48f84..0d75801 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -199,7 +199,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), ::_stop); return; } @@ -691,17 +693,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); -_stop(); +// This is a fail safe in case the agent doesn't send an ACK for +// the terminal update for some reason. +delay(Seconds(60), self(), ::_stop); } void _stop() { -// A hack for now ... but we need to wait until the status update -// is sent to the slave before we shut ourselves down. -// TODO(tnachen): Remove this hack and also the same hack in the -// command executor when we have the new HTTP APIs to wait until -// an ack. -os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index 0c76a3f..e964a11 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -395,10 +395,29 @@ protected: return; } +if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; +} + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; +// If this is a terminal status update acknowledgment for the Docker +// executor, stop the driver to terminate the executor. +// +// TODO(abudnik): This is a workaround for MESOS-9847. A better solution +// is to update supported API for the Docker executor from V0 to V1. It +// will allow the executor to handle status update acknowledgments itself. +if (mesos::internal::protobuf::isTerminalState( +updates[uuid_.get()].status().state()) && +dynamic_cast(executor)) { + driver->stop(); +} + // Remove the corresponding update. updates.erase(uuid_.get());
[mesos] 01/02: Changed termination logic of the Docker executor.
This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch 1.6.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit f511f25be9d850ee9b65fc3ec5f54d149beb2f19 Author: Andrei Budnik AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++ src/exec/exec.cpp | 19 +++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 57cfbcd..d5c905d 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -217,7 +217,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), ::_stop); return; } @@ -750,17 +752,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); -_stop(); +// This is a fail safe in case the agent doesn't send an ACK for +// the terminal update for some reason. +delay(Seconds(60), self(), ::_stop); } void _stop() { -// A hack for now ... but we need to wait until the status update -// is sent to the slave before we shut ourselves down. -// TODO(tnachen): Remove this hack and also the same hack in the -// command executor when we have the new HTTP APIs to wait until -// an ack. -os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index 65a671d..adeb413 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -395,10 +395,29 @@ protected: return; } +if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; +} + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; +// If this is a terminal status update acknowledgment for the Docker +// executor, stop the driver to terminate the executor. +// +// TODO(abudnik): This is a workaround for MESOS-9847. A better solution +// is to update supported API for the Docker executor from V0 to V1. It +// will allow the executor to handle status update acknowledgments itself. +if (mesos::internal::protobuf::isTerminalState( +updates[uuid_.get()].status().state()) && +dynamic_cast(executor)) { + driver->stop(); +} + // Remove the corresponding update. updates.erase(uuid_.get());
[mesos] 01/02: Changed termination logic of the Docker executor.
This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch 1.7.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 6a7da284d1b89f8a144ed2f896f005a5ee9d4aea Author: Andrei Budnik AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++ src/exec/exec.cpp | 19 +++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index f638e4b..ed1b718 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -224,7 +224,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), ::_stop); return; } @@ -757,17 +759,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); -_stop(); +// This is a fail safe in case the agent doesn't send an ACK for +// the terminal update for some reason. +delay(Seconds(60), self(), ::_stop); } void _stop() { -// A hack for now ... but we need to wait until the status update -// is sent to the slave before we shut ourselves down. -// TODO(tnachen): Remove this hack and also the same hack in the -// command executor when we have the new HTTP APIs to wait until -// an ack. -os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index c0fa3b6..dee2074 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -395,10 +395,29 @@ protected: return; } +if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; +} + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; +// If this is a terminal status update acknowledgment for the Docker +// executor, stop the driver to terminate the executor. +// +// TODO(abudnik): This is a workaround for MESOS-9847. A better solution +// is to update supported API for the Docker executor from V0 to V1. It +// will allow the executor to handle status update acknowledgments itself. +if (mesos::internal::protobuf::isTerminalState( +updates[uuid_.get()].status().state()) && +dynamic_cast(executor)) { + driver->stop(); +} + // Remove the corresponding update. updates.erase(uuid_.get());
[mesos] 01/02: Changed termination logic of the Docker executor.
This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch 1.8.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 1bd0b37a7e522d63319db426dae7068b901eaea6 Author: Andrei Budnik AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++ src/exec/exec.cpp | 19 +++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index f638e4b..ed1b718 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -224,7 +224,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), ::_stop); return; } @@ -757,17 +759,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); -_stop(); +// This is a fail safe in case the agent doesn't send an ACK for +// the terminal update for some reason. +delay(Seconds(60), self(), ::_stop); } void _stop() { -// A hack for now ... but we need to wait until the status update -// is sent to the slave before we shut ourselves down. -// TODO(tnachen): Remove this hack and also the same hack in the -// command executor when we have the new HTTP APIs to wait until -// an ack. -os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index c0fa3b6..dee2074 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -395,10 +395,29 @@ protected: return; } +if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; +} + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; +// If this is a terminal status update acknowledgment for the Docker +// executor, stop the driver to terminate the executor. +// +// TODO(abudnik): This is a workaround for MESOS-9847. A better solution +// is to update supported API for the Docker executor from V0 to V1. It +// will allow the executor to handle status update acknowledgments itself. +if (mesos::internal::protobuf::isTerminalState( +updates[uuid_.get()].status().state()) && +dynamic_cast(executor)) { + driver->stop(); +} + // Remove the corresponding update. updates.erase(uuid_.get());
[mesos] 01/02: Changed termination logic of the Docker executor.
This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 3d60cba39d0377a7dc19b4c47f3bb0807418fe50 Author: Andrei Budnik AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++ src/exec/exec.cpp | 19 +++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 132f42b..ebbbc0d 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -224,7 +224,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), ::_stop); return; } @@ -773,17 +775,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); -_stop(); +// This is a fail safe in case the agent doesn't send an ACK for +// the terminal update for some reason. +delay(Seconds(60), self(), ::_stop); } void _stop() { -// A hack for now ... but we need to wait until the status update -// is sent to the slave before we shut ourselves down. -// TODO(tnachen): Remove this hack and also the same hack in the -// command executor when we have the new HTTP APIs to wait until -// an ack. -os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index 67e082e..8ed77f2 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -409,10 +409,29 @@ protected: return; } +if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; +} + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; +// If this is a terminal status update acknowledgment for the Docker +// executor, stop the driver to terminate the executor. +// +// TODO(abudnik): This is a workaround for MESOS-9847. A better solution +// is to update supported API for the Docker executor from V0 to V1. It +// will allow the executor to handle status update acknowledgments itself. +if (mesos::internal::protobuf::isTerminalState( +updates[uuid_.get()].status().state()) && +dynamic_cast(executor)) { + driver->stop(); +} + // Remove the corresponding update. updates.erase(uuid_.get());
[mesos] 01/02: Changed termination logic of the Docker executor.
This is an automated email from the ASF dual-hosted git repository. abudnik pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 457c38967bf9a53c1c5cd2743385937a26f413f6 Author: Andrei Budnik AuthorDate: Wed Jan 29 13:35:02 2020 +0100 Changed termination logic of the Docker executor. Previously, the Docker executor terminated itself after a task's container had terminated. This could lead to termination of the executor before processing of a terminal status update by the agent. In order to mitigate this issue, the executor slept for one second to give a chance to send all status updates and receive all status update acknowledgments before terminating itself. This might have led to various race conditions in some circumstances (e.g., on a slow host). This patch terminates the Docker executor after receiving a terminal status update acknowledgment. Also, this patch increases the timeout from one second to one minute for fail-safety. Review: https://reviews.apache.org/r/72055 --- src/docker/executor.cpp | 14 ++ src/exec/exec.cpp | 19 +++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index 132f42b..ebbbc0d 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -224,7 +224,9 @@ public: driver->sendStatusUpdate(status); - _stop(); + // This is a fail safe in case the agent doesn't send an ACK for + // the terminal update for some reason. + delay(Seconds(60), self(), ::_stop); return; } @@ -773,17 +775,13 @@ private: CHECK_SOME(driver); driver.get()->sendStatusUpdate(taskStatus); -_stop(); +// This is a fail safe in case the agent doesn't send an ACK for +// the terminal update for some reason. +delay(Seconds(60), self(), ::_stop); } void _stop() { -// A hack for now ... but we need to wait until the status update -// is sent to the slave before we shut ourselves down. -// TODO(tnachen): Remove this hack and also the same hack in the -// command executor when we have the new HTTP APIs to wait until -// an ack. -os::sleep(Seconds(1)); driver.get()->stop(); } diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index 69e5e24..0fba5e0 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -409,10 +409,29 @@ protected: return; } +if (!updates.contains(uuid_.get())) { + LOG(WARNING) << "Ignoring unknown status update acknowledgement " + << uuid_.get() << " for task " << taskId + << " of framework " << frameworkId; + return; +} + VLOG(1) << "Executor received status update acknowledgement " << uuid_.get() << " for task " << taskId << " of framework " << frameworkId; +// If this is a terminal status update acknowledgment for the Docker +// executor, stop the driver to terminate the executor. +// +// TODO(abudnik): This is a workaround for MESOS-9847. A better solution +// is to update supported API for the Docker executor from V0 to V1. It +// will allow the executor to handle status update acknowledgments itself. +if (mesos::internal::protobuf::isTerminalState( +updates[uuid_.get()].status().state()) && +dynamic_cast(executor)) { + driver->stop(); +} + // Remove the corresponding update. updates.erase(uuid_.get());