This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit cbb5694f00717f8d8a1b760622e9fe41b066be83 Author: Greg Mann <[email protected]> AuthorDate: Fri Jun 28 12:56:08 2019 -0700 Added common protobufs for agent draining. This patch makes protobuf message updates which will be used by both the master and the agent to facilitate automatic draining of agents. Review: https://reviews.apache.org/r/70822/ --- include/mesos/mesos.proto | 50 ++++++++++++++++++++++++++++++++++++++++++++ include/mesos/type_utils.hpp | 20 ++++++++++++++++++ include/mesos/v1/mesos.proto | 50 ++++++++++++++++++++++++++++++++++++++++++++ src/common/type_utils.cpp | 12 +++++++++++ src/internal/devolve.cpp | 12 +++++++++++ src/internal/devolve.hpp | 2 ++ src/internal/evolve.cpp | 6 ++++++ src/internal/evolve.hpp | 1 + src/messages/messages.proto | 8 +++++++ 9 files changed, 161 insertions(+) diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index eb1b09c..e3fbe1c 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -2626,6 +2626,7 @@ message TaskStatus { REASON_RECONCILIATION = 9; REASON_RESOURCES_UNKNOWN = 18; REASON_SLAVE_DISCONNECTED = 10; + REASON_SLAVE_DRAINING = 34; REASON_SLAVE_REMOVED = 11; REASON_SLAVE_REMOVED_BY_OPERATOR = 31; REASON_SLAVE_REREGISTERED = 32; @@ -3758,3 +3759,52 @@ message DeviceAccess { message DeviceWhitelist { repeated DeviceAccess allowed_devices = 1; } + + +enum DrainState { + UNKNOWN = 0; + + // The agent is currently draining. + DRAINING = 1; + + // The agent has been drained: all tasks have terminated, all terminal + // task status updates have been acknowledged by the frameworks, and all + // operations have finished and had their terminal updates acknowledged. + DRAINED = 2; +} + + +message DrainConfig { + // An upper bound for tasks with a KillPolicy. + // If a task has a KillPolicy grace period greater than this value, this value + // will be used instead. This allows the operator to limit the maximum time it + // will take the agent to drain. If this field is unset, the task's KillPolicy + // or the executor's default grace period is used. + // + // NOTE: Grace periods start when the executor receives the associated kill. + // If, for example, the agent is unreachable when this call is made, + // tasks will still receive their full grace period to kill gracefully. + optional DurationInfo max_grace_period = 1; + + // Whether or not this agent will be removed permanently from the cluster when + // draining is complete. This transition is automatic and does **NOT** require + // a separate call to `MarkAgentGone`. If this field is unset, then the + // default value of `false` is used. + // + // Compared to `MarkAgentGone`, which is used for unreachable agents, + // marking agents gone after draining will respect kill policies. + // To notify frameworks, tasks terminated during draining will return + // a `TASK_GONE_BY_OPERATOR` status update instead of any other terminal + // status. Executors will not need to account for this case, because + // the terminal status update will be intercepted and modified by the agent. + optional bool mark_gone = 2 [default = false]; +} + + +message DrainInfo { + // The drain state of the agent. + required DrainState state = 1; + + // The configuration used to drain the agent. + required DrainConfig config = 2; +} diff --git a/include/mesos/type_utils.hpp b/include/mesos/type_utils.hpp index 57b1893..ed9190b 100644 --- a/include/mesos/type_utils.hpp +++ b/include/mesos/type_utils.hpp @@ -26,6 +26,8 @@ #include <google/protobuf/map.h> #include <google/protobuf/repeated_field.h> +#include <google/protobuf/util/message_differencer.h> + #include <mesos/mesos.hpp> #include <stout/stringify.hpp> @@ -221,6 +223,18 @@ inline bool operator==(const DomainInfo& left, const DomainInfo& right) } +inline bool operator==(const DrainInfo& left, const DrainInfo& right) +{ + return google::protobuf::util::MessageDifferencer::Equals(left, right); +} + + +inline bool operator==(const DrainConfig& left, const DrainConfig& right) +{ + return google::protobuf::util::MessageDifferencer::Equals(left, right); +} + + /** * For machines to match, both the `hostname` and `ip` must be equivalent. * Hostname is not case sensitive, so it is lowercased before comparison. @@ -381,6 +395,12 @@ std::ostream& operator<<( std::ostream& operator<<(std::ostream& stream, const DomainInfo& domainInfo); +std::ostream& operator<<(std::ostream& stream, const DrainConfig& drainConfig); + + +std::ostream& operator<<(std::ostream& stream, const DrainState& state); + + std::ostream& operator<<(std::ostream& stream, const Environment& environment); diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto index 3343177..e198813 100644 --- a/include/mesos/v1/mesos.proto +++ b/include/mesos/v1/mesos.proto @@ -2619,6 +2619,7 @@ message TaskStatus { REASON_RECONCILIATION = 9; REASON_RESOURCES_UNKNOWN = 18; REASON_AGENT_DISCONNECTED = 10; + REASON_AGENT_DRAINING = 34; REASON_AGENT_REMOVED = 11; REASON_AGENT_REMOVED_BY_OPERATOR = 31; REASON_AGENT_REREGISTERED = 32; @@ -3751,3 +3752,52 @@ message DeviceAccess { message DeviceWhitelist { repeated DeviceAccess allowed_devices = 1; } + + +enum DrainState { + UNKNOWN = 0; + + // The agent is currently draining. + DRAINING = 1; + + // The agent has been drained: all tasks have terminated, all terminal + // task status updates have been acknowledged by the frameworks, and all + // operations have finished and had their terminal updates acknowledged. + DRAINED = 2; +} + + +message DrainConfig { + // An upper bound for tasks with a KillPolicy. + // If a task has a KillPolicy grace period greater than this value, this value + // will be used instead. This allows the operator to limit the maximum time it + // will take the agent to drain. If this field is unset, the task's KillPolicy + // or the executor's default grace period is used. + // + // NOTE: Grace periods start when the executor receives the associated kill. + // If, for example, the agent is unreachable when this call is made, + // tasks will still receive their full grace period to kill gracefully. + optional DurationInfo max_grace_period = 1; + + // Whether or not this agent will be removed permanently from the cluster when + // draining is complete. This transition is automatic and does **NOT** require + // a separate call to `MarkAgentGone`. If this field is unset, then the + // default value of `false` is used. + // + // Compared to `MarkAgentGone`, which is used for unreachable agents, + // marking agents gone after draining will respect kill policies. + // To notify frameworks, tasks terminated during draining will return + // a `TASK_GONE_BY_OPERATOR` status update instead of any other terminal + // status. Executors will not need to account for this case, because + // the terminal status update will be intercepted and modified by the agent. + optional bool mark_gone = 2 [default = false]; +} + + +message DrainInfo { + // The drain state of the agent. + required DrainState state = 1; + + // The configuration used to drain the agent. + required DrainConfig config = 2; +} diff --git a/src/common/type_utils.cpp b/src/common/type_utils.cpp index ef1b3ea..a7eb0e9 100644 --- a/src/common/type_utils.cpp +++ b/src/common/type_utils.cpp @@ -657,6 +657,18 @@ ostream& operator<<(ostream& stream, const DeviceWhitelist& deviceWhitelist) } +ostream& operator<<(ostream& stream, const DrainConfig& drainConfig) +{ + return stream << JSON::protobuf(drainConfig); +} + + +ostream& operator<<(ostream& stream, const DrainState& state) +{ + return stream << DrainState_Name(state); +} + + ostream& operator<<(ostream& stream, const CheckStatusInfo& checkStatusInfo) { switch (checkStatusInfo.type()) { diff --git a/src/internal/devolve.cpp b/src/internal/devolve.cpp index 1d300b4..d4a8bb5 100644 --- a/src/internal/devolve.cpp +++ b/src/internal/devolve.cpp @@ -68,6 +68,18 @@ Credential devolve(const v1::Credential& credential) } +DrainConfig devolve(const v1::DrainConfig& drainConfig) +{ + return devolve<DrainConfig>(drainConfig); +} + + +DrainInfo devolve(const v1::DrainInfo& drainInfo) +{ + return devolve<DrainInfo>(drainInfo); +} + + ExecutorID devolve(const v1::ExecutorID& executorId) { return devolve<ExecutorID>(executorId); diff --git a/src/internal/devolve.hpp b/src/internal/devolve.hpp index fefe86e..5b57032 100644 --- a/src/internal/devolve.hpp +++ b/src/internal/devolve.hpp @@ -54,6 +54,8 @@ namespace internal { CommandInfo devolve(const v1::CommandInfo& command); ContainerID devolve(const v1::ContainerID& containerId); Credential devolve(const v1::Credential& credential); +DrainConfig devolve(const v1::DrainConfig& drainConfig); +DrainInfo devolve(const v1::DrainInfo& drainInfo); ExecutorID devolve(const v1::ExecutorID& executorId); FrameworkID devolve(const v1::FrameworkID& frameworkId); FrameworkInfo devolve(const v1::FrameworkInfo& frameworkInfo); diff --git a/src/internal/evolve.cpp b/src/internal/evolve.cpp index 19c1559..81de15e 100644 --- a/src/internal/evolve.cpp +++ b/src/internal/evolve.cpp @@ -92,6 +92,12 @@ v1::DomainInfo evolve(const DomainInfo& domainInfo) } +v1::DrainInfo evolve(const DrainInfo& drainInfo) +{ + return evolve<v1::DrainInfo>(drainInfo); +} + + v1::ExecutorID evolve(const ExecutorID& executorId) { return evolve<v1::ExecutorID>(executorId); diff --git a/src/internal/evolve.hpp b/src/internal/evolve.hpp index 1044d9d..ffbb342 100644 --- a/src/internal/evolve.hpp +++ b/src/internal/evolve.hpp @@ -63,6 +63,7 @@ namespace internal { v1::AgentID evolve(const SlaveID& slaveId); v1::AgentInfo evolve(const SlaveInfo& slaveInfo); v1::DomainInfo evolve(const DomainInfo& domainInfo); +v1::DrainInfo evolve(const DrainInfo& drainInfo); v1::ExecutorID evolve(const ExecutorID& executorId); v1::ExecutorInfo evolve(const ExecutorInfo& executorInfo); v1::FileInfo evolve(const FileInfo& fileInfo); diff --git a/src/messages/messages.proto b/src/messages/messages.proto index e30ad34..f5ea038 100644 --- a/src/messages/messages.proto +++ b/src/messages/messages.proto @@ -980,6 +980,14 @@ message ReregisterExecutorMessage { /** + * Instructs the agent to begin draining tasks. + */ +message DrainSlaveMessage { + required DrainConfig config = 1; +} + + +/** * Sends a free-form message from the master to an agent. * The agent should gracefully terminate in response, which includes * shutting down all executors and tasks on the agent.
