This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 8a85efb877f50c793954e06b2a6db3f4d3049050 Author: Joseph Wu <[email protected]> AuthorDate: Fri Jun 28 12:56:15 2019 -0700 Added master endpoints for agent draining. This adds three master calls: * DRAIN_AGENT * DEACTIVATE_AGENT * REACTIVATE_AGENT DRAIN_AGENT starts automated draining of tasks on the specified agent. When marked for draining, the agent's resources will not be offered and all the agent's tasks will be gracefully killed. DEACTIVATE_AGENT stops offers for a specific agent. This can be used for manual draining of tasks, but is not limited to draining. For example, the operator could deactivate an agent prior to reserving resources. REACTIVATE_AGENT restarts offers for a specific agent, that had been drained or deactivated previously. Review: https://reviews.apache.org/r/70911/ --- include/mesos/master/master.proto | 56 +++++++++++++++++ include/mesos/v1/master/master.proto | 59 ++++++++++++++++++ src/internal/devolve.cpp | 12 ++++ src/internal/devolve.hpp | 1 + src/master/http.cpp | 114 +++++++++++++++++++++++++++++++++++ src/master/master.hpp | 29 +++++++++ src/master/validation.cpp | 18 ++++++ 7 files changed, 289 insertions(+) diff --git a/include/mesos/master/master.proto b/include/mesos/master/master.proto index 4a653ce..399e27e 100644 --- a/include/mesos/master/master.proto +++ b/include/mesos/master/master.proto @@ -90,6 +90,10 @@ message Call { START_MAINTENANCE = 26; // See 'StartMaintenance' below. STOP_MAINTENANCE = 27; // See 'StopMaintenance' below. + DRAIN_AGENT = 37; // See 'DrainAgent' below. + DEACTIVATE_AGENT = 38; // See 'DeactivateAgent' below. + REACTIVATE_AGENT = 39; // See 'ReactivateAgent' below. + GET_QUOTA = 28; // EXPERIMENTAL DO NOT USE. @@ -225,6 +229,55 @@ message Call { repeated MachineID machines = 1; } + // EXPERIMENTAL. + // + // Marks an agent for automated draining of tasks. + // This prevents further tasks from being launched on the agent, by preventing + // offers from being sent for the agent (see 'DeactivateAgent'), and also + // begins killing tasks running on the agent. + message DrainAgent { + required SlaveID slave_id = 1; + + // An upper bound for tasks with a KillPolicy. + // If a task has a KillPolicy grace period greater than this value, + // this value will be used instead. This allows the operator to limit + // the maximum time it will take the agent to drain. + // + // NOTE: Grace periods start when the executor receives the associated kill. + // If, for example, the agent is unreachable when this call is made, + // tasks will still receive their full grace period to kill gracefully. + optional DurationInfo max_grace_period = 2; + + // Whether or not this agent will be removed permanently from the + // cluster when draining is complete. This transition is automatic + // and does **NOT** require a separate call to `MarkAgentGone`. + // + // Compared to `MarkAgentGone`, which is used for unreachable agents, + // marking agents gone after draining will respect kill policies. + // To notify frameworks, tasks terminated during draining will return + // a `TASK_GONE_BY_OPERATOR` status update instead of any other terminal + // status. Executors will not need to account for this case, because + // the terminal status update will be intercepted and modified by the agent. + optional bool mark_gone = 3 [default = false]; + } + + // EXPERIMENTAL. + // + // Turns off offers for a specific agent. + // A deactivated agent will continue to run tasks and communicate statuses + // with the master. + message DeactivateAgent { + required SlaveID slave_id = 1; + } + + // EXPERIMENTAL. + // + // Turns on offers for a specific agent, which was previously drained or + // deactivated. + message ReactivateAgent { + required SlaveID slave_id = 1; + } + // EXPERIMENTAL DO NOT USE. // // This feature is not implementation complete. @@ -288,6 +341,9 @@ message Call { optional UpdateMaintenanceSchedule update_maintenance_schedule = 11; optional StartMaintenance start_maintenance = 12; optional StopMaintenance stop_maintenance = 13; + optional DrainAgent drain_agent = 21; + optional DeactivateAgent deactivate_agent = 22; + optional ReactivateAgent reactivate_agent = 23; optional UpdateQuota update_quota = 20; optional Teardown teardown = 16; optional MarkAgentGone mark_agent_gone = 17; diff --git a/include/mesos/v1/master/master.proto b/include/mesos/v1/master/master.proto index d4dd3fc..62bcee5 100644 --- a/include/mesos/v1/master/master.proto +++ b/include/mesos/v1/master/master.proto @@ -16,6 +16,9 @@ syntax = "proto2"; +// NOTE: This should be removed upon switching to Protobuf3. +import "google/protobuf/duration.proto"; + import "mesos/v1/mesos.proto"; import "mesos/v1/maintenance/maintenance.proto"; @@ -88,6 +91,10 @@ message Call { START_MAINTENANCE = 26; // See 'StartMaintenance' below. STOP_MAINTENANCE = 27; // See 'StopMaintenance' below. + DRAIN_AGENT = 37; // See 'DrainAgent' below. + DEACTIVATE_AGENT = 38; // See 'DeactivateAgent' below. + REACTIVATE_AGENT = 39; // See 'ReactivateAgent' below. + GET_QUOTA = 28; // EXPERIMENTAL DO NOT USE. @@ -223,6 +230,55 @@ message Call { repeated MachineID machines = 1; } + // EXPERIMENTAL. + // + // Marks an agent for automated draining of tasks. + // This prevents further tasks from being launched on the agent, by preventing + // offers from being sent for the agent (see 'DeactivateAgent'), and also + // begins killing tasks running on the agent. + message DrainAgent { + required AgentID agent_id = 1; + + // An upper bound for tasks with a KillPolicy. + // If a task has a KillPolicy grace period greater than this value, + // this value will be used instead. This allows the operator to limit + // the maximum time it will take the agent to drain. + // + // NOTE: Grace periods start when the executor receives the associated kill. + // If, for example, the agent is unreachable when this call is made, + // tasks will still receive their full grace period to kill gracefully. + optional google.protobuf.Duration max_grace_period = 2; + + // Whether or not this agent will be removed permanently from the + // cluster when draining is complete. This transition is automatic + // and does **NOT** require a separate call to `MarkAgentGone`. + // + // Compared to `MarkAgentGone`, which is used for unreachable agents, + // marking agents gone after draining will respect kill policies. + // To notify frameworks, tasks terminated during draining will return + // a `TASK_GONE_BY_OPERATOR` status update instead of any other terminal + // status. Executors will not need to account for this case, because + // the terminal status update will be intercepted and modified by the agent. + optional bool mark_gone = 3 [default = false]; + } + + // EXPERIMENTAL. + // + // Turns off offers for a specific agent. + // A deactivated agent will continue to run tasks and communicate statuses + // with the master. + message DeactivateAgent { + required AgentID agent_id = 1; + } + + // EXPERIMENTAL. + // + // Turns on offers for a specific agent, which was previously drained or + // deactivated. + message ReactivateAgent { + required AgentID agent_id = 1; + } + // EXPERIMENTAL DO NOT USE. // // This feature is not implementation complete. @@ -286,6 +342,9 @@ message Call { optional UpdateMaintenanceSchedule update_maintenance_schedule = 11; optional StartMaintenance start_maintenance = 12; optional StopMaintenance stop_maintenance = 13; + optional DrainAgent drain_agent = 21; + optional DeactivateAgent deactivate_agent = 22; + optional ReactivateAgent reactivate_agent = 23; optional UpdateQuota update_quota = 20; optional Teardown teardown = 16; optional MarkAgentGone mark_agent_gone = 17; diff --git a/src/internal/devolve.cpp b/src/internal/devolve.cpp index d4a8bb5..2809c25 100644 --- a/src/internal/devolve.cpp +++ b/src/internal/devolve.cpp @@ -80,6 +80,18 @@ DrainInfo devolve(const v1::DrainInfo& drainInfo) } +DurationInfo devolve(const google::protobuf::Duration& duration) +{ + DurationInfo durationInfo; + + // NOTE: If not specified, the fields of Duration default to zero. + durationInfo.set_nanoseconds( + duration.seconds() * 1000000000 + duration.nanos()); + + return durationInfo; +} + + ExecutorID devolve(const v1::ExecutorID& executorId) { return devolve<ExecutorID>(executorId); diff --git a/src/internal/devolve.hpp b/src/internal/devolve.hpp index 5b57032..d32a6eb 100644 --- a/src/internal/devolve.hpp +++ b/src/internal/devolve.hpp @@ -56,6 +56,7 @@ ContainerID devolve(const v1::ContainerID& containerId); Credential devolve(const v1::Credential& credential); DrainConfig devolve(const v1::DrainConfig& drainConfig); DrainInfo devolve(const v1::DrainInfo& drainInfo); +DurationInfo devolve(const google::protobuf::Duration& duration); ExecutorID devolve(const v1::ExecutorID& executorId); FrameworkID devolve(const v1::FrameworkID& frameworkId); FrameworkInfo devolve(const v1::FrameworkInfo& frameworkInfo); diff --git a/src/master/http.cpp b/src/master/http.cpp index 3cd7df2..b42ebb9 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -129,9 +129,12 @@ using std::tuple; using std::vector; using mesos::authorization::createSubject; +using mesos::authorization::DEACTIVATE_AGENT; +using mesos::authorization::DRAIN_AGENT; using mesos::authorization::GET_MAINTENANCE_SCHEDULE; using mesos::authorization::GET_MAINTENANCE_STATUS; using mesos::authorization::MARK_AGENT_GONE; +using mesos::authorization::REACTIVATE_AGENT; using mesos::authorization::SET_LOG_LEVEL; using mesos::authorization::START_MAINTENANCE; using mesos::authorization::STOP_MAINTENANCE; @@ -364,6 +367,15 @@ Future<Response> Master::Http::api( case mesos::master::Call::STOP_MAINTENANCE: return stopMaintenance(call, principal, acceptType); + case mesos::master::Call::DRAIN_AGENT: + return drainAgent(call, principal, acceptType); + + case mesos::master::Call::DEACTIVATE_AGENT: + return deactivateAgent(call, principal, acceptType); + + case mesos::master::Call::REACTIVATE_AGENT: + return reactivateAgent(call, principal, acceptType); + case mesos::master::Call::GET_QUOTA: return quotaHandler.status(call, principal, acceptType); @@ -3849,6 +3861,108 @@ Future<Response> Master::Http::getMaintenanceStatus( } +Future<Response> Master::Http::_drainAgent( + const SlaveID& slaveId, + const Option<DurationInfo>& maxGracePeriod, + const bool markGone, + const Owned<ObjectApprovers>& approvers) const +{ + return NotImplemented(); +} + + +Future<Response> Master::Http::drainAgent( + const mesos::master::Call& call, + const Option<Principal>& principal, + ContentType /*contentType*/) const +{ + CHECK_EQ(mesos::master::Call::DRAIN_AGENT, call.type()); + CHECK(call.has_drain_agent()); + + SlaveID slaveId = call.drain_agent().slave_id(); + + Option<DurationInfo> maxGracePeriod; + if (call.drain_agent().has_max_grace_period()) { + maxGracePeriod = call.drain_agent().max_grace_period(); + } + + bool markGone = call.drain_agent().mark_gone(); + + return ObjectApprovers::create( + master->authorizer, + principal, + {DRAIN_AGENT, MARK_AGENT_GONE}) + .then(defer( + master->self(), + [this, slaveId, maxGracePeriod, markGone]( + const Owned<ObjectApprovers>& approvers) { + return _drainAgent(slaveId, maxGracePeriod, markGone, approvers); + })); +} + + +Future<Response> Master::Http::_deactivateAgent( + const SlaveID& slaveId, + const Owned<ObjectApprovers>& approvers) const +{ + return NotImplemented(); +} + + +Future<Response> Master::Http::deactivateAgent( + const mesos::master::Call& call, + const Option<Principal>& principal, + ContentType /*contentType*/) const +{ + CHECK_EQ(mesos::master::Call::DEACTIVATE_AGENT, call.type()); + CHECK(call.has_deactivate_agent()); + + SlaveID slaveId = call.deactivate_agent().slave_id(); + + return ObjectApprovers::create( + master->authorizer, + principal, + {DEACTIVATE_AGENT}) + .then(defer( + master->self(), + [this, slaveId]( + const Owned<ObjectApprovers>& approvers) { + return _deactivateAgent(slaveId, approvers); + })); +} + + +Future<Response> Master::Http::_reactivateAgent( + const SlaveID& slaveId, + const Owned<ObjectApprovers>& approvers) const +{ + return NotImplemented(); +} + + +Future<Response> Master::Http::reactivateAgent( + const mesos::master::Call& call, + const Option<Principal>& principal, + ContentType /*contentType*/) const +{ + CHECK_EQ(mesos::master::Call::REACTIVATE_AGENT, call.type()); + CHECK(call.has_reactivate_agent()); + + SlaveID slaveId = call.reactivate_agent().slave_id(); + + return ObjectApprovers::create( + master->authorizer, + principal, + {REACTIVATE_AGENT}) + .then(defer( + master->self(), + [this, slaveId]( + const Owned<ObjectApprovers>& approvers) { + return _reactivateAgent(slaveId, approvers); + })); +} + + string Master::Http::UNRESERVE_HELP() { return HELP( diff --git a/src/master/master.hpp b/src/master/master.hpp index 7acaa82..23dafe7 100644 --- a/src/master/master.hpp +++ b/src/master/master.hpp @@ -1654,6 +1654,20 @@ private: const google::protobuf::RepeatedPtrField<MachineID>& machineIds, const process::Owned<ObjectApprovers>& approvers) const; + process::Future<process::http::Response> _drainAgent( + const SlaveID& slaveId, + const Option<DurationInfo>& maxGracePeriod, + const bool markGone, + const process::Owned<ObjectApprovers>& approvers) const; + + process::Future<process::http::Response> _deactivateAgent( + const SlaveID& slaveId, + const process::Owned<ObjectApprovers>& approvers) const; + + process::Future<process::http::Response> _reactivateAgent( + const SlaveID& slaveId, + const process::Owned<ObjectApprovers>& approvers) const; + process::Future<process::http::Response> _reserve( const SlaveID& slaveId, const google::protobuf::RepeatedPtrField<Resource>& resources, @@ -1778,6 +1792,21 @@ private: const Option<process::http::authentication::Principal>& principal, ContentType contentType) const; + process::Future<process::http::Response> drainAgent( + const mesos::master::Call& call, + const Option<process::http::authentication::Principal>& principal, + ContentType contentType) const; + + process::Future<process::http::Response> deactivateAgent( + const mesos::master::Call& call, + const Option<process::http::authentication::Principal>& principal, + ContentType contentType) const; + + process::Future<process::http::Response> reactivateAgent( + const mesos::master::Call& call, + const Option<process::http::authentication::Principal>& principal, + ContentType contentType) const; + process::Future<process::http::Response> getOperations( const mesos::master::Call& call, const Option<process::http::authentication::Principal>& principal, diff --git a/src/master/validation.cpp b/src/master/validation.cpp index af2d04a..a7ecefb 100644 --- a/src/master/validation.cpp +++ b/src/master/validation.cpp @@ -239,6 +239,24 @@ Option<Error> validate(const mesos::master::Call& call) } return None(); + case mesos::master::Call::DRAIN_AGENT: + if (!call.has_drain_agent()) { + return Error("Expecting 'drain_agent' to be present"); + } + return None(); + + case mesos::master::Call::DEACTIVATE_AGENT: + if (!call.has_deactivate_agent()) { + return Error("Expecting 'deactivate_agent' to be present"); + } + return None(); + + case mesos::master::Call::REACTIVATE_AGENT: + if (!call.has_reactivate_agent()) { + return Error("Expecting 'reactivate_agent' to be present"); + } + return None(); + case mesos::master::Call::GET_QUOTA: return None();
