Maintenance Primitives: Added machine DOWN endpoint. Endpoint: /machine/down Transitions agents into the DOWN mode.
Registry operation = maintenance::StartMaintenance Sets the list of machines as DOWN. Review: https://reviews.apache.org/r/37358 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/de231ed5 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/de231ed5 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/de231ed5 Branch: refs/heads/master Commit: de231ed590809d85c9e47071f5ce899035a15dc4 Parents: 3b0fe5c Author: Joseph Wu <[email protected]> Authored: Sun Aug 30 13:56:21 2015 -0400 Committer: Joris Van Remoortere <[email protected]> Committed: Mon Aug 31 13:09:54 2015 -0400 ---------------------------------------------------------------------- src/master/http.cpp | 73 ++++++++++++++++++++++ src/master/maintenance.cpp | 30 +++++++++ src/master/maintenance.hpp | 24 ++++++++ src/master/master.cpp | 6 ++ src/master/master.hpp | 5 ++ src/tests/master_maintenance_tests.cpp | 95 +++++++++++++++++++++++++++++ src/tests/registrar_tests.cpp | 87 +++++++++++++++++++++++++- 7 files changed, 319 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/master/http.cpp ---------------------------------------------------------------------- diff --git a/src/master/http.cpp b/src/master/http.cpp index 44178d8..11e786d 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -1475,6 +1475,79 @@ Future<Response> Master::Http::maintenanceSchedule(const Request& request) const } +// /master/machine/down endpoint help. +const string Master::Http::MACHINE_DOWN_HELP = HELP( + TLDR( + "Brings a set of machines down."), + USAGE( + "/master/machine/down"), + DESCRIPTION( + "POST: Validates the request body as JSON and transitions", + " the list of machines into DOWN mode. Currently, only", + " machines in DRAINING mode are allowed to be brought down.")); + + +// /master/machine/down endpoint handler. +Future<Response> Master::Http::machineDown(const Request& request) const +{ + if (request.method != "POST") { + return BadRequest("Expecting POST, got '" + request.method + "'"); + } + + // Parse the POST body as JSON. + Try<JSON::Object> jsonIds = JSON::parse<JSON::Object>(request.body); + if (jsonIds.isError()) { + return BadRequest(jsonIds.error()); + } + + // Convert the machines to a protobuf. + Try<MachineIDs> protoIds = + ::protobuf::parse<MachineIDs>(jsonIds.get()); + + if (protoIds.isError()) { + return BadRequest(protoIds.error()); + } + + // Validate every machine in the list. + MachineIDs ids = protoIds.get(); + Try<Nothing> isValid = maintenance::validation::machines(ids); + if (isValid.isError()) { + return BadRequest(isValid.error()); + } + + // Check that all machines are part of a maintenance schedule. + // TODO(josephw): Allow a transition from `UP` to `DOWN`. + foreach (const MachineID& id, ids.values()) { + if (!master->machineInfos.contains(id)) { + return BadRequest( + "Machine '" + id.DebugString() + + "' is not part of a maintenance schedule"); + } + + if (master->machineInfos[id].mode() != MachineInfo::DRAINING) { + return BadRequest( + "Machine '" + id.DebugString() + + "' is not in DRAINING mode and cannot be brought down"); + } + } + + return master->registrar->apply(Owned<Operation>( + new maintenance::StartMaintenance(ids))) + .then(defer(master->self(), [=](bool result) -> Future<Response> { + // See the top comment in "master/maintenance.hpp" for why this check + // is here, and is appropriate. + CHECK(result); + + // Update the master's local state with the downed machines. + foreach (const MachineID& id, ids.values()) { + master->machineInfos[id].set_mode(MachineInfo::DOWN); + } + + return OK(); + })); +} + + Result<Credential> Master::Http::authenticate(const Request& request) const { // By default, assume everyone is authenticated if no credentials http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/master/maintenance.cpp ---------------------------------------------------------------------- diff --git a/src/master/maintenance.cpp b/src/master/maintenance.cpp index 798c026..859cef9 100644 --- a/src/master/maintenance.cpp +++ b/src/master/maintenance.cpp @@ -112,6 +112,36 @@ Try<bool> UpdateSchedule::perform( } +StartMaintenance::StartMaintenance( + const MachineIDs& _ids) +{ + foreach (const MachineID& id, _ids.values()) { + ids.insert(id); + } +} + + +Try<bool> StartMaintenance::perform( + Registry* registry, + hashset<SlaveID>* slaveIDs, + bool strict) +{ + // Flip the mode of all targeted machines. + bool changed = false; + for (int i = 0; i < registry->machines().machines().size(); i++) { + if (ids.contains(registry->machines().machines(i).info().id())) { + // Flip the mode. + registry->mutable_machines()->mutable_machines(i) + ->mutable_info()->set_mode(MachineInfo::DOWN); + + changed = true; // Mutation. + } + } + + return changed; +} + + namespace validation { Try<Nothing> schedule( http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/master/maintenance.hpp ---------------------------------------------------------------------- diff --git a/src/master/maintenance.hpp b/src/master/maintenance.hpp index 42b5f9e..8e6cb9c 100644 --- a/src/master/maintenance.hpp +++ b/src/master/maintenance.hpp @@ -65,6 +65,30 @@ private: }; +/** + * Transitions a group of machines from `DRAINING` mode into + * `DOWN` mode. All machines must be part of a maintenance + * schedule prior to executing this operation. + * + * TODO(josephw): Allow a transition from `UP` to `DOWN`. + */ +class StartMaintenance : public Operation +{ +public: + explicit StartMaintenance( + const MachineIDs& _ids); + +protected: + Try<bool> perform( + Registry* registry, + hashset<SlaveID>* slaveIDs, + bool strict); + +private: + hashset<MachineID> ids; +}; + + namespace validation { /** http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index ea556f9..06e283d 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -821,6 +821,12 @@ void Master::initialize() Http::log(request); return http.maintenanceSchedule(request); }); + route("/machine/down", + Http::MACHINE_DOWN_HELP, + [http](const process::http::Request& request) { + Http::log(request); + return http.machineDown(request); + }); // Provide HTTP assets from a "webui" directory. This is either // specified via flags (which is necessary for running out of the http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/master/master.hpp ---------------------------------------------------------------------- diff --git a/src/master/master.hpp b/src/master/master.hpp index 175e623..c7e96db 100644 --- a/src/master/master.hpp +++ b/src/master/master.hpp @@ -836,6 +836,10 @@ private: process::Future<process::http::Response> maintenanceSchedule( const process::http::Request& request) const; + // /master/machine/down + process::Future<process::http::Response> machineDown( + const process::http::Request& request) const; + const static std::string SCHEDULER_HELP; const static std::string HEALTH_HELP; const static std::string OBSERVE_HELP; @@ -847,6 +851,7 @@ private: const static std::string STATESUMMARY_HELP; const static std::string TASKS_HELP; const static std::string MAINTENANCE_SCHEDULE_HELP; + const static std::string MACHINE_DOWN_HELP; private: // Helper for doing authentication, returns the credential used if http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/tests/master_maintenance_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_maintenance_tests.cpp b/src/tests/master_maintenance_tests.cpp index 1258ecc..67301cc 100644 --- a/src/tests/master_maintenance_tests.cpp +++ b/src/tests/master_maintenance_tests.cpp @@ -54,6 +54,7 @@ using process::http::BadRequest; using process::http::OK; using process::http::Response; +using mesos::internal::protobuf::maintenance::createMachineList; using mesos::internal::protobuf::maintenance::createSchedule; using mesos::internal::protobuf::maintenance::createUnavailability; using mesos::internal::protobuf::maintenance::createWindow; @@ -234,6 +235,100 @@ TEST_F(MasterMaintenanceTest, UpdateSchedule) AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); } + +// Posts valid and invalid machines to the maintenance start endpoint. +TEST_F(MasterMaintenanceTest, BringDownMachines) +{ + // Set up a master. + Try<PID<Master>> master = StartMaster(); + ASSERT_SOME(master); + + // Extra machine used in this test. + // It isn't filled in, so it's incorrect. + MachineID badMachine; + + // Try to start maintenance on an unscheduled machine. + MachineIDs machines = createMachineList({machine1, machine2}); + Future<Response> response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Try an empty list. + machines = createMachineList({}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Try an empty machine. + machines = createMachineList({badMachine}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Post a valid schedule with two machines. + maintenance::Schedule schedule = createSchedule( + {createWindow({machine1, machine2}, unavailability)}); + + response = process::http::post( + master.get(), + "maintenance/schedule", + headers, + stringify(JSON::Protobuf(schedule))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Down machine1. + machines = createMachineList({machine1}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Fail to down machine1 again. + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Fail to down machine1 and machine2. + machines = createMachineList({machine1, machine2}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Down machine2. + machines = createMachineList({machine2}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); +} + } // namespace tests { } // namespace internal { } // namespace mesos { http://git-wip-us.apache.org/repos/asf/mesos/blob/de231ed5/src/tests/registrar_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/registrar_tests.cpp b/src/tests/registrar_tests.cpp index 567934c..733a2cd 100644 --- a/src/tests/registrar_tests.cpp +++ b/src/tests/registrar_tests.cpp @@ -72,6 +72,7 @@ using std::vector; using process::Clock; +using mesos::internal::protobuf::maintenance::createMachineList; using mesos::internal::protobuf::maintenance::createSchedule; using mesos::internal::protobuf::maintenance::createUnavailability; using mesos::internal::protobuf::maintenance::createWindow; @@ -492,7 +493,7 @@ TEST_P(RegistrarTest, UpdateMaintenanceSchedule) Registrar registrar(flags, state); Future<Registry> registry = registrar.recover(master); AWAIT_READY(registry); - + EXPECT_EQ(1, registry.get().schedules().size()); EXPECT_EQ(0, registry.get().schedules(0).windows().size()); EXPECT_EQ(0, registry.get().machines().machines().size()); @@ -500,6 +501,90 @@ TEST_P(RegistrarTest, UpdateMaintenanceSchedule) } +// Creates a schedule and properly starts maintenance. +TEST_P(RegistrarTest, StartMaintenance) +{ + // Machine definitions used in this test. + MachineID machine1; + machine1.set_ip("0.0.0.1"); + + MachineID machine2; + machine2.set_hostname("2"); + + MachineID machine3; + machine3.set_hostname("3"); + machine3.set_ip("0.0.0.3"); + + Unavailability unavailability = createUnavailability(Clock::now()); + + { + // Prepare the registrar. + Registrar registrar(flags, state); + AWAIT_READY(registrar.recover(master)); + + // Schedule two machines for maintenance. + maintenance::Schedule schedule = createSchedule( + {createWindow({machine1, machine2}, unavailability)}); + + AWAIT_READY(registrar.apply( + Owned<Operation>(new UpdateSchedule(schedule)))); + + // Transition machine two into `DOWN` mode. + MachineIDs machines = createMachineList({machine2}); + AWAIT_READY(registrar.apply( + Owned<Operation>(new StartMaintenance(machines)))); + } + + { + // Check that machine two is down. + Registrar registrar(flags, state); + Future<Registry> registry = registrar.recover(master); + AWAIT_READY(registry); + + EXPECT_EQ(2, registry.get().machines().machines().size()); + EXPECT_EQ( + MachineInfo::DRAINING, + registry.get().machines().machines(0).info().mode()); + + EXPECT_EQ( + MachineInfo::DOWN, + registry.get().machines().machines(1).info().mode()); + + // Schedule three machines for maintenance. + maintenance::Schedule schedule = createSchedule( + {createWindow({machine1, machine2, machine3}, unavailability)}); + + AWAIT_READY(registrar.apply( + Owned<Operation>(new UpdateSchedule(schedule)))); + + // Deactivate the two `DRAINING` machines. + MachineIDs machines = createMachineList({machine1, machine3}); + AWAIT_READY(registrar.apply( + Owned<Operation>(new StartMaintenance(machines)))); + } + + { + // Check that all machines are down. + Registrar registrar(flags, state); + Future<Registry> registry = registrar.recover(master); + AWAIT_READY(registry); + + EXPECT_EQ(3, registry.get().machines().machines().size()); + EXPECT_EQ( + MachineInfo::DOWN, + registry.get().machines().machines(0).info().mode()); + + EXPECT_EQ( + MachineInfo::DOWN, + registry.get().machines().machines(1).info().mode()); + + EXPECT_EQ( + MachineInfo::DOWN, + registry.get().machines().machines(2).info().mode()); + } +} + + TEST_P(RegistrarTest, Bootstrap) { // Run 1 readmits a slave that is not present.
