Maintenance Primitives: Added machine UP endpoint. Endpoint: /machine/up Transitions agents back into UP mode.
Registry operation = maintenance::StopMaintenance Sets the list of machines back to UP mode. Removes those machines from the maintenance schedule. Review: https://reviews.apache.org/r/37362 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/bf4ca549 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/bf4ca549 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/bf4ca549 Branch: refs/heads/master Commit: bf4ca5497fbed88ec54fd25f962c5eb1cee3b48c Parents: de231ed Author: Joseph Wu <[email protected]> Authored: Sun Aug 30 13:56:33 2015 -0400 Committer: Joris Van Remoortere <[email protected]> Committed: Mon Aug 31 13:15:16 2015 -0400 ---------------------------------------------------------------------- src/master/http.cpp | 103 +++++++++++++++++++++++ src/master/maintenance.cpp | 55 +++++++++++++ src/master/maintenance.hpp | 23 ++++++ src/master/master.cpp | 6 ++ src/master/master.hpp | 5 ++ src/tests/master_maintenance_tests.cpp | 121 +++++++++++++++++++++++++++- src/tests/registrar_tests.cpp | 79 ++++++++++++++++++ 7 files changed, 391 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/master/http.cpp ---------------------------------------------------------------------- diff --git a/src/master/http.cpp b/src/master/http.cpp index 11e786d..6fad959 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -93,6 +93,7 @@ using process::http::UnsupportedMediaType; using process::metrics::internal::MetricsProcess; +using std::list; using std::map; using std::string; using std::vector; @@ -1548,6 +1549,108 @@ Future<Response> Master::Http::machineDown(const Request& request) const } +// /master/maintenance/start endpoint help. +const string Master::Http::MACHINE_UP_HELP = HELP( + TLDR( + "Brings a set of machines back up."), + USAGE( + "/master/machine/up"), + DESCRIPTION( + "POST: Validates the request body as JSON and transitions", + " the list of machines into UP mode. This also removes", + " the list of machines from the maintenance schedule.")); + + +// /master/machine/up endpoint handler. +Future<Response> Master::Http::machineUp(const Request& request) const +{ + if (request.method != "POST") { + return BadRequest("Expecting POST, got '" + request.method + "'"); + } + + // Parse the POST body as JSON. + Try<JSON::Object> jsonIds = JSON::parse<JSON::Object>(request.body); + if (jsonIds.isError()) { + return BadRequest(jsonIds.error()); + } + + // Convert the machines to a protobuf. + Try<MachineIDs> protoIds = + ::protobuf::parse<MachineIDs>(jsonIds.get()); + + if (protoIds.isError()) { + return BadRequest(protoIds.error()); + } + + // Validate every machine in the list. + MachineIDs ids = protoIds.get(); + Try<Nothing> isValid = maintenance::validation::machines(ids); + if (isValid.isError()) { + return BadRequest(isValid.error()); + } + + // Check that all machines are part of a maintenance schedule. + foreach (const MachineID& id, ids.values()) { + if (!master->machineInfos.contains(id)) { + return BadRequest( + "Machine '" + id.DebugString() + + "' is not part of a maintenance schedule"); + } + + if (master->machineInfos[id].mode() != MachineInfo::DOWN) { + return BadRequest( + "Machine '" + id.DebugString() + + "' is not in DOWN mode and cannot be brought up"); + } + } + + return master->registrar->apply(Owned<Operation>( + new maintenance::StopMaintenance(ids))) + .then(defer(master->self(), [=](bool result) -> Future<Response> { + // See the top comment in "master/maintenance.hpp" for why this check + // is here, and is appropriate. + CHECK(result); + + // Update the master's local state with the reactivated machines. + hashset<MachineID> updated; + foreach (const MachineID& id, ids.values()) { + master->machineInfos.erase(id); + updated.insert(id); + } + + // Delete the machines from the schedule. + for (list<mesos::maintenance::Schedule>::iterator schedule = + master->maintenance.schedules.begin(); + schedule != master->maintenance.schedules.end();) { + for (int j = schedule->windows().size() - 1; j >= 0; j--) { + mesos::maintenance::Window* window = schedule->mutable_windows(j); + + // Delete individual machines. + for (int k = window->machine_ids().size() - 1; k >= 0; k--) { + if (updated.contains(window->machine_ids(k))) { + window->mutable_machine_ids()->DeleteSubrange(k, 1); + } + } + + // If the resulting window is empty, delete it. + if (window->machine_ids().size() == 0) { + schedule->mutable_windows()->DeleteSubrange(j, 1); + } + } + + // If the resulting schedule is empty, delete it. + if (schedule->windows().size() == 0) { + schedule = master->maintenance.schedules.erase(schedule); + } else { + ++schedule; + } + } + + return OK(); + })); +} + + Result<Credential> Master::Http::authenticate(const Request& request) const { // By default, assume everyone is authenticated if no credentials http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/master/maintenance.cpp ---------------------------------------------------------------------- diff --git a/src/master/maintenance.cpp b/src/master/maintenance.cpp index 859cef9..277dd82 100644 --- a/src/master/maintenance.cpp +++ b/src/master/maintenance.cpp @@ -142,6 +142,61 @@ Try<bool> StartMaintenance::perform( } +StopMaintenance::StopMaintenance( + const MachineIDs& _ids) +{ + foreach (const MachineID& id, _ids.values()) { + ids.insert(id); + } +} + + +Try<bool> StopMaintenance::perform( + Registry* registry, + hashset<SlaveID>* slaveIDs, + bool strict) +{ + // Delete the machine info entry of all targeted machines. + // i.e. Transition them into `UP` mode. + bool changed = false; + for (int i = registry->machines().machines().size() - 1; i >= 0; i--) { + if (ids.contains(registry->machines().machines(i).info().id())) { + registry->mutable_machines()->mutable_machines()->DeleteSubrange(i, 1); + + changed = true; // Mutation. + } + } + + // Delete the machines from the schedule. + for (int i = registry->schedules().size() - 1; i >= 0; i--) { + maintenance::Schedule* schedule = registry->mutable_schedules(i); + + for (int j = schedule->windows().size() - 1; j >= 0; j--) { + maintenance::Window* window = schedule->mutable_windows(j); + + // Delete individual machines. + for (int k = window->machine_ids().size() - 1; k >= 0; k--) { + if (ids.contains(window->machine_ids(k))) { + window->mutable_machine_ids()->DeleteSubrange(k, 1); + } + } + + // If the resulting window is empty, delete it. + if (window->machine_ids().size() == 0) { + schedule->mutable_windows()->DeleteSubrange(j, 1); + } + } + + // If the resulting schedule is empty, delete it. + if (schedule->windows().size() == 0) { + registry->mutable_schedules()->DeleteSubrange(i, 1); + } + } + + return changed; +} + + namespace validation { Try<Nothing> schedule( http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/master/maintenance.hpp ---------------------------------------------------------------------- diff --git a/src/master/maintenance.hpp b/src/master/maintenance.hpp index 8e6cb9c..bebaeb2 100644 --- a/src/master/maintenance.hpp +++ b/src/master/maintenance.hpp @@ -89,6 +89,29 @@ private: }; +/** + * Transitions a group of machines from `DOWN` mode into `UP` mode. + * All machines must be in `DOWN` mode and must be part of a maintenance + * schedule prior to executing this operation. The machines will be + * removed from the maintenance schedule. + */ +class StopMaintenance : public Operation +{ +public: + explicit StopMaintenance( + const MachineIDs& _ids); + +protected: + Try<bool> perform( + Registry* registry, + hashset<SlaveID>* slaveIDs, + bool strict); + +private: + hashset<MachineID> ids; +}; + + namespace validation { /** http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index 06e283d..cd1b386 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -827,6 +827,12 @@ void Master::initialize() Http::log(request); return http.machineDown(request); }); + route("/machine/up", + Http::MACHINE_UP_HELP, + [http](const process::http::Request& request) { + Http::log(request); + return http.machineUp(request); + }); // Provide HTTP assets from a "webui" directory. This is either // specified via flags (which is necessary for running out of the http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/master/master.hpp ---------------------------------------------------------------------- diff --git a/src/master/master.hpp b/src/master/master.hpp index c7e96db..68be718 100644 --- a/src/master/master.hpp +++ b/src/master/master.hpp @@ -840,6 +840,10 @@ private: process::Future<process::http::Response> machineDown( const process::http::Request& request) const; + // /master/machine/up + process::Future<process::http::Response> machineUp( + const process::http::Request& request) const; + const static std::string SCHEDULER_HELP; const static std::string HEALTH_HELP; const static std::string OBSERVE_HELP; @@ -852,6 +856,7 @@ private: const static std::string TASKS_HELP; const static std::string MAINTENANCE_SCHEDULE_HELP; const static std::string MACHINE_DOWN_HELP; + const static std::string MACHINE_UP_HELP; private: // Helper for doing authentication, returns the credential used if http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/tests/master_maintenance_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_maintenance_tests.cpp b/src/tests/master_maintenance_tests.cpp index 67301cc..c3acd7a 100644 --- a/src/tests/master_maintenance_tests.cpp +++ b/src/tests/master_maintenance_tests.cpp @@ -18,6 +18,8 @@ #include <string> +#include <mesos/maintenance/maintenance.hpp> + #include <process/clock.hpp> #include <process/future.hpp> #include <process/http.hpp> @@ -28,6 +30,7 @@ #include <stout/json.hpp> #include <stout/net.hpp> #include <stout/option.hpp> +#include <stout/protobuf.hpp> #include <stout/strings.hpp> #include <stout/stringify.hpp> #include <stout/try.hpp> @@ -325,8 +328,124 @@ TEST_F(MasterMaintenanceTest, BringDownMachines) "machine/down", headers, stringify(JSON::Protobuf(machines))); - + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); +} + + +// Posts valid and invalid machines to the maintenance stop endpoint. +TEST_F(MasterMaintenanceTest, BringUpMachines) +{ + // Set up a master. + Try<PID<Master>> master = StartMaster(); + ASSERT_SOME(master); + + // Try to bring up an unscheduled machine. + MachineIDs machines = createMachineList({machine1, machine2}); + Future<Response> response = process::http::post( + master.get(), + "machine/up", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Post a valid schedule with three machines. + maintenance::Schedule schedule = createSchedule({ + createWindow({machine1, machine2}, unavailability), + createWindow({machine3}, unavailability)}); + + response = process::http::post( + master.get(), + "maintenance/schedule", + headers, + stringify(JSON::Protobuf(schedule))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Try to bring up a non-down machine. + machines = createMachineList({machine1, machine2}); + response = process::http::post( + master.get(), + "machine/up", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(BadRequest().status, response); + + // Down machine3. + machines = createMachineList({machine3}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Up machine3. + response = process::http::post( + master.get(), + "machine/up", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Get the maintenance schedule. + response = process::http::get( + master.get(), + "maintenance/schedule"); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Check that only one maintenance window remains. + Try<JSON::Object> masterSchedule_ = + JSON::parse<JSON::Object>(response.get().body); + + ASSERT_SOME(masterSchedule_); + Try<mesos::maintenance::Schedule> masterSchedule = + ::protobuf::parse<mesos::maintenance::Schedule>(masterSchedule_.get()); + + ASSERT_SOME(masterSchedule); + ASSERT_EQ(1, masterSchedule.get().windows().size()); + ASSERT_EQ(2, masterSchedule.get().windows(0).machine_ids().size()); + + // Down the other machines. + machines = createMachineList({machine1, machine2}); + response = process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(machines))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Up the other machines. + response = process::http::post( + master.get(), + "machine/up", + headers, + stringify(JSON::Protobuf(machines))); + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Get the maintenance schedule again. + response = process::http::get( + master.get(), + "maintenance/schedule"); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Check that the schedule is empty. + masterSchedule_ = JSON::parse<JSON::Object>(response.get().body); + + ASSERT_SOME(masterSchedule_); + masterSchedule = + ::protobuf::parse<mesos::maintenance::Schedule>(masterSchedule_.get()); + + ASSERT_SOME(masterSchedule); + ASSERT_EQ(0, masterSchedule.get().windows().size()); } } // namespace tests { http://git-wip-us.apache.org/repos/asf/mesos/blob/bf4ca549/src/tests/registrar_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/registrar_tests.cpp b/src/tests/registrar_tests.cpp index 733a2cd..aa49c86 100644 --- a/src/tests/registrar_tests.cpp +++ b/src/tests/registrar_tests.cpp @@ -585,6 +585,85 @@ TEST_P(RegistrarTest, StartMaintenance) } +// Creates a schedule and properly starts and stops maintenance. +TEST_P(RegistrarTest, StopMaintenance) +{ + // Machine definitions used in this test. + MachineID machine1; + machine1.set_ip("0.0.0.1"); + + MachineID machine2; + machine2.set_hostname("2"); + + MachineID machine3; + machine3.set_hostname("3"); + machine3.set_ip("0.0.0.3"); + + Unavailability unavailability = createUnavailability(Clock::now()); + + { + // Prepare the registrar. + Registrar registrar(flags, state); + AWAIT_READY(registrar.recover(master)); + + // Schdule three machines for maintenance. + maintenance::Schedule schedule = createSchedule({ + createWindow({machine1, machine2}, unavailability), + createWindow({machine3}, unavailability)}); + + AWAIT_READY(registrar.apply( + Owned<Operation>(new UpdateSchedule(schedule)))); + + // Transition machine three into `DOWN` mode. + MachineIDs machines = createMachineList({machine3}); + AWAIT_READY(registrar.apply( + Owned<Operation>(new StartMaintenance(machines)))); + + // Transition machine three into `UP` mode. + AWAIT_READY(registrar.apply( + Owned<Operation>(new StopMaintenance(machines)))); + } + + { + // Check that machine three and the window were removed. + Registrar registrar(flags, state); + Future<Registry> registry = registrar.recover(master); + AWAIT_READY(registry); + + EXPECT_EQ(1, registry.get().schedules().size()); + EXPECT_EQ(1, registry.get().schedules(0).windows().size()); + EXPECT_EQ(2, registry.get().schedules(0).windows(0).machine_ids().size()); + EXPECT_EQ(2, registry.get().machines().machines().size()); + EXPECT_EQ( + MachineInfo::DRAINING, + registry.get().machines().machines(0).info().mode()); + + EXPECT_EQ( + MachineInfo::DRAINING, + registry.get().machines().machines(1).info().mode()); + + // Transition machine one and two into `DOWN` mode. + MachineIDs machines = createMachineList({machine1, machine2}); + AWAIT_READY(registrar.apply( + Owned<Operation>(new StartMaintenance(machines)))); + + // Transition all machines into `UP` mode. + AWAIT_READY(registrar.apply( + Owned<Operation>(new StopMaintenance(machines)))); + } + + { + // Check that the schedule is now empty. + Registrar registrar(flags, state); + Future<Registry> registry = registrar.recover(master); + AWAIT_READY(registry); + + EXPECT_EQ(0, registry.get().schedules().size()); + EXPECT_EQ(0, registry.get().machines().machines().size()); + } +} + + TEST_P(RegistrarTest, Bootstrap) { // Run 1 readmits a slave that is not present.
