Maintenance Primitives: Shutdown slave when maintenance is started. Review: https://reviews.apache.org/r/37622
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/147420e3 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/147420e3 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/147420e3 Branch: refs/heads/master Commit: 147420e3e591c4b2674d3f84252066bc5d4b660c Parents: ea96190 Author: Joris Van Remoortere <[email protected]> Authored: Tue Aug 25 18:55:25 2015 -0400 Committer: Joris Van Remoortere <[email protected]> Committed: Mon Sep 14 13:58:37 2015 -0400 ---------------------------------------------------------------------- src/master/http.cpp | 31 ++++++++ src/tests/master_maintenance_tests.cpp | 114 ++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/147420e3/src/master/http.cpp ---------------------------------------------------------------------- diff --git a/src/master/http.cpp b/src/master/http.cpp index 05b590e..f7ce9aa 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -1593,6 +1593,37 @@ Future<Response> Master::Http::machineDown(const Request& request) const // is here, and is appropriate. CHECK(result); + // We currently send a `ShutdownMessage` to each slave. This terminates + // all the executors for all the frameworks running on that slave. + // We also manually remove the slave to force sending TASK_LOST updates + // for all the tasks that were running on the slave and `LostSlaveMessage` + // messages to the framework. This guards against the slave having dropped + // the `ShutdownMessage`. + foreach (const MachineID& machineId, ids.values()) { + // The machine may not be in machines. This means no slaves are + // currently registered on that machine so this is a no-op. + if (master->machines.contains(machineId)) { + // NOTE: Copies are needed because removeSlave modifies + // master->machines. + foreach ( + const SlaveID& slaveId, + utils::copy(master->machines[machineId].slaves)) { + Slave* slave = master->slaves.registered.get(slaveId); + CHECK_NOTNULL(slave); + + // Tell the slave to shut down. + ShutdownMessage shutdownMessage; + shutdownMessage.set_message("Operator initiated 'Machine DOWN'"); + master->send(slave->pid, shutdownMessage); + + // Immediately remove the slave to force sending `TASK_LOST` status + // updates as well as `LostSlaveMessage` messages to the frameworks. + // See comment above. + master->removeSlave(slave, "Operator initiated 'Machine DOWN'"); + } + } + } + // Update the master's local state with the downed machines. foreach (const MachineID& id, ids.values()) { master->machines[id].info.set_mode(MachineInfo::DOWN); http://git-wip-us.apache.org/repos/asf/mesos/blob/147420e3/src/tests/master_maintenance_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_maintenance_tests.cpp b/src/tests/master_maintenance_tests.cpp index 4a59389..6ae502d 100644 --- a/src/tests/master_maintenance_tests.cpp +++ b/src/tests/master_maintenance_tests.cpp @@ -564,6 +564,120 @@ TEST_F(MasterMaintenanceTest, PreV1SchedulerSupport) } +// Test ensures that slaves receive a shutdown message from the master when +// maintenance is started, and frameworks receive a task lost message. +TEST_F(MasterMaintenanceTest, EnterMaintenanceMode) +{ + Try<PID<Master>> master = StartMaster(); + ASSERT_SOME(master); + + MockExecutor exec(DEFAULT_EXECUTOR_ID); + + Try<PID<Slave>> slave = StartSlave(&exec); + ASSERT_SOME(slave); + + MockScheduler sched; + MesosSchedulerDriver driver( + &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(&driver, _, _)) + .Times(1); + + // Launch a task. + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 64, "*")) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + EXPECT_CALL(exec, registered(_, _, _, _)) + .Times(1); + + EXPECT_CALL(exec, launchTask(_, _)) + .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); + + EXPECT_CALL(exec, shutdown(_)) + .Times(AtMost(1)); + + EXPECT_CALL(sched, offerRescinded(&driver, _)) + .WillRepeatedly(Return()); // Ignore rescinds. + + // Collect the status updates to verify the task is running and then lost. + Future<TaskStatus> startStatus, lostStatus; + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .WillOnce(FutureArg<1>(&startStatus)) + .WillOnce(FutureArg<1>(&lostStatus)); + + // Start the test. + driver.start(); + + // Wait till the task is running to schedule the maintenance. + AWAIT_READY(startStatus); + EXPECT_EQ(TASK_RUNNING, startStatus.get().state()); + + // Schedule this slave for maintenance. + MachineID machine; + machine.set_hostname(maintenanceHostname); + machine.set_ip(stringify(slave.get().address.ip)); + + // TODO(jmlvanre): Replace Time(0.0) with `Clock::now()` once JSON double + // conversion is fixed. For now using a rounded time avoids the issue. + const Time start = Time::create(0.0).get() + Seconds(60); + const Duration duration = Seconds(120); + const Unavailability unavailability = createUnavailability(start, duration); + + // Post a valid schedule with one machine. + maintenance::Schedule schedule = createSchedule( + {createWindow({machine}, unavailability)}); + + // We have a few seconds between the first set of offers and the next + // allocation of offers. This should be enough time to perform a maintenance + // schedule update. This update will also trigger the rescinding of offers + // from the scheduled slave. + Future<Response> response = + process::http::post( + master.get(), + "maintenance/schedule", + headers, + stringify(JSON::Protobuf(schedule))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Verify that the master forces the slave to be shut down after the + // maintenance is started. + Future<ShutdownMessage> shutdownMessage = + FUTURE_PROTOBUF(ShutdownMessage(), master.get(), slave.get()); + + // Verify that the framework will be informed that the slave is lost. + Future<Nothing> slaveLost; + EXPECT_CALL(sched, slaveLost(&driver, _)) + .WillOnce(FutureSatisfy(&slaveLost)); + + // Start the maintenance. + response = + process::http::post( + master.get(), + "machine/down", + headers, + stringify(JSON::Protobuf(createMachineList({machine})))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + // Wait for the slave to be shut down. + AWAIT_READY(shutdownMessage); + + // Verify that we received a TASK_LOST. + AWAIT_READY(lostStatus); + EXPECT_EQ(TASK_LOST, lostStatus.get().state()); + + // Verify that the framework received the slave lost message. + AWAIT_READY(slaveLost); + + driver.stop(); + driver.join(); + + Shutdown(); // Must shutdown before 'containerizer' gets deallocated. +} + + // Posts valid and invalid machines to the maintenance start endpoint. TEST_F(MasterMaintenanceTest, BringDownMachines) {
