Repository: mesos Updated Branches: refs/heads/master 748a60e00 -> 83b43d91a
Returned "ServiceUnavailable" for slave's /state during recovery. This will help operators hitting the /state endpoint by not returning incomplete data when slave is still recoverying. Review: https://reviews.apache.org/r/43267 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/83b43d91 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/83b43d91 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/83b43d91 Branch: refs/heads/master Commit: 83b43d91adb720f0b7d25816155f5c71240c4164 Parents: 748a60e Author: Vinod Kone <[email protected]> Authored: Fri Feb 5 12:07:20 2016 -0800 Committer: Vinod Kone <[email protected]> Committed: Mon Feb 8 16:52:27 2016 -0800 ---------------------------------------------------------------------- CHANGELOG | 9 +++++ src/slave/http.cpp | 4 ++ src/tests/slave_tests.cpp | 89 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/83b43d91/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index 82c1be6..fc387dc 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,12 @@ + +Release Notes - Mesos - Version 0.28.0 (WIP) +-------------------------------------------- + +** API Changes: + * [MESOS-4066] - Agent should not return partial state when a request is made + to /state endpoint during recovery. + + Release Notes - Mesos - Version 0.27.0 -------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/83b43d91/src/slave/http.cpp ---------------------------------------------------------------------- diff --git a/src/slave/http.cpp b/src/slave/http.cpp index 9167030..523e8dc 100644 --- a/src/slave/http.cpp +++ b/src/slave/http.cpp @@ -399,6 +399,10 @@ string Slave::Http::STATE_HELP() { Future<Response> Slave::Http::state(const Request& request) const { + if (slave->state == Slave::RECOVERING) { + return ServiceUnavailable("Agent has not finished recovery"); + } + JSON::Object object; object.values["version"] = MESOS_VERSION; http://git-wip-us.apache.org/repos/asf/mesos/blob/83b43d91/src/tests/slave_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 2a34d54..0884ee5 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -75,6 +75,7 @@ using process::UPID; using process::http::OK; using process::http::Response; +using process::http::ServiceUnavailable; using std::map; using std::shared_ptr; @@ -1184,9 +1185,15 @@ TEST_F(SlaveTest, StateEndpoint) // Capture the start time deterministically. Clock::pause(); + Future<Nothing> __recover = FUTURE_DISPATCH(_, &Slave::__recover); + Try<PID<Slave>> slave = StartSlave(&containerizer, flags); ASSERT_SOME(slave); + // Ensure slave has finished recovery. + AWAIT_READY(__recover); + Clock::settle(); + Future<Response> response = process::http::get(slave.get(), "state"); AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); @@ -1320,6 +1327,88 @@ TEST_F(SlaveTest, StateEndpoint) } +// This test checks that when a slave is in RECOVERING state it responds +// to HTTP requests for "/state" endpoint with ServiceUnavailable. +TEST_F(SlaveTest, StateEndpointUnavailableDuringRecovery) +{ + Try<PID<Master>> master = StartMaster(); + ASSERT_SOME(master); + + MockExecutor exec(DEFAULT_EXECUTOR_ID); + + TestContainerizer* containerizer1 = new TestContainerizer(&exec); + + slave::Flags flags = CreateSlaveFlags(); + + Try<PID<Slave>> slave = StartSlave(containerizer1, flags); + ASSERT_SOME(slave); + + // Launch a task so that slave has something to recover after restart. + MockScheduler sched; + + // Enable checkpointing for the framework. + FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; + frameworkInfo.set_checkpoint(true); + + MesosSchedulerDriver driver( + &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(&driver, _, _)) + .Times(1); + + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*")) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + EXPECT_CALL(exec, registered(_, _, _, _)); + + EXPECT_CALL(exec, launchTask(_, _)) + .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); + + Future<TaskStatus> status; + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .WillOnce(FutureArg<1>(&status)) + .WillRepeatedly(Return()); // Ignore subsequent updates. + + driver.start(); + + AWAIT_READY(status); + EXPECT_EQ(TASK_RUNNING, status.get().state()); + + // Need this expectation here because `TestContainerizer` doesn't do recovery + // and hence sets `MESOS_RECOVERY_TIMEOUT` as '0s' causing the executor driver + // to exit immediately after slave exit. + EXPECT_CALL(exec, shutdown(_)) + .Times(AtMost(1)); + + // Restart the slave. + Stop(slave.get()); + delete containerizer1; + + // Pause the clock to keep slave in RECOVERING state. + Clock::pause(); + + Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover); + + TestContainerizer containerizer2; + + slave = StartSlave(&containerizer2, flags); + ASSERT_SOME(slave); + + // Ensure slave has setup the route for "/state". + AWAIT_READY(_recover); + + Future<Response> response = process::http::get(slave.get(), "state"); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(ServiceUnavailable().status, response); + + driver.stop(); + driver.join(); + + Shutdown(); +} + + // This test ensures that when a slave is shutting down, it will not // try to re-register with the master. TEST_F(SlaveTest, DISABLED_TerminatingSlaveDoesNotReregister)
