Repository: mesos Updated Branches: refs/heads/master 98557a7cf -> f66289831
Task health status change notifications The reusable health check program added in #22579 emits TaskStatus messages when the task under supervision first becomes viable (when the task passes its first health check). It also emits a message when a task changes state from healthy to unhealthy. However, the scheduler should be notified for _every_ observed change in health status. It's easy to imagine cases where the scheduler wants to wait a while before killing an unhealthy task, but still be notified of status changes so that load balancers may be updated, etc. This patch therefore causes the scheduler to also be notified when an unhealthy task becomes healthy again. Review: https://reviews.apache.org/r/23966 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983 Branch: refs/heads/master Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb Parents: 98557a7 Author: Connor Doyle <[email protected]> Authored: Tue Jul 29 15:46:45 2014 -0700 Committer: Niklas Q. Nielsen <[email protected]> Committed: Tue Jul 29 15:46:45 2014 -0700 ---------------------------------------------------------------------- src/health-check/main.cpp | 5 +- src/tests/health_check_tests.cpp | 87 +++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp ---------------------------------------------------------------------- diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp index 95d881e..10d57a0 100644 --- a/src/health-check/main.cpp +++ b/src/health-check/main.cpp @@ -121,7 +121,10 @@ private: void success() { VLOG(1) << "Check passed"; - if (initializing) { + + // Send a healthy status update on the first success, + // and on the first success following failure(s). + if (initializing || consecutiveFailures > 0) { TaskHealthStatus taskHealthStatus; taskHealthStatus.set_healthy(true); taskHealthStatus.mutable_task_id()->CopyFrom(taskID); http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/health_check_tests.cpp b/src/tests/health_check_tests.cpp index aa5b78b..6c54ea8 100644 --- a/src/tests/health_check_tests.cpp +++ b/src/tests/health_check_tests.cpp @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask) Shutdown(); } +// Testing health status change reporting to scheduler. +TEST_F(HealthCheckTest, HealthStatusChange) +{ + Try<PID<Master> > master = StartMaster(); + ASSERT_SOME(master); + + slave::Flags flags = CreateSlaveFlags(); + flags.isolation = "posix/cpu,posix/mem"; + + Try<MesosContainerizer*> containerizer = + MesosContainerizer::create(flags, false); + CHECK_SOME(containerizer); + + Try<PID<Slave> > slave = StartSlave(containerizer.get()); + ASSERT_SOME(slave); + + MockScheduler sched; + MesosSchedulerDriver driver( + &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(&driver, _, _)); + + Future<vector<Offer> > offers; + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(FutureArg<1>(&offers)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(offers); + EXPECT_NE(0u, offers.get().size()); + + // Create a temporary file. + Try<string> temporaryPath = os::mktemp(); + ASSERT_SOME(temporaryPath); + string tmpPath = temporaryPath.get(); + + // This command fails every other invocation. + // For all runs i in Nat0, the following case i % 2 applies: + // + // Case 0: + // - Remove the temporary file. + // + // Case 1: + // - Attempt to remove the nonexistent temporary file. + // - Create the temporary file. + // - Exit with a non-zero status. + string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit 1)"; + + vector<TaskInfo> tasks = populateTasks( + "sleep 20", alt, offers.get()[0], 0, 3); + + Future<TaskStatus> statusRunning; + Future<TaskStatus> statusHealth1; + Future<TaskStatus> statusHealth2; + Future<TaskStatus> statusHealth3; + + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .WillOnce(FutureArg<1>(&statusRunning)) + .WillOnce(FutureArg<1>(&statusHealth1)) + .WillOnce(FutureArg<1>(&statusHealth2)) + .WillOnce(FutureArg<1>(&statusHealth3)); + + driver.launchTasks(offers.get()[0].id(), tasks); + + AWAIT_READY(statusRunning); + EXPECT_EQ(TASK_RUNNING, statusRunning.get().state()); + + AWAIT_READY(statusHealth1); + EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state()); + EXPECT_TRUE(statusHealth1.get().healthy()); + + AWAIT_READY(statusHealth2); + EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state()); + EXPECT_FALSE(statusHealth2.get().healthy()); + + AWAIT_READY(statusHealth3); + EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state()); + EXPECT_TRUE(statusHealth3.get().healthy()); + + os::rm(tmpPath); // Clean up the temporary file. + + driver.stop(); + driver.join(); + + Shutdown(); +} // Testing killing task after number of consecutive failures. // Temporarily disabled due to MESOS-1613.
