Hi Ben, Sorry about that; I am on it with Connor and followed up with a couple of questions in the ticket.
Niklas On 29 July 2014 18:36, Benjamin Mahler <[email protected]> wrote: > Not sure if it's related to this commit, but seems the GracePeriod test is > flaky now: > > https://issues.apache.org/jira/browse/MESOS-1653 > > Could you help triage this ticket? > > > On Tue, Jul 29, 2014 at 3:47 PM, <[email protected]> wrote: > > > Repository: mesos > > Updated Branches: > > refs/heads/master 98557a7cf -> f66289831 > > > > > > Task health status change notifications > > > > The reusable health check program added in #22579 emits TaskStatus > > messages when the task under supervision first becomes viable (when the > > task passes its first health check). It also emits a message when a > > task changes state from healthy to unhealthy. > > > > However, the scheduler should be notified for _every_ observed change in > > health status. It's easy to imagine cases where the scheduler wants to > > wait a while before killing an unhealthy task, but still be notified of > > status changes so that load balancers may be updated, etc. This patch > > therefore causes the scheduler to also be notified when an unhealthy > > task becomes healthy again. > > > > Review: https://reviews.apache.org/r/23966 > > > > > > Project: http://git-wip-us.apache.org/repos/asf/mesos/repo > > Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983 > > Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983 > > Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983 > > > > Branch: refs/heads/master > > Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb > > Parents: 98557a7 > > Author: Connor Doyle <[email protected]> > > Authored: Tue Jul 29 15:46:45 2014 -0700 > > Committer: Niklas Q. Nielsen <[email protected]> > > Committed: Tue Jul 29 15:46:45 2014 -0700 > > > > ---------------------------------------------------------------------- > > src/health-check/main.cpp | 5 +- > > src/tests/health_check_tests.cpp | 87 > +++++++++++++++++++++++++++++++++++ > > 2 files changed, 91 insertions(+), 1 deletion(-) > > ---------------------------------------------------------------------- > > > > > > > > > http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp > > ---------------------------------------------------------------------- > > diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp > > index 95d881e..10d57a0 100644 > > --- a/src/health-check/main.cpp > > +++ b/src/health-check/main.cpp > > @@ -121,7 +121,10 @@ private: > > void success() > > { > > VLOG(1) << "Check passed"; > > - if (initializing) { > > + > > + // Send a healthy status update on the first success, > > + // and on the first success following failure(s). > > + if (initializing || consecutiveFailures > 0) { > > TaskHealthStatus taskHealthStatus; > > taskHealthStatus.set_healthy(true); > > taskHealthStatus.mutable_task_id()->CopyFrom(taskID); > > > > > > > http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp > > ---------------------------------------------------------------------- > > diff --git a/src/tests/health_check_tests.cpp > > b/src/tests/health_check_tests.cpp > > index aa5b78b..6c54ea8 100644 > > --- a/src/tests/health_check_tests.cpp > > +++ b/src/tests/health_check_tests.cpp > > @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask) > > Shutdown(); > > } > > > > +// Testing health status change reporting to scheduler. > > +TEST_F(HealthCheckTest, HealthStatusChange) > > +{ > > + Try<PID<Master> > master = StartMaster(); > > + ASSERT_SOME(master); > > + > > + slave::Flags flags = CreateSlaveFlags(); > > + flags.isolation = "posix/cpu,posix/mem"; > > + > > + Try<MesosContainerizer*> containerizer = > > + MesosContainerizer::create(flags, false); > > + CHECK_SOME(containerizer); > > + > > + Try<PID<Slave> > slave = StartSlave(containerizer.get()); > > + ASSERT_SOME(slave); > > + > > + MockScheduler sched; > > + MesosSchedulerDriver driver( > > + &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); > > + > > + EXPECT_CALL(sched, registered(&driver, _, _)); > > + > > + Future<vector<Offer> > offers; > > + EXPECT_CALL(sched, resourceOffers(&driver, _)) > > + .WillOnce(FutureArg<1>(&offers)) > > + .WillRepeatedly(Return()); // Ignore subsequent offers. > > + > > + driver.start(); > > + > > + AWAIT_READY(offers); > > + EXPECT_NE(0u, offers.get().size()); > > + > > + // Create a temporary file. > > + Try<string> temporaryPath = os::mktemp(); > > + ASSERT_SOME(temporaryPath); > > + string tmpPath = temporaryPath.get(); > > + > > + // This command fails every other invocation. > > + // For all runs i in Nat0, the following case i % 2 applies: > > + // > > + // Case 0: > > + // - Remove the temporary file. > > + // > > + // Case 1: > > + // - Attempt to remove the nonexistent temporary file. > > + // - Create the temporary file. > > + // - Exit with a non-zero status. > > + string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit > 1)"; > > + > > + vector<TaskInfo> tasks = populateTasks( > > + "sleep 20", alt, offers.get()[0], 0, 3); > > + > > + Future<TaskStatus> statusRunning; > > + Future<TaskStatus> statusHealth1; > > + Future<TaskStatus> statusHealth2; > > + Future<TaskStatus> statusHealth3; > > + > > + EXPECT_CALL(sched, statusUpdate(&driver, _)) > > + .WillOnce(FutureArg<1>(&statusRunning)) > > + .WillOnce(FutureArg<1>(&statusHealth1)) > > + .WillOnce(FutureArg<1>(&statusHealth2)) > > + .WillOnce(FutureArg<1>(&statusHealth3)); > > + > > + driver.launchTasks(offers.get()[0].id(), tasks); > > + > > + AWAIT_READY(statusRunning); > > + EXPECT_EQ(TASK_RUNNING, statusRunning.get().state()); > > + > > + AWAIT_READY(statusHealth1); > > + EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state()); > > + EXPECT_TRUE(statusHealth1.get().healthy()); > > + > > + AWAIT_READY(statusHealth2); > > + EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state()); > > + EXPECT_FALSE(statusHealth2.get().healthy()); > > + > > + AWAIT_READY(statusHealth3); > > + EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state()); > > + EXPECT_TRUE(statusHealth3.get().healthy()); > > + > > + os::rm(tmpPath); // Clean up the temporary file. > > + > > + driver.stop(); > > + driver.join(); > > + > > + Shutdown(); > > +} > > > > // Testing killing task after number of consecutive failures. > > // Temporarily disabled due to MESOS-1613. > > > > >
