Repository: mesos
Updated Branches:
  refs/heads/master 98557a7cf -> f66289831


Task health status change notifications

The reusable health check program added in #22579 emits TaskStatus
messages when the task under supervision first becomes viable (when the
task passes its first health check).  It also emits a message when a
task changes state from healthy to unhealthy.

However, the scheduler should be notified for _every_ observed change in
health status.  It's easy to imagine cases where the scheduler wants to
wait a while before killing an unhealthy task, but still be notified of
status changes so that load balancers may be updated, etc.  This patch
therefore causes the scheduler to also be notified when an unhealthy
task becomes healthy again.

Review: https://reviews.apache.org/r/23966


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983

Branch: refs/heads/master
Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb
Parents: 98557a7
Author: Connor Doyle <[email protected]>
Authored: Tue Jul 29 15:46:45 2014 -0700
Committer: Niklas Q. Nielsen <[email protected]>
Committed: Tue Jul 29 15:46:45 2014 -0700

----------------------------------------------------------------------
 src/health-check/main.cpp        |  5 +-
 src/tests/health_check_tests.cpp | 87 +++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp
index 95d881e..10d57a0 100644
--- a/src/health-check/main.cpp
+++ b/src/health-check/main.cpp
@@ -121,7 +121,10 @@ private:
   void success()
   {
     VLOG(1) << "Check passed";
-    if (initializing) {
+
+    // Send a healthy status update on the first success,
+    // and on the first success following failure(s).
+    if (initializing || consecutiveFailures > 0) {
       TaskHealthStatus taskHealthStatus;
       taskHealthStatus.set_healthy(true);
       taskHealthStatus.mutable_task_id()->CopyFrom(taskID);

http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/health_check_tests.cpp b/src/tests/health_check_tests.cpp
index aa5b78b..6c54ea8 100644
--- a/src/tests/health_check_tests.cpp
+++ b/src/tests/health_check_tests.cpp
@@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask)
   Shutdown();
 }
 
+// Testing health status change reporting to scheduler.
+TEST_F(HealthCheckTest, HealthStatusChange)
+{
+  Try<PID<Master> > master = StartMaster();
+  ASSERT_SOME(master);
+
+  slave::Flags flags = CreateSlaveFlags();
+  flags.isolation = "posix/cpu,posix/mem";
+
+  Try<MesosContainerizer*> containerizer =
+    MesosContainerizer::create(flags, false);
+  CHECK_SOME(containerizer);
+
+  Try<PID<Slave> > slave = StartSlave(containerizer.get());
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer> > offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  // Create a temporary file.
+  Try<string> temporaryPath = os::mktemp();
+  ASSERT_SOME(temporaryPath);
+  string tmpPath = temporaryPath.get();
+
+  // This command fails every other invocation.
+  // For all runs i in Nat0, the following case i % 2 applies:
+  //
+  // Case 0:
+  //   - Remove the temporary file.
+  //
+  // Case 1:
+  //   - Attempt to remove the nonexistent temporary file.
+  //   - Create the temporary file.
+  //   - Exit with a non-zero status.
+  string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit 1)";
+
+  vector<TaskInfo> tasks = populateTasks(
+      "sleep 20", alt, offers.get()[0], 0, 3);
+
+  Future<TaskStatus> statusRunning;
+  Future<TaskStatus> statusHealth1;
+  Future<TaskStatus> statusHealth2;
+  Future<TaskStatus> statusHealth3;
+
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&statusRunning))
+    .WillOnce(FutureArg<1>(&statusHealth1))
+    .WillOnce(FutureArg<1>(&statusHealth2))
+    .WillOnce(FutureArg<1>(&statusHealth3));
+
+  driver.launchTasks(offers.get()[0].id(), tasks);
+
+  AWAIT_READY(statusRunning);
+  EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
+
+  AWAIT_READY(statusHealth1);
+  EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state());
+  EXPECT_TRUE(statusHealth1.get().healthy());
+
+  AWAIT_READY(statusHealth2);
+  EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state());
+  EXPECT_FALSE(statusHealth2.get().healthy());
+
+  AWAIT_READY(statusHealth3);
+  EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state());
+  EXPECT_TRUE(statusHealth3.get().healthy());
+
+  os::rm(tmpPath); // Clean up the temporary file.
+
+  driver.stop();
+  driver.join();
+
+  Shutdown();
+}
 
 // Testing killing task after number of consecutive failures.
 // Temporarily disabled due to MESOS-1613.

Reply via email to