Handled agents failing health checks multiple times.

Now that we wait for the agent to be removed from the registry before
stopping the SlaveObserver, it is possible for an agent to fail health
checks multiple times if the registry operation takes longer than
`agent_ping_timeout`.

This commit updates the master logic to handle this by ignoring health
check failures while the registry operation to mark the agent
unreachable is still in progress.

Review: https://reviews.apache.org/r/51653/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5e66002c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5e66002c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5e66002c

Branch: refs/heads/master
Commit: 5e66002cec7855055245e328c34a783a55c542f6
Parents: 0d85e7e
Author: Neil Conway <neil.con...@gmail.com>
Authored: Mon Sep 19 15:48:30 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Mon Sep 19 15:48:30 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp         |  12 ++++
 src/tests/partition_tests.cpp | 124 +++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/5e66002c/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 4e0df8a..3095f8a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -5827,6 +5827,18 @@ void Master::markUnreachable(const SlaveID& slaveId)
     return;
   }
 
+  if (slaves.markingUnreachable.contains(slaveId)) {
+    // Possible if marking the slave unreachable in the registry takes
+    // a long time. While the registry operation is in progress, the
+    // `SlaveObserver` will continue to ping the slave; if the slave
+    // fails another health check, the `SlaveObserver` will trigger
+    // another attempt to mark it unreachable.
+    LOG(WARNING) << "Not marking agent " << slaveId
+                 << " unreachable because another unreachable"
+                 << " transition is already in progress";
+    return;
+  }
+
   LOG(INFO) << "Marking agent " << *slave
             << " unreachable: health check timed out";
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/5e66002c/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 54d07b7..d2fce57 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -58,6 +58,7 @@ using process::Future;
 using process::Message;
 using process::Owned;
 using process::PID;
+using process::Promise;
 using process::Time;
 
 using process::http::OK;
@@ -67,6 +68,7 @@ using std::vector;
 
 using testing::_;
 using testing::AtMost;
+using testing::DoAll;
 using testing::Eq;
 using testing::Return;
 
@@ -1869,6 +1871,128 @@ TEST_P(PartitionTest, RegistryGcByAge)
 }
 
 
+// This test checks that the master behaves correctly if a slave fails
+// health checks twice. At present, this can only occur if the registry
+// operation to mark the slave unreachable takes so long that the
+// slave fails an additional health check in the mean time.
+TEST_P(PartitionTest, FailHealthChecksTwice)
+{
+  master::Flags masterFlags = CreateMasterFlags();
+  masterFlags.registry_strict = GetParam();
+
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Set these expectations up before we spawn the slave so that we
+  // don't miss the first PING.
+  Future<Message> ping = FUTURE_MESSAGE(
+      Eq(PingSlaveMessage().GetTypeName()), _, _);
+
+  // Drop all the PONGs to simulate slave partition.
+  DROP_PROTOBUFS(PongSlaveMessage(), _, _);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<Nothing> resourceOffers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureSatisfy(&resourceOffers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  // Need to make sure the framework AND slave have registered with
+  // master. Waiting for resource offers should accomplish both.
+  AWAIT_READY(resourceOffers);
+
+  Clock::pause();
+
+  EXPECT_CALL(sched, offerRescinded(&driver, _))
+    .Times(AtMost(1));
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  // Now advance through the PINGs.
+  size_t pings = 0;
+  while (true) {
+    AWAIT_READY(ping);
+    pings++;
+    if (pings == masterFlags.max_agent_ping_timeouts) {
+      break;
+    }
+    ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _);
+    Clock::advance(masterFlags.agent_ping_timeout);
+  }
+
+  // The slave observer should dispatch a message to the master. We
+  // intercept the next registrar operation; this should be the
+  // registry operation marking the slave unreachable.
+  Future<Nothing> unreachableDispatch1 =
+    FUTURE_DISPATCH(master.get()->pid, &Master::markUnreachable);
+
+  Future<Owned<master::Operation>> markUnreachable;
+  Promise<bool> markUnreachableContinue;
+  EXPECT_CALL(*master.get()->registrar.get(), apply(_))
+    .WillOnce(DoAll(FutureArg<0>(&markUnreachable),
+                    Return(markUnreachableContinue.future())));
+
+  Clock::advance(masterFlags.agent_ping_timeout);
+
+  AWAIT_READY(unreachableDispatch1);
+  AWAIT_READY(markUnreachable);
+  EXPECT_NE(
+      nullptr,
+      dynamic_cast<master::MarkSlaveUnreachable*>(
+          markUnreachable.get().get()));
+
+  // Cause the slave to fail another health check. This is possible
+  // because we don't shutdown the SlaveObserver until we have marked
+  // the slave as unreachable in the registry. The second health check
+  // failure should dispatch to the master but this should NOT result
+  // in another registry operation.
+  Future<Nothing> unreachableDispatch2 =
+    FUTURE_DISPATCH(master.get()->pid, &Master::markUnreachable);
+
+  EXPECT_CALL(*master.get()->registrar.get(), apply(_))
+    .Times(0);
+
+  Clock::advance(masterFlags.agent_ping_timeout);
+
+  AWAIT_READY(unreachableDispatch2);
+
+  // Apply the registry operation, then pass the result back to the
+  // master to allow it to continue.
+  Future<bool> applyUnreachable =
+    master.get()->registrar->unmocked_apply(markUnreachable.get());
+
+  AWAIT_READY(applyUnreachable);
+  markUnreachableContinue.set(applyUnreachable.get());
+
+  AWAIT_READY(slaveLost);
+
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(2, stats.values["master/slave_unreachable_scheduled"]);
+  EXPECT_EQ(2, stats.values["master/slave_unreachable_completed"]);
+  EXPECT_EQ(1, stats.values["master/slave_removals"]);
+  EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
+
+  driver.stop();
+  driver.join();
+
+  Clock::resume();
+}
+
+
 class OneWayPartitionTest : public MesosTest {};
 
 

Reply via email to