Added test for race between health checks and agent unregistration.

This test tries to simulate a race between marking an agent unhealthy
and receiving an `UnregisterSlaveMessage` for that agent.

Unfortunately, this test is a little fragile (we need to manually
dispatch an event to the master process to simulate the action that
would be taken by the slave observer).

Review: https://reviews.apache.org/r/50707/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/504e05ee
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/504e05ee
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/504e05ee

Branch: refs/heads/master
Commit: 504e05ee60c3648a4bf2d7433f1380731db8999f
Parents: 9ba79da
Author: Neil Conway <neil.con...@gmail.com>
Authored: Mon Sep 19 15:47:14 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Mon Sep 19 15:47:14 2016 -0700

----------------------------------------------------------------------
 src/tests/slave_tests.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/504e05ee/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 696e0f7..2e7accc 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -2813,6 +2813,78 @@ TEST_F(SlaveTest, CancelSlaveRemoval)
 }
 
 
+// This test checks that the master behaves correctly when a slave
+// fails health checks, but concurrently the slave unregisters from
+// the master.
+TEST_F(SlaveTest, HealthCheckUnregisterRace)
+{
+  // Start a master.
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Start a slave.
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  // Start a scheduler.
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  // Need to make sure the framework AND slave have registered with
+  // master. Waiting for resource offers should accomplish both.
+  AWAIT_READY(offers);
+
+  SlaveID slaveId = offers.get()[0].slave_id();
+
+  EXPECT_CALL(sched, offerRescinded(&driver, _))
+    .Times(1); // Expect a single offer to be rescinded.
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  // Cause the slave to shutdown gracefully by sending it SIGUSR1.
+  // This should result in the slave sending `UnregisterSlaveMessage`
+  // to the master.
+  Future<UnregisterSlaveMessage> unregisterSlaveMessage =
+    FUTURE_PROTOBUF(
+        UnregisterSlaveMessage(),
+        slave.get()->pid,
+        master.get()->pid);
+
+  kill(getpid(), SIGUSR1);
+
+  AWAIT_READY(unregisterSlaveMessage);
+  AWAIT_READY(slaveLost);
+
+  Clock::pause();
+  Clock::settle();
+
+  // We now want to arrange for the agent to fail health checks. We
+  // can't do that directly, because the `SlaveObserver` for this
+  // agent has already been removed. Instead, we dispatch to the
+  // master's `markUnreachable` method directly.
+  process::dispatch(master.get()->pid, &Master::markUnreachable, slaveId);
+
+  Clock::settle();
+  Clock::resume();
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test ensures that a killTask() can happen between runTask()
 // and _run() and then gets "handled properly". This means that
 // the task never gets started, but also does not get lost. The end

Reply via email to