Added test for race between health checks and agent unregistration. This test tries to simulate a race between marking an agent unhealthy and receiving an `UnregisterSlaveMessage` for that agent.
Unfortunately, this test is a little fragile (we need to manually dispatch an event to the master process to simulate the action that would be taken by the slave observer). Review: https://reviews.apache.org/r/50707/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/504e05ee Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/504e05ee Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/504e05ee Branch: refs/heads/master Commit: 504e05ee60c3648a4bf2d7433f1380731db8999f Parents: 9ba79da Author: Neil Conway <[email protected]> Authored: Mon Sep 19 15:47:14 2016 -0700 Committer: Vinod Kone <[email protected]> Committed: Mon Sep 19 15:47:14 2016 -0700 ---------------------------------------------------------------------- src/tests/slave_tests.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/504e05ee/src/tests/slave_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 696e0f7..2e7accc 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -2813,6 +2813,78 @@ TEST_F(SlaveTest, CancelSlaveRemoval) } +// This test checks that the master behaves correctly when a slave +// fails health checks, but concurrently the slave unregisters from +// the master. +TEST_F(SlaveTest, HealthCheckUnregisterRace) +{ + // Start a master. + Try<Owned<cluster::Master>> master = StartMaster(); + ASSERT_SOME(master); + + // Start a slave. + Owned<MasterDetector> detector = master.get()->createDetector(); + Try<Owned<cluster::Slave>> slave = StartSlave(detector.get()); + ASSERT_SOME(slave); + + // Start a scheduler. + MockScheduler sched; + MesosSchedulerDriver driver( + &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(&driver, _, _)); + + Future<vector<Offer>> offers; + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(FutureArg<1>(&offers)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + // Need to make sure the framework AND slave have registered with + // master. Waiting for resource offers should accomplish both. + AWAIT_READY(offers); + + SlaveID slaveId = offers.get()[0].slave_id(); + + EXPECT_CALL(sched, offerRescinded(&driver, _)) + .Times(1); // Expect a single offer to be rescinded. + + Future<Nothing> slaveLost; + EXPECT_CALL(sched, slaveLost(&driver, _)) + .WillOnce(FutureSatisfy(&slaveLost)); + + // Cause the slave to shutdown gracefully by sending it SIGUSR1. + // This should result in the slave sending `UnregisterSlaveMessage` + // to the master. + Future<UnregisterSlaveMessage> unregisterSlaveMessage = + FUTURE_PROTOBUF( + UnregisterSlaveMessage(), + slave.get()->pid, + master.get()->pid); + + kill(getpid(), SIGUSR1); + + AWAIT_READY(unregisterSlaveMessage); + AWAIT_READY(slaveLost); + + Clock::pause(); + Clock::settle(); + + // We now want to arrange for the agent to fail health checks. We + // can't do that directly, because the `SlaveObserver` for this + // agent has already been removed. Instead, we dispatch to the + // master's `markUnreachable` method directly. + process::dispatch(master.get()->pid, &Master::markUnreachable, slaveId); + + Clock::settle(); + Clock::resume(); + + driver.stop(); + driver.join(); +} + + // This test ensures that a killTask() can happen between runTask() // and _run() and then gets "handled properly". This means that // the task never gets started, but also does not get lost. The end
