Removed dependency on "registry_strict" in master failover.

When the master fails over, agents have `agent_reregister_timeout` to
reregister with the new master. Any agents that fail to reregister
within the timeout will be marked unreachable in the registry.
Previously, frameworks would only receive a `slaveLost` callback for
such agents if the master was running in "registry_strict" mode. This
commit changes the master to always inform frameworks about lost agents,
regardless of the "registry_strict" flag.

Review: https://reviews.apache.org/r/51955/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/905204e5
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/905204e5
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/905204e5

Branch: refs/heads/master
Commit: 905204e54748c5842f96997db41f9ff14d9246ab
Parents: ef9211f
Author: Neil Conway <neil.con...@gmail.com>
Authored: Mon Sep 19 15:49:23 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Mon Sep 19 15:49:23 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp      | 27 ++++----------
 src/tests/master_tests.cpp | 81 -----------------------------------------
 2 files changed, 7 insertions(+), 101 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/905204e5/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 763c5e7..66a672f 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1880,27 +1880,14 @@ Nothing Master::markUnreachableAfterFailover(const 
Registry::Slave& slave)
 
   TimeInfo unreachableTime = protobuf::getCurrentTime();
 
-  if (flags.registry_strict) {
-    slaves.markingUnreachable.insert(slave.info().id());
+  slaves.markingUnreachable.insert(slave.info().id());
 
-    registrar->apply(Owned<Operation>(
-        new MarkSlaveUnreachable(slave.info(), unreachableTime)))
-      .onAny(defer(self(),
-                   &Self::_markUnreachableAfterFailover,
-                   slave.info(),
-                   lambda::_1));
-  } else {
-    // When a non-strict registry is in use, we want to ensure the
-    // registry is used in a write-only manner. Therefore we remove
-    // the slave from the registry but we do not inform the
-    // framework.
-    const string& message =
-      "Failed to mark agent " + stringify(slave.info().id()) + " unreachable";
-
-    registrar->apply(Owned<Operation>(
-        new MarkSlaveUnreachable(slave.info(), unreachableTime)))
-      .onFailed(lambda::bind(fail, message, lambda::_1));
-  }
+  registrar->apply(Owned<Operation>(
+          new MarkSlaveUnreachable(slave.info(), unreachableTime)))
+    .onAny(defer(self(),
+                 &Self::_markUnreachableAfterFailover,
+                 slave.info(),
+                 lambda::_1));
 
   return Nothing();
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/905204e5/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 6c49ab3..a32ac12 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -1967,87 +1967,6 @@ TEST_F(MasterTest, RecoveredSlaveCanReregister)
 }
 
 
-// This test ensures that a non-strict registry is write-only by
-// inducing a slave removal during recovery. After which, we expect
-// that the framework is *not* informed, and we expect that the
-// slave can re-register successfully.
-TEST_F(MasterTest, NonStrictRegistryWriteOnly)
-{
-  // Step 1: Start a master.
-  master::Flags masterFlags = CreateMasterFlags();
-  masterFlags.registry_strict = false;
-
-  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
-  ASSERT_SOME(master);
-
-  // Step 2: Start a slave.
-  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
-    FUTURE_PROTOBUF(SlaveRegisteredMessage(), master.get()->pid, _);
-
-  // Reuse slaveFlags so both StartSlave() use the same work_dir.
-  slave::Flags slaveFlags = this->CreateSlaveFlags();
-
-  Owned<MasterDetector> detector = master.get()->createDetector();
-  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags);
-  ASSERT_SOME(slave);
-
-  AWAIT_READY(slaveRegisteredMessage);
-
-  // Step 3: Stop the slave while the master is down.
-  master->reset();
-  slave.get()->terminate();
-  slave->reset();
-
-  // Step 4: Restart the master.
-  master = StartMaster(masterFlags);
-  ASSERT_SOME(master);
-
-  // Step 5: Start a scheduler.
-  MockScheduler sched;
-  MesosSchedulerDriver driver(
-      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
-
-  Future<Nothing> registered;
-  EXPECT_CALL(sched, registered(&driver, _, _))
-    .WillOnce(FutureSatisfy(&registered));
-
-  EXPECT_CALL(sched, resourceOffers(&driver, _))
-    .WillRepeatedly(Return()); // Ignore offers.
-
-  driver.start();
-
-  AWAIT_READY(registered);
-
-  // Step 6: Advance the clock and make sure the slave is not
-  // removed!
-  Future<Nothing> slaveLost;
-  EXPECT_CALL(sched, slaveLost(&driver, _))
-    .WillRepeatedly(FutureSatisfy(&slaveLost));
-
-  Clock::pause();
-  Clock::advance(masterFlags.agent_reregister_timeout);
-  Clock::settle();
-
-  ASSERT_TRUE(slaveLost.isPending());
-
-  Clock::resume();
-
-  // Step 7: Now expect the slave to be able to re-register,
-  // according to the non-strict semantics.
-  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
-    FUTURE_PROTOBUF(SlaveReregisteredMessage(), master.get()->pid, _);
-
-  detector = master.get()->createDetector();
-  slave = StartSlave(detector.get(), slaveFlags);
-  ASSERT_SOME(slave);
-
-  AWAIT_READY(slaveReregisteredMessage);
-
-  driver.stop();
-  driver.join();
-}
-
-
 // This test ensures that slave removals during master recovery
 // are rate limited.
 TEST_F(MasterTest, RateLimitRecoveredSlaveRemoval)

Reply via email to