This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new 288b6a3  Fixed agent reregistration and marking as unreachable race.
288b6a3 is described below

commit 288b6a38717a150538b4e402810bd43c55d512df
Author: Ilya Pronin <[email protected]>
AuthorDate: Tue Jan 12 18:09:21 2021 -0500

    Fixed agent reregistration and marking as unreachable race.
    
    During master failover if agent reregistration runs concurrently with
    marking the agent as unreachable and finishes before the MarkUnreachable
    operation is complete, the assertion that the agent is in the recovered
    set in Master::_markUnreachable() doesn't hold. The reason for this is
    because after readmitting the agent the master removes it from the
    recovered set in Master::__reregisterSlave().
    
    We can fix this by ignoring agent reregistration requests while a
    marking unreachable operation is in progress, similarly to how we do it
    for marking gone. Once the marking operation is complete, the agent will
    be able to reregister as usual.
    
    Review: https://reviews.apache.org/r/73131/
---
 src/master/master.cpp      | 10 +++++++
 src/tests/master_tests.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 164720a..4a601c9 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6582,6 +6582,16 @@ void Master::reregisterSlave(
     return;
   }
 
+  if (slaves.markingUnreachable.contains(slaveInfo.id())) {
+    LOG(INFO)
+      << "Ignoring reregister agent message from agent "
+      << slaveInfo.id() << " at " << from << " ("
+      << slaveInfo.hostname() << ") as a mark unreachable operation is "
+      << "already in progress";
+
+    return;
+  }
+
   if (slaves.markingGone.contains(slaveInfo.id())) {
     LOG(INFO)
       << "Ignoring reregister agent message from agent "
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index cd0973e..3a89d7d 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -11226,6 +11226,78 @@ TEST_F(MasterTest, LostTaskCleanup) {
   EXPECT_TRUE(completedTasks.empty()) << JSON::protobuf(completedTasks);
 }
 
+
+// This regression test verifies that an agent reregistration request
+// made while the agent is being marked as unreachable after a master
+// failover is ignored until the marking is complete. See MESOS-10209.
+TEST_F(MasterTest, AgentReregistrationDuringMarkingUnreachable)
+{
+  // TODO(ipronin): We could use in-memory registry if it was injectable
+  // in StartMaster().
+  master::Flags masterFlags = CreateMasterFlags();
+  masterFlags.registry = "replicated_log";
+
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  StandaloneMasterDetector detector(master.get()->pid);
+
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  // Start an agent and wait for it to be registered.
+  Try<Owned<cluster::Slave>> slave = StartSlave(&detector);
+  ASSERT_SOME(slave);
+  AWAIT_READY(slaveRegisteredMessage);
+
+  // Fail over the master.
+  master->reset();
+  master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Intercept the attempted registrar operation. It should be the
+  // operation that marks the agent as unreachable.
+  Promise<bool> promise;
+  Future<Owned<master::RegistryOperation>> mark;
+  EXPECT_CALL(*master.get()->registrar, apply(_))
+    .WillOnce(DoAll(FutureArg<0>(&mark), Return(promise.future())));
+
+  // Trigger agent reregistration timeout to begin marking the agent as
+  // unreachable.
+  Clock::pause();
+  Clock::advance(masterFlags.agent_reregister_timeout);
+  Clock::resume();
+
+  // Wait for the master to attempt to update the registry.
+  AWAIT_READY(mark);
+  EXPECT_NE(nullptr, dynamic_cast<master::MarkSlaveUnreachable*>(mark->get()));
+
+  Future<ReregisterSlaveMessage> reregisterSlaveMessage =
+    FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  // Detect the new master to start agent reregistration attempts. The
+  // master should drop reregistration messages because agent marking is
+  // in progress.
+  detector.appoint(master.get()->pid);
+  AWAIT_READY(reregisterSlaveMessage);
+
+  // Wait until the agent reregistration message processing is done.
+  Clock::pause();
+  Clock::settle();
+  Clock::resume();
+
+  EXPECT_CALL(*master.get()->registrar, apply(_));
+
+  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
+    FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
+
+  // Allow the registry operation to return success. Note that we don't
+  // actually update the registry here. The agent should be able to
+  // reregister after this.
+  promise.set(true);
+  AWAIT_READY(slaveReregisteredMessage);
+}
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {

Reply via email to