This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/merge_geode_3780
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 9975d1e10a905b040edeefa0ecb2210d1a1c1525
Author: Bruce Schuchardt <bschucha...@pivotal.io>
AuthorDate: Thu Aug 15 13:55:46 2019 -0700

    GEODE-3780 suspected member is never watched again after passing final 
check (#3917)
    
    * GEODE-3780 suspected member is never watched again after passing final 
check
    
    After passing a "final check" a member will be subject to suspect
    processing again but we weren't processing the suspect message locally.
    This caused JoinLeave to never be notified of the suspect so that
    removal could be initiated.
    
    I also noticed that a method in HealthMonitor was misnamed.  It claimed
    to return the set of members that had failed availability checks but
    instead it was returning a set of members currently under suspicion.  I
    renamed the method for clarity.
    
    * empty commit
    
    * removing getSuspectMembers - it could kick out a suspect member too easily
    
    * removing unused method and commented-out code
    
    * revising test
    
    (cherry picked from commit 8e9b04470264983d0aa1c7900f6e9be2374549d9)
---
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 25 ++++++++++++
 .../gms/membership/GMSJoinLeaveJUnitTest.java      | 44 ++++++++++++++++------
 .../membership/gms/fd/GMSHealthMonitor.java        |  8 ++--
 .../membership/gms/interfaces/HealthMonitor.java   |  6 ---
 .../membership/gms/membership/GMSJoinLeave.java    | 20 +++++-----
 5 files changed, 70 insertions(+), 33 deletions(-)

diff --git 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index 2fb067f..a750fa9 100644
--- 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -500,6 +500,31 @@ public class GMSHealthMonitorJUnitTest {
     }
   }
 
+  @Test
+  public void testMemberIsExaminedAgainAfterPassingAvailabilityCheck() {
+    // use the test health monitor's availability check for the first round of 
suspect processing
+    // but then turn it off so that a subsequent round is performed and fails 
to get a heartbeat
+    useGMSHealthMonitorTestClass = true;
+
+    try {
+      NetView v = installAView();
+
+      setFailureDetectionPorts(v);
+
+      InternalDistributedMember memberToCheck = mockMembers.get(1);
+
+      boolean retVal = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not 
responding", true);
+      assertTrue("CheckIfAvailable should have return true", retVal);
+
+      // memberToCheck should be suspected again since it's not sending 
heartbeats and then
+      // it should fail an availability check and cause removal of the member
+      useGMSHealthMonitorTestClass = false;
+      await().untilAsserted(() -> verify(joinLeave, 
atLeastOnce()).remove(memberToCheck,
+          "Member isn't responding to heartbeat requests"));
+    } finally {
+      useGMSHealthMonitorTestClass = false;
+    }
+  }
 
   @Test
   public void testNeighborRemainsSameAfterSuccessfulFinalCheck() {
diff --git 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index cb605ea..2fe1a3e 100644
--- 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -716,12 +716,13 @@ public class GMSJoinLeaveJUnitTest {
         D = mockMembers[2],
         E = mockMembers[3];
     prepareAndInstallView(C, createMemberList(A, B, C, D, E));
-    
when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A));
     LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for 
test");
     msg.setSender(C);
     gmsJoinLeave.processMessage(msg);
+    RemoveMemberMessage removeMemberMessage = new RemoveMemberMessage(B, A, 
"removing for test");
+    removeMemberMessage.setSender(B);
+    gmsJoinLeave.processMessage(removeMemberMessage);
     assertTrue("Expected becomeCoordinator to be invoked", 
gmsJoinLeave.isCoordinator());
-    await().until(() -> gmsJoinLeave.getView().size() == 1);
   }
 
   /**
@@ -738,18 +739,39 @@ public class GMSJoinLeaveJUnitTest {
         C = mockMembers[1],
         D = mockMembers[2],
         E = mockMembers[3];
+
     prepareAndInstallView(C, createMemberList(A, B, C, D));
-    
when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A));
-    E.setVmViewId(1);
+
+    // have the Messenger acknowledge all membership view messages so no-one 
is kicked out for
+    // failure to respond
+    when(messenger.send(isA(InstallViewMessage.class), isA(NetView.class)))
+        .thenAnswer((request) -> {
+          InstallViewMessage installViewMessage = request.getArgument(0);
+          for (InternalDistributedMember recipient : 
installViewMessage.getRecipients()) {
+            ViewAckMessage viewAckMessage =
+                new ViewAckMessage(gmsJoinLeaveMemberId, 
installViewMessage.getView().getViewId(),
+                    installViewMessage.isPreparing());
+            viewAckMessage.setSender(recipient);
+            gmsJoinLeave.processMessage(viewAckMessage);
+          }
+          return null;
+        });
+
+    E.setVmViewId(2);
+
+    gmsJoinLeave.recordViewRequest(new LeaveRequestMessage(B, C, "removing for 
test"));
+
     gmsJoinLeave.processMessage(new JoinRequestMessage(B, E, null, 1, 1));
-    LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for 
test");
-    msg.setSender(C);
+
+    RemoveMemberMessage msg = new RemoveMemberMessage(B, A, "crashed for 
test");
+    msg.setSender(D);
     gmsJoinLeave.processMessage(msg);
-    assertTrue("Expected becomeCoordinator to be invoked", 
gmsJoinLeave.isCoordinator());
-    await().until(() -> {
-      NetView preparedView = gmsJoinLeave.getPreparedView();
-      return preparedView != null && preparedView.contains(E);
-    });
+
+    await().until(() -> gmsJoinLeave.isCoordinator() && 
gmsJoinLeave.getViewRequests().isEmpty());
+
+    // E should have joined and retained its view ID of 2
+    await().until(() -> gmsJoinLeave.getView().contains(E));
+    assertEquals(2, E.getVmViewId());
   }
 
   @Test
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 9481e9b..2e22894 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -1412,11 +1412,6 @@ public class GMSHealthMonitor implements HealthMonitor {
     return this.socketPort;
   }
 
-  @Override
-  public Collection<InternalDistributedMember> 
getMembersFailingAvailabilityCheck() {
-    return 
Collections.unmodifiableCollection(this.suspectedMemberIds.keySet());
-  }
-
   private void sendSuspectRequest(final List<SuspectRequest> requests) {
     logger.debug("Sending suspect request for members {}", requests);
     List<InternalDistributedMember> recipients;
@@ -1437,6 +1432,7 @@ public class GMSHealthMonitor implements HealthMonitor {
 
     logger.trace("Sending suspect messages to {}", recipients);
     SuspectMembersMessage smm = new SuspectMembersMessage(recipients, 
requests);
+    smm.setSender(localAddress);
     Set<InternalDistributedMember> failedRecipients;
     try {
       failedRecipients = services.getMessenger().send(smm);
@@ -1448,6 +1444,8 @@ public class GMSHealthMonitor implements HealthMonitor {
     if (failedRecipients != null && failedRecipients.size() > 0) {
       logger.trace("Unable to send suspect message to {}", failedRecipients);
     }
+    logger.trace("Processing suspect message locally");
+    processMessage(smm);
   }
 
   private static class ConnectTimeoutTask extends TimerTask implements 
ConnectionWatcher {
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
index 9ea751c..e0e0fbd 100755
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
@@ -14,7 +14,6 @@
  */
 package org.apache.geode.distributed.internal.membership.gms.interfaces;
 
-import java.util.Collection;
 
 import org.apache.geode.distributed.DistributedMember;
 import 
org.apache.geode.distributed.internal.membership.InternalDistributedMember;
@@ -53,9 +52,4 @@ public interface HealthMonitor extends Service {
    */
   int getFailureDetectionPort();
 
-  /**
-   * Returns the set of members declared dead by the health monitor
-   */
-  Collection<InternalDistributedMember> getMembersFailingAvailabilityCheck();
-
 }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 32452c3..8600aa0 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -45,6 +45,7 @@ import org.apache.logging.log4j.Logger;
 
 import org.apache.geode.GemFireConfigException;
 import org.apache.geode.SystemConnectException;
+import org.apache.geode.annotations.VisibleForTesting;
 import org.apache.geode.distributed.DistributedMember;
 import org.apache.geode.distributed.DistributedSystemDisconnectedException;
 import org.apache.geode.distributed.Locator;
@@ -612,7 +613,8 @@ public class GMSJoinLeave implements JoinLeave {
     }
 
     if (!isCoordinator && !isStopping && 
!services.getCancelCriterion().isCancelInProgress()) {
-      logger.info("Checking to see if I should become coordinator");
+      logger.info("Checking to see if I should become coordinator.  My address 
is {}",
+          localAddress);
       NetView check = new NetView(v, v.getViewId() + 1);
       check.remove(mbr);
       synchronized (removedMembers) {
@@ -623,15 +625,10 @@ public class GMSJoinLeave implements JoinLeave {
         leftMembers.add(mbr);
         check.removeAll(leftMembers);
       }
-      Collection<InternalDistributedMember> suspectMembers =
-          services.getHealthMonitor().getMembersFailingAvailabilityCheck();
-      check.removeAll(suspectMembers);
-      logger.info("View with removed and left members removed is {}", check);
-      if (check.getCoordinator().equals(localAddress)) {
-        for (InternalDistributedMember suspect : suspectMembers) {
-          recordViewRequest(
-              new RemoveMemberMessage(localAddress, suspect, "Failed 
availability check"));
-        }
+      DistributedMember coordinator = check.getCoordinator();
+      logger.info("View with removed and left members removed is {} and 
coordinator would be {}",
+          check, coordinator);
+      if (coordinator.equals(localAddress)) {
         synchronized (viewInstallationLock) {
           becomeCoordinator(mbr);
         }
@@ -727,7 +724,8 @@ public class GMSJoinLeave implements JoinLeave {
     }
   }
 
-  private void recordViewRequest(DistributionMessage request) {
+  @VisibleForTesting
+  void recordViewRequest(DistributionMessage request) {
     try {
       synchronized (viewRequests) {
         if (request instanceof JoinRequestMessage) {

Reply via email to