This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-3780
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 50bcc90c779c5d74c20b75d84ebc4522dd421caa
Author: Bruce Schuchardt <bschucha...@pivotal.io>
AuthorDate: Tue Aug 13 09:47:52 2019 -0700

    GEODE-3780 suspected member is never watched again after passing final check
    
    After passing a "final check" a member will be subject to suspect
    processing again but we weren't processing the suspect message locally.
    This caused JoinLeave to never be notified of the suspect so that
    removal could be initiated.
    
    I also noticed that a method in HealthMonitor was misnamed.  It claimed
    to return the set of members that had failed availability checks but
    instead it was returning a set of members currently under suspicion.  I
    renamed the method for clarity.
---
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 25 ++++++++++++++++++++++
 .../gms/membership/GMSJoinLeaveJUnitTest.java      |  4 ++--
 .../membership/gms/fd/GMSHealthMonitor.java        |  9 +++++---
 .../membership/gms/interfaces/HealthMonitor.java   |  4 ++--
 .../membership/gms/membership/GMSJoinLeave.java    |  2 +-
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index 264ac62..59d2c0e 100644
--- 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -498,6 +498,31 @@ public class GMSHealthMonitorJUnitTest {
     }
   }
 
+  @Test
+  public void testMemberIsExaminedAgainAfterPassingAvailabilityCheck() {
+    // use the test health monitor's availability check for the first round of 
suspect processing
+    // but then turn it off so that a subsequent round is performed and fails 
to get a heartbeat
+    useGMSHealthMonitorTestClass = true;
+
+    try {
+      GMSMembershipView v = installAView();
+
+      setFailureDetectionPorts(v);
+
+      GMSMember memberToCheck = mockMembers.get(1);
+
+      boolean retVal = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not 
responding", true);
+      assertTrue("CheckIfAvailable should have return true", retVal);
+
+      // memberToCheck should be suspected again since it's not sending 
heartbeats and then
+      // it should fail an availability check and cause removal of the member
+      useGMSHealthMonitorTestClass = false;
+      await().untilAsserted(() -> verify(joinLeave, 
atLeastOnce()).remove(memberToCheck,
+          "Member isn't responding to heartbeat requests"));
+    } finally {
+      useGMSHealthMonitorTestClass = false;
+    }
+  }
 
   @Test
   public void testNeighborRemainsSameAfterSuccessfulFinalCheck() {
diff --git 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index 8f18895..dcefbf3 100644
--- 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -707,7 +707,7 @@ public class GMSJoinLeaveJUnitTest {
         D = mockMembers[2],
         E = mockMembers[3];
     prepareAndInstallView(C, createMemberList(A, B, C, D, E));
-    
when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A));
+    
when(healthMonitor.getSuspectedMembers()).thenReturn(Collections.singleton(A));
     LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for 
test");
     msg.setSender(C);
     gmsJoinLeave.processMessage(msg);
@@ -730,7 +730,7 @@ public class GMSJoinLeaveJUnitTest {
         D = mockMembers[2],
         E = mockMembers[3];
     prepareAndInstallView(C, createMemberList(A, B, C, D));
-    
when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A));
+    
when(healthMonitor.getSuspectedMembers()).thenReturn(Collections.singleton(A));
     E.setVmViewId(1);
     gmsJoinLeave.processMessage(new JoinRequestMessage(B, E, null, 1, 1));
     LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for 
test");
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index d085cd2..36827e3 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -1414,7 +1414,7 @@ public class GMSHealthMonitor implements HealthMonitor {
   }
 
   @Override
-  public Collection<GMSMember> getMembersFailingAvailabilityCheck() {
+  public Collection<GMSMember> getSuspectedMembers() {
     return 
Collections.unmodifiableCollection(this.suspectedMemberIds.keySet());
   }
 
@@ -1436,8 +1436,9 @@ public class GMSHealthMonitor implements HealthMonitor {
       recipients = currentView.getMembers();
     }
 
-    logger.trace("Sending suspect messages to {}", recipients);
+    logger.debug("Sending suspect messages to {}", recipients);
     SuspectMembersMessage smm = new SuspectMembersMessage(recipients, 
requests);
+    smm.setSender(localAddress);
     Set<GMSMember> failedRecipients;
     try {
       failedRecipients = services.getMessenger().send(smm);
@@ -1447,8 +1448,10 @@ public class GMSHealthMonitor implements HealthMonitor {
     }
 
     if (failedRecipients != null && failedRecipients.size() > 0) {
-      logger.trace("Unable to send suspect message to {}", failedRecipients);
+      logger.debug("Unable to send suspect message to {}", failedRecipients);
     }
+    logger.debug("Processing suspect message locally");
+    processMessage(smm);
   }
 
   private static class ConnectTimeoutTask extends TimerTask implements 
ConnectionWatcher {
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
index c134088..64acf0c 100755
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
@@ -53,8 +53,8 @@ public interface HealthMonitor extends Service {
   int getFailureDetectionPort();
 
   /**
-   * Returns the set of members declared dead by the health monitor
+   * Returns the set of members under suspicion by the health monitor
    */
-  Collection<GMSMember> getMembersFailingAvailabilityCheck();
+  Collection<GMSMember> getSuspectedMembers();
 
 }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 3ce099e..f9b9aac 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -621,7 +621,7 @@ public class GMSJoinLeave implements JoinLeave {
         check.removeAll(leftMembers);
       }
       Collection<GMSMember> suspectMembers =
-          services.getHealthMonitor().getMembersFailingAvailabilityCheck();
+          services.getHealthMonitor().getSuspectedMembers();
       check.removeAll(suspectMembers);
       logger.info("View with removed and left members removed is {}", check);
       if (check.getCoordinator().equals(localAddress)) {

Reply via email to