This is an automated email from the ASF dual-hosted git repository. bschuchardt pushed a commit to branch feature/GEODE-6732 in repository https://gitbox.apache.org/repos/asf/geode.git
commit c65b63303d297c502d9ae52d1cace1c5dceae15f Author: Bruce Schuchardt <bschucha...@pivotal.io> AuthorDate: Wed May 22 14:42:37 2019 -0700 GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails If a self-health check fails we should give a member that's under suspicion a break and stop suspecting it for the moment. If it's really not there anymore a subsequent check will happen and that will resolve the issue. --- .../gms/fd/GMSHealthMonitorJUnitTest.java | 23 ++++++++++++++++++++++ .../membership/gms/fd/GMSHealthMonitor.java | 13 +++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java index 071b750..5dd04ab 100644 --- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java +++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java @@ -50,8 +50,10 @@ import java.net.Socket; import java.net.SocketAddress; import java.net.UnknownHostException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Properties; +import java.util.Set; import java.util.Timer; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; @@ -617,6 +619,25 @@ public class GMSHealthMonitorJUnitTest { assertTrue(gmsHealthMonitor.isSuspectMember(memberToCheck)); } + @Test + public void testFailedSelfCheckRemovesMemberAsSuspect() { + useGMSHealthMonitorTestClass = true; + simulateHeartbeatInGMSHealthMonitorTestClass = false; + NetView v = installAView(); + + setFailureDetectionPorts(v); + + InternalDistributedMember memberToCheck = gmsHealthMonitor.getNextNeighbor(); + gmsHealthMonitor.stopServer(); + boolean available = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", false); + assertTrue(available); + verify(joinLeave, never()).remove(isA(InternalDistributedMember.class), isA(String.class)); + assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers + .contains(memberToCheck)); + assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers + .contains(joinLeave.getMemberID())); + } + /** * a failed availablility check should initiate suspect processing */ @@ -901,10 +922,12 @@ public class GMSHealthMonitorJUnitTest { public class GMSHealthMonitorTest extends GMSHealthMonitor { public boolean useBlockingSocket = false; + public Set<InternalDistributedMember> availabilityCheckedMembers = new HashSet<>(); @Override boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port, boolean retryIfConnectFails) { + availabilityCheckedMembers.add(suspectMember); if (useGMSHealthMonitorTestClass) { if (simulateHeartbeatInGMSHealthMonitorTestClass) { HeartbeatMessage fakeHeartbeat = new HeartbeatMessage(); diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java index 43da869..4976d5d 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java @@ -925,6 +925,10 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler { checkExecutor.shutdown(); } + stopServer(); + } + + void stopServer() { if (serverSocketExecutor != null) { if (serverSocket != null && !serverSocket.isClosed()) { try { @@ -1321,9 +1325,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler { // make sure it is still suspected memberSuspected(localAddress, mbr, reason); } else { + failed = true; // if this node can survive an availability check then initiate suspicion about // the node that failed the availability check + logger.info("BRUCE: invoking self-check on {}", localAddress); if (doTCPCheckMember(localAddress, this.socketPort, false)) { + logger.info("BRUCE: self-check passed"); membersInFinalCheck.remove(mbr); // tell peers about this member and then perform another availability check memberSuspected(localAddress, mbr, reason); @@ -1335,9 +1342,13 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler { suspectMembersMessage.setSender(localAddress); logger.debug("Performing local processing on suspect request"); processSuspectMembersRequest(suspectMembersMessage); + } else { + logger.info( + "Self-check for availability failed - will not continue to suspect {} for now", + mbr); + failed = false; } } - failed = true; } else { logger.info( "Availability check failed but detected recent message traffic for suspect member "