This is an automated email from the ASF dual-hosted git repository. bschuchardt pushed a commit to branch feature/merge_geode_3780 in repository https://gitbox.apache.org/repos/asf/geode.git
commit 9975d1e10a905b040edeefa0ecb2210d1a1c1525 Author: Bruce Schuchardt <bschucha...@pivotal.io> AuthorDate: Thu Aug 15 13:55:46 2019 -0700 GEODE-3780 suspected member is never watched again after passing final check (#3917) * GEODE-3780 suspected member is never watched again after passing final check After passing a "final check" a member will be subject to suspect processing again but we weren't processing the suspect message locally. This caused JoinLeave to never be notified of the suspect so that removal could be initiated. I also noticed that a method in HealthMonitor was misnamed. It claimed to return the set of members that had failed availability checks but instead it was returning a set of members currently under suspicion. I renamed the method for clarity. * empty commit * removing getSuspectMembers - it could kick out a suspect member too easily * removing unused method and commented-out code * revising test (cherry picked from commit 8e9b04470264983d0aa1c7900f6e9be2374549d9) --- .../gms/fd/GMSHealthMonitorJUnitTest.java | 25 ++++++++++++ .../gms/membership/GMSJoinLeaveJUnitTest.java | 44 ++++++++++++++++------ .../membership/gms/fd/GMSHealthMonitor.java | 8 ++-- .../membership/gms/interfaces/HealthMonitor.java | 6 --- .../membership/gms/membership/GMSJoinLeave.java | 20 +++++----- 5 files changed, 70 insertions(+), 33 deletions(-) diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java index 2fb067f..a750fa9 100644 --- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java +++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java @@ -500,6 +500,31 @@ public class GMSHealthMonitorJUnitTest { } } + @Test + public void testMemberIsExaminedAgainAfterPassingAvailabilityCheck() { + // use the test health monitor's availability check for the first round of suspect processing + // but then turn it off so that a subsequent round is performed and fails to get a heartbeat + useGMSHealthMonitorTestClass = true; + + try { + NetView v = installAView(); + + setFailureDetectionPorts(v); + + InternalDistributedMember memberToCheck = mockMembers.get(1); + + boolean retVal = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", true); + assertTrue("CheckIfAvailable should have return true", retVal); + + // memberToCheck should be suspected again since it's not sending heartbeats and then + // it should fail an availability check and cause removal of the member + useGMSHealthMonitorTestClass = false; + await().untilAsserted(() -> verify(joinLeave, atLeastOnce()).remove(memberToCheck, + "Member isn't responding to heartbeat requests")); + } finally { + useGMSHealthMonitorTestClass = false; + } + } @Test public void testNeighborRemainsSameAfterSuccessfulFinalCheck() { diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java index cb605ea..2fe1a3e 100644 --- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java +++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java @@ -716,12 +716,13 @@ public class GMSJoinLeaveJUnitTest { D = mockMembers[2], E = mockMembers[3]; prepareAndInstallView(C, createMemberList(A, B, C, D, E)); - when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A)); LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for test"); msg.setSender(C); gmsJoinLeave.processMessage(msg); + RemoveMemberMessage removeMemberMessage = new RemoveMemberMessage(B, A, "removing for test"); + removeMemberMessage.setSender(B); + gmsJoinLeave.processMessage(removeMemberMessage); assertTrue("Expected becomeCoordinator to be invoked", gmsJoinLeave.isCoordinator()); - await().until(() -> gmsJoinLeave.getView().size() == 1); } /** @@ -738,18 +739,39 @@ public class GMSJoinLeaveJUnitTest { C = mockMembers[1], D = mockMembers[2], E = mockMembers[3]; + prepareAndInstallView(C, createMemberList(A, B, C, D)); - when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A)); - E.setVmViewId(1); + + // have the Messenger acknowledge all membership view messages so no-one is kicked out for + // failure to respond + when(messenger.send(isA(InstallViewMessage.class), isA(NetView.class))) + .thenAnswer((request) -> { + InstallViewMessage installViewMessage = request.getArgument(0); + for (InternalDistributedMember recipient : installViewMessage.getRecipients()) { + ViewAckMessage viewAckMessage = + new ViewAckMessage(gmsJoinLeaveMemberId, installViewMessage.getView().getViewId(), + installViewMessage.isPreparing()); + viewAckMessage.setSender(recipient); + gmsJoinLeave.processMessage(viewAckMessage); + } + return null; + }); + + E.setVmViewId(2); + + gmsJoinLeave.recordViewRequest(new LeaveRequestMessage(B, C, "removing for test")); + gmsJoinLeave.processMessage(new JoinRequestMessage(B, E, null, 1, 1)); - LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for test"); - msg.setSender(C); + + RemoveMemberMessage msg = new RemoveMemberMessage(B, A, "crashed for test"); + msg.setSender(D); gmsJoinLeave.processMessage(msg); - assertTrue("Expected becomeCoordinator to be invoked", gmsJoinLeave.isCoordinator()); - await().until(() -> { - NetView preparedView = gmsJoinLeave.getPreparedView(); - return preparedView != null && preparedView.contains(E); - }); + + await().until(() -> gmsJoinLeave.isCoordinator() && gmsJoinLeave.getViewRequests().isEmpty()); + + // E should have joined and retained its view ID of 2 + await().until(() -> gmsJoinLeave.getView().contains(E)); + assertEquals(2, E.getVmViewId()); } @Test diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java index 9481e9b..2e22894 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java @@ -1412,11 +1412,6 @@ public class GMSHealthMonitor implements HealthMonitor { return this.socketPort; } - @Override - public Collection<InternalDistributedMember> getMembersFailingAvailabilityCheck() { - return Collections.unmodifiableCollection(this.suspectedMemberIds.keySet()); - } - private void sendSuspectRequest(final List<SuspectRequest> requests) { logger.debug("Sending suspect request for members {}", requests); List<InternalDistributedMember> recipients; @@ -1437,6 +1432,7 @@ public class GMSHealthMonitor implements HealthMonitor { logger.trace("Sending suspect messages to {}", recipients); SuspectMembersMessage smm = new SuspectMembersMessage(recipients, requests); + smm.setSender(localAddress); Set<InternalDistributedMember> failedRecipients; try { failedRecipients = services.getMessenger().send(smm); @@ -1448,6 +1444,8 @@ public class GMSHealthMonitor implements HealthMonitor { if (failedRecipients != null && failedRecipients.size() > 0) { logger.trace("Unable to send suspect message to {}", failedRecipients); } + logger.trace("Processing suspect message locally"); + processMessage(smm); } private static class ConnectTimeoutTask extends TimerTask implements ConnectionWatcher { diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java index 9ea751c..e0e0fbd 100755 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java @@ -14,7 +14,6 @@ */ package org.apache.geode.distributed.internal.membership.gms.interfaces; -import java.util.Collection; import org.apache.geode.distributed.DistributedMember; import org.apache.geode.distributed.internal.membership.InternalDistributedMember; @@ -53,9 +52,4 @@ public interface HealthMonitor extends Service { */ int getFailureDetectionPort(); - /** - * Returns the set of members declared dead by the health monitor - */ - Collection<InternalDistributedMember> getMembersFailingAvailabilityCheck(); - } diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java index 32452c3..8600aa0 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java @@ -45,6 +45,7 @@ import org.apache.logging.log4j.Logger; import org.apache.geode.GemFireConfigException; import org.apache.geode.SystemConnectException; +import org.apache.geode.annotations.VisibleForTesting; import org.apache.geode.distributed.DistributedMember; import org.apache.geode.distributed.DistributedSystemDisconnectedException; import org.apache.geode.distributed.Locator; @@ -612,7 +613,8 @@ public class GMSJoinLeave implements JoinLeave { } if (!isCoordinator && !isStopping && !services.getCancelCriterion().isCancelInProgress()) { - logger.info("Checking to see if I should become coordinator"); + logger.info("Checking to see if I should become coordinator. My address is {}", + localAddress); NetView check = new NetView(v, v.getViewId() + 1); check.remove(mbr); synchronized (removedMembers) { @@ -623,15 +625,10 @@ public class GMSJoinLeave implements JoinLeave { leftMembers.add(mbr); check.removeAll(leftMembers); } - Collection<InternalDistributedMember> suspectMembers = - services.getHealthMonitor().getMembersFailingAvailabilityCheck(); - check.removeAll(suspectMembers); - logger.info("View with removed and left members removed is {}", check); - if (check.getCoordinator().equals(localAddress)) { - for (InternalDistributedMember suspect : suspectMembers) { - recordViewRequest( - new RemoveMemberMessage(localAddress, suspect, "Failed availability check")); - } + DistributedMember coordinator = check.getCoordinator(); + logger.info("View with removed and left members removed is {} and coordinator would be {}", + check, coordinator); + if (coordinator.equals(localAddress)) { synchronized (viewInstallationLock) { becomeCoordinator(mbr); } @@ -727,7 +724,8 @@ public class GMSJoinLeave implements JoinLeave { } } - private void recordViewRequest(DistributionMessage request) { + @VisibleForTesting + void recordViewRequest(DistributionMessage request) { try { synchronized (viewRequests) { if (request instanceof JoinRequestMessage) {