This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/develop by this push:
new f8c69d2 GEODE-6244 Healthy member kicked out by Sick member when
final-check fails
f8c69d2 is described below
commit f8c69d2b647edf7b3e9f93446a39e381fe3b70d9
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Mon Feb 4 09:40:37 2019 -0800
GEODE-6244 Healthy member kicked out by Sick member when final-check fails
The initial fix caused a problem that prevented election of a new
membership coordinator in a certain case. The case was a view
with nodes [A, B, C, D, E] where C was the coordinator. Node A had
crashed and the crash had been detected by B. Node C then left the
cluster, sending a Leave message to B. B's JoinLeave did not know about
the HealthMonitor's decision that A was crashed and did not become the
new coordinator.
This commit makes B's JoinLeave pay attention to the crashed-member set
in the HealthMonitor when deciding whether to become the membership
coordinator for the cluster.
---
.../gms/membership/GMSJoinLeaveJUnitTest.java | 20 ++++++++++++++++++++
.../internal/membership/gms/fd/GMSHealthMonitor.java | 12 +++++++++++-
.../membership/gms/interfaces/HealthMonitor.java | 7 +++++++
.../membership/gms/membership/GMSJoinLeave.java | 6 ++++++
4 files changed, 44 insertions(+), 1 deletion(-)
diff --git
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index a503809..f59f677 100644
---
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -733,6 +733,26 @@ public class GMSJoinLeaveJUnitTest {
assertTrue("Expected becomeCoordinator to be invoked",
gmsJoinLeave.isCoordinator());
}
+ /**
+ * Given a view with [A, B, C, D, E] where C is coordinator, A failed
availability checks and
+ * C shuts down we should see B become the coordinator.
+ */
+ @Test
+ public void testBecomeCoordinatorThroughShutdownWhenOlderMemberCrashed()
throws Exception {
+ initMocks();
+ InternalDistributedMember A = mockMembers[0],
+ B = gmsJoinLeaveMemberId,
+ C = mockMembers[1],
+ D = mockMembers[2],
+ E = mockMembers[3];
+ prepareAndInstallView(C, createMemberList(A, B, C, D, E));
+
when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A));
+ LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for
test");
+ msg.setSender(C);
+ gmsJoinLeave.processMessage(msg);
+ assertTrue("Expected becomeCoordinator to be invoked",
gmsJoinLeave.isCoordinator());
+ }
+
@Test
public void testBecomeCoordinatorThroughViewChange() throws Exception {
initMocks();
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index cf6e9e5..d880501 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -842,12 +842,17 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
}
InternalDistributedMember oldNeighbor = nextNeighbor;
if (oldNeighbor != newNeighbor) {
- logger.info("Failure detection is now watching {}", newNeighbor);
+ logger.info("Failure detection is now watching " + newNeighbor
+ + "; suspects are " + suspectedMemberIds);
nextNeighbor = newNeighbor;
}
}
if (nextNeighbor != null && nextNeighbor.equals(localAddress)) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Health monitor is unable to find a neighbor to watch. "
+ + "Current suspects are {}", suspectedMemberIds);
+ }
nextNeighbor = null;
}
@@ -1355,6 +1360,11 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
return this.socketPort;
}
+ @Override
+ public Collection<InternalDistributedMember>
getMembersFailingAvailabilityCheck() {
+ return
Collections.unmodifiableCollection(this.suspectedMemberIds.keySet());
+ }
+
private void sendSuspectRequest(final List<SuspectRequest> requests) {
logger.debug("Sending suspect request for members {}", requests);
List<InternalDistributedMember> recipients;
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
index abd7a66..1975186 100755
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
@@ -14,6 +14,8 @@
*/
package org.apache.geode.distributed.internal.membership.gms.interfaces;
+import java.util.Collection;
+
import org.apache.geode.distributed.DistributedMember;
import
org.apache.geode.distributed.internal.membership.InternalDistributedMember;
@@ -51,4 +53,9 @@ public interface HealthMonitor extends Service {
*/
int getFailureDetectionPort();
+ /**
+ * Returns the set of members declared dead by the health monitor
+ */
+ Collection<InternalDistributedMember> getMembersFailingAvailabilityCheck();
+
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 6046827..75ee997 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -608,6 +608,12 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
leftMembers.add(mbr);
check.removeAll(leftMembers);
}
+ Collection<InternalDistributedMember> suspectMembers =
+ services.getHealthMonitor().getMembersFailingAvailabilityCheck();
+ check.removeAll(suspectMembers);
+ logger.info(
+ "View with removed and left members removed is {}\nremoved members:
{}\nleft members: {}\nsuspect members: {}",
+ check, removedMembers, leftMembers, suspectMembers);
if (check.getCoordinator().equals(localAddress)) {
synchronized (viewInstallationLock) {
becomeCoordinator(mbr);