This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch support/1.12
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/support/1.12 by this push:
new 0f23c76 GEODE-8721: member that should become coordinator never
detects loss of current coordinator (#5758)
0f23c76 is described below
commit 0f23c7643d8410cba0badbb126946869d36f4523
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Mon Nov 30 08:08:58 2020 -0800
GEODE-8721: member that should become coordinator never detects loss of
current coordinator (#5758)
* GEODE-8721: member that should become coordinator never detects loss of
current coordinator
If a server is in the process of performing an availability check on
another server we shouldn't update the contact timestamp for
the suspected server based on gossip from another server. Doing so
will make the availability check pass and send out another gossip
message that would likewise update their timestamps for the suspected
server, perpetuating the notion that the suspect is still around.
* added VisibleForTesting
(cherry picked from commit b7afc604b9c2fafe4388dcdcf05fc7ec49c0ce86)
---
.../gms/fd/GMSHealthMonitorJUnitTest.java | 51 ++++++++++++++++++++++
.../membership/gms/fd/GMSHealthMonitor.java | 19 ++++++--
2 files changed, 66 insertions(+), 4 deletions(-)
diff --git
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index a146f1a..e458dcf 100644
---
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -671,6 +671,29 @@ public class GMSHealthMonitorJUnitTest {
}
@Test
+ public void testFinalCheckInProgressPreemptsLivenessGossip() throws
Exception {
+ // if a member is undergoing a final check we should not accept another
member's
+ // gossip about the suspect being alive and should not update the contact
timestamp
+ // because that interferes with the final check
+ useGMSHealthMonitorTestClass = true;
+ simulateHeartbeatInGMSHealthMonitorTestClass = false;
+
+ GMSMembershipView v = installAView();
+ setFailureDetectionPorts(v);
+
+ MemberIdentifier memberToCheck = gmsHealthMonitor.getNextNeighbor();
+ GMSHealthMonitorTest testMonitor = (GMSHealthMonitorTest) gmsHealthMonitor;
+
+ // set an old contact timestamp for the suspect, tell the monitor that an
availability
+ // check succeeded and then make sure it didn't update the timestamp for
the suspect
+ final long timestamp = testMonitor.establishCurrentTime() - 2000;
+ testMonitor.setContactTimestamp(memberToCheck, timestamp);
+ testMonitor.addMemberInFinalCheck(memberToCheck);
+ testMonitor.processMessage(new FinalCheckPassedMessage(null,
memberToCheck));
+
assertThat(testMonitor.getContactTimestamp(memberToCheck)).isEqualTo(timestamp);
+ }
+
+ @Test
public void testFailedSelfCheckRemovesMemberAsSuspect() throws Exception {
useGMSHealthMonitorTestClass = true;
simulateHeartbeatInGMSHealthMonitorTestClass = false;
@@ -1057,6 +1080,34 @@ public class GMSHealthMonitorJUnitTest {
return serverSocket;
}
}
+
+ /**
+ * when a suspect is undergoing an availability check its identifier will
+ * be in the membersInFinalCheck collection
+ */
+ public void addMemberInFinalCheck(MemberIdentifier memberToCheck) {
+ membersInFinalCheck.add(memberToCheck);
+ }
+
+ public void setContactTimestamp(MemberIdentifier memberToCheck, long
timestamp) {
+ memberTimeStamps.put(memberToCheck, new TimeStamp(timestamp));
+ }
+
+ public long getContactTimestamp(MemberIdentifier memberIdentifier) {
+ return ((TimeStamp) memberTimeStamps.get(memberIdentifier)).getTime();
+ }
+
+ /**
+ * Establish the currentTimeStamp for the health monitor. This is the
timestamp
+ * used in contactedBy() updates and is usually established by the Monitor
thread
+ * in GMSHealthMonitor but is initially zero.
+ *
+ * @return the timestamp
+ */
+ public long establishCurrentTime() {
+ currentTimeStamp = System.currentTimeMillis();
+ return currentTimeStamp;
+ }
}
public class TrickySocket extends ServerSocket {
diff --git
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 803f4de..73af605 100644
---
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -52,6 +52,7 @@ import java.util.stream.Collectors;
import org.apache.logging.log4j.Logger;
import org.jgroups.util.UUID;
+import org.apache.geode.annotations.VisibleForTesting;
import org.apache.geode.distributed.internal.membership.api.MemberIdentifier;
import
org.apache.geode.distributed.internal.membership.api.MemberStartupException;
import
org.apache.geode.distributed.internal.membership.api.MembershipClosedException;
@@ -129,7 +130,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier>
implements HealthMoni
public static final long MEMBER_SUSPECT_COLLECTION_INTERVAL =
Long.getLong("geode.suspect-member-collection-interval", 200);
- private volatile long currentTimeStamp;
+ /**
+ * A millisecond clock reading used to mark the last time a peer made
contact.
+ */
+ @VisibleForTesting
+ volatile long currentTimeStamp;
/**
* this member's ID
@@ -151,7 +156,8 @@ public class GMSHealthMonitor<ID extends MemberIdentifier>
implements HealthMoni
/**
* Members undergoing final checks
*/
- private final List<ID> membersInFinalCheck =
+ @VisibleForTesting
+ final List<ID> membersInFinalCheck =
Collections.synchronizedList(new ArrayList<>(30));
/**
@@ -205,7 +211,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier>
implements HealthMoni
* /**
* this class is to avoid garbage
*/
- private static class TimeStamp {
+ static class TimeStamp {
private volatile long timeStamp;
@@ -256,6 +262,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier>
implements HealthMoni
return;
}
+ // TODO - why are we taking two clock readings and setting
currentTimeStamp twice?
long currentTime = System.currentTimeMillis();
// this is the start of interval to record member activity
GMSHealthMonitor.this.currentTimeStamp = currentTime;
@@ -1239,7 +1246,11 @@ public class GMSHealthMonitor<ID extends
MemberIdentifier> implements HealthMoni
if (isStopping) {
return;
}
- contactedBy(m.getSuspect());
+ // if we're currently processing a final-check for this member don't
artificially update the
+ // timestamp of the member or the final-check will be invalid
+ if (!membersInFinalCheck.contains(m.getSuspect())) {
+ contactedBy(m.getSuspect());
+ }
}