This is an automated email from the ASF dual-hosted git repository. bschuchardt pushed a commit to branch support/1.13 in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/support/1.13 by this push: new 956c9aa GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758) 956c9aa is described below commit 956c9aa66328a8bed14892f16d230f8d4f6c8105 Author: Bruce Schuchardt <bschucha...@pivotal.io> AuthorDate: Mon Nov 30 08:08:58 2020 -0800 GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758) * GEODE-8721: member that should become coordinator never detects loss of current coordinator If a server is in the process of performing an availability check on another server we shouldn't update the contact timestamp for the suspected server based on gossip from another server. Doing so will make the availability check pass and send out another gossip message that would likewise update their timestamps for the suspected server, perpetuating the notion that the suspect is still around. * added VisibleForTesting (cherry picked from commit b7afc604b9c2fafe4388dcdcf05fc7ec49c0ce86) --- .../gms/fd/GMSHealthMonitorJUnitTest.java | 51 ++++++++++++++++++++++ .../membership/gms/fd/GMSHealthMonitor.java | 19 ++++++-- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java index f734e81..2aaf2f5 100644 --- a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java +++ b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java @@ -634,6 +634,29 @@ public class GMSHealthMonitorJUnitTest { } @Test + public void testFinalCheckInProgressPreemptsLivenessGossip() throws Exception { + // if a member is undergoing a final check we should not accept another member's + // gossip about the suspect being alive and should not update the contact timestamp + // because that interferes with the final check + useGMSHealthMonitorTestClass = true; + simulateHeartbeatInGMSHealthMonitorTestClass = false; + + GMSMembershipView v = installAView(); + setFailureDetectionPorts(v); + + MemberIdentifier memberToCheck = gmsHealthMonitor.getNextNeighbor(); + GMSHealthMonitorTest testMonitor = (GMSHealthMonitorTest) gmsHealthMonitor; + + // set an old contact timestamp for the suspect, tell the monitor that an availability + // check succeeded and then make sure it didn't update the timestamp for the suspect + final long timestamp = testMonitor.establishCurrentTime() - 2000; + testMonitor.setContactTimestamp(memberToCheck, timestamp); + testMonitor.addMemberInFinalCheck(memberToCheck); + testMonitor.processMessage(new FinalCheckPassedMessage(null, memberToCheck)); + assertThat(testMonitor.getContactTimestamp(memberToCheck)).isEqualTo(timestamp); + } + + @Test public void testFailedSelfCheckRemovesMemberAsSuspect() throws Exception { useGMSHealthMonitorTestClass = true; simulateHeartbeatInGMSHealthMonitorTestClass = false; @@ -1019,6 +1042,34 @@ public class GMSHealthMonitorJUnitTest { return serverSocket; } } + + /** + * when a suspect is undergoing an availability check its identifier will + * be in the membersInFinalCheck collection + */ + public void addMemberInFinalCheck(MemberIdentifier memberToCheck) { + membersInFinalCheck.add(memberToCheck); + } + + public void setContactTimestamp(MemberIdentifier memberToCheck, long timestamp) { + memberTimeStamps.put(memberToCheck, new TimeStamp(timestamp)); + } + + public long getContactTimestamp(MemberIdentifier memberIdentifier) { + return ((TimeStamp) memberTimeStamps.get(memberIdentifier)).getTime(); + } + + /** + * Establish the currentTimeStamp for the health monitor. This is the timestamp + * used in contactedBy() updates and is usually established by the Monitor thread + * in GMSHealthMonitor but is initially zero. + * + * @return the timestamp + */ + public long establishCurrentTime() { + currentTimeStamp = System.currentTimeMillis(); + return currentTimeStamp; + } } public class TrickySocket extends ServerSocket { diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java index bd95e57..2590e23 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java @@ -52,6 +52,7 @@ import java.util.stream.Collectors; import org.apache.logging.log4j.Logger; import org.jgroups.util.UUID; +import org.apache.geode.annotations.VisibleForTesting; import org.apache.geode.distributed.internal.membership.api.MemberIdentifier; import org.apache.geode.distributed.internal.membership.api.MemberStartupException; import org.apache.geode.distributed.internal.membership.api.MembershipClosedException; @@ -130,7 +131,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni public static final long MEMBER_SUSPECT_COLLECTION_INTERVAL = Long.getLong("geode.suspect-member-collection-interval", 200); - private volatile long currentTimeStamp; + /** + * A millisecond clock reading used to mark the last time a peer made contact. + */ + @VisibleForTesting + volatile long currentTimeStamp; /** * this member's ID @@ -152,7 +157,8 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni /** * Members undergoing final checks */ - private final List<ID> membersInFinalCheck = + @VisibleForTesting + final List<ID> membersInFinalCheck = Collections.synchronizedList(new ArrayList<>(30)); /** @@ -206,7 +212,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni * /** * this class is to avoid garbage */ - private static class TimeStamp { + static class TimeStamp { private volatile long timeStamp; @@ -257,6 +263,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni return; } + // TODO - why are we taking two clock readings and setting currentTimeStamp twice? long currentTime = System.currentTimeMillis(); // this is the start of interval to record member activity GMSHealthMonitor.this.currentTimeStamp = currentTime; @@ -1240,7 +1247,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni if (isStopping) { return; } - contactedBy(m.getSuspect()); + // if we're currently processing a final-check for this member don't artificially update the + // timestamp of the member or the final-check will be invalid + if (!membersInFinalCheck.contains(m.getSuspect())) { + contactedBy(m.getSuspect()); + } }