This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch release/1.9.0
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/release/1.9.0 by this push:
new 0fea07a GEODE-6423 availability checks sometimes immediately initiate
removal
0fea07a is described below
commit 0fea07ad0eb4cc23220e482967c2734f8835e982
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Thu Apr 4 10:57:12 2019 -0700
GEODE-6423 availability checks sometimes immediately initiate removal
Do not loop in trying to form a tcp/ip connection to a suspect unless
the next step is to remove the suspect from membership. In this case
there will be another invocation of the same method that will take the
removal step next.
(cherry picked from commit 2e0a893f0587bdcec560960a6b283b5465d5897f)
---
.../gms/fd/GMSHealthMonitorJUnitTest.java | 7 +++--
.../membership/gms/fd/GMSHealthMonitor.java | 34 ++++++++++------------
2 files changed, 20 insertions(+), 21 deletions(-)
diff --git
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index dbbd332..6e72f8c 100644
---
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -816,7 +816,7 @@ public class GMSHealthMonitorJUnitTest {
InternalDistributedMember otherMember =
createInternalDistributedMember(Version.CURRENT_ORDINAL, 0, 1, 1);
long startTime = System.currentTimeMillis();
- gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort());
+ gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort(),
true);
mySocket.close();
serverThread.interrupt();
serverThread.join(getTimeout().getValueInMS());
@@ -902,7 +902,8 @@ public class GMSHealthMonitorJUnitTest {
public boolean useBlockingSocket = false;
@Override
- boolean doTCPCheckMember(InternalDistributedMember suspectMember, int
port) {
+ boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
+ boolean retryIfConnectFails) {
if (useGMSHealthMonitorTestClass) {
if (simulateHeartbeatInGMSHealthMonitorTestClass) {
HeartbeatMessage fakeHeartbeat = new HeartbeatMessage();
@@ -911,7 +912,7 @@ public class GMSHealthMonitorJUnitTest {
}
return false;
}
- return super.doTCPCheckMember(suspectMember, port);
+ return super.doTCPCheckMember(suspectMember, port, retryIfConnectFails);
}
@Override
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index d1d19e7..43da869 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -511,7 +511,8 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
* @param suspectMember member that does not respond to
HeartbeatRequestMessage
* @return true if successfully exchanged PING/PONG with TCP connection,
otherwise false.
*/
- boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
+ boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
+ boolean retryIfConnectFails) {
Socket clientSocket = null;
// make sure we try to check on the member for the contracted
memberTimeout period
// in case a timed socket.connect() returns immediately
@@ -555,7 +556,8 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
// expected
}
}
- } while (!passed && !this.isShutdown() && System.nanoTime() < giveupTime);
+ } while (retryIfConnectFails && !passed && !this.isShutdown()
+ && System.nanoTime() < giveupTime);
return passed;
}
@@ -690,7 +692,6 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
if (!ssocket.isClosed()) {
try {
ssocket.close();
- logger.info("GMSHealthMonitor server socket closed.");
} catch (IOException e) {
logger.debug("Unexpected exception", e);
}
@@ -857,7 +858,7 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
}
InternalDistributedMember oldNeighbor = nextNeighbor;
if (oldNeighbor != newNeighbor) {
- logger.info("Failure detection is now watching " + newNeighbor);
+ logger.debug("Failure detection is now watching " + newNeighbor);
nextNeighbor = newNeighbor;
}
}
@@ -928,7 +929,6 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
if (serverSocket != null && !serverSocket.isClosed()) {
try {
serverSocket.close();
- logger.info("GMSHealthMonitor server socket is closed in
stopServices().");
} catch (IOException e) {
logger.trace("Unexpected exception", e);
}
@@ -939,8 +939,6 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
- logger.info("GMSHealthMonitor serverSocketExecutor is "
- + (serverSocketExecutor.isTerminated() ? "terminated" : "not
terminated"));
}
}
@@ -1122,9 +1120,6 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
NetView cv = currentView;
- logger.info("Received suspect message {} with current view {}",
incomingRequest,
- cv == null ? "<no view>" : cv.getViewId());
-
if (cv == null) {
return;
}
@@ -1176,7 +1171,7 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
synchronized (suspectRequestsInView) {
recordSuspectRequests(suspectRequests, cv);
Set<SuspectRequest> suspectsInView = suspectRequestsInView.get(cv);
- logger.info("Current suspects are {}", suspectsInView);
+ logger.debug("Current suspects are {}", suspectsInView);
for (final SuspectRequest sr : suspectsInView) {
check.remove(sr.getSuspectMember());
membersToCheck.add(sr);
@@ -1189,10 +1184,10 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
}
}
if (!membersLeaving.isEmpty()) {
- logger.info("Current leave requests are {}", membersLeaving);
+ logger.debug("Current leave requests are {}", membersLeaving);
check.removeAll(membersLeaving);
}
- logger.info(
+ logger.trace(
"Proposed view with suspects & leaving members removed is {}\nwith
coordinator {}\nmy address is {}",
check,
check.getCoordinator(), localAddress);
@@ -1308,7 +1303,10 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
// if we will get heartbeat then it will change the timestamp, which
we are
// checking below in case of tcp check failure..
doCheckMember(mbr, false);
- pinged = doTCPCheckMember(mbr, port);
+ // now, while waiting for a heartbeat, try connecting to the suspect's
failure detection
+ // port
+ final boolean retryIfConnectFails = forceRemovalIfCheckFails;
+ pinged = doTCPCheckMember(mbr, port, retryIfConnectFails);
}
if (!pinged && !isStopping) {
@@ -1325,7 +1323,7 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
} else {
// if this node can survive an availability check then initiate
suspicion about
// the node that failed the availability check
- if (doTCPCheckMember(localAddress, this.socketPort)) {
+ if (doTCPCheckMember(localAddress, this.socketPort, false)) {
membersInFinalCheck.remove(mbr);
// tell peers about this member and then perform another
availability check
memberSuspected(localAddress, mbr, reason);
@@ -1335,7 +1333,7 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
Collections
.singletonList(new SuspectRequest(mbr, "failed
availability check")));
suspectMembersMessage.setSender(localAddress);
- logger.info("Performing local processing on suspect request");
+ logger.debug("Performing local processing on suspect request");
processSuspectMembersRequest(suspectMembersMessage);
}
}
@@ -1398,7 +1396,7 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
recipients = currentView.getMembers();
}
- logger.info("Sending suspect messages to {}", recipients);
+ logger.trace("Sending suspect messages to {}", recipients);
SuspectMembersMessage smm = new SuspectMembersMessage(recipients,
requests);
Set<InternalDistributedMember> failedRecipients;
try {
@@ -1409,7 +1407,7 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
}
if (failedRecipients != null && failedRecipients.size() > 0) {
- logger.info("Unable to send suspect message to {}", failedRecipients);
+ logger.trace("Unable to send suspect message to {}", failedRecipients);
}
}