This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch release/1.9.0
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/release/1.9.0 by this push:
new e1d30f2 GEODE-6451 CI Failure: Hang cleaning up after
ClusterConfigLocatorRestartDUnitTest.serverRestartsAfterLocatorReconnects
e1d30f2 is described below
commit e1d30f2561d4ed55e5788d40620b5c8ad38c23aa
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Wed Feb 27 12:05:48 2019 -0800
GEODE-6451 CI Failure: Hang cleaning up after
ClusterConfigLocatorRestartDUnitTest.serverRestartsAfterLocatorReconnects
Removed code that scrubbed the current membership view of IDs matching a
new join request. This code is no longer needed now that we generate a
new UUID for a member when it tries to auto-reconnect. The new ID of
such a member would never match an old ID.
In practice this code was causing test failures in situations where the
auto-reconnect time was set to a small value. A member would end up
sending multiple join requests to the same coordinator. The first would
be used to allow the new member into the cluster but the second, due to
this code, would cause that member to be immediately removed from the
cluster.
The unicast receiver thread was becoming blocked if a
forced disconnect occurred during reconnect because
InternalDistributedSystem.disconnect sychronizes, for some
reason, on GemFireCacheImpl.class. This reworks that
logic to have the reconnect thread get a
SystemConnectException forcing cleanup of the reconnecting
InternalDistributedSystem in that thread.
(cherry picked from commit e10164060d6aad877b2503543b7f785a22a24f4b)
Conflicts:
geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
---
.../ClusterConfigLocatorRestartDUnitTest.java | 6 ++++
.../internal/InternalDistributedSystem.java | 8 +++++
.../distributed/internal/InternalLocator.java | 8 ++++-
.../internal/membership/gms/ServiceConfig.java | 5 +++-
.../internal/membership/gms/Services.java | 1 +
.../membership/gms/membership/GMSJoinLeave.java | 34 +++++++++++++---------
.../membership/gms/messenger/JGroupsMessenger.java | 3 +-
.../geode/internal/cache/GemFireCacheImpl.java | 15 ++++++++++
.../apache/geode/test/junit/rules/VMProvider.java | 8 ++---
9 files changed, 68 insertions(+), 20 deletions(-)
diff --git
a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index d750969..0ddb364 100644
---
a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++
b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -55,6 +55,11 @@ public class ClusterConfigLocatorRestartDUnitTest {
@Test
public void serverRestartsAfterLocatorReconnects() throws Exception {
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException:
for testing");
+ IgnoredException.addIgnoredException("cluster configuration service not
available");
+ IgnoredException.addIgnoredException("This thread has been stalled");
+ IgnoredException
+ .addIgnoredException("member unexpectedly shut down shared, unordered
connection");
+ IgnoredException.addIgnoredException("Connection refused");
MemberVM locator0 = rule.startLocatorVM(0);
@@ -82,6 +87,7 @@ public class ClusterConfigLocatorRestartDUnitTest {
IgnoredException.addIgnoredException("This member is no longer in the
membership view");
IgnoredException.addIgnoredException("This node is no longer in the
membership view");
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException:
for testing");
+ IgnoredException.addIgnoredException("Connection refused");
MemberVM locator0 = rule.startLocatorVM(0);
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 2850b64..2cc25d5 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2971,6 +2971,14 @@ public class InternalDistributedSystem extends
DistributedSystem
this.attemptingToReconnect = false;
}
+ public void stopReconnectingNoDisconnect() {
+ this.reconnectCancelled = true;
+ synchronized (this.reconnectLock) {
+ this.reconnectLock.notify();
+ }
+ this.attemptingToReconnect = false;
+ }
+
/**
* Provides hook for dunit to generate and store a detailed creation stack
trace that includes the
* keys/values of DistributionConfig including security related attributes
without introducing
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index 736da0a..9922902 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -1041,7 +1041,13 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
setLocator(this);
}
}
- ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+ try {
+ ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+ } catch (CancelException e) {
+ logger.info("Attempt to reconnect failed and further attempts have
been terminated");
+ this.stoppedForReconnect = false;
+ return false;
+ }
}
InternalDistributedSystem newSystem = (InternalDistributedSystem)
ds.getReconnectedSystem();
if (newSystem != null) {
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
index faf8ebc..0fe55f0 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
@@ -35,7 +35,7 @@ public class ServiceConfig {
private final int[] membershipPortRange;
private final long memberTimeout;
- private final boolean isReconnecting;
+ private boolean isReconnecting;
private Integer lossThreshold;
private final Integer memberWeight;
private boolean networkPartitionDetectionEnabled;
@@ -157,4 +157,7 @@ public class ServiceConfig {
networkPartitionDetectionEnabled =
theConfig.getEnableNetworkPartitionDetection();
}
+ public void setIsReconnecting(boolean b) {
+ this.isReconnecting = false;
+ }
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
index e8bc0b9..7d403ab 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
@@ -225,6 +225,7 @@ public class Services {
}
logger.info("Stopping membership services");
this.stopping = true;
+ config.setIsReconnecting(false);
try {
this.timer.cancel();
} finally {
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 261bb70..5ed5836 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -344,6 +344,9 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
if (attemptToJoin()) {
return true;
}
+ if (this.isStopping) {
+ break;
+ }
if (!state.possibleCoordinator.equals(localAddress)) {
state.alreadyTried.add(state.possibleCoordinator);
}
@@ -447,12 +450,13 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
return isJoined;
}
- logger.debug("received join response {}", response);
+ logger.info("received join response {}", response);
joinResponse[0] = null;
String failReason = response.getRejectionMessage();
if (failReason != null) {
if (failReason.contains("Rejecting the attempt of a member using an
older version")
- || failReason.contains("15806")) {
+ || failReason.contains("15806")
+ || failReason.contains("ForcedDisconnectException")) {
throw new SystemConnectException(failReason);
} else if (failReason.contains("Failed to find credentials")) {
throw new AuthenticationRequiredException(failReason);
@@ -1072,7 +1076,19 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
private void forceDisconnect(String reason) {
this.isStopping = true;
- services.getManager().forceDisconnect(reason);
+ if (!isJoined) {
+ logger.fatal("BRUCE: forcedDisconnect invoked. isReconnecting={}
isJoined={}",
+ services.getConfig().isReconnecting(), isJoined);
+ joinResponse[0] =
+ new JoinResponseMessage(
+ "Stopping due to ForcedDisconnectException caused by '" + reason
+ "'", -1);
+ isJoined = false;
+ synchronized (joinResponse) {
+ joinResponse.notifyAll();
+ }
+ } else {
+ services.getManager().forceDisconnect(reason);
+ }
}
private void ackView(InstallViewMessage m) {
@@ -2329,8 +2345,8 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
}
} // synchronized
if (requests != null && !requests.isEmpty()) {
- logger.info("View Creator is processing {} requests for the next
membership view",
- requests.size());
+ logger.info("View Creator is processing {} requests for the next
membership view ({})",
+ requests.size(), requests);
try {
createAndSendView(requests);
if (shutdown) {
@@ -2440,14 +2456,6 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
JoinRequestMessage jmsg = (JoinRequestMessage) msg;
mbr = jmsg.getMemberID();
int port = jmsg.getFailureDetectionPort();
- // see if an old member ID is being reused. If
- // so we'll remove it from the new view
- for (InternalDistributedMember m : oldMembers) {
- if (mbr.compareTo(m, false) == 0) {
- oldIDs.add(m);
- break;
- }
- }
if (!joinReqs.contains(mbr)) {
joinReqs.add(mbr);
joinPorts.put(mbr, port);
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index cf526f4..ccca881 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -554,7 +554,8 @@ public class JGroupsMessenger implements Messenger {
gmsMember.setMemberWeight((byte) (services.getConfig().getMemberWeight() &
0xff));
gmsMember.setNetworkPartitionDetectionEnabled(
services.getConfig().getDistributionConfig().getEnableNetworkPartitionDetection());
-
+ logger.info("Established local address {} with net-member {}",
localAddress,
+ localAddress.getNetMember());
services.setLocalAddress(localAddress);
}
diff --git
a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
index f3d1187..28dc6c8 100755
---
a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
+++
b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
@@ -2134,6 +2134,21 @@ public class GemFireCacheImpl implements InternalCache,
InternalClientCache, Has
if (isClosed()) {
return;
}
+
+ if (!keepDS && systemFailureCause == null // normal cache close
+ && (this.isReconnecting() || this.system.getReconnectedSystem() !=
null)) {
+ logger.debug(
+ "Cache is shutting down distributed system connection. "
+ + "isReconnecting={} reconnectedSystem={} keepAlive={}
keepDS={}",
+ this.isReconnecting(), system.getReconnectedSystem(), keepAlive,
keepDS);
+
+ this.system.stopReconnectingNoDisconnect();
+ if (this.system.getReconnectedSystem() != null) {
+ this.system.getReconnectedSystem().disconnect();
+ }
+ return;
+ }
+
final boolean isDebugEnabled = logger.isDebugEnabled();
synchronized (GemFireCacheImpl.class) {
diff --git
a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
index de52d9e..9b50736 100644
---
a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
+++
b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
@@ -53,7 +53,7 @@ public abstract class VMProvider {
}
public void stop(boolean cleanWorkingDir) {
- getVM().invoke(() -> {
+ getVM().invoke("stop cluster elements", () -> {
// this did not clean up the files
ClusterStartupRule.stopElementInsideVM();
MemberStarterRule.disconnectDSIfAny();
@@ -66,19 +66,19 @@ public abstract class VMProvider {
}
public boolean isClient() {
- return getVM().invoke(() -> {
+ return getVM().invoke("isClient", () -> {
return ClusterStartupRule.clientCacheRule != null;
});
}
public boolean isLocator() {
- return getVM().invoke(() -> ClusterStartupRule.getLocator() != null);
+ return getVM().invoke("isLocator", () -> ClusterStartupRule.getLocator()
!= null);
}
// a server can be started without a cache server, so as long as this member
has no locator,
// it's deemed as a server
public boolean isServer() {
- return getVM().invoke(() -> ClusterStartupRule.getLocator() == null);
+ return getVM().invoke("isServer", () -> ClusterStartupRule.getLocator() ==
null);
}
public void invoke(final SerializableRunnableIF runnable) {