This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/develop by this push:
new e101640 GEODE-6451 CI Failure: Hang cleaning up after
ClusterConfigLocatorRestartDUnitTest.serverRestartsAfterLocatorReconnects
e101640 is described below
commit e10164060d6aad877b2503543b7f785a22a24f4b
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Wed Feb 27 12:05:48 2019 -0800
GEODE-6451 CI Failure: Hang cleaning up after
ClusterConfigLocatorRestartDUnitTest.serverRestartsAfterLocatorReconnects
Removed code that scrubbed the current membership view of IDs matching a
new join request. This code is no longer needed now that we generate a
new UUID for a member when it tries to auto-reconnect. The new ID of
such a member would never match an old ID.
In practice this code was causing test failures in situations where the
auto-reconnect time was set to a small value. A member would end up
sending multiple join requests to the same coordinator. The first would
be used to allow the new member into the cluster but the second, due to
this code, would cause that member to be immediately removed from the
cluster.
The unicast receiver thread was becoming blocked if a
forced disconnect occurred during reconnect because
InternalDistributedSystem.disconnect sychronizes, for some
reason, on GemFireCacheImpl.class. This reworks that
logic to have the reconnect thread get a
SystemConnectException forcing cleanup of the reconnecting
InternalDistributedSystem in that thread.
---
.../ClusterConfigLocatorRestartDUnitTest.java | 4 +++
.../internal/InternalDistributedSystem.java | 8 +++++
.../distributed/internal/InternalLocator.java | 8 ++++-
.../internal/membership/gms/ServiceConfig.java | 5 +++-
.../internal/membership/gms/Services.java | 1 +
.../membership/gms/membership/GMSJoinLeave.java | 34 +++++++++++++---------
.../membership/gms/messenger/JGroupsMessenger.java | 3 +-
.../geode/internal/cache/GemFireCacheImpl.java | 15 ++++++++++
.../apache/geode/test/junit/rules/VMProvider.java | 8 ++---
9 files changed, 66 insertions(+), 20 deletions(-)
diff --git
a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index 589ba0a..0ddb364 100644
---
a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++
b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -55,8 +55,11 @@ public class ClusterConfigLocatorRestartDUnitTest {
@Test
public void serverRestartsAfterLocatorReconnects() throws Exception {
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException:
for testing");
+ IgnoredException.addIgnoredException("cluster configuration service not
available");
+ IgnoredException.addIgnoredException("This thread has been stalled");
IgnoredException
.addIgnoredException("member unexpectedly shut down shared, unordered
connection");
+ IgnoredException.addIgnoredException("Connection refused");
MemberVM locator0 = rule.startLocatorVM(0);
@@ -84,6 +87,7 @@ public class ClusterConfigLocatorRestartDUnitTest {
IgnoredException.addIgnoredException("This member is no longer in the
membership view");
IgnoredException.addIgnoredException("This node is no longer in the
membership view");
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException:
for testing");
+ IgnoredException.addIgnoredException("Connection refused");
MemberVM locator0 = rule.startLocatorVM(0);
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 82e885a..7063162 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2983,6 +2983,14 @@ public class InternalDistributedSystem extends
DistributedSystem
this.attemptingToReconnect = false;
}
+ public void stopReconnectingNoDisconnect() {
+ this.reconnectCancelled = true;
+ synchronized (this.reconnectLock) {
+ this.reconnectLock.notify();
+ }
+ this.attemptingToReconnect = false;
+ }
+
/**
* Provides hook for dunit to generate and store a detailed creation stack
trace that includes the
* keys/values of DistributionConfig including security related attributes
without introducing
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index 28bd0bd..75200bf 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -1032,7 +1032,13 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
setLocator(this);
}
}
- ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+ try {
+ ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+ } catch (CancelException e) {
+ logger.info("Attempt to reconnect failed and further attempts have
been terminated");
+ this.stoppedForReconnect = false;
+ return false;
+ }
}
InternalDistributedSystem newSystem = (InternalDistributedSystem)
ds.getReconnectedSystem();
if (newSystem != null) {
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
index faf8ebc..0fe55f0 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
@@ -35,7 +35,7 @@ public class ServiceConfig {
private final int[] membershipPortRange;
private final long memberTimeout;
- private final boolean isReconnecting;
+ private boolean isReconnecting;
private Integer lossThreshold;
private final Integer memberWeight;
private boolean networkPartitionDetectionEnabled;
@@ -157,4 +157,7 @@ public class ServiceConfig {
networkPartitionDetectionEnabled =
theConfig.getEnableNetworkPartitionDetection();
}
+ public void setIsReconnecting(boolean b) {
+ this.isReconnecting = false;
+ }
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
index e8bc0b9..7d403ab 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
@@ -225,6 +225,7 @@ public class Services {
}
logger.info("Stopping membership services");
this.stopping = true;
+ config.setIsReconnecting(false);
try {
this.timer.cancel();
} finally {
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index c3e0f1d..a816e78 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -344,6 +344,9 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
if (attemptToJoin()) {
return true;
}
+ if (this.isStopping) {
+ break;
+ }
if (!state.possibleCoordinator.equals(localAddress)) {
state.alreadyTried.add(state.possibleCoordinator);
}
@@ -447,12 +450,13 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
return isJoined;
}
- logger.debug("received join response {}", response);
+ logger.info("received join response {}", response);
joinResponse[0] = null;
String failReason = response.getRejectionMessage();
if (failReason != null) {
if (failReason.contains("Rejecting the attempt of a member using an
older version")
- || failReason.contains("15806")) {
+ || failReason.contains("15806")
+ || failReason.contains("ForcedDisconnectException")) {
throw new SystemConnectException(failReason);
} else if (failReason.contains("Failed to find credentials")) {
throw new AuthenticationRequiredException(failReason);
@@ -1073,7 +1077,19 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
private void forceDisconnect(String reason) {
this.isStopping = true;
- services.getManager().forceDisconnect(reason);
+ if (!isJoined) {
+ logger.fatal("BRUCE: forcedDisconnect invoked. isReconnecting={}
isJoined={}",
+ services.getConfig().isReconnecting(), isJoined);
+ joinResponse[0] =
+ new JoinResponseMessage(
+ "Stopping due to ForcedDisconnectException caused by '" + reason
+ "'", -1);
+ isJoined = false;
+ synchronized (joinResponse) {
+ joinResponse.notifyAll();
+ }
+ } else {
+ services.getManager().forceDisconnect(reason);
+ }
}
private void ackView(InstallViewMessage m) {
@@ -2330,8 +2346,8 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
}
} // synchronized
if (requests != null && !requests.isEmpty()) {
- logger.info("View Creator is processing {} requests for the next
membership view",
- requests.size());
+ logger.info("View Creator is processing {} requests for the next
membership view ({})",
+ requests.size(), requests);
try {
createAndSendView(requests);
if (shutdown) {
@@ -2441,14 +2457,6 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
JoinRequestMessage jmsg = (JoinRequestMessage) msg;
mbr = jmsg.getMemberID();
int port = jmsg.getFailureDetectionPort();
- // see if an old member ID is being reused. If
- // so we'll remove it from the new view
- for (InternalDistributedMember m : oldMembers) {
- if (mbr.compareTo(m, false) == 0) {
- oldIDs.add(m);
- break;
- }
- }
if (!joinReqs.contains(mbr)) {
joinReqs.add(mbr);
joinPorts.put(mbr, port);
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index d365bb7..cf1232b 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -563,7 +563,8 @@ public class JGroupsMessenger implements Messenger {
gmsMember.setMemberWeight((byte) (services.getConfig().getMemberWeight() &
0xff));
gmsMember.setNetworkPartitionDetectionEnabled(
services.getConfig().getDistributionConfig().getEnableNetworkPartitionDetection());
- logger.info("Established local address {}", localAddress);
+ logger.info("Established local address {} with net-member {}",
localAddress,
+ localAddress.getNetMember());
services.setLocalAddress(localAddress);
}
diff --git
a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
index 3eab4a9..022f9a2 100755
---
a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
+++
b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
@@ -2142,6 +2142,21 @@ public class GemFireCacheImpl implements InternalCache,
InternalClientCache, Has
if (isClosed()) {
return;
}
+
+ if (!keepDS && systemFailureCause == null // normal cache close
+ && (this.isReconnecting() || this.system.getReconnectedSystem() !=
null)) {
+ logger.debug(
+ "Cache is shutting down distributed system connection. "
+ + "isReconnecting={} reconnectedSystem={} keepAlive={}
keepDS={}",
+ this.isReconnecting(), system.getReconnectedSystem(), keepAlive,
keepDS);
+
+ this.system.stopReconnectingNoDisconnect();
+ if (this.system.getReconnectedSystem() != null) {
+ this.system.getReconnectedSystem().disconnect();
+ }
+ return;
+ }
+
final boolean isDebugEnabled = logger.isDebugEnabled();
synchronized (GemFireCacheImpl.class) {
diff --git
a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
index de52d9e..9b50736 100644
---
a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
+++
b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
@@ -53,7 +53,7 @@ public abstract class VMProvider {
}
public void stop(boolean cleanWorkingDir) {
- getVM().invoke(() -> {
+ getVM().invoke("stop cluster elements", () -> {
// this did not clean up the files
ClusterStartupRule.stopElementInsideVM();
MemberStarterRule.disconnectDSIfAny();
@@ -66,19 +66,19 @@ public abstract class VMProvider {
}
public boolean isClient() {
- return getVM().invoke(() -> {
+ return getVM().invoke("isClient", () -> {
return ClusterStartupRule.clientCacheRule != null;
});
}
public boolean isLocator() {
- return getVM().invoke(() -> ClusterStartupRule.getLocator() != null);
+ return getVM().invoke("isLocator", () -> ClusterStartupRule.getLocator()
!= null);
}
// a server can be started without a cache server, so as long as this member
has no locator,
// it's deemed as a server
public boolean isServer() {
- return getVM().invoke(() -> ClusterStartupRule.getLocator() == null);
+ return getVM().invoke("isServer", () -> ClusterStartupRule.getLocator() ==
null);
}
public void invoke(final SerializableRunnableIF runnable) {