This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new e101640  GEODE-6451 CI Failure: Hang cleaning up after 
ClusterConfigLocatorRestartDUnitTest.serverRestartsAfterLocatorReconnects
e101640 is described below

commit e10164060d6aad877b2503543b7f785a22a24f4b
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Wed Feb 27 12:05:48 2019 -0800

    GEODE-6451 CI Failure: Hang cleaning up after 
ClusterConfigLocatorRestartDUnitTest.serverRestartsAfterLocatorReconnects
    
    Removed code that scrubbed the current membership view of IDs matching a
    new join request.  This code is no longer needed now that we generate a
    new UUID for a member when it tries to auto-reconnect.  The new ID of
    such a member would never match an old ID.
    
    In practice this code was causing test failures in situations where the
    auto-reconnect time was set to a small value.  A member would end up
    sending multiple join requests to the same coordinator.  The first would
    be used to allow the new member into the cluster but the second, due to
    this code, would cause that member to be immediately removed from the
    cluster.
    
    The unicast receiver thread was becoming blocked if a
    forced disconnect occurred during reconnect because
    InternalDistributedSystem.disconnect sychronizes, for some
    reason, on GemFireCacheImpl.class.  This reworks that
    logic to have the reconnect thread get a
    SystemConnectException forcing cleanup of the reconnecting
    InternalDistributedSystem in that thread.
---
 .../ClusterConfigLocatorRestartDUnitTest.java      |  4 +++
 .../internal/InternalDistributedSystem.java        |  8 +++++
 .../distributed/internal/InternalLocator.java      |  8 ++++-
 .../internal/membership/gms/ServiceConfig.java     |  5 +++-
 .../internal/membership/gms/Services.java          |  1 +
 .../membership/gms/membership/GMSJoinLeave.java    | 34 +++++++++++++---------
 .../membership/gms/messenger/JGroupsMessenger.java |  3 +-
 .../geode/internal/cache/GemFireCacheImpl.java     | 15 ++++++++++
 .../apache/geode/test/junit/rules/VMProvider.java  |  8 ++---
 9 files changed, 66 insertions(+), 20 deletions(-)

diff --git 
a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
 
b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index 589ba0a..0ddb364 100644
--- 
a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++ 
b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -55,8 +55,11 @@ public class ClusterConfigLocatorRestartDUnitTest {
   @Test
   public void serverRestartsAfterLocatorReconnects() throws Exception {
     
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException:
 for testing");
+    IgnoredException.addIgnoredException("cluster configuration service not 
available");
+    IgnoredException.addIgnoredException("This thread has been stalled");
     IgnoredException
         .addIgnoredException("member unexpectedly shut down shared, unordered 
connection");
+    IgnoredException.addIgnoredException("Connection refused");
 
     MemberVM locator0 = rule.startLocatorVM(0);
 
@@ -84,6 +87,7 @@ public class ClusterConfigLocatorRestartDUnitTest {
     IgnoredException.addIgnoredException("This member is no longer in the 
membership view");
     IgnoredException.addIgnoredException("This node is no longer in the 
membership view");
     
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException:
 for testing");
+    IgnoredException.addIgnoredException("Connection refused");
 
 
     MemberVM locator0 = rule.startLocatorVM(0);
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 82e885a..7063162 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2983,6 +2983,14 @@ public class InternalDistributedSystem extends 
DistributedSystem
     this.attemptingToReconnect = false;
   }
 
+  public void stopReconnectingNoDisconnect() {
+    this.reconnectCancelled = true;
+    synchronized (this.reconnectLock) {
+      this.reconnectLock.notify();
+    }
+    this.attemptingToReconnect = false;
+  }
+
   /**
    * Provides hook for dunit to generate and store a detailed creation stack 
trace that includes the
    * keys/values of DistributionConfig including security related attributes 
without introducing
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index 28bd0bd..75200bf 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -1032,7 +1032,13 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
             setLocator(this);
           }
         }
-        ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+        try {
+          ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+        } catch (CancelException e) {
+          logger.info("Attempt to reconnect failed and further attempts have 
been terminated");
+          this.stoppedForReconnect = false;
+          return false;
+        }
       }
       InternalDistributedSystem newSystem = (InternalDistributedSystem) 
ds.getReconnectedSystem();
       if (newSystem != null) {
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
index faf8ebc..0fe55f0 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
@@ -35,7 +35,7 @@ public class ServiceConfig {
   private final int[] membershipPortRange;
   private final long memberTimeout;
 
-  private final boolean isReconnecting;
+  private boolean isReconnecting;
   private Integer lossThreshold;
   private final Integer memberWeight;
   private boolean networkPartitionDetectionEnabled;
@@ -157,4 +157,7 @@ public class ServiceConfig {
     networkPartitionDetectionEnabled = 
theConfig.getEnableNetworkPartitionDetection();
   }
 
+  public void setIsReconnecting(boolean b) {
+    this.isReconnecting = false;
+  }
 }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
index e8bc0b9..7d403ab 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
@@ -225,6 +225,7 @@ public class Services {
     }
     logger.info("Stopping membership services");
     this.stopping = true;
+    config.setIsReconnecting(false);
     try {
       this.timer.cancel();
     } finally {
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index c3e0f1d..a816e78 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -344,6 +344,9 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
             if (attemptToJoin()) {
               return true;
             }
+            if (this.isStopping) {
+              break;
+            }
             if (!state.possibleCoordinator.equals(localAddress)) {
               state.alreadyTried.add(state.possibleCoordinator);
             }
@@ -447,12 +450,13 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
       return isJoined;
     }
 
-    logger.debug("received join response {}", response);
+    logger.info("received join response {}", response);
     joinResponse[0] = null;
     String failReason = response.getRejectionMessage();
     if (failReason != null) {
       if (failReason.contains("Rejecting the attempt of a member using an 
older version")
-          || failReason.contains("15806")) {
+          || failReason.contains("15806")
+          || failReason.contains("ForcedDisconnectException")) {
         throw new SystemConnectException(failReason);
       } else if (failReason.contains("Failed to find credentials")) {
         throw new AuthenticationRequiredException(failReason);
@@ -1073,7 +1077,19 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
 
   private void forceDisconnect(String reason) {
     this.isStopping = true;
-    services.getManager().forceDisconnect(reason);
+    if (!isJoined) {
+      logger.fatal("BRUCE: forcedDisconnect invoked.  isReconnecting={} 
isJoined={}",
+          services.getConfig().isReconnecting(), isJoined);
+      joinResponse[0] =
+          new JoinResponseMessage(
+              "Stopping due to ForcedDisconnectException caused by '" + reason 
+ "'", -1);
+      isJoined = false;
+      synchronized (joinResponse) {
+        joinResponse.notifyAll();
+      }
+    } else {
+      services.getManager().forceDisconnect(reason);
+    }
   }
 
   private void ackView(InstallViewMessage m) {
@@ -2330,8 +2346,8 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
             }
           } // synchronized
           if (requests != null && !requests.isEmpty()) {
-            logger.info("View Creator is processing {} requests for the next 
membership view",
-                requests.size());
+            logger.info("View Creator is processing {} requests for the next 
membership view ({})",
+                requests.size(), requests);
             try {
               createAndSendView(requests);
               if (shutdown) {
@@ -2441,14 +2457,6 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
             JoinRequestMessage jmsg = (JoinRequestMessage) msg;
             mbr = jmsg.getMemberID();
             int port = jmsg.getFailureDetectionPort();
-            // see if an old member ID is being reused. If
-            // so we'll remove it from the new view
-            for (InternalDistributedMember m : oldMembers) {
-              if (mbr.compareTo(m, false) == 0) {
-                oldIDs.add(m);
-                break;
-              }
-            }
             if (!joinReqs.contains(mbr)) {
               joinReqs.add(mbr);
               joinPorts.put(mbr, port);
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index d365bb7..cf1232b 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -563,7 +563,8 @@ public class JGroupsMessenger implements Messenger {
     gmsMember.setMemberWeight((byte) (services.getConfig().getMemberWeight() & 
0xff));
     gmsMember.setNetworkPartitionDetectionEnabled(
         
services.getConfig().getDistributionConfig().getEnableNetworkPartitionDetection());
-    logger.info("Established local address {}", localAddress);
+    logger.info("Established local address {} with net-member {}", 
localAddress,
+        localAddress.getNetMember());
     services.setLocalAddress(localAddress);
   }
 
diff --git 
a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
 
b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
index 3eab4a9..022f9a2 100755
--- 
a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
+++ 
b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
@@ -2142,6 +2142,21 @@ public class GemFireCacheImpl implements InternalCache, 
InternalClientCache, Has
     if (isClosed()) {
       return;
     }
+
+    if (!keepDS && systemFailureCause == null // normal cache close
+        && (this.isReconnecting() || this.system.getReconnectedSystem() != 
null)) {
+      logger.debug(
+          "Cache is shutting down distributed system connection.  "
+              + "isReconnecting={}  reconnectedSystem={} keepAlive={} 
keepDS={}",
+          this.isReconnecting(), system.getReconnectedSystem(), keepAlive, 
keepDS);
+
+      this.system.stopReconnectingNoDisconnect();
+      if (this.system.getReconnectedSystem() != null) {
+        this.system.getReconnectedSystem().disconnect();
+      }
+      return;
+    }
+
     final boolean isDebugEnabled = logger.isDebugEnabled();
 
     synchronized (GemFireCacheImpl.class) {
diff --git 
a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java 
b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
index de52d9e..9b50736 100644
--- 
a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
+++ 
b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
@@ -53,7 +53,7 @@ public abstract class VMProvider {
   }
 
   public void stop(boolean cleanWorkingDir) {
-    getVM().invoke(() -> {
+    getVM().invoke("stop cluster elements", () -> {
       // this did not clean up the files
       ClusterStartupRule.stopElementInsideVM();
       MemberStarterRule.disconnectDSIfAny();
@@ -66,19 +66,19 @@ public abstract class VMProvider {
   }
 
   public boolean isClient() {
-    return getVM().invoke(() -> {
+    return getVM().invoke("isClient", () -> {
       return ClusterStartupRule.clientCacheRule != null;
     });
   }
 
   public boolean isLocator() {
-    return getVM().invoke(() -> ClusterStartupRule.getLocator() != null);
+    return getVM().invoke("isLocator", () -> ClusterStartupRule.getLocator() 
!= null);
   }
 
   // a server can be started without a cache server, so as long as this member 
has no locator,
   // it's deemed as a server
   public boolean isServer() {
-    return getVM().invoke(() -> ClusterStartupRule.getLocator() == null);
+    return getVM().invoke("isServer", () -> ClusterStartupRule.getLocator() == 
null);
   }
 
   public void invoke(final SerializableRunnableIF runnable) {

Reply via email to