This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-7796
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/feature/GEODE-7796 by this 
push:
     new d8121db  reduce locator-wait-time (seconds, not millis)
d8121db is described below

commit d8121db35d773f326725d92397a32b8400fc953a
Author: Bruce Schuchardt <bschucha...@pivotal.io>
AuthorDate: Fri Feb 14 08:28:49 2020 -0800

    reduce locator-wait-time (seconds, not millis)
    
    also several other changes to fix the underlying failure:
      1) add a synchronization to the services restart thread so only one
      thread is active at a time
      2) shut down membership cleanup executor and avoid creating multiple
      TcpServer executors on auto-reconnect
      3) remove setting the locator as a dependent of the
      InternalDistributedSystem.  This was causing locator.stop() to be
      invoked multiple times (as Dale noticed in his analysis)
      4) when stopping a locator for auto-reconnect wait for it to stop in
      order to avoid creating multiple restart threads when there are
      cascading failures.
---
 .../apache/geode/distributed/LocatorDUnitTest.java |  7 ++--
 .../distributed/internal/DistributionImpl.java     |  2 +-
 .../internal/InternalDistributedSystem.java        | 11 +----
 .../distributed/internal/InternalLocator.java      | 49 ++++++++++------------
 .../internal/membership/gms/GMSMembership.java     |  6 ++-
 .../gms/locator/MembershipLocatorImpl.java         |  7 +++-
 .../distributed/internal/tcpserver/TcpServer.java  |  4 +-
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git 
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
 
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index d6eb991..5dbe309 100644
--- 
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ 
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -72,7 +72,6 @@ import java.util.Set;
 import org.apache.logging.log4j.Logger;
 import org.junit.After;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
@@ -256,7 +255,6 @@ public class LocatorDUnitTest implements Serializable {
   }
 
   @Test
-  @Ignore("GEODE=7760 - test sometimes hangs due to product issue")
   public void testCrashLocatorMultipleTimes() throws Exception {
     port1 = AvailablePort.getRandomAvailablePort(AvailablePort.SOCKET);
     DistributedTestUtils.deleteLocatorStateFile(port1);
@@ -270,12 +268,15 @@ public class LocatorDUnitTest implements Serializable {
     properties.put(MAX_WAIT_TIME_RECONNECT, "" + (3 * memberTimeoutMS));
     // since we're restarting location services let's be a little forgiving 
about that service
     // starting up so that stress-tests can pass
-    properties.put(LOCATOR_WAIT_TIME, "" + (3 * memberTimeoutMS));
+    properties.put(LOCATOR_WAIT_TIME, "" + 3);
     addDSProps(properties);
     if (stateFile.exists()) {
       assertThat(stateFile.delete()).isTrue();
     }
 
+    IgnoredException
+        .addIgnoredException("Possible loss of quorum due to the loss of 1 
cache processes");
+
     Locator locator = Locator.startLocatorAndDS(port1, logFile, properties);
     system = (InternalDistributedSystem) locator.getDistributedSystem();
 
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
index 8a3febc..20d2342 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
@@ -919,7 +919,7 @@ public class DistributionImpl implements Distribution {
       // network-down testing
       InternalLocator loc = (InternalLocator) Locator.getLocator();
       if (loc != null) {
-        loc.stop(true, !distribution.disableAutoReconnect, false);
+        loc.stop(true, !distribution.disableAutoReconnect, true);
       }
     }
   }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 584d101..823844f 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -962,13 +962,6 @@ public class InternalDistributedSystem extends 
DistributedSystem
   }
 
   /**
-   * record a locator as a dependent of this distributed system
-   */
-  void setDependentLocator(InternalLocator theLocator) {
-    startedLocator = theLocator;
-  }
-
-  /**
    * Used by DistributionManager to fix bug 33362
    */
   void setDM(DistributionManager dm) {
@@ -1624,8 +1617,8 @@ public class InternalDistributedSystem extends 
DistributedSystem
           dm.close();
           // we close the locator after the DM so that when split-brain 
detection
           // is enabled, loss of the locator doesn't cause the DM to croak
-          if (startedLocator != null && !isReconnectingDS) {
-            startedLocator.stop(forcedDisconnect, preparingForReconnect, 
false);
+          if (startedLocator != null) {
+            startedLocator.stop(forcedDisconnect, preparingForReconnect, true);
             startedLocator = null;
           }
         } finally { // timer canceled
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index f2abe55..b118187 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -211,6 +211,9 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
   private WanLocatorDiscoverer locatorDiscoverer;
   private InternalConfigurationPersistenceService 
configurationPersistenceService;
   private ClusterManagementService clusterManagementService;
+  // synchronization lock that ensures we only have one thread performing 
location services
+  // restart at a time
+  private final Object servicesRestartLock = new Object();
 
   public static InternalLocator getLocator() {
     synchronized (locatorLock) {
@@ -752,8 +755,6 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
       startCache(internalDistributedSystem);
 
       logger.info("Locator started on {}", thisLocator);
-
-      internalDistributedSystem.setDependentLocator(this);
     }
   }
 
@@ -919,7 +920,7 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
       // If we are already shutting down don't do all of this again.
       // But, give the server a bit of time to shut down so a new
       // locator can be created, if desired, when this method returns
-      if (!stopForReconnect && waitForDisconnect) {
+      if (waitForDisconnect) {
         long endOfWait = System.currentTimeMillis() + 60000;
         if (isDebugEnabled && membershipLocator.isAlive()) {
           logger.debug("sleeping to wait for the locator server to shut 
down...");
@@ -959,10 +960,8 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
     handleShutdown();
     logger.info("{} is stopped", this);
 
-    if (stoppedForReconnect) {
-      if (internalDistributedSystem != null) {
-        launchRestartThread();
-      }
+    if (stopForReconnect) {
+      launchRestartThread();
     }
   }
 
@@ -981,10 +980,6 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
     if (productUseLog != null) {
       productUseLog.close();
     }
-    if (internalDistributedSystem != null) {
-      internalDistributedSystem.setDependentLocator(null);
-    }
-
     if (internalCache != null && !stoppedForReconnect && !forcedDisconnect) {
       logger.info("Closing locator's cache");
       try {
@@ -1055,21 +1050,24 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
   private void launchRestartThread() {
     String threadName = "Location services restart thread";
     restartThread = new LoggingThread(threadName, () -> {
-      boolean restarted = false;
-      try {
-        restarted = attemptReconnect();
-        logger.info("attemptReconnect returned {}", restarted);
-      } catch (InterruptedException e) {
-        logger.info("attempt to restart location services was interrupted", e);
-      } catch (IOException e) {
-        logger.info("attempt to restart location services terminated", e);
-      } finally {
-        shutdownHandled.set(false);
-        if (!restarted) {
-          stoppedForReconnect = false;
+      synchronized (servicesRestartLock) {
+        stoppedForReconnect = true;
+        boolean restarted = false;
+        try {
+          restarted = attemptReconnect();
+          logger.info("attemptReconnect returned {}", restarted);
+        } catch (InterruptedException e) {
+          logger.info("attempt to restart location services was interrupted", 
e);
+        } catch (IOException e) {
+          logger.info("attempt to restart location services terminated", e);
+        } finally {
+          shutdownHandled.set(false);
+          if (!restarted) {
+            stoppedForReconnect = false;
+          }
+          reconnected = restarted;
+          restartThread = null;
         }
-        reconnected = restarted;
-        restartThread = null;
       }
     });
     restartThread.start();
@@ -1196,7 +1194,6 @@ public class InternalLocator extends Locator implements 
ConnectListener, LogConf
     }
     internalDistributedSystem = newSystem;
     internalCache = newCache;
-    internalDistributedSystem.setDependentLocator(this);
     logger.info("Locator restart: initializing TcpServer");
 
     try {
diff --git 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
index ea7635e..8707467 100644
--- 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
+++ 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
@@ -1272,6 +1272,10 @@ public class GMSMembership<ID extends MemberIdentifier> 
implements Membership<ID
       services.setShutdownCause(e);
     }
 
+    if (cleanupTimer != null && !cleanupTimer.isShutdown()) {
+      cleanupTimer.shutdownNow();
+    }
+
     lifecycleListener.disconnect(e);
 
     // first shut down communication so we don't do any more harm to other
@@ -1914,7 +1918,7 @@ public class GMSMembership<ID extends MemberIdentifier> 
implements Membership<ID
         }
       }
 
-      if (cleanupTimer != null) {
+      if (cleanupTimer != null && !cleanupTimer.isShutdown()) {
         cleanupTimer.shutdown();
       }
 
diff --git 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
index 9009d66..999e6cb 100644
--- 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
+++ 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
@@ -23,6 +23,7 @@ import java.net.SocketAddress;
 import java.net.UnknownHostException;
 import java.nio.file.Path;
 import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
 
 import org.apache.logging.log4j.Logger;
@@ -181,10 +182,11 @@ public class MembershipLocatorImpl<ID extends 
MemberIdentifier> implements Membe
       }
 
       boolean interrupted = Thread.interrupted();
+      long waitTimeMillis = TcpServer.SHUTDOWN_WAIT_TIME * 2;
       try {
         // TcpServer up to SHUTDOWN_WAIT_TIME for its executor pool to shut 
down.
         // We wait 2 * SHUTDOWN_WAIT_TIME here to account for that shutdown, 
and then our own.
-        waitToShutdown(TcpServer.SHUTDOWN_WAIT_TIME * 2);
+        waitToShutdown(waitTimeMillis);
 
       } catch (InterruptedException ex) {
         interrupted = true;
@@ -198,7 +200,8 @@ public class MembershipLocatorImpl<ID extends 
MemberIdentifier> implements Membe
       }
 
       if (isAlive()) {
-        logger.fatal("Could not stop {} in 60 seconds", this);
+        logger.fatal("Could not stop {} in {} seconds", this,
+            TimeUnit.MILLISECONDS.toSeconds(waitTimeMillis));
       }
     }
   }
diff --git 
a/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
 
b/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
index 26793e0..5c25ac8 100755
--- 
a/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
+++ 
b/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
@@ -164,7 +164,9 @@ public class TcpServer {
   public void restarting() throws IOException {
     this.shuttingDown = false;
     startServerThread();
-    this.executor = executorServiceSupplier.get();
+    if (this.executor == null || this.executor.isShutdown()) {
+      this.executor = executorServiceSupplier.get();
+    }
     logger.info("TcpServer@" + System.identityHashCode(this)
         + " restarting: completed.  Server thread=" + this.serverThread + '@'
         + System.identityHashCode(this.serverThread) + ";alive=" + 
this.serverThread.isAlive());

Reply via email to