This is an automated email from the ASF dual-hosted git repository. bschuchardt pushed a commit to branch feature/GEODE-7796 in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/feature/GEODE-7796 by this push: new d8121db reduce locator-wait-time (seconds, not millis) d8121db is described below commit d8121db35d773f326725d92397a32b8400fc953a Author: Bruce Schuchardt <bschucha...@pivotal.io> AuthorDate: Fri Feb 14 08:28:49 2020 -0800 reduce locator-wait-time (seconds, not millis) also several other changes to fix the underlying failure: 1) add a synchronization to the services restart thread so only one thread is active at a time 2) shut down membership cleanup executor and avoid creating multiple TcpServer executors on auto-reconnect 3) remove setting the locator as a dependent of the InternalDistributedSystem. This was causing locator.stop() to be invoked multiple times (as Dale noticed in his analysis) 4) when stopping a locator for auto-reconnect wait for it to stop in order to avoid creating multiple restart threads when there are cascading failures. --- .../apache/geode/distributed/LocatorDUnitTest.java | 7 ++-- .../distributed/internal/DistributionImpl.java | 2 +- .../internal/InternalDistributedSystem.java | 11 +---- .../distributed/internal/InternalLocator.java | 49 ++++++++++------------ .../internal/membership/gms/GMSMembership.java | 6 ++- .../gms/locator/MembershipLocatorImpl.java | 7 +++- .../distributed/internal/tcpserver/TcpServer.java | 4 +- 7 files changed, 43 insertions(+), 43 deletions(-) diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java index d6eb991..5dbe309 100644 --- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java +++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java @@ -72,7 +72,6 @@ import java.util.Set; import org.apache.logging.log4j.Logger; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -256,7 +255,6 @@ public class LocatorDUnitTest implements Serializable { } @Test - @Ignore("GEODE=7760 - test sometimes hangs due to product issue") public void testCrashLocatorMultipleTimes() throws Exception { port1 = AvailablePort.getRandomAvailablePort(AvailablePort.SOCKET); DistributedTestUtils.deleteLocatorStateFile(port1); @@ -270,12 +268,15 @@ public class LocatorDUnitTest implements Serializable { properties.put(MAX_WAIT_TIME_RECONNECT, "" + (3 * memberTimeoutMS)); // since we're restarting location services let's be a little forgiving about that service // starting up so that stress-tests can pass - properties.put(LOCATOR_WAIT_TIME, "" + (3 * memberTimeoutMS)); + properties.put(LOCATOR_WAIT_TIME, "" + 3); addDSProps(properties); if (stateFile.exists()) { assertThat(stateFile.delete()).isTrue(); } + IgnoredException + .addIgnoredException("Possible loss of quorum due to the loss of 1 cache processes"); + Locator locator = Locator.startLocatorAndDS(port1, logFile, properties); system = (InternalDistributedSystem) locator.getDistributedSystem(); diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java index 8a3febc..20d2342 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java @@ -919,7 +919,7 @@ public class DistributionImpl implements Distribution { // network-down testing InternalLocator loc = (InternalLocator) Locator.getLocator(); if (loc != null) { - loc.stop(true, !distribution.disableAutoReconnect, false); + loc.stop(true, !distribution.disableAutoReconnect, true); } } } diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java index 584d101..823844f 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java @@ -962,13 +962,6 @@ public class InternalDistributedSystem extends DistributedSystem } /** - * record a locator as a dependent of this distributed system - */ - void setDependentLocator(InternalLocator theLocator) { - startedLocator = theLocator; - } - - /** * Used by DistributionManager to fix bug 33362 */ void setDM(DistributionManager dm) { @@ -1624,8 +1617,8 @@ public class InternalDistributedSystem extends DistributedSystem dm.close(); // we close the locator after the DM so that when split-brain detection // is enabled, loss of the locator doesn't cause the DM to croak - if (startedLocator != null && !isReconnectingDS) { - startedLocator.stop(forcedDisconnect, preparingForReconnect, false); + if (startedLocator != null) { + startedLocator.stop(forcedDisconnect, preparingForReconnect, true); startedLocator = null; } } finally { // timer canceled diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java index f2abe55..b118187 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java @@ -211,6 +211,9 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf private WanLocatorDiscoverer locatorDiscoverer; private InternalConfigurationPersistenceService configurationPersistenceService; private ClusterManagementService clusterManagementService; + // synchronization lock that ensures we only have one thread performing location services + // restart at a time + private final Object servicesRestartLock = new Object(); public static InternalLocator getLocator() { synchronized (locatorLock) { @@ -752,8 +755,6 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf startCache(internalDistributedSystem); logger.info("Locator started on {}", thisLocator); - - internalDistributedSystem.setDependentLocator(this); } } @@ -919,7 +920,7 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf // If we are already shutting down don't do all of this again. // But, give the server a bit of time to shut down so a new // locator can be created, if desired, when this method returns - if (!stopForReconnect && waitForDisconnect) { + if (waitForDisconnect) { long endOfWait = System.currentTimeMillis() + 60000; if (isDebugEnabled && membershipLocator.isAlive()) { logger.debug("sleeping to wait for the locator server to shut down..."); @@ -959,10 +960,8 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf handleShutdown(); logger.info("{} is stopped", this); - if (stoppedForReconnect) { - if (internalDistributedSystem != null) { - launchRestartThread(); - } + if (stopForReconnect) { + launchRestartThread(); } } @@ -981,10 +980,6 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf if (productUseLog != null) { productUseLog.close(); } - if (internalDistributedSystem != null) { - internalDistributedSystem.setDependentLocator(null); - } - if (internalCache != null && !stoppedForReconnect && !forcedDisconnect) { logger.info("Closing locator's cache"); try { @@ -1055,21 +1050,24 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf private void launchRestartThread() { String threadName = "Location services restart thread"; restartThread = new LoggingThread(threadName, () -> { - boolean restarted = false; - try { - restarted = attemptReconnect(); - logger.info("attemptReconnect returned {}", restarted); - } catch (InterruptedException e) { - logger.info("attempt to restart location services was interrupted", e); - } catch (IOException e) { - logger.info("attempt to restart location services terminated", e); - } finally { - shutdownHandled.set(false); - if (!restarted) { - stoppedForReconnect = false; + synchronized (servicesRestartLock) { + stoppedForReconnect = true; + boolean restarted = false; + try { + restarted = attemptReconnect(); + logger.info("attemptReconnect returned {}", restarted); + } catch (InterruptedException e) { + logger.info("attempt to restart location services was interrupted", e); + } catch (IOException e) { + logger.info("attempt to restart location services terminated", e); + } finally { + shutdownHandled.set(false); + if (!restarted) { + stoppedForReconnect = false; + } + reconnected = restarted; + restartThread = null; } - reconnected = restarted; - restartThread = null; } }); restartThread.start(); @@ -1196,7 +1194,6 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf } internalDistributedSystem = newSystem; internalCache = newCache; - internalDistributedSystem.setDependentLocator(this); logger.info("Locator restart: initializing TcpServer"); try { diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java index ea7635e..8707467 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java @@ -1272,6 +1272,10 @@ public class GMSMembership<ID extends MemberIdentifier> implements Membership<ID services.setShutdownCause(e); } + if (cleanupTimer != null && !cleanupTimer.isShutdown()) { + cleanupTimer.shutdownNow(); + } + lifecycleListener.disconnect(e); // first shut down communication so we don't do any more harm to other @@ -1914,7 +1918,7 @@ public class GMSMembership<ID extends MemberIdentifier> implements Membership<ID } } - if (cleanupTimer != null) { + if (cleanupTimer != null && !cleanupTimer.isShutdown()) { cleanupTimer.shutdown(); } diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java index 9009d66..999e6cb 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java @@ -23,6 +23,7 @@ import java.net.SocketAddress; import java.net.UnknownHostException; import java.nio.file.Path; import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import org.apache.logging.log4j.Logger; @@ -181,10 +182,11 @@ public class MembershipLocatorImpl<ID extends MemberIdentifier> implements Membe } boolean interrupted = Thread.interrupted(); + long waitTimeMillis = TcpServer.SHUTDOWN_WAIT_TIME * 2; try { // TcpServer up to SHUTDOWN_WAIT_TIME for its executor pool to shut down. // We wait 2 * SHUTDOWN_WAIT_TIME here to account for that shutdown, and then our own. - waitToShutdown(TcpServer.SHUTDOWN_WAIT_TIME * 2); + waitToShutdown(waitTimeMillis); } catch (InterruptedException ex) { interrupted = true; @@ -198,7 +200,8 @@ public class MembershipLocatorImpl<ID extends MemberIdentifier> implements Membe } if (isAlive()) { - logger.fatal("Could not stop {} in 60 seconds", this); + logger.fatal("Could not stop {} in {} seconds", this, + TimeUnit.MILLISECONDS.toSeconds(waitTimeMillis)); } } } diff --git a/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java b/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java index 26793e0..5c25ac8 100755 --- a/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java +++ b/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java @@ -164,7 +164,9 @@ public class TcpServer { public void restarting() throws IOException { this.shuttingDown = false; startServerThread(); - this.executor = executorServiceSupplier.get(); + if (this.executor == null || this.executor.isShutdown()) { + this.executor = executorServiceSupplier.get(); + } logger.info("TcpServer@" + System.identityHashCode(this) + " restarting: completed. Server thread=" + this.serverThread + '@' + System.identityHashCode(this.serverThread) + ";alive=" + this.serverThread.isAlive());