This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch feature/GEODE-7796
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/feature/GEODE-7796 by this
push:
new d8121db reduce locator-wait-time (seconds, not millis)
d8121db is described below
commit d8121db35d773f326725d92397a32b8400fc953a
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Fri Feb 14 08:28:49 2020 -0800
reduce locator-wait-time (seconds, not millis)
also several other changes to fix the underlying failure:
1) add a synchronization to the services restart thread so only one
thread is active at a time
2) shut down membership cleanup executor and avoid creating multiple
TcpServer executors on auto-reconnect
3) remove setting the locator as a dependent of the
InternalDistributedSystem. This was causing locator.stop() to be
invoked multiple times (as Dale noticed in his analysis)
4) when stopping a locator for auto-reconnect wait for it to stop in
order to avoid creating multiple restart threads when there are
cascading failures.
---
.../apache/geode/distributed/LocatorDUnitTest.java | 7 ++--
.../distributed/internal/DistributionImpl.java | 2 +-
.../internal/InternalDistributedSystem.java | 11 +----
.../distributed/internal/InternalLocator.java | 49 ++++++++++------------
.../internal/membership/gms/GMSMembership.java | 6 ++-
.../gms/locator/MembershipLocatorImpl.java | 7 +++-
.../distributed/internal/tcpserver/TcpServer.java | 4 +-
7 files changed, 43 insertions(+), 43 deletions(-)
diff --git
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index d6eb991..5dbe309 100644
---
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -72,7 +72,6 @@ import java.util.Set;
import org.apache.logging.log4j.Logger;
import org.junit.After;
import org.junit.Before;
-import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@@ -256,7 +255,6 @@ public class LocatorDUnitTest implements Serializable {
}
@Test
- @Ignore("GEODE=7760 - test sometimes hangs due to product issue")
public void testCrashLocatorMultipleTimes() throws Exception {
port1 = AvailablePort.getRandomAvailablePort(AvailablePort.SOCKET);
DistributedTestUtils.deleteLocatorStateFile(port1);
@@ -270,12 +268,15 @@ public class LocatorDUnitTest implements Serializable {
properties.put(MAX_WAIT_TIME_RECONNECT, "" + (3 * memberTimeoutMS));
// since we're restarting location services let's be a little forgiving
about that service
// starting up so that stress-tests can pass
- properties.put(LOCATOR_WAIT_TIME, "" + (3 * memberTimeoutMS));
+ properties.put(LOCATOR_WAIT_TIME, "" + 3);
addDSProps(properties);
if (stateFile.exists()) {
assertThat(stateFile.delete()).isTrue();
}
+ IgnoredException
+ .addIgnoredException("Possible loss of quorum due to the loss of 1
cache processes");
+
Locator locator = Locator.startLocatorAndDS(port1, logFile, properties);
system = (InternalDistributedSystem) locator.getDistributedSystem();
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
index 8a3febc..20d2342 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/DistributionImpl.java
@@ -919,7 +919,7 @@ public class DistributionImpl implements Distribution {
// network-down testing
InternalLocator loc = (InternalLocator) Locator.getLocator();
if (loc != null) {
- loc.stop(true, !distribution.disableAutoReconnect, false);
+ loc.stop(true, !distribution.disableAutoReconnect, true);
}
}
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 584d101..823844f 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -962,13 +962,6 @@ public class InternalDistributedSystem extends
DistributedSystem
}
/**
- * record a locator as a dependent of this distributed system
- */
- void setDependentLocator(InternalLocator theLocator) {
- startedLocator = theLocator;
- }
-
- /**
* Used by DistributionManager to fix bug 33362
*/
void setDM(DistributionManager dm) {
@@ -1624,8 +1617,8 @@ public class InternalDistributedSystem extends
DistributedSystem
dm.close();
// we close the locator after the DM so that when split-brain
detection
// is enabled, loss of the locator doesn't cause the DM to croak
- if (startedLocator != null && !isReconnectingDS) {
- startedLocator.stop(forcedDisconnect, preparingForReconnect,
false);
+ if (startedLocator != null) {
+ startedLocator.stop(forcedDisconnect, preparingForReconnect, true);
startedLocator = null;
}
} finally { // timer canceled
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index f2abe55..b118187 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -211,6 +211,9 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
private WanLocatorDiscoverer locatorDiscoverer;
private InternalConfigurationPersistenceService
configurationPersistenceService;
private ClusterManagementService clusterManagementService;
+ // synchronization lock that ensures we only have one thread performing
location services
+ // restart at a time
+ private final Object servicesRestartLock = new Object();
public static InternalLocator getLocator() {
synchronized (locatorLock) {
@@ -752,8 +755,6 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
startCache(internalDistributedSystem);
logger.info("Locator started on {}", thisLocator);
-
- internalDistributedSystem.setDependentLocator(this);
}
}
@@ -919,7 +920,7 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
// If we are already shutting down don't do all of this again.
// But, give the server a bit of time to shut down so a new
// locator can be created, if desired, when this method returns
- if (!stopForReconnect && waitForDisconnect) {
+ if (waitForDisconnect) {
long endOfWait = System.currentTimeMillis() + 60000;
if (isDebugEnabled && membershipLocator.isAlive()) {
logger.debug("sleeping to wait for the locator server to shut
down...");
@@ -959,10 +960,8 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
handleShutdown();
logger.info("{} is stopped", this);
- if (stoppedForReconnect) {
- if (internalDistributedSystem != null) {
- launchRestartThread();
- }
+ if (stopForReconnect) {
+ launchRestartThread();
}
}
@@ -981,10 +980,6 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
if (productUseLog != null) {
productUseLog.close();
}
- if (internalDistributedSystem != null) {
- internalDistributedSystem.setDependentLocator(null);
- }
-
if (internalCache != null && !stoppedForReconnect && !forcedDisconnect) {
logger.info("Closing locator's cache");
try {
@@ -1055,21 +1050,24 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
private void launchRestartThread() {
String threadName = "Location services restart thread";
restartThread = new LoggingThread(threadName, () -> {
- boolean restarted = false;
- try {
- restarted = attemptReconnect();
- logger.info("attemptReconnect returned {}", restarted);
- } catch (InterruptedException e) {
- logger.info("attempt to restart location services was interrupted", e);
- } catch (IOException e) {
- logger.info("attempt to restart location services terminated", e);
- } finally {
- shutdownHandled.set(false);
- if (!restarted) {
- stoppedForReconnect = false;
+ synchronized (servicesRestartLock) {
+ stoppedForReconnect = true;
+ boolean restarted = false;
+ try {
+ restarted = attemptReconnect();
+ logger.info("attemptReconnect returned {}", restarted);
+ } catch (InterruptedException e) {
+ logger.info("attempt to restart location services was interrupted",
e);
+ } catch (IOException e) {
+ logger.info("attempt to restart location services terminated", e);
+ } finally {
+ shutdownHandled.set(false);
+ if (!restarted) {
+ stoppedForReconnect = false;
+ }
+ reconnected = restarted;
+ restartThread = null;
}
- reconnected = restarted;
- restartThread = null;
}
});
restartThread.start();
@@ -1196,7 +1194,6 @@ public class InternalLocator extends Locator implements
ConnectListener, LogConf
}
internalDistributedSystem = newSystem;
internalCache = newCache;
- internalDistributedSystem.setDependentLocator(this);
logger.info("Locator restart: initializing TcpServer");
try {
diff --git
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
index ea7635e..8707467 100644
---
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
+++
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
@@ -1272,6 +1272,10 @@ public class GMSMembership<ID extends MemberIdentifier>
implements Membership<ID
services.setShutdownCause(e);
}
+ if (cleanupTimer != null && !cleanupTimer.isShutdown()) {
+ cleanupTimer.shutdownNow();
+ }
+
lifecycleListener.disconnect(e);
// first shut down communication so we don't do any more harm to other
@@ -1914,7 +1918,7 @@ public class GMSMembership<ID extends MemberIdentifier>
implements Membership<ID
}
}
- if (cleanupTimer != null) {
+ if (cleanupTimer != null && !cleanupTimer.isShutdown()) {
cleanupTimer.shutdown();
}
diff --git
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
index 9009d66..999e6cb 100644
---
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
+++
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/locator/MembershipLocatorImpl.java
@@ -23,6 +23,7 @@ import java.net.SocketAddress;
import java.net.UnknownHostException;
import java.nio.file.Path;
import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import org.apache.logging.log4j.Logger;
@@ -181,10 +182,11 @@ public class MembershipLocatorImpl<ID extends
MemberIdentifier> implements Membe
}
boolean interrupted = Thread.interrupted();
+ long waitTimeMillis = TcpServer.SHUTDOWN_WAIT_TIME * 2;
try {
// TcpServer up to SHUTDOWN_WAIT_TIME for its executor pool to shut
down.
// We wait 2 * SHUTDOWN_WAIT_TIME here to account for that shutdown,
and then our own.
- waitToShutdown(TcpServer.SHUTDOWN_WAIT_TIME * 2);
+ waitToShutdown(waitTimeMillis);
} catch (InterruptedException ex) {
interrupted = true;
@@ -198,7 +200,8 @@ public class MembershipLocatorImpl<ID extends
MemberIdentifier> implements Membe
}
if (isAlive()) {
- logger.fatal("Could not stop {} in 60 seconds", this);
+ logger.fatal("Could not stop {} in {} seconds", this,
+ TimeUnit.MILLISECONDS.toSeconds(waitTimeMillis));
}
}
}
diff --git
a/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
b/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
index 26793e0..5c25ac8 100755
---
a/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
+++
b/geode-tcp-server/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
@@ -164,7 +164,9 @@ public class TcpServer {
public void restarting() throws IOException {
this.shuttingDown = false;
startServerThread();
- this.executor = executorServiceSupplier.get();
+ if (this.executor == null || this.executor.isShutdown()) {
+ this.executor = executorServiceSupplier.get();
+ }
logger.info("TcpServer@" + System.identityHashCode(this)
+ " restarting: completed. Server thread=" + this.serverThread + '@'
+ System.identityHashCode(this.serverThread) + ";alive=" +
this.serverThread.isAlive());