This is an automated email from the ASF dual-hosted git repository.
nkalmar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zookeeper.git
The following commit(s) were added to refs/heads/master by this push:
new 05ee941 ZOOKEEPER-3320: Leader election port stop listen when
hostname unresolvable for some time
05ee941 is described below
commit 05ee9413e7a31703395b81fb8d72baf1cb09a46d
Author: Igor Skokov <[email protected]>
AuthorDate: Mon Jul 29 11:49:27 2019 +0200
ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable
for some time
Author: Igor Skokov <[email protected]>
Author: Igor Skokov <[email protected]>
Reviewers: Enrico Olivelli <[email protected]>, Norbert Kalmar
<[email protected]>
Closes #863 from Lagrang/ZOOKEEPER-3320
---
.../src/main/resources/markdown/zookeeperAdmin.md | 12 ++++++
.../zookeeper/server/quorum/QuorumCnxManager.java | 46 ++++++++++++++++------
.../zookeeper/server/quorum/CnxManagerTest.java | 29 ++++++++++++++
3 files changed, 74 insertions(+), 13 deletions(-)
diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
index 1690ce6..d38afd2 100644
--- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
+++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
@@ -1076,6 +1076,18 @@ As an example, this will enable all four letter word
commands:
properly, check your operating system's options regarding TCP
keepalive for more information. Defaults to
**false**.
+
+* *zookeeper.electionPortBindRetry* :
+ (Java system property only: **zookeeper.electionPortBindRetry**)
+ Property set max retry count when Zookeeper server fails to bind
+ leader election port. Such errors can be temporary and recoverable,
+ such as DNS issue described in
[ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320),
+ or non-retryable, such as port already in use.
+ In case of transient errors, this property can improve availability
+ of Zookeeper server and help it to self recover.
+ Default value 3. In container environment, especially in Kubernetes,
+ this value should be increased to overcome issues related to DNS name
resolving.
+
* *observer.reconnectDelayMs* :
(Java system property: **zookeeper.observer.reconnectDelayMs**)
diff --git
a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
index d97da2a..4be8fa6 100644
---
a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
+++
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
@@ -18,6 +18,8 @@
package org.apache.zookeeper.server.quorum;
+import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
+
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
@@ -36,6 +38,7 @@ import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Map;
+import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
@@ -43,24 +46,20 @@ import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
-import java.util.NoSuchElementException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
-
+import javax.net.ssl.SSLSocket;
import org.apache.zookeeper.common.X509Exception;
import org.apache.zookeeper.server.ExitCode;
-import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
-import org.apache.zookeeper.server.util.ConfigUtils;
import org.apache.zookeeper.server.ZooKeeperThread;
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer;
import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
+import org.apache.zookeeper.server.util.ConfigUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import javax.net.ssl.SSLSocket;
-import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
-
/**
* This class implements a connection manager for leader election using TCP. It
* maintains one connection for every pair of servers. The tricky part is to
@@ -848,12 +847,30 @@ public class QuorumCnxManager {
*/
public class Listener extends ZooKeeperThread {
+ private static final String ELECTION_PORT_BIND_RETRY =
"zookeeper.electionPortBindRetry";
+ private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3;
+
+ private final int portBindMaxRetry;
volatile ServerSocket ss = null;
public Listener() {
// During startup of thread, thread name will be overridden to
// specific election address
super("ListenerThread");
+
+ // maximum retry count while trying to bind to election port
+ // see ZOOKEEPER-3320 for more details
+ final Integer maxRetry =
Integer.getInteger(ELECTION_PORT_BIND_RETRY,
+
DEFAULT_PORT_BIND_MAX_RETRY);
+ if (maxRetry >= 0) {
+ LOG.info("Election port bind maximum retries is {}", maxRetry);
+ portBindMaxRetry = maxRetry;
+ } else {
+ LOG.info("'{}' contains invalid value: {}(must be >= 0). "
+ + "Use default value of {} instead.",
+ ELECTION_PORT_BIND_RETRY, maxRetry,
DEFAULT_PORT_BIND_MAX_RETRY);
+ portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY;
+ }
}
/**
@@ -865,7 +882,7 @@ public class QuorumCnxManager {
InetSocketAddress addr;
Socket client = null;
Exception exitException = null;
- while((!shutdown) && (numRetries < 3)){
+ while((!shutdown) && (numRetries < portBindMaxRetry)){
try {
if (self.shouldUsePortUnification()) {
LOG.info("Creating TLS-enabled quorum server socket");
@@ -935,11 +952,14 @@ public class QuorumCnxManager {
}
LOG.info("Leaving listener");
if (!shutdown) {
- LOG.error("As I'm leaving the listener thread, "
- + "I won't be able to participate in leader "
- + "election any longer: "
- + formatInetAddr(self.getElectionAddress()));
- if (exitException instanceof BindException) {
+ LOG.error("As I'm leaving the listener thread after "
+ + numRetries + " errors. "
+ + "I won't be able to participate in leader "
+ + "election any longer: "
+ + formatInetAddr(self.getElectionAddress())
+ + ". Use " + ELECTION_PORT_BIND_RETRY + " property
to "
+ + "increase retry count.");
+ if (exitException instanceof SocketException) {
// After leaving listener thread, the host cannot join the
// quorum anymore, this is a severe error that we cannot
// recover from, so we need to exit
diff --git
a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
index 878e41b..200ed99 100644
---
a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
+++
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
@@ -291,6 +291,35 @@ public class CnxManagerTest extends ZKTestCase {
}
/**
+ * Test for bug described in {@link
https://issues.apache.org/jira/browse/ZOOKEEPER-3320}.
+ * Test create peer with address which contains unresolvable DNS name,
+ * leader election listener thread should stop after N errors.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testCnxManagerListenerThreadConfigurableRetry() throws
Exception {
+ final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>();
+ final long myid = 1L;
+ unresolvablePeers.put(myid, new QuorumServer(myid,
"unresolvable-domain.org:2182:2183;2181"));
+ final QuorumPeer peer = new QuorumPeer(unresolvablePeers,
+ ClientBase.createTmpDir(),
+ ClientBase.createTmpDir(),
+ 2181, 3, myid, 1000, 2, 2, 2);
+ final QuorumCnxManager cnxManager = peer.createCnxnManager();
+ QuorumCnxManager.Listener listener = cnxManager.listener;
+ listener.start();
+ // listener thread should stop and throws error which notify
QuorumPeer about error.
+ // QuorumPeer should start shutdown process
+ listener.join(15000); // set wait time, if listener contains bug and
thread not stops.
+ Assert.assertFalse(listener.isAlive());
+ Assert.assertFalse(peer.isRunning());
+ peer.join(15000);
+ Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped
after "
+ + "listener thread death", listener.isAlive());
+ }
+
+ /**
* Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6
* observer connects to a 3.5.0 server.
* see https://issues.apache.org/jira/browse/ZOOKEEPER-1789