This is an automated email from the ASF dual-hosted git repository.
andor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zookeeper.git
The following commit(s) were added to refs/heads/master by this push:
new 6692d7a ZOOKEEPER-3320: Leader election port stop listen when
hostname unresolvable for some time
6692d7a is described below
commit 6692d7a5b4bc3f0dbd36677c06e782ef5240153a
Author: Igor Skokov <[email protected]>
AuthorDate: Tue Aug 6 12:52:07 2019 +0200
ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable
for some time
Improvements and fixes of #863
Author: Igor Skokov <[email protected]>
Author: Igor Skokov <[email protected]>
Reviewers: [email protected], [email protected]
Closes #1033 from Lagrang/ZOOKEEPER-3320 and squashes the following commits:
50d64659e [Igor Skokov] ZOOKEEPER-3320: doc fix, rename config property
'zookeeper.electionPortBindRetry' to 'electionPortBindRetry'
fb9cdc57c [Igor Skokov] Merge remote-tracking branch
'lagrang/ZOOKEEPER-3320' into ZOOKEEPER-3320
f95ee187b [Igor Skokov] ZOOKEEPER-3320:
CnxManagerTest.testCnxManagerListenerThreadConfigurableRetry fix
1af098d33 [Igor Skokov] ZOOKEEPER-3320: support custom socket bind error
handler in QuorumCnxManager.Listener
7b222efbe [Igor Skokov] ZOOKEEPER-3320: handle 0 value for
zookeeper.electionPortBindRetry as infinite, fix CnxManagerTest.
testCnxManagerListenerThreadConfigurableRetry to prevent JVM exit during testing
5051b4cdf [Igor Skokov] ZOOKEEPER-3320: fix of test compilation
eeb5c4155 [Igor Skokov] ZOOKEEPER-3320: use existing scheme to stop server
when QuorumCnxManager.Listener fails to bind to election port
587fd95a0 [Igor Skokov] ZOOKEEPER-3320: QuorumCnxManager.Listener extends
ZookeeperCriticalThread, add test to CnxManagerTest to check configurable
retries of leader election port bind
0888a2953 [Igor Skokov] ZOOKEEPER-3320: add documentation for
zookeeper.electionPortBindRetry property
a9a934254 [Igor Skokov] ZOOKEEPER-3320: add validation and logging of
zookeeper.electionPortBindRetry value
da33c1d3a [Igor Skokov] ZOOKEEPER-3320: configurable retry count for
election port bind in QuorumCnxManager.Listener
e25b44551 [Igor Skokov] ZOOKEEPER-3320: support custom socket bind error
handler in QuorumCnxManager.Listener
b4abdc7f2 [Igor Skokov] ZOOKEEPER-3320: handle 0 value for
zookeeper.electionPortBindRetry as infinite, fix CnxManagerTest.
testCnxManagerListenerThreadConfigurableRetry to prevent JVM exit during testing
c1afdf933 [Igor Skokov] Merge branch 'master' into ZOOKEEPER-3320
e9db1e445 [Igor Skokov] ZOOKEEPER-3320: fix of test compilation
a541ee902 [Igor Skokov] Merge branch 'master' into ZOOKEEPER-3320
bb0c77f7a [Igor Skokov] ZOOKEEPER-3320: use existing scheme to stop server
when QuorumCnxManager.Listener fails to bind to election port
914295895 [Igor Skokov] ZOOKEEPER-3320: QuorumCnxManager.Listener extends
ZookeeperCriticalThread, add test to CnxManagerTest to check configurable
retries of leader election port bind
883d35eb0 [Igor Skokov] ZOOKEEPER-3320: add documentation for
zookeeper.electionPortBindRetry property
b448f3603 [Igor Skokov] ZOOKEEPER-3320: add validation and logging of
zookeeper.electionPortBindRetry value
706e1f058 [Igor Skokov] ZOOKEEPER-3320: configurable retry count for
election port bind in QuorumCnxManager.Listener
---
.../src/main/resources/markdown/zookeeperAdmin.md | 13 +++++
.../zookeeper/server/quorum/QuorumCnxManager.java | 57 ++++++++++++++++------
.../zookeeper/server/quorum/CnxManagerTest.java | 31 ++++++++++++
3 files changed, 87 insertions(+), 14 deletions(-)
diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
index 6154eb7..dbf9a08 100644
--- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
+++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
@@ -1088,6 +1088,19 @@ As an example, this will enable all four letter word
commands:
properly, check your operating system's options regarding TCP
keepalive for more information. Defaults to
**false**.
+
+* *electionPortBindRetry* :
+ (Java system property only: **zookeeper.electionPortBindRetry**)
+ Property set max retry count when Zookeeper server fails to bind
+ leader election port. Such errors can be temporary and recoverable,
+ such as DNS issue described in
[ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320),
+ or non-retryable, such as port already in use.
+ In case of transient errors, this property can improve availability
+ of Zookeeper server and help it to self recover.
+ Default value 3. In container environment, especially in Kubernetes,
+ this value should be increased or set to 0(infinite retry) to overcome
issues
+ related to DNS name resolving.
+
* *observer.reconnectDelayMs* :
(Java system property: **zookeeper.observer.reconnectDelayMs**)
diff --git
a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
index 3b6133a..5039d83 100644
---
a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
+++
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
@@ -18,6 +18,8 @@
package org.apache.zookeeper.server.quorum;
+import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
+
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
@@ -36,6 +38,7 @@ import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Map;
+import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
@@ -43,24 +46,20 @@ import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
-import java.util.NoSuchElementException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
-
+import javax.net.ssl.SSLSocket;
import org.apache.zookeeper.common.X509Exception;
import org.apache.zookeeper.server.ExitCode;
-import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
-import org.apache.zookeeper.server.util.ConfigUtils;
import org.apache.zookeeper.server.ZooKeeperThread;
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer;
import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
+import org.apache.zookeeper.server.util.ConfigUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import javax.net.ssl.SSLSocket;
-import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
-
/**
* This class implements a connection manager for leader election using TCP. It
* maintains one connection for every pair of servers. The tricky part is to
@@ -848,12 +847,39 @@ public class QuorumCnxManager {
*/
public class Listener extends ZooKeeperThread {
+ private static final String ELECTION_PORT_BIND_RETRY =
"zookeeper.electionPortBindRetry";
+ private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3;
+
+ private final int portBindMaxRetry;
+ private Runnable socketBindErrorHandler = () ->
System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue());
volatile ServerSocket ss = null;
public Listener() {
// During startup of thread, thread name will be overridden to
// specific election address
super("ListenerThread");
+
+ // maximum retry count while trying to bind to election port
+ // see ZOOKEEPER-3320 for more details
+ final Integer maxRetry =
Integer.getInteger(ELECTION_PORT_BIND_RETRY,
+
DEFAULT_PORT_BIND_MAX_RETRY);
+ if (maxRetry >= 0) {
+ LOG.info("Election port bind maximum retries is {}",
+ maxRetry == 0 ? "infinite" : maxRetry);
+ portBindMaxRetry = maxRetry;
+ } else {
+ LOG.info("'{}' contains invalid value: {}(must be >= 0). "
+ + "Use default value of {} instead.",
+ ELECTION_PORT_BIND_RETRY, maxRetry,
DEFAULT_PORT_BIND_MAX_RETRY);
+ portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY;
+ }
+ }
+
+ /**
+ * Change socket bind error handler. Used for testing.
+ */
+ void setSocketBindErrorHandler(Runnable errorHandler) {
+ this.socketBindErrorHandler = errorHandler;
}
/**
@@ -865,7 +891,7 @@ public class QuorumCnxManager {
InetSocketAddress addr;
Socket client = null;
Exception exitException = null;
- while((!shutdown) && (numRetries < 3)){
+ while ((!shutdown) && (portBindMaxRetry == 0 || numRetries <
portBindMaxRetry)) {
try {
if (self.shouldUsePortUnification()) {
LOG.info("Creating TLS-enabled quorum server socket");
@@ -935,15 +961,18 @@ public class QuorumCnxManager {
}
LOG.info("Leaving listener");
if (!shutdown) {
- LOG.error("As I'm leaving the listener thread, "
- + "I won't be able to participate in leader "
- + "election any longer: "
- + formatInetAddr(self.getElectionAddress()));
- if (exitException instanceof BindException) {
+ LOG.error("As I'm leaving the listener thread after "
+ + numRetries + " errors. "
+ + "I won't be able to participate in leader "
+ + "election any longer: "
+ + formatInetAddr(self.getElectionAddress())
+ + ". Use " + ELECTION_PORT_BIND_RETRY + " property
to "
+ + "increase retry count.");
+ if (exitException instanceof SocketException) {
// After leaving listener thread, the host cannot join the
// quorum anymore, this is a severe error that we cannot
// recover from, so we need to exit
-
System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue());
+ socketBindErrorHandler.run();
}
} else if (ss != null) {
// Clean up for shutdown.
diff --git
a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
index 878e41b..276f35f 100644
---
a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
+++
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
@@ -36,6 +36,7 @@ import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.net.Socket;
+import java.util.concurrent.atomic.AtomicBoolean;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.HandshakeCompletedListener;
@@ -291,6 +292,36 @@ public class CnxManagerTest extends ZKTestCase {
}
/**
+ * Test for bug described in {@link
https://issues.apache.org/jira/browse/ZOOKEEPER-3320}.
+ * Test create peer with address which contains unresolvable DNS name,
+ * leader election listener thread should stop after N errors.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testCnxManagerListenerThreadConfigurableRetry() throws
Exception {
+ final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>();
+ final long myid = 1L;
+ unresolvablePeers.put(myid, new QuorumServer(myid,
"unresolvable-domain.org:2182:2183;2181"));
+ final QuorumPeer peer = new QuorumPeer(unresolvablePeers,
+ ClientBase.createTmpDir(),
+ ClientBase.createTmpDir(),
+ 2181, 3, myid, 1000, 2, 2, 2);
+ final QuorumCnxManager cnxManager = peer.createCnxnManager();
+ final QuorumCnxManager.Listener listener = cnxManager.listener;
+ final AtomicBoolean errorHappend = new AtomicBoolean();
+ listener.setSocketBindErrorHandler(() -> errorHappend.set(true));
+ listener.start();
+ // listener thread should stop and throws error which notify
QuorumPeer about error.
+ // QuorumPeer should start shutdown process
+ listener.join(15000); // set wait time, if listener contains bug and
thread not stops.
+ Assert.assertFalse(listener.isAlive());
+ Assert.assertTrue(errorHappend.get());
+ Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped
after "
+ + "listener thread death", listener.isAlive());
+ }
+
+ /**
* Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6
* observer connects to a 3.5.0 server.
* see https://issues.apache.org/jira/browse/ZOOKEEPER-1789