[GitHub] zookeeper pull request #456: ZOOKEEPER-2930: Leader cannot be elected due to...
Github user JonathanO commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/456#discussion_r172802731 --- Diff: src/java/main/org/apache/zookeeper/server/quorum/QuorumCnxManager.java --- @@ -318,76 +318,167 @@ public Thread newThread(Runnable r) { */ public void testInitiateConnection(long sid) throws Exception { LOG.debug("Opening channel to server " + sid); -Socket sock = new Socket(); -setSockOpts(sock); -sock.connect(self.getVotingView().get(sid).electionAddr, cnxTO); -initiateConnection(sock, sid); +initiateConnection(sid, self.getVotingView().get(sid).electionAddr); +} + +private Socket openChannel(long sid, InetSocketAddress electionAddr) { +LOG.debug("Opening channel to server " + sid); +try { +final Socket sock = new Socket(); +setSockOpts(sock); +sock.connect(electionAddr, cnxTO); +LOG.debug("Connected to server " + sid); +return sock; +} catch (UnresolvedAddressException e) { +// Sun doesn't include the address that causes this +// exception to be thrown, also UAE cannot be wrapped cleanly +// so we log the exception in order to capture this critical +// detail. +LOG.warn("Cannot open channel to " + sid ++ " at election address " + electionAddr, e); +throw e; +} catch (IOException e) { +LOG.warn("Cannot open channel to " + sid ++ " at election address " + electionAddr, +e); +return null; +} } /** * If this server has initiated the connection, then it gives up on the * connection if it loses challenge. Otherwise, it keeps the connection. */ -public void initiateConnection(final Socket sock, final Long sid) { +public boolean initiateConnection(final Long sid, InetSocketAddress electionAddr) { try { -startConnection(sock, sid); -} catch (IOException e) { -LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection", -new Object[] { sid, sock.getRemoteSocketAddress() }, e); -closeSocket(sock); -return; +Socket sock = openChannel(sid, electionAddr); +if (sock != null) { +try { +startConnection(sock, sid); +} catch (IOException e) { +LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection", +new Object[]{sid, sock.getRemoteSocketAddress()}, e); +closeSocket(sock); +} +return true; +} else { +return false; +} +} finally { +inprogressConnections.remove(sid); } } -/** - * Server will initiate the connection request to its peer server - * asynchronously via separate connection thread. - */ -public void initiateConnectionAsync(final Socket sock, final Long sid) { +synchronized private void connectOneAsync(final Long sid, final ZooKeeperThread connectorThread) { +if (senderWorkerMap.get(sid) != null) { +LOG.debug("There is a connection already for server " + sid); +return; +} if(!inprogressConnections.add(sid)){ // simply return as there is a connection request to // server 'sid' already in progress. LOG.debug("Connection request to server id: {} is already in progress, so skipping this request", sid); -closeSocket(sock); return; } try { -connectionExecutor.execute( -new QuorumConnectionReqThread(sock, sid)); +connectionExecutor.execute(connectorThread); connectionThreadCnt.incrementAndGet(); } catch (Throwable e) { // Imp: Safer side catching all type of exceptions and remove 'sid' // from inprogress connections. This is to avoid blocking further // connection requests from this 'sid' in case of errors. inprogressConnections.remove(sid); LOG.error("Exception while submitting quorum connection request", e); -closeSocket(sock); } } +/** + * Try to establish a connection to
[GitHub] zookeeper pull request #456: ZOOKEEPER-2930: Leader cannot be elected due to...
Github user JonathanO commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/456#discussion_r169910520 --- Diff: src/java/main/org/apache/zookeeper/server/quorum/QuorumCnxManager.java --- @@ -318,76 +318,167 @@ public Thread newThread(Runnable r) { */ public void testInitiateConnection(long sid) throws Exception { LOG.debug("Opening channel to server " + sid); -Socket sock = new Socket(); -setSockOpts(sock); -sock.connect(self.getVotingView().get(sid).electionAddr, cnxTO); -initiateConnection(sock, sid); +initiateConnection(sid, self.getVotingView().get(sid).electionAddr); +} + +private Socket openChannel(long sid, InetSocketAddress electionAddr) { +LOG.debug("Opening channel to server " + sid); +try { +final Socket sock = new Socket(); +setSockOpts(sock); +sock.connect(electionAddr, cnxTO); +LOG.debug("Connected to server " + sid); +return sock; +} catch (UnresolvedAddressException e) { +// Sun doesn't include the address that causes this +// exception to be thrown, also UAE cannot be wrapped cleanly +// so we log the exception in order to capture this critical +// detail. +LOG.warn("Cannot open channel to " + sid ++ " at election address " + electionAddr, e); +throw e; +} catch (IOException e) { +LOG.warn("Cannot open channel to " + sid ++ " at election address " + electionAddr, +e); +return null; +} } /** * If this server has initiated the connection, then it gives up on the * connection if it loses challenge. Otherwise, it keeps the connection. */ -public void initiateConnection(final Socket sock, final Long sid) { +public boolean initiateConnection(final Long sid, InetSocketAddress electionAddr) { try { -startConnection(sock, sid); -} catch (IOException e) { -LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection", -new Object[] { sid, sock.getRemoteSocketAddress() }, e); -closeSocket(sock); -return; +Socket sock = openChannel(sid, electionAddr); +if (sock != null) { +try { +startConnection(sock, sid); +} catch (IOException e) { +LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection", +new Object[]{sid, sock.getRemoteSocketAddress()}, e); +closeSocket(sock); +} +return true; +} else { +return false; +} +} finally { +inprogressConnections.remove(sid); } } -/** - * Server will initiate the connection request to its peer server - * asynchronously via separate connection thread. - */ -public void initiateConnectionAsync(final Socket sock, final Long sid) { +synchronized private void connectOneAsync(final Long sid, final ZooKeeperThread connectorThread) { +if (senderWorkerMap.get(sid) != null) { +LOG.debug("There is a connection already for server " + sid); +return; +} if(!inprogressConnections.add(sid)){ // simply return as there is a connection request to // server 'sid' already in progress. LOG.debug("Connection request to server id: {} is already in progress, so skipping this request", sid); -closeSocket(sock); return; } try { -connectionExecutor.execute( -new QuorumConnectionReqThread(sock, sid)); +connectionExecutor.execute(connectorThread); connectionThreadCnt.incrementAndGet(); } catch (Throwable e) { // Imp: Safer side catching all type of exceptions and remove 'sid' // from inprogress connections. This is to avoid blocking further // connection requests from this 'sid' in case of errors. inprogressConnections.remove(sid); LOG.error("Exception while submitting quorum connection request", e); -closeSocket(sock); } } +/** + * Try to establish a connection to
[GitHub] zookeeper pull request #456: ZOOKEEPER-2930: Leader cannot be elected due to...
Github user fpj commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/456#discussion_r168620297 --- Diff: src/java/main/org/apache/zookeeper/server/quorum/QuorumCnxManager.java --- @@ -318,76 +318,167 @@ public Thread newThread(Runnable r) { */ public void testInitiateConnection(long sid) throws Exception { LOG.debug("Opening channel to server " + sid); -Socket sock = new Socket(); -setSockOpts(sock); -sock.connect(self.getVotingView().get(sid).electionAddr, cnxTO); -initiateConnection(sock, sid); +initiateConnection(sid, self.getVotingView().get(sid).electionAddr); +} + +private Socket openChannel(long sid, InetSocketAddress electionAddr) { +LOG.debug("Opening channel to server " + sid); +try { +final Socket sock = new Socket(); +setSockOpts(sock); +sock.connect(electionAddr, cnxTO); +LOG.debug("Connected to server " + sid); +return sock; +} catch (UnresolvedAddressException e) { +// Sun doesn't include the address that causes this +// exception to be thrown, also UAE cannot be wrapped cleanly +// so we log the exception in order to capture this critical +// detail. +LOG.warn("Cannot open channel to " + sid ++ " at election address " + electionAddr, e); +throw e; +} catch (IOException e) { +LOG.warn("Cannot open channel to " + sid ++ " at election address " + electionAddr, +e); +return null; +} } /** * If this server has initiated the connection, then it gives up on the * connection if it loses challenge. Otherwise, it keeps the connection. */ -public void initiateConnection(final Socket sock, final Long sid) { +public boolean initiateConnection(final Long sid, InetSocketAddress electionAddr) { try { -startConnection(sock, sid); -} catch (IOException e) { -LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection", -new Object[] { sid, sock.getRemoteSocketAddress() }, e); -closeSocket(sock); -return; +Socket sock = openChannel(sid, electionAddr); +if (sock != null) { +try { +startConnection(sock, sid); +} catch (IOException e) { +LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection", +new Object[]{sid, sock.getRemoteSocketAddress()}, e); +closeSocket(sock); +} +return true; +} else { +return false; +} +} finally { +inprogressConnections.remove(sid); } } -/** - * Server will initiate the connection request to its peer server - * asynchronously via separate connection thread. - */ -public void initiateConnectionAsync(final Socket sock, final Long sid) { +synchronized private void connectOneAsync(final Long sid, final ZooKeeperThread connectorThread) { +if (senderWorkerMap.get(sid) != null) { +LOG.debug("There is a connection already for server " + sid); +return; +} if(!inprogressConnections.add(sid)){ // simply return as there is a connection request to // server 'sid' already in progress. LOG.debug("Connection request to server id: {} is already in progress, so skipping this request", sid); -closeSocket(sock); return; } try { -connectionExecutor.execute( -new QuorumConnectionReqThread(sock, sid)); +connectionExecutor.execute(connectorThread); connectionThreadCnt.incrementAndGet(); } catch (Throwable e) { // Imp: Safer side catching all type of exceptions and remove 'sid' // from inprogress connections. This is to avoid blocking further // connection requests from this 'sid' in case of errors. inprogressConnections.remove(sid); LOG.error("Exception while submitting quorum connection request", e); -closeSocket(sock); } } +/** + * Try to establish a connection to server
[GitHub] zookeeper pull request #456: ZOOKEEPER-2930: Leader cannot be elected due to...
Github user JonathanO commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/456#discussion_r166036745 --- Diff: src/java/main/org/apache/zookeeper/server/quorum/QuorumCnxManager.java --- @@ -689,15 +669,15 @@ synchronized void connectOne(long sid){ MaplastProposedView = lastSeenQV.getAllMembers(); if (lastCommittedView.containsKey(sid)) { knownId = true; -if (connectOne(sid, lastCommittedView.get(sid).electionAddr)) -return; --- End diff -- This part of the change isn't quite right since it relied on connectOne returning false on an IOException calling sock.connect(). We will no longer attempt to use lastProposedView.get(sid).electionAddr in the case that a connection using the lastCommittedView failed and the electionAddr has changed. I don't know what effect this will have. Maybe I need to move this condition into the async connection mechanism too? ---
[GitHub] zookeeper pull request #456: ZOOKEEPER-2930: Leader cannot be elected due to...
GitHub user JonathanO opened a pull request: https://github.com/apache/zookeeper/pull/456 ZOOKEEPER-2930: Leader cannot be elected due to network timeout of some members. Move sock.connect() into the async connection worker thread. Use initiateConnectionAsync for all connections. This prevents connection delays blocking notifications to other nodes. You can merge this pull request into a Git repository by running: $ git pull https://github.com/transferwise/zookeeper ZOOKEEPER-2930 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/zookeeper/pull/456.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #456 commit 68e327e112b0910e56b9fa2b3ba948e4179adb6b Author: Jonathan OddyDate: 2018-02-02T15:33:50Z ZOOKEEPER-2930: Leader cannot be elected due to network timeout of some members. Move sock.connect() into the async connection worker thread. Use initiateConnectionAsync for all connections. This prevents connection delays blocking notifications to other nodes. ---