Repository: incubator-impala
Updated Branches:
  refs/heads/master ba8c135ef -> b3636c97d


IMPALA-4684: Handle Zookeeper ConnentionLoss exceptions

This is the second patch to address IMPALA-4684. The first patch exposed
a transient Zookeeper connection error on RHEL7. This patch introduces a
retry (up to 3 times), and somewhat better logging.

Tested by running tests against an RHEL7 instance and confirming that
all HBase nodes start up.

Change-Id: I44b4eec342addcfe489f94c332bbe14225c9968c
Reviewed-on: http://gerrit.cloudera.org:8080/5554
Reviewed-by: Alex Behm <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/226a2e63
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/226a2e63
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/226a2e63

Branch: refs/heads/master
Commit: 226a2e63321e9bcf4ba906512fc40e35b98db252
Parents: ba8c135
Author: David Knupp <[email protected]>
Authored: Tue Dec 20 15:06:15 2016 -0800
Committer: Internal Jenkins <[email protected]>
Committed: Thu Dec 22 01:18:56 2016 +0000

----------------------------------------------------------------------
 testdata/bin/check-hbase-nodes.py | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226a2e63/testdata/bin/check-hbase-nodes.py
----------------------------------------------------------------------
diff --git a/testdata/bin/check-hbase-nodes.py 
b/testdata/bin/check-hbase-nodes.py
index 28ac0c1..ffe7a7c 100755
--- a/testdata/bin/check-hbase-nodes.py
+++ b/testdata/bin/check-hbase-nodes.py
@@ -30,7 +30,7 @@ import time
 
 from contextlib import closing
 from kazoo.client import KazooClient
-from kazoo.exceptions import NoNodeError
+from kazoo.exceptions import NoNodeError, ConnectionLoss
 from kazoo.handlers.threading import KazooTimeoutError
 
 LOGGER = logging.getLogger('hbase_check')
@@ -43,6 +43,7 @@ HDFS_HOST = '127.0.0.1:5070'
 ZK_HOSTS = '127.0.0.1:2181'
 HBASE_NODES = ['/hbase/master', '/hbase/rs']
 ADMIN_USER = 'admin'
+MAX_ZOOKEEPER_CONNECTION_RETRIES = 3
 
 
 def parse_args():
@@ -128,14 +129,31 @@ def check_znodes_list_for_errors(nodes, zookeeper_hosts, 
timeout):
         timeout_seconds: Number of seconds to attempt to get node
 
     Returns:
-        0 success, or else the number of unresponsive nodes
+        0 success, or else the number of errors
     """
-    with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client:
-        try:
-            errors = sum([check_znode(node, zk_client, timeout) for node in 
nodes])
-        finally:
-            zk_client.stop()
-    return errors
+    connection_retries = 0
+
+    while True:
+        with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as 
zk_client:
+            try:
+                return sum([check_znode(node, zk_client, timeout) for node in 
nodes])
+            except ConnectionLoss as e:
+                connection_retries += 1
+                if connection_retries > MAX_ZOOKEEPER_CONNECTION_RETRIES:
+                    LOGGER.error("Max connection retries exceeded: 
{0}".format(str(e)))
+                    raise
+                else:
+                    err_msg = ("Zookeeper connection loss: retrying connection 
"
+                               "({0} of {1} attempts)")
+                    LOGGER.warn(err_msg.format(connection_retries,
+                                               
MAX_ZOOKEEPER_CONNECTION_RETRIES))
+                    time.sleep(1)
+            except Exception as e:
+                LOGGER.error("Unexpected error checking HBase node: 
{0}".format(str(e)))
+                raise
+            finally:
+                LOGGER.info("Stopping Zookeeper client")
+                zk_client.stop()
 
 
 def is_hdfs_running(host, admin_user):

Reply via email to