Repository: incubator-impala Updated Branches: refs/heads/master ba8c135ef -> b3636c97d
IMPALA-4684: Handle Zookeeper ConnentionLoss exceptions This is the second patch to address IMPALA-4684. The first patch exposed a transient Zookeeper connection error on RHEL7. This patch introduces a retry (up to 3 times), and somewhat better logging. Tested by running tests against an RHEL7 instance and confirming that all HBase nodes start up. Change-Id: I44b4eec342addcfe489f94c332bbe14225c9968c Reviewed-on: http://gerrit.cloudera.org:8080/5554 Reviewed-by: Alex Behm <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/226a2e63 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/226a2e63 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/226a2e63 Branch: refs/heads/master Commit: 226a2e63321e9bcf4ba906512fc40e35b98db252 Parents: ba8c135 Author: David Knupp <[email protected]> Authored: Tue Dec 20 15:06:15 2016 -0800 Committer: Internal Jenkins <[email protected]> Committed: Thu Dec 22 01:18:56 2016 +0000 ---------------------------------------------------------------------- testdata/bin/check-hbase-nodes.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226a2e63/testdata/bin/check-hbase-nodes.py ---------------------------------------------------------------------- diff --git a/testdata/bin/check-hbase-nodes.py b/testdata/bin/check-hbase-nodes.py index 28ac0c1..ffe7a7c 100755 --- a/testdata/bin/check-hbase-nodes.py +++ b/testdata/bin/check-hbase-nodes.py @@ -30,7 +30,7 @@ import time from contextlib import closing from kazoo.client import KazooClient -from kazoo.exceptions import NoNodeError +from kazoo.exceptions import NoNodeError, ConnectionLoss from kazoo.handlers.threading import KazooTimeoutError LOGGER = logging.getLogger('hbase_check') @@ -43,6 +43,7 @@ HDFS_HOST = '127.0.0.1:5070' ZK_HOSTS = '127.0.0.1:2181' HBASE_NODES = ['/hbase/master', '/hbase/rs'] ADMIN_USER = 'admin' +MAX_ZOOKEEPER_CONNECTION_RETRIES = 3 def parse_args(): @@ -128,14 +129,31 @@ def check_znodes_list_for_errors(nodes, zookeeper_hosts, timeout): timeout_seconds: Number of seconds to attempt to get node Returns: - 0 success, or else the number of unresponsive nodes + 0 success, or else the number of errors """ - with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client: - try: - errors = sum([check_znode(node, zk_client, timeout) for node in nodes]) - finally: - zk_client.stop() - return errors + connection_retries = 0 + + while True: + with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client: + try: + return sum([check_znode(node, zk_client, timeout) for node in nodes]) + except ConnectionLoss as e: + connection_retries += 1 + if connection_retries > MAX_ZOOKEEPER_CONNECTION_RETRIES: + LOGGER.error("Max connection retries exceeded: {0}".format(str(e))) + raise + else: + err_msg = ("Zookeeper connection loss: retrying connection " + "({0} of {1} attempts)") + LOGGER.warn(err_msg.format(connection_retries, + MAX_ZOOKEEPER_CONNECTION_RETRIES)) + time.sleep(1) + except Exception as e: + LOGGER.error("Unexpected error checking HBase node: {0}".format(str(e))) + raise + finally: + LOGGER.info("Stopping Zookeeper client") + zk_client.stop() def is_hdfs_running(host, admin_user):
