Repository: incubator-impala Updated Branches: refs/heads/master c7fa03286 -> 9b3f43b9f
IMPALA-2013: Reintroduce steps for checking HBase health in run-hbase.sh We used to include a step in run-hbase.sh for calling a python script that queried Zookeeper to see if the HBase master was up. The original script was problematic, so we stopped using it during our mini-cluster HBase start up procedure. HBase start up issues continue to plague us, however. This patch reintroduces a Zookeeper check, with the following updates: - replace the original script with check-hbase-nodes.py - query the correct node /hbase/master, not just /hbase/rs - use the python Zookeeper library kazoo, rather than calling out to the shell and parsing the return string - since we are moving toward testing on a remote cluster, also add the capability to pass in the address for the host that provides the Zookeeper and HBase services - add an additional check that the HDFS service is running, because of an edge case where the HBase master can briefly start without a cluster running. In addition to the expected tests, this script was also tested under the conditions of IMPALA-4088, whereby the HBase RegionServer is running, but the master fails because another listening process has already taken its TCP port (60010) during startup. Change-Id: I9b81f3cfb6ea0ba7b18ce5fcd5d268f515c8b0c3 Reviewed-on: http://gerrit.cloudera.org:8080/4348 Reviewed-by: Alex Behm <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a42d18dc Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a42d18dc Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a42d18dc Branch: refs/heads/master Commit: a42d18dcc316ac2dbc32edcc2774272f0bdf469c Parents: c7fa032 Author: David Knupp <[email protected]> Authored: Thu Sep 8 16:42:44 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Thu Sep 15 00:02:22 2016 +0000 ---------------------------------------------------------------------- infra/python/deps/requirements.txt | 1 + testdata/bin/check-hbase-nodes.py | 174 +++++++++++++++++++++++++++++ testdata/bin/run-hbase.sh | 1 + testdata/bin/wait-for-hbase-master.py | 59 ---------- 4 files changed, 176 insertions(+), 59 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/infra/python/deps/requirements.txt ---------------------------------------------------------------------- diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt index 1c87f8e..b433713 100644 --- a/infra/python/deps/requirements.txt +++ b/infra/python/deps/requirements.txt @@ -54,6 +54,7 @@ impyla == 0.11.2 # before thirdparty is built thrift will be installed anyways. thrift == 0.9.0 thrift_sasl == 0.1.0 +kazoo == 2.2.1 monkeypatch == 0.1rc3 ordereddict == 1.1 pexpect == 3.3 http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/check-hbase-nodes.py ---------------------------------------------------------------------- diff --git a/testdata/bin/check-hbase-nodes.py b/testdata/bin/check-hbase-nodes.py new file mode 100755 index 0000000..999a657 --- /dev/null +++ b/testdata/bin/check-hbase-nodes.py @@ -0,0 +1,174 @@ +#!/usr/bin/env impala-python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Given a series of hosts and Zookeeper nodes, make sure that each node is accessible. +""" + +import argparse +import hdfs +import logging +import pprint +import requests +import sys +import time + +from contextlib import closing +from kazoo.client import KazooClient +from kazoo.exceptions import NoNodeError +from kazoo.handlers.threading import KazooTimeoutError + +LOGGER = logging.getLogger('hbase_check') +LOGGER.addHandler(logging.StreamHandler()) +LOGGER.setLevel(logging.INFO) + +TIMEOUT_SECONDS = 30 + +HDFS_HOST = '127.0.0.1:5070' +ZK_HOSTS = '127.0.0.1:2181' +HBASE_NODES = ['/hbase/master', '/hbase/rs'] +ADMIN_USER = 'admin' + + +def parse_args(): + """Parse and return command line args.""" + parser = argparse.ArgumentParser() + + parser.add_argument('--timeout', '-t', type=int, default=TIMEOUT_SECONDS, + help=('Number of seconds to try to get znode before giving up. ' + 'Default is {0} seconds.'.format(TIMEOUT_SECONDS))) + + parser.add_argument('--hdfs_host', '-s', default=HDFS_HOST, + help=('Host:port where the HDFS web host is running, ' + 'e.g, 0.0.0.0:5070. Default is {0}.'.format(HDFS_HOST))) + + parser.add_argument('--admin_user', '-u', default=ADMIN_USER, + help='Cluster admin username. Default is {0}.'.format(ADMIN_USER)) + + parser.add_argument('--zookeeper_hosts', '-z', default=ZK_HOSTS, + help=('Comma-delineated string of hosts in host:PORT format, ' + 'e.g, 0.0.0.0:2181. Default is {0}.'.format(ZK_HOSTS))) + + parser.add_argument('-node', '-n', action='append', dest='nodes', + default=HBASE_NODES, + help=('HBase znode to check. Can be specified multiple times. ' + 'Defaults are -n {0}.'.format(' -n '.join(HBASE_NODES)))) + return parser.parse_args() + + +def connect_to_zookeeper(host_list, timeout_seconds): + """Connect to Zookeeper service. + + Args: + host_list: Comma-separated string of hosts in host:port format + timeout_seconds: Number of seconds to attempt to connect to host + + Returns: + KazooClient instance + """ + zk_client = KazooClient(hosts=host_list) + + try: + LOGGER.info("Connecting to Zookeeper host(s).") + zk_client.start(timeout=timeout_seconds) + LOGGER.info("Success: " + str(zk_client)) + return zk_client + except KazooTimeoutError as e: + LOGGER.error("Could not connect to Zookeeper: " + str(e)) + sys.exit(1) + + +def check_znode(node, zk_client, timeout_seconds): + """Given a Zookeeper client and a node, check that the node is up. + + Args: + node: name of a znode as a string, e.g., /hbase/rs + zk_client: Zookeeper client object + timeout_seconds: Number of seconds to attempt to get node + + Returns: + 0 success, 1 on failure + """ + start_time = time.time() + while (time.time() - start_time) < timeout_seconds: + LOGGER.info("Waiting for HBase node: " + node) + try: + node_info = zk_client.get(node) + LOGGER.info("Success: " + node) + LOGGER.debug(pprint.pformat(node_info)) + return 0 + except NoNodeError: + time.sleep(1) + + LOGGER.error("Failed while checking for HBase node: " + node) + return 1 + + +def check_znodes_list_for_errors(nodes, zookeeper_hosts, timeout): + """Confirm that the given list of znodes are responsive. + + Args: + zk_client: Zookeeper client object + node: name of a znode as a string, e.g., /hbase/rs + timeout_seconds: Number of seconds to attempt to get node + + Returns: + 0 success, or else the number of unresponsive nodes + """ + with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client: + errors = sum([check_znode(node, zk_client, timeout) for node in nodes]) + zk_client.stop() + return errors + + +def is_hdfs_running(host, admin_user): + """Confirm that HDFS is available. + + There is a pathological case where the HBase master can start up briefly if HDFS is not + available, and then quit immediately, but that can be long enough to give a false positive + that the HBase master is running. + + Args: + host: HDFS host:port + admin_user: Admin username + + Returns: + Boolean + """ + try: + hdfs_client = hdfs.InsecureClient('http://' + host, user=admin_user) + LOGGER.info("Contents of HDFS root: {0}".format(hdfs_client.list('/'))) + return True + except (requests.exceptions.ConnectionError, hdfs.util.HdfsError) as e: + msg = 'Could not confirm HDFS is running at http://{0} - {1}'.format(host, e) + LOGGER.error(msg) + return False + + +if __name__ == "__main__": + args = parse_args() + + if is_hdfs_running(args.hdfs_host, args.admin_user): + errors = check_znodes_list_for_errors(args.nodes, args.zookeeper_hosts, args.timeout) + + if errors > 0: + msg = "Could not get one or more nodes. Exiting with errors: {0}".format(errors) + LOGGER.error(msg) + sys.exit(errors) + else: + sys.exit(1) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/run-hbase.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/run-hbase.sh b/testdata/bin/run-hbase.sh index 29ff77d..2a51105 100755 --- a/testdata/bin/run-hbase.sh +++ b/testdata/bin/run-hbase.sh @@ -126,4 +126,5 @@ for ((i=1; i <= HBASE_START_RETRY_ATTEMPTS; ++i)); do fi done +${CLUSTER_BIN}/check-hbase-nodes.py echo "HBase startup scripts succeeded" http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a42d18dc/testdata/bin/wait-for-hbase-master.py ---------------------------------------------------------------------- diff --git a/testdata/bin/wait-for-hbase-master.py b/testdata/bin/wait-for-hbase-master.py deleted file mode 100755 index 428949e..0000000 --- a/testdata/bin/wait-for-hbase-master.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env impala-python -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from subprocess import Popen, PIPE -import os -import re -import sys -import time - -now = time.time() -TIMEOUT_SECONDS = 30.0 -ZK_CLASS="org.apache.zookeeper.ZooKeeperMain" - -print "Waiting for HBase Master" - -while time.time() - now < TIMEOUT_SECONDS: - sys.stdout.write(".") - sys.stdout.flush() - - p = Popen([os.environ["JAVA"], - ZK_CLASS, - "-server", - "localhost:2181", - "get", - "/hbase/rs"], stderr=PIPE, stdout=PIPE) - out, err = p.communicate() - if re.match(".*" + ZK_CLASS + "\w*$", err): - print "Failure" - print err - print "CLASSPATH does not contain " + ZK_CLASS - print "Please check your CLASSPATH" - exit(1) - - if "numChildren" in err: - print "Success" - print "HBase master is up, found in %2.1fs" % (time.time() - now,) - exit(0) - - time.sleep(0.5) - -print "Failure" -print "Hbase master did NOT write /hbase/rs in %2.1fs" % (time.time() - now,) -exit(1)
