Repository: incubator-hawq Updated Branches: refs/heads/master 1469782ed -> e74109bf6
HAWQ-668. hawq check should be able to check yarn settings Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/e74109bf Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/e74109bf Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/e74109bf Branch: refs/heads/master Commit: e74109bf6645a24bbbd2ce37a32d01e981c670e5 Parents: 1469782 Author: rlei <[email protected]> Authored: Wed Apr 13 17:13:02 2016 +0800 Committer: rlei <[email protected]> Committed: Tue Apr 19 17:23:27 2016 +0800 ---------------------------------------------------------------------- src/backend/utils/misc/etc/gpcheck.cnf | 48 +++- tools/bin/gpcheck | 365 ++++++++++++++++++++++++++-- tools/bin/gppylib/gpcheckutil.py | 17 +- tools/bin/hawqpylib/hawqlib.py | 18 ++ tools/sbin/gpcheck_hostdump | 50 +++- 5 files changed, 472 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/src/backend/utils/misc/etc/gpcheck.cnf ---------------------------------------------------------------------- diff --git a/src/backend/utils/misc/etc/gpcheck.cnf b/src/backend/utils/misc/etc/gpcheck.cnf index 9ccac0d..9d36de6 100644 --- a/src/backend/utils/misc/etc/gpcheck.cnf +++ b/src/backend/utils/misc/etc/gpcheck.cnf @@ -40,12 +40,11 @@ hard.nproc = 131072 diskusage.monitor.mounts = / diskusage.monitor.usagemax = 90% -[hdfs] +[hdfs.base] dfs.mem.namenode.heap = 40960 dfs.mem.datanode.heap = 6144 # in hdfs-site.xml dfs.support.append = true -dfs.client.enable.read.from.local = true dfs.block.local-path-access.user = gpadmin dfs.datanode.max.transfer.threads = 40960 dfs.client.socket-timeout = 300000000 @@ -54,5 +53,48 @@ dfs.namenode.handler.count = 60 ipc.server.handler.queue.size = 3300 dfs.datanode.handler.count = 60 ipc.client.connection.maxidletime = 3600000 -dfs.namenode.accesstime.precision = -1 +dfs.namenode.accesstime.precision = 0 +dfs.client.read.shortcircuit = true +[hdfs.non] +dfs.block.access.token.enable = FALSE + +[hdfs.ha] +dfs.block.access.token.enable = FALSE + +[hdfs.kerberos] +dfs.block.access.token.enable = TRUE +dfs.datanode.data.dir.perm = 750 + +[hdfs.ha.kerberos] +dfs.block.access.token.enable = TRUE + +[yarn.base] +yarn.resourcemanager.scheduler.class = org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler + +[yarn.non] + +[yarn.ha] + +[yarn.kerberos] +hadoop.security.authentication = kerberos +hadoop.proxyuser.yarn.groups = * +hadoop.proxyuser.yarn.hosts = * +hadoop.proxyuser.postgres.hosts = * +hadoop.proxyuser.postgres.groups = * + +[yarn.ha.kerberos] +hadoop.security.authentication = kerberos +hadoop.proxyuser.yarn.groups = * +hadoop.proxyuser.yarn.hosts = * +hadoop.proxyuser.postgres.hosts = * +hadoop.proxyuser.postgres.groups = * + +[hawq.base] +dfs.client.read.shortcircuit = true + +[hawq.kerberos] +hadoop.security.authentication = kerberos + +[hawq.yarn] +hawq_global_rm_type = yarn http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/gpcheck ---------------------------------------------------------------------- diff --git a/tools/bin/gpcheck b/tools/bin/gpcheck index aefe499..1d0019c 100755 --- a/tools/bin/gpcheck +++ b/tools/bin/gpcheck @@ -26,8 +26,10 @@ try: from gppylib.commands.unix import getLocalHostname, getUserName, SYSTEM from gppylib.commands.base import WorkerPool, Command, REMOTE from gppylib.gpcheckutil import HostType, hosttype_str + from hawqpylib.hawqlib import remote_ssh_output from pgdb import DatabaseError import pg + import stat except ImportError, e: sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) @@ -89,15 +91,29 @@ class GpCheckConfig: self.hdfs_expected = { # default value for HDFS configuration "dfs.mem.namenode.heap": 8192, "dfs.mem.datanode.heap": 8192 } + self.hdfs_non_expected = {} + self.hdfs_ha_expected = {} + self.hdfs_kerberos_expected = {} + self.hdfs_ha_kerberos_expected = {} + + self.yarn_expected = {} + self.yarn_non_expected = {} + self.yarn_ha_expected = {} + self.yarn_kerberos_expected = {} + self.yarn_ha_kerberos_expected = {} + + self.hawq_expected = {} + self.hawq_kerberos_expected = {} + self.hawq_yarn_expected = {} + def readConfigFile(self, config_file): parsed_list = self.parser.read(config_file) if len(parsed_list) != 1: raise GpCheckError("cannot open file!") - for required_section in ("linux.sysctl", "hdfs"): - if not self.parser.has_section(required_section): - raise GpCheckError("require section '%s'" % required_section) + if not self.parser.has_section("linux.sysctl"): + raise GpCheckError("require section 'linux.sysctl'") section = "global" if self.parser.has_option(section, "configfile_version"): @@ -136,15 +152,75 @@ class GpCheckConfig: raise GpCheckError("Bad config entry value '%s' for 'diskusage.monitor.usagemax': %s" % (self.diskusage_usagemax, e)) - section = 'hdfs' - for opt in self.parser.options(section): - self.hdfs_expected[opt] = self.parser.get(section, opt) - try: - self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"]) - self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"]) - except ValueError, e: - raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e) + if not self.parser.has_section('hdfs.base'): + if not self.parser.has_section("hdfs"): + raise GpCheckError("require section 'hdfs'") + + section = 'hdfs' + for opt in self.parser.options(section): + self.hdfs_expected[opt] = self.parser.get(section, opt) + try: + self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"]) + self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"]) + except ValueError, e: + raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e) + else: + section = 'hdfs.base' + for opt in self.parser.options(section): + self.hdfs_expected[opt] = self.parser.get(section, opt) + try: + self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"]) + self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"]) + except ValueError, e: + raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e) + + section = 'hdfs.non' + for opt in self.parser.options(section): + self.hdfs_non_expected[opt] = self.parser.get(section, opt) + + section = 'hdfs.ha' + for opt in self.parser.options(section): + self.hdfs_ha_expected[opt] = self.parser.get(section, opt) + + section = 'hdfs.kerberos' + for opt in self.parser.options(section): + self.hdfs_kerberos_expected[opt] = self.parser.get(section, opt) + + section = 'hdfs.ha.kerberos' + for opt in self.parser.options(section): + self.hdfs_ha_kerberos_expected[opt] = self.parser.get(section, opt) + section = 'yarn.base' + for opt in self.parser.options(section): + self.yarn_expected[opt] = self.parser.get(section, opt) + + section = 'yarn.non' + for opt in self.parser.options(section): + self.yarn_non_expected[opt] = self.parser.get(section, opt) + + section = 'yarn.ha' + for opt in self.parser.options(section): + self.yarn_ha_expected[opt] = self.parser.get(section, opt) + + section = 'yarn.kerberos' + for opt in self.parser.options(section): + self.yarn_kerberos_expected[opt] = self.parser.get(section, opt) + + section = 'yarn.ha.kerberos' + for opt in self.parser.options(section): + self.yarn_ha_kerberos_expected[opt] = self.parser.get(section, opt) + + section = 'hawq.base' + for opt in self.parser.options(section): + self.hawq_expected[opt] = self.parser.get(section, opt) + + section = 'hawq.kerberos' + for opt in self.parser.options(section): + self.hawq_kerberos_expected[opt] = self.parser.get(section, opt) + + section = 'hawq.yarn' + for opt in self.parser.options(section): + self.hawq_yarn_expected[opt] = self.parser.get(section, opt) ###### Global Variables ############# logger = get_default_logger() @@ -176,6 +252,16 @@ def checkPlatform(): raise GpCheckError("No tests exists for this platform in gpcheck") +def parse_host_list_file(host_file): + host_list = list() + with open(host_file) as f: + hosts = f.readlines() + for host in hosts: + host = host.split("#",1)[0].strip() + if host: + host_list.append(host) + return host_list + def parseargs(): global options, GPHOME, HADOOP_HOME, GPCHECK_CONFIG_FILE @@ -188,7 +274,12 @@ def parseargs(): parser.add_option('--zipin', type='string') parser.add_option('--gphome', type='string') # for HDFS xml and memory check - parser.add_option('--hadoop', type='string') + parser.add_option('--hadoop', '--hadoop-home', type='string') + parser.add_option('--hdfs', action='store_true') + parser.add_option('--hdfs-ha', dest="hdfs_ha", action='store_true') + parser.add_option('--yarn', action='store_true') + parser.add_option('--yarn-ha', dest="yarn_ha", action='store_true') + parser.add_option('--kerberos', action='store_true') parser.add_option('-c', '--config', type='string') # optional: gpcheck config file path parser.add_option('-f', '--file', type='string') # host file, for testing a list of hosts @@ -212,6 +303,10 @@ def parseargs(): if not HADOOP_HOME: checkFailed(None, "utility will SKIP HDFS configuration check because HADOOP_HOME is not specified in environment variable or --hadoop") + if options.yarn and not HADOOP_HOME: + options.yarn = False + checkFailed(None, "utility will SKIP YARN configuration check because HADOOP_HOME is not specified in environment variable or --hadoop") + # params check if not options.file and not options.host and not options.zipin: raise GpCheckError(" --file or --host or --zipin must be specified") @@ -242,6 +337,7 @@ def checkFailed(host, msg): def getHDFSNamenodeHost(): core_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/core-site.xml") + hdfs_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/hdfs-site.xml") logger.info("try to detect namenode from %s" % core_site_file) # for processing property xml @@ -255,12 +351,42 @@ def getHDFSNamenodeHost(): for node in xmldoc.getElementsByTagName('property'): if getPropName(node) == 'fs.default.name' or getPropName(node) == 'fs.defaultFS': fsurl = getPropValue(node).strip() - namenode_addr = re.search(r"//([^:/]*)", fsurl).group(1) + namenode_list_alias = re.search(r"//([^:/]*)", fsurl).group(1) + if_ha_disabled = re.search(".*:[0-9]+$", fsurl) + if if_ha_disabled: + namenode_addr = namenode_list_alias + else: + namenode_addr = '' break # run hostname command on remote to get actual hostname if namenode_addr == '': - logger.error("cannot detect namenode from %s" % core_site_file) + ha_namenode_list = '' + default_namenode_alias = '' + with open(hdfs_site_file) as f: + xmldoc = minidom.parse(f) + for node in xmldoc.getElementsByTagName('property'): + if re.search('dfs.ha.namenodes.*', getPropName(node).strip()): + ha_namenode_list = getPropValue(node).strip() + default_namenode_alias = ha_namenode_list.split(',')[0].strip() + break + + if ha_namenode_list == '': + logger.error("cannot detect namenode from %s" % core_site_file) + raise GpCheckError("cannot detect namenode from %s" % core_site_file) + #sys.exit(1) + else: + with open(hdfs_site_file) as f: + xmldoc = minidom.parse(f) + for node in xmldoc.getElementsByTagName('property'): + namenode_rpc_address = "dfs.namenode.rpc-address.%s.%s" % (namenode_list_alias, + default_namenode_alias) + if getPropName(node) == namenode_rpc_address: + default_namenode_rpc_address = getPropValue(node).strip() + namenode_addr = default_namenode_rpc_address.split(':')[0].strip() + + if namenode_addr == '': + raise GpCheckError("cannot detect namenode from %s" % core_site_file) else: cmd = Command(namenode_addr, "hostname", REMOTE, namenode_addr) pool.addCommand(cmd) @@ -345,10 +471,12 @@ def runCollectionOnServers(): else: raise GpCheckError("unsupported host type") - cmd = "%s/sbin/gpcheck_hostdump %s" % (GPHOME, host_type_cl) + cmd = "%s/sbin/gpcheck_hostdump --hawq %s" % (GPHOME, host_type_cl) cmd += " --sysctl %s" % ",".join(gpcheck_config.sysctl_expected.keys()) if HADOOP_HOME: cmd += " --hadoop %s" % HADOOP_HOME + if options.yarn or options.yarn_ha: + cmd += " --yarn" return cmd try: @@ -537,7 +665,7 @@ def testSolarisEtcUserAttr(host): checkFailed(host.hostname, "/etc/user_attr is missing expected line '%s'" % line) -def testHAWQ(host): +def testHAWQGUC(host): if not gpcheck_info.hawq_collected_ok: return @@ -567,7 +695,7 @@ def testHAWQ(host): return # check HAWQ master's memory size - expected_vmemory_size = 1024 + expected_vmemory_size = 8192 if guc_vmemsize_master != expected_vmemory_size: checkFailed(host.hostname, "HAWQ master's %s GUC value is %s, expected %s" % ( HAWQ_GUC_MEMORY, guc_vmemsize_master, expected_vmemory_size)) @@ -582,7 +710,7 @@ def testHAWQ(host): logger.warning("please change the expected data node memory 'dfs.mem.datanode.heap' in gpcheck.cnf file") logger.warning("SKIP '%s' check" %(HAWQ_GUC_MEMORY)) return - expect_vmemsize_per_segment = 1024 + expect_vmemsize_per_segment = 8192 if guc_vmemsize_master != expect_vmemsize_per_segment: checkFailed(host.hostname, "HAWQ segment's %s GUC value on this host is %s, expected %s" % ( HAWQ_GUC_MEMORY, guc_vmemsize_master, expect_vmemsize_per_segment)) @@ -602,6 +730,120 @@ def testDiskCapacity(host): return +def testHAWQconfig(host): + hawq = host.data.hawq + hdfs = host.data.hdfs + if hawq is None: + return # skip HAWQ test when hawq is None + + if options.verbose: + logger.info("-- test HAWQ config") + + if hawq.errormsg: + checkFailed(host.hostname, "collect HAWQ configuration error: %s" % hawq.errormsg) + return + + datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME) + is_datanode = False + if host.hostname in datanode_list: + is_datanode = True + + expect_config = gpcheck_config.hawq_expected + + if options.kerberos: + expect_config.update(gpcheck_config.hawq_kerberos_expected) + + if options.yarn or options.yarn_ha: + expect_config.update(gpcheck_config.hawq_yarn_expected) + + actual_config = hawq.site_config + hdfs_actual_config = hdfs.site_config + + for exp_key, exp_val in expect_config.items(): + if exp_key not in actual_config: + checkFailed(host.hostname, "HAWQ configuration missing: '%s' needs to be set to '%s'" % (exp_key, exp_val)) + + else: + actual_val = actual_config[exp_key] + et = (exp_key, exp_val, actual_val) + + if exp_key == "dfs.block.local-path-access.user": + if exp_val not in actual_val.split(','): + checkFailed(host.hostname, "HDFS configuration: '%s' should include user '%s', actual value is '%s'" % et) + elif exp_key == "dfs.namenode.handler.count": + if int(exp_val) > int(actual_val): + checkFailed(host.hostname, "HDFS configuration: '%s' should be at least '%s', actual value is '%s'" % et) + else: + if exp_val != actual_val: + checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % et) + + if not options.kerberos: + if 'hadoop.security.authentication' in actual_config: + if actual_config['hadoop.security.authentication'] != 'simple': + checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % ('simple', 'hadoop.security.authentication', actual_config[hadoop.security.authentication])) + + if 'hadoop.security.authentication' in hdfs_actual_config: + if hdfs_actual_config['hadoop.security.authentication'] != 'simple': + checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % ('simple', 'hadoop.security.authentication', hdfs_actual_config[hadoop.security.authentication])) + + if options.yarn or options.yarn_ha: + hawq_yarn_property_exist_list = ['hawq_rm_yarn_address', 'hawq_rm_yarn_scheduler_address', 'hawq_rm_yarn_app_name'] + for item in hawq_yarn_property_exist_list: + if item in actual_config: + if not actual_config[item]: + checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address is empty") + else: + checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address not defined") + + if 'dfs.client.read.shortcircuit' not in actual_config: + checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined") + + if 'dfs.client.read.shortcircuit' not in hdfs_actual_config: + checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined") + + if 'dfs.domain.socket.path' not in actual_config: + checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path not defined") + + if 'dfs.domain.socket.path' not in hdfs_actual_config: + checkFailed(host.hostname, "HDFS configuration dfs.domain.socket.path not defined") + + if is_datanode and 'dfs.domain.socket.path' in actual_config and 'dfs.domain.socket.path' in hdfs_actual_config: + if actual_config['dfs.domain.socket.path'] != hdfs_actual_config['dfs.domain.socket.path']: + checkFailed(host.hostname, "HAWQ configuration: dfs.domain.socket.path expect to have the same value with HDFS configuration") + else: + cmd = "ls -l %s" % actual_config['dfs.domain.socket.path'] + (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '') + if result == 0: + if output.split(' ')[0][7:9] != 'rw': + checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s should have R/W access for both hawq and HDFS on %s" % (actual_config['dfs.domain.socket.path'], host.hostname)) + else: + checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s, does not exist on %s" % (actual_config['dfs.domain.socket.path'], host.hostname)) + + if 'output.replace-datanode-on-failure' in actual_config: + if len(datanode_list) < 4: + if actual_config['output.replace-datanode-on-failure'] == 'true': + checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure expect false, current is true") + else: + if actual_config['output.replace-datanode-on-failure'] == 'false': + checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure expect true, current is false") + else: + checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure not defined") + + +def testDiskCapacity(host): + if options.verbose: + logger.info("-- test Disk Capacity") + + for line in host.data.diskusage.lines: + if len(gpcheck_config.diskusage_mounts) == 0 or line.mount in gpcheck_config.diskusage_mounts: + actual_usage = int(line.used_percent[:-1]) + if actual_usage > gpcheck_config.diskusage_usagemax: + checkFailed(host.hostname, + "potential disk full risk: %s mounted on %s has used %s space" % ( + line.fs, line.mount, line.used_percent)) + return + + def testHDFSConfig(host): hdfs = host.data.hdfs if hdfs is None: @@ -615,6 +857,30 @@ def testHDFSConfig(host): return expect_config = gpcheck_config.hdfs_expected + + if not options.hdfs_ha and not options.kerberos: + expect_config.update(gpcheck_config.hdfs_non_expected) + + if options.hdfs_ha and not options.kerberos: + expect_config.update(gpcheck_config.hdfs_ha_expected) + + if options.kerberos and not options.hdfs_ha: + expect_config.update(gpcheck_config.hdfs_kerberos_expected) + + if options.kerberos and options.hdfs_ha: + expect_config.update(gpcheck_config.hdfs_ha_kerberos_expected) + + + if options.yarn or options.yarn_ha: + expect_config.update(gpcheck_config.yarn_expected) + if not options.yarn_ha and not options.kerberos: + expect_config.update(gpcheck_config.yarn_non_expected) + + if options.yarn_ha: + expect_config.update(gpcheck_config.yarn_ha_expected) + if options.kerberos: + expect_config.update(gpcheck_config.yarn_kerberos_expected) + actual_config = hdfs.site_config actual_heap_size = hdfs.namenode_heap_size if host.is_namenode else hdfs.datanode_heap_size @@ -658,6 +924,64 @@ def testHDFSConfig(host): (actual_heap_size, expect_datanode_heap)) + # Check if nodemanager direcotries exists + directory_check_list = [] + datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME) + is_datanode = False + if host.hostname in datanode_list: + is_datanode = True + + if options.yarn or options.yarn_ha: + yarn_enabled = True + else: + yarn_enabled = False + + if yarn_enabled and is_datanode: + if 'yarn.nodemanager.local-dirs' in actual_config: + directory_check_list += actual_config['yarn.nodemanager.local-dirs'].split(',') + else: + checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.local-dirs not defined") + + if 'yarn.nodemanager.log-dirs' in actual_config: + directory_check_list += actual_config['yarn.nodemanager.log-dirs'].split(',') + else: + checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.log-dirs not defined") + + for directory in directory_check_list: + cmd = "test -e %s" % directory + (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '') + if result != 0: + checkFailed(host.hostname, "YARN nodemanager directory %s does not exist" % directory) + + # Check if resource manager property exists + if options.yarn: + yarn_property_exist_list = ['yarn.resourcemanager.address', 'yarn.resourcemanager.scheduler.address'] + + if options.yarn_ha: + yarn_property_exist_list = ['yarn.resourcemanager.address.rm1', 'yarn.resourcemanager.address.rm2', 'yarn.resourcemanager.scheduler.address.rm1', \ + 'yarn.resourcemanager.scheduler.address.rm2'] + + if yarn_enabled: + for item in yarn_property_exist_list: + if item in actual_config: + if not actual_config[item]: + checkFailed(host.hostname, "YARN configuration: %s is empty" % item) + else: + checkFailed(host.hostname, "YARN configuration: %s not defined" % item) + + # Check yarn kerberos properties + #yarn_kerberos_check_list = ['hadoop.proxyuser.yarn.groups', 'hadoop.proxyuser.yarn.hosts', 'hadoop.proxyuser.postgres.hosts', 'hadoop.proxyuser.postgres.groups'] + if yarn_enabled and options.kerberos: + yarn_kerberos_check_list = ['yarn.nodemanager.keytab', 'yarn.nodemanager.principal','hadoop.proxyuser.postgres.groups', \ + 'yarn.resourcemanager.keytab', 'yarn.resourcemanager.principal'] + for item in yarn_kerberos_check_list: + if item in actual_config: + if not actual_config[item]: + checkFailed(host.hostname, "YARN configuration: %s is empty, expected non-empty" % item) + else: + checkFailed(host.hostname, "YARN configuration missing: %s" % item) + + def testIOSchedulers(host): if options.verbose: logger.info("-- test IO scheduler") @@ -774,6 +1098,8 @@ def testNtp(host): def testGenericLinuxHost(host): logger.info("test on host: %s" % host.hostname) if host.is_namenode: + testHAWQGUC(host) + testHAWQconfig(host) testHDFSConfig(host) testDiskCapacity(host) testSysctl(host) @@ -782,7 +1108,8 @@ def testGenericLinuxHost(host): testNtp(host) else: - testHAWQ(host) + testHAWQGUC(host) + testHAWQconfig(host) testDiskCapacity(host) testHDFSConfig(host) testIOSchedulers(host) http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/gppylib/gpcheckutil.py ---------------------------------------------------------------------- diff --git a/tools/bin/gppylib/gpcheckutil.py b/tools/bin/gppylib/gpcheckutil.py index 9956990..3419bb2 100755 --- a/tools/bin/gppylib/gpcheckutil.py +++ b/tools/bin/gppylib/gpcheckutil.py @@ -151,6 +151,20 @@ class hdfs: return "============= HDFS ==========================\n" + output +class hawq: + def __init__(self): + self.site_config = dict() + self.errormsg = None + + def __str__(self): + if self.errormsg: + return "============= HAWQ ERROR ====================\n" + self.errormsg + else: + output = "HAWQ checks \n" + output += "\n".join(["%s = %s" % (k, self.site_config[k]) for k in sorted(self.site_config.iterkeys())]) + return "============= HAWQ ==========================\n" + output + + class diskusage_entry: def __init__(self, fs, size, used, avail, used_percent, mount): self.fs = fs @@ -336,6 +350,7 @@ class GenericLinuxOutputData: self.uname = None self.machine = None self.hdfs = None + self.hawq = None self.diskusage = None self.sysctl = None self.limitsconf = None @@ -346,7 +361,7 @@ class GenericLinuxOutputData: def __str__(self): applied_checks = filter(lambda x: x is not None, - [ self.uname, self.machine, self.hdfs, self.diskusage, self.sysctl, + [ self.uname, self.machine, self.hdfs, self.hawq, self.diskusage, self.sysctl, self.limitsconf, self.mounts, self.ioschedulers, self.blockdev, self.ntp ]) return "\n".join(map(str, applied_checks)) http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/hawqpylib/hawqlib.py ---------------------------------------------------------------------- diff --git a/tools/bin/hawqpylib/hawqlib.py b/tools/bin/hawqpylib/hawqlib.py index ae0d852..c149ffc 100755 --- a/tools/bin/hawqpylib/hawqlib.py +++ b/tools/bin/hawqpylib/hawqlib.py @@ -131,6 +131,24 @@ def check_property_exist_xml(xml_file, property_name): return property_exist, property_name, property_value +def get_xml_values(xmlfile): + xml_dict = {} + with open(xmlfile) as f: + xmldoc = minidom.parse(f) + + for node in xmldoc.getElementsByTagName('property'): + name = node.getElementsByTagName('name')[0].childNodes[0].data.encode('ascii') + + try: + value = node.getElementsByTagName('value')[0].childNodes[0].data.encode('ascii') + except: + value = None + + xml_dict[name] = value + + return xml_dict + + class HawqXMLParser: def __init__(self, GPHOME): self.GPHOME = GPHOME http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/sbin/gpcheck_hostdump ---------------------------------------------------------------------- diff --git a/tools/sbin/gpcheck_hostdump b/tools/sbin/gpcheck_hostdump index 7714cc3..28f074e 100755 --- a/tools/sbin/gpcheck_hostdump +++ b/tools/sbin/gpcheck_hostdump @@ -31,7 +31,7 @@ try: from gppylib.gpparseopts import OptParser, OptChecker from gppylib.gpcheckutil import ApplianceOutputData, GenericLinuxOutputData, GenericSolarisOutputData from gppylib.gpcheckutil import chkconfig, omreport, grubconf, mounts, GpMount, GpMount, inittab, ntp - from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl, limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, machine + from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl, limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, hawq, machine from gppylib.gpcheckutil import solaris_etc_system, solaris_etc_project, solaris_etc_user_attr except ImportError, e: sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) @@ -413,15 +413,53 @@ def collectCPUandMemoryInfo(): return data +def collectHAWQ(): + if not options.hawq: + return None + data = hawq() + hawq_config_dir = os.environ.get('GPHOME') + if hawq_config_dir is None: + print "Please export GPHOME first, exit" + sys.exit(1) + hdfs_client_file = os.path.join(hawq_config_dir, "etc/hdfs-client.xml") + yarn_client_file = os.path.join(hawq_config_dir, "etc/yarn-client.xml") + hawq_site_file = os.path.join(hawq_config_dir, "etc/hawq-site.xml") + + # collect HDFS site config + getPropName = lambda node: node.getElementsByTagName('name')[0].childNodes[0].data + getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data + hawq_config_file_list = [hdfs_client_file, hawq_site_file] + if options.yarn: + hawq_config_file_list.append(yarn_client_file) + for filename in hawq_config_file_list: + try: + with open(filename) as f: + xmldoc = minidom.parse(f) + for node in xmldoc.getElementsByTagName('property'): + try: + data.site_config[getPropName(node)] = getPropValue(node).strip() + except IndexError: + pass # the <value> tag may be empty, which causes IndexError in getPropValue + + except Exception, e: + data.errormsg = "Failed to read HAWQ config file '%s': %s" % (filename, e) + + return data + + def collectHDFS(): if not options.hadoop: return None data = hdfs() + hawq_config_dir = os.environ.get('GPHOME') + if hawq_config_dir is None: + print "Please export GPHOME first, exit" + sys.exit(1) hadoop_config_file = os.path.join(options.hadoop, "libexec/hadoop-config.sh") hadoop_env_file = os.path.join(options.hadoop, "etc/hadoop/hadoop-env.sh") hdfs_site_file = os.path.join(options.hadoop, "etc/hadoop/hdfs-site.xml") + yarn_site_file = os.path.join(options.hadoop, "etc/hadoop/yarn-site.xml") core_site_file = os.path.join(options.hadoop, "etc/hadoop/core-site.xml") - libhdfs3_site_file = os.environ.get("LIBHDFS3_CONF") # collect java heap size config p = subprocess.Popen(". %s; echo $JAVA_HEAP_MAX" % hadoop_config_file, shell = True, @@ -457,7 +495,10 @@ def collectHDFS(): # collect HDFS site config getPropName = lambda node: node.getElementsByTagName('name')[0].childNodes[0].data getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data - for filename in (hdfs_site_file, core_site_file, libhdfs3_site_file): + hdfs_config_file_list = [hdfs_site_file, core_site_file] + if options.yarn: + hdfs_config_file_list.append(yarn_site_file) + for filename in hdfs_config_file_list: try: with open(filename) as f: xmldoc = minidom.parse(f) @@ -804,6 +845,7 @@ def processGenericLinuxServer(): output = GenericLinuxOutputData() output.hdfs = collectHDFS() + output.hawq = collectHAWQ() output.uname = collectUname() output.machine = collectCPUandMemoryInfo() output.diskusage = collectDiskUsage() @@ -844,6 +886,8 @@ def parseargs(): parser.remove_option('-h') parser.add_option('-h', '-?', '--help', action='store_true') parser.add_option('--hadoop', type='string') + parser.add_option('--hawq', action='store_true') + parser.add_option('--yarn', action='store_true') parser.add_option('--sysctl', type='string') parser.add_option('--appliance', action='store_true') parser.add_option('--linux', action='store_true')
