http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/nagios_service.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/nagios_service.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/nagios_service.py new file mode 100644 index 0000000..b7f512b --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/nagios_service.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Ambari Agent + +""" + +import json +import os +import signal + +from resource_management import * +from os.path import isfile + + +def nagios_service(action='start'): # start or stop + import params + + nagios_pid_file = format("{nagios_pid_file}") + + if action == 'start': + command = format("service {nagios_service_name} start") + Execute(command) + elif action == 'stop': + # attempt to grab the pid in case we need it later + nagios_pid = 0 + if isfile(nagios_pid_file): + with open(nagios_pid_file, "r") as file: + try: + nagios_pid = int(file.read()) + Logger.info("Nagios is running with a PID of {0}".format(nagios_pid)) + except: + Logger.info("Unable to read PID file {0}".format(nagios_pid_file)) + finally: + file.close() + + command = format("service {nagios_service_name} stop") + Execute(command) + + # on SUSE, there is a bug where Nagios doesn't kill the process + # but this could also affect any OS, so don't restrict this to SUSE + if nagios_pid > 0: + try: + os.kill(nagios_pid, 0) + except: + Logger.info("The Nagios process has successfully terminated") + else: + Logger.info("The Nagios process with ID {0} failed to terminate; explicitly killing.".format(nagios_pid)) + os.kill(nagios_pid, signal.SIGKILL) + + # in the event that the Nagios scripts don't remove the pid file + if isfile( nagios_pid_file ): + Execute(format("rm -f {nagios_pid_file}")) + + MonitorWebserver("restart") + +def update_active_alerts(): + import status_params + + alerts = None + if 'alerts' in status_params.config and status_params.config['alerts'] is not None: + alerts = status_params.config['alerts'] + + if alerts is None: + return + + output = {} + + for a in alerts: + alert_name = a['name'] + alert_text = a['text'] + alert_state = a['state'] + alert_host = a['host'] + if not output.has_key(alert_name): + output[alert_name] = {} + + if not output[alert_name].has_key(alert_host): + output[alert_name][alert_host] = [] + + host_items = output[alert_name][alert_host] + alert_out = {} + alert_out['state'] = alert_state + alert_out['text'] = alert_text + host_items.append(alert_out) + + with open(os.path.join(status_params.nagios_var_dir, 'ambari.json'), 'w') as f: + json.dump(output, f) +
http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/params.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/params.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/params.py new file mode 100644 index 0000000..ec6c885 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/params.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Ambari Agent + +""" + +from functions import is_jdk_greater_6 +from resource_management import * +import status_params + +# server configurations +config = Script.get_config() + +if System.get_instance().os_family == "ubuntu": + nagios_service_name = "nagios3" +else: + nagios_service_name = "nagios" + +conf_dir = format("/etc/{nagios_service_name}") +nagios_obj_dir = format("{conf_dir}/objects") +nagios_var_dir = status_params.nagios_var_dir +nagios_rw_dir = status_params.nagios_rw_dir + +# HACK: Stylesheets for Nagios UI on Ubuntu are in wrong place so we have to do a symlink. +# In future we can fix this directly in the package. +ubuntu_stylesheets_real_location = "/etc/nagios3/stylesheets" +ubuntu_stylesheets_desired_location = "/usr/share/nagios3/htdocs/stylesheets" + +if System.get_instance().os_family == "ubuntu": + host_template = "generic-host" + plugins_dir = "/usr/lib/nagios/plugins" + nagios_web_dir = "/usr/share/nagios3/htdocs" + + cfg_files = [ + format("{conf_dir}/commands.cfg"), + format("{conf_dir}/conf.d/contacts_nagios2.cfg"), + format("{conf_dir}/conf.d/generic-host_nagios2.cfg"), + format("{conf_dir}/conf.d/generic-service_nagios2.cfg"), + format("{conf_dir}/conf.d/timeperiods_nagios2.cfg"), + ] + cgi_dir = "/usr/lib/cgi-bin/nagios3" + cgi_weblink = "/cgi-bin/nagios3" +else: + host_template = "linux-server" + plugins_dir = "/usr/lib64/nagios/plugins" + nagios_web_dir = "/usr/share/nagios" + + cfg_files = [ + format("{nagios_obj_dir}/commands.cfg"), + format("{nagios_obj_dir}/contacts.cfg"), + format("{nagios_obj_dir}/timeperiods.cfg"), + format("{nagios_obj_dir}/templates.cfg"), + ] + + cgi_dir = "/usr/lib/nagios/cgi" + cgi_weblink = "/nagios/cgi-bin" + +check_result_path = "/var/nagios/spool/checkresults" +nagios_log_dir = "/var/log/nagios" +nagios_log_archives_dir = format("{nagios_log_dir}/archives") +nagios_host_cfg = format("{nagios_obj_dir}/hadoop-hosts.cfg") +nagios_lookup_daemon_str = "/usr/sbin/nagios" +nagios_pid_dir = status_params.nagios_pid_dir +nagios_pid_file = status_params.nagios_pid_file +nagios_resource_cfg = format("{conf_dir}/resource.cfg") +nagios_hostgroup_cfg = format("{nagios_obj_dir}/hadoop-hostgroups.cfg") +nagios_servicegroup_cfg = format("{nagios_obj_dir}/hadoop-servicegroups.cfg") +nagios_service_cfg = format("{nagios_obj_dir}/hadoop-services.cfg") +nagios_command_cfg = format("{nagios_obj_dir}/hadoop-commands.cfg") +eventhandlers_dir = "/usr/lib/nagios/eventhandlers" +nagios_principal_name = default("/configurations/nagios-env/nagios_principal_name", "nagios") +hadoop_ssl_enabled = False + +oozie_server_port = get_port_from_url(config['configurations']['oozie-site']['oozie.base.url']) +namenode_host = default("/clusterHostInfo/namenode_host", None) + +# - test for HDFS or HCFS (glusterfs) +if 'namenode_host' in config['clusterHostInfo']: + ishdfs_value = "HDFS" +else: + ishdfs_value = None + +has_namenode = not namenode_host == None + +# different to HDP1 +if has_namenode: + if 'dfs.namenode.http-address' in config['configurations']['hdfs-site']: + namenode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.namenode.http-address']) + else: + namenode_port = "50070" + + if 'dfs.namenode.secondary.http-address' in config['configurations']['hdfs-site']: + snamenode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.namenode.secondary.http-address']) + else: + snamenode_port = "50071" + + if 'dfs.journalnode.http-address' in config['configurations']['hdfs-site']: + journalnode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.journalnode.http-address']) + datanode_port = get_port_from_url(config['configurations']['hdfs-site']['dfs.datanode.http.address']) + +hbase_master_rpc_port = default('/configurations/hbase-site/hbase.master.port', "60000") +rm_port = get_port_from_url(config['configurations']['yarn-site']['yarn.resourcemanager.webapp.address']) +nm_port = "8042" +hs_port = get_port_from_url(config['configurations']['mapred-site']['mapreduce.jobhistory.webapp.address']) +flume_port = "4159" +hive_metastore_port = get_port_from_url(config['configurations']['hive-site']['hive.metastore.uris']) #"9083" +hive_server_port = default('/configurations/hive-site/hive.server2.thrift.port',"10000") +templeton_port = config['configurations']['webhcat-site']['templeton.port'] #"50111" +hbase_master_port = config['configurations']['hbase-site']['hbase.master.info.port'] #"60010" +hbase_rs_port = config['configurations']['hbase-site']['hbase.regionserver.info.port'] #"60030" +storm_ui_port = config['configurations']['storm-site']['ui.port'] +drpc_port = config['configurations']['storm-site']['drpc.port'] +nimbus_port = config['configurations']['storm-site']['nimbus.thrift.port'] +supervisor_port = "56431" +storm_rest_api_port = "8745" +falcon_port = config['configurations']['falcon-env']['falcon_port'] +ahs_port = get_port_from_url(config['configurations']['yarn-site']['yarn.timeline-service.webapp.address']) + +# use sensible defaults for checkpoint as they are required by Nagios and +# may not be part of hdfs-site.xml on an upgrade +if has_namenode: + if 'dfs.namenode.checkpoint.period' in config['configurations']['hdfs-site']: + dfs_namenode_checkpoint_period = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.period'] + else: + dfs_namenode_checkpoint_period = '21600' + + if 'dfs.namenode.checkpoint.txns' in config['configurations']['hdfs-site']: + dfs_namenode_checkpoint_txns = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.txns'] + else: + dfs_namenode_checkpoint_txns = '1000000' + +# this is different for HDP1 +nn_metrics_property = "FSNamesystem" +clientPort = config['configurations']['zookeeper-env']['clientPort'] #ZK + + +java64_home = config['hostLevelParams']['java_home'] +check_cpu_on = is_jdk_greater_6(java64_home) +security_enabled = config['configurations']['cluster-env']['security_enabled'] +nagios_keytab_path = default("/configurations/nagios-env/nagios_keytab_path", "/etc/security/keytabs/nagios.service.keytab") +kinit_path_local = functions.get_kinit_path(["/usr/bin", "/usr/kerberos/bin", "/usr/sbin"]) + +dfs_ha_enabled = False +dfs_ha_nameservices = default("/configurations/hdfs-site/dfs.nameservices", None) +dfs_ha_namenode_ids = default(format("/configurations/hdfs-site/dfs.ha.namenodes.{dfs_ha_nameservices}"), None) +if dfs_ha_namenode_ids: + dfs_ha_namemodes_ids_list = dfs_ha_namenode_ids.split(",") + dfs_ha_namenode_ids_array_len = len(dfs_ha_namemodes_ids_list) + if dfs_ha_namenode_ids_array_len > 1: + dfs_ha_enabled = True + +nn_ha_host_port_map = {} +if dfs_ha_enabled: + for nn_id in dfs_ha_namemodes_ids_list: + nn_host = config['configurations']['hdfs-site'][format('dfs.namenode.rpc-address.{dfs_ha_nameservices}.{nn_id}')] + nn_ha_host_port_map[nn_host.split(":")[0]] = nn_host.split(":")[1] +else: + if 'namenode_host' in config['clusterHostInfo']: + namenode_metadata_port = get_port_from_url(config['configurations']['core-site']['fs.defaultFS']) + nn_ha_host_port_map[config['clusterHostInfo']['namenode_host'][0]] = namenode_metadata_port + else: + namenode_metadata_port = '8020' + +os_family = System.get_instance().os_family + +ganglia_port = "8651" +ganglia_collector_slaves_port = "8660" +ganglia_collector_namenode_port = "8661" +ganglia_collector_jobtracker_port = "8662" +ganglia_collector_hbase_port = "8663" +ganglia_collector_rm_port = "8664" +ganglia_collector_nm_port = "8660" +ganglia_collector_hs_port = "8666" + +all_ping_ports = config['clusterHostInfo']['all_ping_ports'] + +if System.get_instance().os_family == "suse": + nagios_p1_pl = "/usr/lib/nagios/p1.pl" + htpasswd_cmd = "htpasswd2" + web_conf_dir = "/etc/apache2/conf.d" +elif System.get_instance().os_family == "ubuntu": + nagios_p1_pl = "/usr/lib/nagios3/p1.pl" + htpasswd_cmd = "htpasswd" + web_conf_dir = "/etc/apache2/conf.d" +elif System.get_instance().os_family == "redhat": + nagios_p1_pl = "/usr/bin/p1.pl" + htpasswd_cmd = "htpasswd" + web_conf_dir = "/etc/httpd/conf.d" + +nagios_httpd_config_file = format("{web_conf_dir}/{nagios_service_name}.conf") +hdp_mon_nagios_addons_path = format("{web_conf_dir}/hdp_mon_nagios_addons.conf") + +ambarinagios_php_dir = "/usr/share/hdp/nagios/" +ambarinagios_php_filename = "nagios_alerts.php" + +nagios_user = config['configurations']['nagios-env']['nagios_user'] +nagios_group = config['configurations']['nagios-env']['nagios_group'] +nagios_web_login = config['configurations']['nagios-env']['nagios_web_login'] +nagios_web_password = config['configurations']['nagios-env']['nagios_web_password'] +user_group = config['configurations']['cluster-env']['user_group'] +nagios_contact = config['configurations']['nagios-env']['nagios_contact'] + + +_snamenode_host = default("/clusterHostInfo/snamenode_host", None) +_jtnode_host = default("/clusterHostInfo/jtnode_host", None) +_slave_hosts = default("/clusterHostInfo/slave_hosts", None) +_journalnode_hosts = default("/clusterHostInfo/journalnode_hosts", None) +_zkfc_hosts = default("/clusterHostInfo/zkfc_hosts", None) +_rm_host = default("/clusterHostInfo/rm_host", None) +if type(_rm_host) is list: + rm_hosts_in_str = ','.join(_rm_host) +_nm_hosts = default("/clusterHostInfo/nm_hosts", None) +_hs_host = default("/clusterHostInfo/hs_host", None) +_zookeeper_hosts = default("/clusterHostInfo/zookeeper_hosts", None) +_flume_hosts = default("/clusterHostInfo/flume_hosts", None) +_nagios_server_host = default("/clusterHostInfo/nagios_server_host",None) +_ganglia_server_host = default("/clusterHostInfo/ganglia_server_host",None) +_app_timeline_server_hosts = default("/clusterHostInfo/app_timeline_server_hosts",None) +_nimbus_host = default("/clusterHostInfo/nimbus_hosts",None) +_drpc_host = default("/clusterHostInfo/drpc_server_hosts",None) +_supervisor_hosts = default("/clusterHostInfo/supervisor_hosts",None) +_storm_ui_host = default("/clusterHostInfo/storm_ui_server_hosts",None) +_storm_rest_api_hosts = default("/clusterHostInfo/storm_rest_api_hosts",None) +hbase_master_hosts = default("/clusterHostInfo/hbase_master_hosts",None) +if type(hbase_master_hosts) is list: + hbase_master_hosts_in_str = ','.join(hbase_master_hosts) +_hive_server_host = default("/clusterHostInfo/hive_server_host",None) +_oozie_server = default("/clusterHostInfo/oozie_server",None) +_webhcat_server_host = default("/clusterHostInfo/webhcat_server_host",None) +_falcon_host = default("/clusterHostInfo/falcon_server_hosts", None) +# can differ on HDP1 +#_mapred_tt_hosts = _slave_hosts +#if hbase_rs_hosts not given it is assumed that region servers on same nodes as slaves +_hbase_rs_hosts = default("/clusterHostInfo/hbase_rs_hosts", _slave_hosts) +_hue_server_host = default("/clusterHostInfo/hue_server_host", None) +all_hosts = config['clusterHostInfo']['all_hosts'] + +if 'namenode_host' in config['clusterHostInfo']: + nn_hosts_string = " ".join(namenode_host) +else: + nn_hosts_string = " ".join(config['clusterHostInfo']['ambari_server_host']) + + +hostgroup_defs = { + 'namenode' : namenode_host, + 'snamenode' : _snamenode_host, + 'slaves' : _slave_hosts, + 'agent-servers' : all_hosts, + 'nagios-server' : _nagios_server_host, + 'jobtracker' : _jtnode_host, + 'ganglia-server' : _ganglia_server_host, + 'flume-servers' : _flume_hosts, + 'zookeeper-servers' : _zookeeper_hosts, + 'hbasemasters' : hbase_master_hosts, + 'hiveserver' : _hive_server_host, + 'region-servers' : _hbase_rs_hosts, + 'oozie-server' : _oozie_server, + 'webhcat-server' : _webhcat_server_host, + 'hue-server' : _hue_server_host, + 'resourcemanager' : _rm_host, + 'nodemanagers' : _nm_hosts, + 'historyserver2' : _hs_host, + 'journalnodes' : _journalnode_hosts, + 'nimbus' : _nimbus_host, + 'drpc-server' : _drpc_host, + 'storm_ui' : _storm_ui_host, + 'supervisors' : _supervisor_hosts, + 'storm_rest_api' : _storm_rest_api_hosts, + 'falcon-server' : _falcon_host, + 'ats-servers' : _app_timeline_server_hosts +} http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/status_params.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/status_params.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/status_params.py new file mode 100644 index 0000000..11d4aa9 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/scripts/status_params.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +from resource_management import * + +config = Script.get_config() + +nagios_pid_dir = "/var/run/nagios" +nagios_pid_file = format("{nagios_pid_dir}/nagios.pid") + +nagios_var_dir = "/var/nagios" +nagios_rw_dir = "/var/nagios/rw" http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/contacts.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/contacts.cfg.j2 b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/contacts.cfg.j2 new file mode 100644 index 0000000..610b2bd --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/contacts.cfg.j2 @@ -0,0 +1,109 @@ +{# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + +############################################################################### +# CONTACTS.CFG - SAMPLE CONTACT/CONTACTGROUP DEFINITIONS +# +# Last Modified: 05-31-2007 +# +# NOTES: This config file provides you with some example contact and contact +# group definitions that you can reference in host and service +# definitions. +# +# You don't need to keep these definitions in a separate file from your +# other object definitions. This has been done just to make things +# easier to understand. +# +############################################################################### + +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +# + + +############################################################################### +############################################################################### +# +# CONTACTS +# +############################################################################### +############################################################################### + +# Just one contact defined by default - the Nagios admin (that's you) +# This contact definition inherits a lot of default values from the 'generic-contact' +# template which is defined elsewhere. + +define contact{ + contact_name {{nagios_web_login}} ; Short name of user + use generic-contact ; Inherit default values from generic-contact template (defined above) + alias Nagios Admin ; Full name of user + + email {{nagios_contact}} ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ****** + } + +# Contact which writes all Nagios alerts to the system logger. +define contact{ + contact_name sys_logger ; Short name of user + use generic-contact ; Inherit default values from generic-contact template (defined above) + alias System Logger ; Full name of user + host_notifications_enabled 1 + service_notifications_enabled 1 + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r,s + host_notification_options d,u,r,s + can_submit_commands 1 + retain_status_information 1 + service_notification_commands service_sys_logger + host_notification_commands host_sys_logger + } + +############################################################################### +############################################################################### +# +# CONTACT GROUPS +# +############################################################################### +############################################################################### + +# We only have one contact in this simple configuration file, so there is +# no need to create more than one contact group. + +define contactgroup { + contactgroup_name admins + alias Nagios Administrators + members {{nagios_web_login}},sys_logger +} http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 new file mode 100644 index 0000000..a8a616c --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 @@ -0,0 +1,166 @@ +{# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +# + +{% if check_cpu_on %} +# 'check_cpu' check remote cpu load +define command { + command_name check_cpu + command_line $USER1$/check_wrapper.sh php $USER1$/check_cpu.php -h $HOSTADDRESS$ -p $ARG1$ -w $ARG2$ -c $ARG3$ -e $ARG4$ -k $ARG5$ -r $ARG6$ -t $ARG7$ -u $ARG8$ + } +define command { + command_name check_cpu_ha + command_line $USER1$/check_wrapper.sh php $USER1$/check_cpu_ha.php -h $ARG1$ -p $ARG2$ -w $ARG3$ -c $ARG4$ -e $ARG5$ -k $ARG6$ -r $ARG7$ -t $ARG8$ -u $ARG9$ + } +{% endif %} + +# Check data node storage full +define command { + command_name check_datanode_storage + command_line $USER1$/check_wrapper.sh php $USER1$/check_datanode_storage.php -h $HOSTADDRESS$ -p $ARG1$ -w $ARG2$ -c $ARG3$ -e $ARG4$ -k $ARG5$ -r $ARG6$ -t $ARG7$ -s $ARG8$ + } + +define command{ + command_name check_hdfs_blocks + command_line $USER1$/check_wrapper.sh php $USER1$/check_hdfs_blocks.php -h $ARG1$ -p $ARG2$ -s $ARG3$ -e $ARG4$ -k $ARG5$ -r $ARG6$ -t $ARG7$ -u $ARG8$ + } + +define command{ + command_name check_hdfs_capacity + command_line $USER1$/check_wrapper.sh php $USER1$/check_hdfs_capacity.php -h $ARG1$ -p $ARG2$ -w $ARG3$ -c $ARG4$ -e $ARG5$ -k $ARG6$ -r $ARG7$ -t $ARG8$ -s $ARG9$ + } + +define command{ + command_name check_aggregate + command_line $USER1$/check_wrapper.sh php $USER1$/check_aggregate.php -f /var/nagios/status.dat -s 1 -t service -n $ARG1$ -w $ARG2$ -c $ARG3$ + } + +define command{ + command_name check_rpcq_latency + command_line $USER1$/check_wrapper.sh php $USER1$/check_rpcq_latency.php -h $HOSTADDRESS$ -p $ARG2$ -n $ARG1$ -w $ARG3$ -c $ARG4$ -e $ARG5$ -k $ARG6$ -r $ARG7$ -t $ARG8$ -s $ARG9$ + } + +define command{ + command_name check_rpcq_latency_ha + command_line $USER1$/check_wrapper.sh php $USER1$/check_rpcq_latency_ha.php -h $ARG1$ -p $ARG3$ -n $ARG2$ -w $ARG4$ -c $ARG5$ -e $ARG6$ -k $ARG7$ -r $ARG8$ -t $ARG9$ -s $ARG10$ + } + +define command{ + command_name check_nagios + command_line $USER1$/check_wrapper.sh $USER1$/check_nagios -e $ARG1$ -F $ARG2$ -C $ARG3$ + } + +define command{ + command_name check_webui + command_line $USER1$/check_wrapper.sh $USER1$/check_webui.sh $ARG1$ $HOSTADDRESS$ $ARG2$ + } + +define command{ + command_name check_webui_ha + command_line $USER1$/check_wrapper.sh $USER1$/check_webui_ha.sh $ARG1$ $ARG2$ $ARG3$ + } + +define command{ + command_name check_name_dir_status + command_line $USER1$/check_wrapper.sh php $USER1$/check_name_dir_status.php -h $HOSTADDRESS$ -p $ARG1$ -e $ARG2$ -k $ARG3$ -r $ARG4$ -t $ARG5$ -s $ARG6$ + } + +define command{ + command_name check_oozie_status + command_line $USER1$/check_wrapper.sh $USER1$/check_oozie_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$ + } + +define command{ + command_name check_templeton_status + command_line $USER1$/check_wrapper.sh $USER1$/check_templeton_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$ + } + +define command{ + command_name check_hive_metastore_status + command_line $USER1$/check_wrapper.sh $USER1$/check_hive_metastore_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$ + } +define command{ + command_name check_hue_status + command_line $USER1$/check_wrapper.sh $USER1$/check_hue_status.sh + } + +define command{ + command_name check_mapred_local_dir_used_space + command_line $USER1$/check_wrapper.sh $USER1$/check_mapred_local_dir_used.sh $ARG1$ $ARG2$ + } + +define command{ + command_name check_namenodes_ha + command_line $USER1$/check_wrapper.sh $USER1$/check_namenodes_ha.sh $ARG1$ $ARG2$ + } + +define command{ + command_name check_nodemanager_health + command_line $USER1$/check_wrapper.sh $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$ + } + +define command{ + command_name host_sys_logger + command_line $USER1$/sys_logger.py $HOSTSTATETYPE$ $HOSTATTEMPT$ $HOSTSTATE$ "Host::Ping" "Event Host=$HOSTADDRESS$($HOSTSTATE$), $HOSTOUTPUT$ $LONGHOSTOUTPUT$" + } + +define command{ + command_name service_sys_logger + command_line $USER1$/sys_logger.py $SERVICESTATETYPE$ $SERVICEATTEMPT$ $SERVICESTATE$ "$SERVICEDESC$" "Event Host=$HOSTADDRESS$ Service Description=$SERVICEDESC$($SERVICESTATE$), $SERVICEOUTPUT$ $LONGSERVICEOUTPUT$" + } + +define command{ + command_name check_tcp_wrapper + command_line $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$ + } + +define command{ + command_name check_checkpoint_time + command_line $USER1$/check_wrapper.sh /var/lib/ambari-agent/ambari-python-wrap $USER1$/check_checkpoint_time.py -H "$ARG1$" -p $ARG2$ -w $ARG3$ -c $ARG4$ -t $ARG5$ -x $ARG6$ + } + +define command{ + command_name check_tcp_wrapper_sasl + command_line $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$ -s \"$ARG3$\" + } + +define command{ + command_name check_ambari + command_line $USER1$/check_wrapper.sh /var/lib/ambari-agent/ambari-python-wrap $USER1$/check_ambari_alerts.py -H $HOSTADDRESS$ -f $ARG1$ -n $ARG2$ + } http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2 b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2 new file mode 100644 index 0000000..05c1252 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hostgroups.cfg.j2 @@ -0,0 +1,33 @@ +{# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + +{% for name, hosts in hostgroup_defs.iteritems() %} +{% if hosts %} +define hostgroup { + hostgroup_name {{name}} + alias {{name}} + members {{','.join(hosts)}} +} +{% endif %} +{% endfor %} + +define hostgroup { + hostgroup_name all-servers + alias All Servers + members {{','.join(all_hosts)}} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2 b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2 new file mode 100644 index 0000000..8bcc980 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-hosts.cfg.j2 @@ -0,0 +1,53 @@ +{# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +{% for host in all_hosts %} +define host { + alias {{host}} + host_name {{host}} + use {{host_template}} + address {{host}} + check_command check_tcp_wrapper!{{all_ping_ports[loop.index-1]}}!-w 1 -c 1 + check_interval 0.25 + retry_interval 0.25 + max_check_attempts 4 + notifications_enabled 1 + first_notification_delay 0 # Send notification soon after change in the hard state + notification_interval 0 # Send the notification once + notification_options d,u,r +} + +{% endfor %} http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 new file mode 100644 index 0000000..00f0740 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 @@ -0,0 +1,119 @@ +{# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + + + +{% if hostgroup_defs['namenode'] or + hostgroup_defs['snamenode'] or + hostgroup_defs['slaves'] %} + {% if hostgroup_defs['namenode'] != None %} + define servicegroup { + servicegroup_name HDFS + alias HDFS Checks + } + {% endif %} +{% endif %} +{%if hostgroup_defs['jobtracker'] or + hostgroup_defs['historyserver2']-%} +define servicegroup { + servicegroup_name MAPREDUCE + alias MAPREDUCE Checks +} +{% endif %} +{%if hostgroup_defs['resourcemanager'] or + hostgroup_defs['nodemanagers'] %} +define servicegroup { + servicegroup_name YARN + alias YARN Checks +} +{% endif %} +{%if hostgroup_defs['hbasemasters'] %} +define servicegroup { + servicegroup_name HBASE + alias HBASE Checks +} +{% endif %} +{% if hostgroup_defs['oozie-server'] %} +define servicegroup { + servicegroup_name OOZIE + alias OOZIE Checks +} +{% endif %} +{% if hostgroup_defs['webhcat-server'] %} +define servicegroup { + servicegroup_name WEBHCAT + alias WEBHCAT Checks +} +{% endif %} +{% if hostgroup_defs['nagios-server'] %} +define servicegroup { + servicegroup_name NAGIOS + alias NAGIOS Checks +} +{% endif %} +{% if hostgroup_defs['ganglia-server'] %} +define servicegroup { + servicegroup_name GANGLIA + alias GANGLIA Checks +} +{% endif %} +{% if hostgroup_defs['hiveserver'] %} +define servicegroup { + servicegroup_name HIVE + alias HIVE Checks +} +{% endif %} +{% if hostgroup_defs['zookeeper-servers'] %} +define servicegroup { + servicegroup_name ZOOKEEPER + alias ZOOKEEPER Checks +} +{% endif %} +define servicegroup { + servicegroup_name AMBARI + alias AMBARI Checks +} +{% if hostgroup_defs['hue-server'] %} +define servicegroup { + servicegroup_name HUE + alias HUE Checks +} +{% endif %} +{% if hostgroup_defs['nimbus'] or + hostgroup_defs['drpc-server'] or + hostgroup_defs['storm_ui'] or + hostgroup_defs['supervisors'] or + hostgroup_defs['storm_rest_api']%} +define servicegroup { + servicegroup_name STORM + alias STORM Checks +} +{% endif %} +{% if hostgroup_defs['falcon-server'] %} +define servicegroup { + servicegroup_name FALCON + alias FALCON Checks +} +{% endif %} + +{%if hostgroup_defs['flume-servers'] %} +define servicegroup { + servicegroup_name FLUME + alias FLUME Checks +} +{% endif %} http://git-wip-us.apache.org/repos/asf/ambari/blob/83efcfea/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-services.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-services.cfg.j2 b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-services.cfg.j2 new file mode 100644 index 0000000..045e9ad --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/NAGIOS/package/templates/hadoop-services.cfg.j2 @@ -0,0 +1,804 @@ +{# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# + +{# TODO: Look for { or } in created file #} +# NAGIOS SERVER Check (status log update) +{% if hostgroup_defs['nagios-server'] %} +define service { + name hadoop-service + use generic-service + notification_options w,u,c,r,f,s + first_notification_delay 0 + notification_interval 0 # Send the notification once + contact_groups admins + notifications_enabled 1 + event_handler_enabled 1 + register 0 +} + +define service { + hostgroup_name nagios-server + use hadoop-service + service_description NAGIOS::Nagios status log freshness + servicegroups NAGIOS + check_command check_nagios!10!/var/nagios/status.dat!{{nagios_lookup_daemon_str}} + normal_check_interval 5 + retry_check_interval 0.5 + max_check_attempts 2 +} + +# NAGIOS SERVER HDFS Checks +{% if hostgroup_defs['namenode'] != None %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::Percent DataNodes with space available + servicegroups HDFS + check_command check_aggregate!"DATANODE::DataNode space"!10%!30% + normal_check_interval 0.5 + retry_check_interval 1 + max_check_attempts 1 +} + +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::Percent DataNodes live + servicegroups HDFS + check_command check_aggregate!"DATANODE::DataNode process"!10%!30% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} +{# used only for HDP2 #} +{% if hostgroup_defs['namenode'] and hostgroup_defs['namenode'] != None and dfs_ha_enabled %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::NameNode HA Healthy + servicegroups HDFS + check_command check_namenodes_ha!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }} + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 5 +} +{% endif %} + +# AMBARI AGENT Checks +{% for hostname in all_hosts %} +define service { + host_name {{ hostname }} + use hadoop-service + service_description AMBARI::Ambari Agent process + servicegroups AMBARI + check_command check_tcp_wrapper!{{all_ping_ports[loop.index-1]}}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.25 + max_check_attempts 4 +} + +{% endfor %} + +# NAGIOS SERVER ZOOKEEPER Checks +{% if hostgroup_defs['zookeeper-servers'] %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description ZOOKEEPER::Percent ZooKeeper Servers live + servicegroups ZOOKEEPER + check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process"!35%!70% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +# NAGIOS SERVER HBASE Checks +{% if hostgroup_defs['hbasemasters'] %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HBASE::Percent RegionServers live + servicegroups HBASE + check_command check_aggregate!"REGIONSERVER::RegionServer process"!10%!30% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} +{% endif %} + + + +# GANGLIA SERVER Checks +{% if hostgroup_defs['ganglia-server'] %} +define service { + hostgroup_name ganglia-server + use hadoop-service + service_description GANGLIA::Ganglia Server process + servicegroups GANGLIA + check_command check_tcp_wrapper!{{ ganglia_port }}!-w 1 -c 1 + normal_check_interval 0.25 + retry_check_interval 0.25 + max_check_attempts 4 +} + +{% if hostgroup_defs['namenode'] %} +define service { + hostgroup_name ganglia-server + use hadoop-service + service_description GANGLIA::Ganglia Monitor process for NameNode + servicegroups GANGLIA + check_command check_tcp_wrapper!{{ ganglia_collector_namenode_port }}!-w 1 -c 1 + normal_check_interval 0.25 + retry_check_interval 0.25 + max_check_attempts 4 +} +{% endif %} + +{% if hostgroup_defs['hbasemasters'] %} +define service { + hostgroup_name ganglia-server + use hadoop-service + service_description GANGLIA::Ganglia Monitor process for HBase Master + servicegroups GANGLIA + check_command check_tcp_wrapper!{{ ganglia_collector_hbase_port }}!-w 1 -c 1 + normal_check_interval 0.25 + retry_check_interval 0.25 + max_check_attempts 4 +} +{% endif %} + +{% if hostgroup_defs['resourcemanager'] %} +define service { + hostgroup_name ganglia-server + use hadoop-service + service_description GANGLIA::Ganglia Monitor process for ResourceManager + servicegroups GANGLIA + check_command check_tcp_wrapper!{{ ganglia_collector_rm_port }}!-w 1 -c 1 + normal_check_interval 0.25 + retry_check_interval 0.25 + max_check_attempts 4 +} +{% endif %} + +{% if hostgroup_defs['historyserver2'] %} +define service { + hostgroup_name ganglia-server + use hadoop-service + service_description GANGLIA::Ganglia Monitor process for HistoryServer + servicegroups GANGLIA + check_command check_tcp_wrapper!{{ ganglia_collector_hs_port }}!-w 1 -c 1 + normal_check_interval 0.25 + retry_check_interval 0.25 + max_check_attempts 4 +} +{% endif %} +{% endif %} + + +{% if hostgroup_defs['snamenode'] and hostgroup_defs['namenode'] != None %} +# Secondary namenode checks +define service { + hostgroup_name snamenode + use hadoop-service + service_description NAMENODE::Secondary NameNode process + servicegroups HDFS + check_command check_tcp_wrapper!{{ snamenode_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['storm_ui'] %} +# STORM UI Checks +define service { + hostgroup_name storm_ui + use hadoop-service + service_description STORM_UI_SERVER::Storm UI on {{ hostgroup_defs['storm_ui'][0] }} + servicegroups STORM + check_command check_webui!storm_ui!{{ storm_ui_port }} + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['storm_ui'] %} +# STORM UI Checks +define service { + hostgroup_name storm_ui + use hadoop-service + service_description STORM_UI_SERVER::Storm UI Server process + servicegroups STORM + check_command check_tcp_wrapper!{{ storm_ui_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['nimbus'] %} +# Nimbus Checks +define service { + hostgroup_name nimbus + use hadoop-service + service_description NIMBUS::Nimbus process + servicegroups STORM + check_command check_tcp_wrapper!{{ nimbus_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['drpc-server'] %} +# drpc Checks +define service { + hostgroup_name drpc-server + use hadoop-service + service_description DRPC_SERVER::DRPC Server process + servicegroups STORM + check_command check_tcp_wrapper!{{ drpc_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['storm_rest_api'] %} +# Storm REST API Checks +define service { + hostgroup_name storm_rest_api + use hadoop-service + service_description STORM_REST_API::Storm REST API Server process + servicegroups STORM + check_command check_tcp_wrapper!{{ storm_rest_api_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +# NAGIOS SERVER Supervisor Checks +{% if hostgroup_defs['supervisors'] %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description SUPERVISOR::Percent Supervisors live + servicegroups STORM + check_command check_aggregate!"SUPERVISOR::Supervisors process"!10%!30% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} + +define service { + hostgroup_name supervisors + use hadoop-service + service_description SUPERVISOR::Supervisors process + servicegroups STORM + check_command check_tcp_wrapper!{{ supervisor_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['namenode'] and hostgroup_defs['namenode'] != None %} +# HDFS Checks +{% for namenode_hostname in namenode_host %} +{# TODO: check if we can get rid of str, lower #} +define service { + host_name {{ namenode_hostname }} + use hadoop-service + service_description NAMENODE::NameNode edit logs directory status on {{ namenode_hostname }} + servicegroups HDFS + check_command check_name_dir_status!{{ namenode_port }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 0.5 + retry_check_interval 0.5 + max_check_attempts 3 +} + +{% if check_cpu_on %} +define service { + host_name {{ namenode_hostname }} + use hadoop-service + service_description NAMENODE::NameNode host CPU utilization on {{ namenode_hostname }} + servicegroups HDFS +# check_command check_cpu!200%!250% + check_command check_cpu!{{ namenode_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 2 + max_check_attempts 5 +} +{% endif %} + +define service { + host_name {{ namenode_hostname }} + use hadoop-service + service_description NAMENODE::NameNode Web UI on {{ namenode_hostname }} + servicegroups HDFS + check_command check_webui!namenode!{{ namenode_port }} + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} + +define service { + host_name {{ namenode_hostname }} + use hadoop-service + service_description NAMENODE::NameNode process on {{ namenode_hostname }} + servicegroups HDFS + check_command check_tcp_wrapper!{{nn_ha_host_port_map[namenode_hostname]}}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} + +define service { + host_name {{ namenode_hostname }} + use hadoop-service + service_description HDFS::NameNode RPC latency on {{ namenode_hostname }} + servicegroups HDFS + check_command check_rpcq_latency!NameNode!{{ namenode_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 1 + max_check_attempts 5 +} + +{% endfor %} + +define service { + host_name {{namenode_host[0]}} + use hadoop-service + service_description NAMENODE::Last checkpoint time + servicegroups HDFS + check_command check_checkpoint_time!{{ nn_hosts_string }}!{{ namenode_port }}!200!200!{{ dfs_namenode_checkpoint_period }}!{{dfs_namenode_checkpoint_txns}} + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} + +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::Blocks health + servicegroups HDFS + check_command check_hdfs_blocks!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!{{ nn_metrics_property }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 2 + retry_check_interval 1 + max_check_attempts 1 +} + +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::HDFS capacity utilization + servicegroups HDFS + check_command check_hdfs_capacity!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!80%!90%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 2 + retry_check_interval 1 + max_check_attempts 1 +} + +{% endif %} + +{% if hostgroup_defs['resourcemanager'] %} +# YARN::RESOURCEMANAGER Checks +define service { + hostgroup_name nagios-server + use hadoop-service + service_description RESOURCEMANAGER::ResourceManager Web UI + servicegroups YARN + check_command check_webui_ha!resourcemanager!{{ rm_hosts_in_str }}!{{ rm_port }} + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} + +{% if check_cpu_on %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description RESOURCEMANAGER::ResourceManager CPU utilization + servicegroups YARN +# check_command check_cpu!200%!250% + check_command check_cpu_ha!{{ rm_hosts_in_str }}!{{ rm_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 2 + max_check_attempts 5 +} +{% endif %} + +define service { + hostgroup_name nagios-server + use hadoop-service + service_description RESOURCEMANAGER::ResourceManager RPC latency + servicegroups YARN + check_command check_rpcq_latency_ha!{{ rm_hosts_in_str }}!ResourceManager!{{ rm_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 1 + max_check_attempts 5 +} + +{% for rm_host in _rm_host %} +define service { + host_name {{ rm_host }} + use hadoop-service + service_description RESOURCEMANAGER::ResourceManager process on {{ rm_host }} + servicegroups YARN + check_command check_tcp_wrapper!{{ rm_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endfor %} +{% endif %} + +{% if hostgroup_defs['nodemanagers'] %} +# YARN::NODEMANAGER Checks +define service { + hostgroup_name nodemanagers + use hadoop-service + service_description NODEMANAGER::NodeManager process + servicegroups YARN + check_command check_tcp_wrapper!{{ nm_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} + +define service { + hostgroup_name nodemanagers + use hadoop-service + service_description NODEMANAGER::NodeManager health + servicegroups YARN + check_command check_nodemanager_health!{{ nm_port }}!{{ str(security_enabled).lower() }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }} + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description NODEMANAGER::Percent NodeManagers live + servicegroups YARN + check_command check_aggregate!"NODEMANAGER::NodeManager process"!10%!30% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['historyserver2'] %} +# MAPREDUCE::JOBHISTORY Checks +define service { + hostgroup_name historyserver2 + use hadoop-service + service_description JOBHISTORY::HistoryServer Web UI + servicegroups MAPREDUCE + check_command check_webui!historyserver2!{{ hs_port }} + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} + +{% if check_cpu_on %} +define service { + hostgroup_name historyserver2 + use hadoop-service + service_description JOBHISTORY::HistoryServer CPU utilization + servicegroups MAPREDUCE +# check_command check_cpu!200%!250% + check_command check_cpu!{{ hs_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 2 + max_check_attempts 5 +} +{% endif %} + +define service { + hostgroup_name historyserver2 + use hadoop-service + service_description JOBHISTORY::HistoryServer RPC latency + servicegroups MAPREDUCE + check_command check_rpcq_latency!JobHistoryServer!{{ hs_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 1 + max_check_attempts 5 +} + +define service { + hostgroup_name historyserver2 + use hadoop-service + service_description JOBHISTORY::HistoryServer process + servicegroups MAPREDUCE + check_command check_tcp_wrapper!{{ hs_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} + +{% endif %} + +{% if hostgroup_defs['journalnodes'] %} +# Journalnode checks +define service { + hostgroup_name journalnodes + use hadoop-service + service_description JOURNALNODE::JournalNode process + servicegroups HDFS + check_command check_tcp_wrapper!{{ journalnode_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} + +{% if dfs_ha_enabled %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::Percent JournalNodes live + servicegroups HDFS + check_command check_aggregate!"JOURNALNODE::JournalNode process"!33%!50% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} +{% endif %} +{% endif %} + +{% if hostgroup_defs['slaves'] and hostgroup_defs['namenode'] != None %} +# HDFS::DATANODE Checks +define service { + hostgroup_name slaves + use hadoop-service + service_description DATANODE::DataNode process + servicegroups HDFS + check_command check_tcp_wrapper!{{datanode_port}}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} + +define service { + hostgroup_name slaves + use hadoop-service + service_description DATANODE::DataNode space + servicegroups HDFS + check_command check_datanode_storage!{{ datanode_port }}!90%!90%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 2 + retry_check_interval 1 + max_check_attempts 2 +} + +{% endif %} + +{% if hostgroup_defs['zookeeper-servers'] %} +# ZOOKEEPER Checks +define service { + hostgroup_name zookeeper-servers + use hadoop-service + service_description ZOOKEEPER::ZooKeeper Server process + servicegroups ZOOKEEPER + check_command check_tcp_wrapper!{{ clientPort }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['hbasemasters'] %} +# HBASE::REGIONSERVER Checks +define service { + hostgroup_name region-servers + use hadoop-service + service_description REGIONSERVER::RegionServer process + servicegroups HBASE + check_command check_tcp_wrapper!{{ hbase_rs_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} + +{# HBASE:: MASTER Checks +# define service { +# hostgroup_name hbasemasters +# use hadoop-service +# service_description HBASEMASTER::HBase Master Web UI +# servicegroups HBASE +# check_command check_webui!hbase!{{ hbase_master_port }} +# normal_check_interval 1 +# retry_check_interval 1 +# max_check_attempts 3 +# #} +{% if hostgroup_defs['hbasemasters'] %} +{% if check_cpu_on %} +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HBASEMASTER::HBase Master CPU utilization + servicegroups HBASE +# check_command check_cpu!200%!250% + check_command check_cpu_ha!{{ hbase_master_hosts_in_str }}!{{ hbase_master_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + normal_check_interval 5 + retry_check_interval 2 + max_check_attempts 5 +} +{% endif %} +{% endif %} + +{% for hbasemaster in hbase_master_hosts %} +define service { + host_name {{ hbasemaster }} + use hadoop-service + service_description HBASEMASTER::HBase Master process on {{ hbasemaster }} + servicegroups HBASE + check_command check_tcp_wrapper!{{ hbase_master_rpc_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 4 +} +{% endfor %} +{% endif %} + +{% if hostgroup_defs['hiveserver'] %} +# HIVE Metastore check +define service { + hostgroup_name hiveserver + use hadoop-service + service_description HIVE-METASTORE::Hive Metastore process + servicegroups HIVE + check_command check_tcp_wrapper!{{ hive_metastore_port }}!-w 1 -c 1 + normal_check_interval 0.5 + retry_check_interval 0.5 + max_check_attempts 3 +} + +# HIVE Server check +define service { + hostgroup_name hiveserver + use hadoop-service + service_description HIVE-SERVER::HiveServer2 process + servicegroups HIVE + check_command check_tcp_wrapper_sasl!{{ hive_server_port }}!-w 1 -c 1!A001 AUTHENTICATE ANONYMOUS + normal_check_interval 0.5 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} +{% if hostgroup_defs['oozie-server'] %} +# Oozie check +define service { + hostgroup_name oozie-server + use hadoop-service + service_description OOZIE::Oozie Server status + servicegroups OOZIE + {% if security_enabled %} + check_command check_oozie_status!{{ oozie_server_port }}!{{ java64_home }}!true!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }} + {% else %} + check_command check_oozie_status!{{ oozie_server_port }}!{{ java64_home }}!false + {% endif %} + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} +{% endif %} +{% if hostgroup_defs['webhcat-server'] %} +# WEBHCAT check +define service { + hostgroup_name webhcat-server + use hadoop-service + service_description WEBHCAT::WebHCat Server status + servicegroups WEBHCAT + {% if security_enabled %} + check_command check_templeton_status!{{ templeton_port }}!v1!{{ str(security_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }} + {% else %} + check_command check_templeton_status!{{ templeton_port }}!v1!false + {% endif %} + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['hue-server'] %} +define service { + hostgroup_name hue-server + use hadoop-service + service_description HUE::Hue Server status + servicegroups HUE + check_command check_hue_status + normal_check_interval 100 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} + +#FALCON checks +{% if hostgroup_defs['falcon-server'] %} +define service { + hostgroup_name falcon-server + service_description FALCON::Falcon Server process + servicegroups FALCON + check_command check_tcp_wrapper!{{ falcon_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +define service { + hostgroup_name falcon-server + service_description FALCON::Falcon Server Web UI + servicegroups FALCON + check_command check_webui!falconserver!{{ falcon_port }} + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} +{% if hostgroup_defs['ats-servers'] %} +define service { + hostgroup_name ats-servers + use hadoop-service + service_description APP_TIMELINE_SERVER::App Timeline Server process + servicegroups YARN + check_command check_tcp_wrapper!{{ ahs_port }}!-w 1 -c 1 + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} + +{% if hostgroup_defs['flume-servers'] %} +# FLUME Checks +define service { + hostgroup_name flume-servers + use hadoop-service + service_description FLUME::Flume Agent process + servicegroups FLUME + check_command check_ambari!/var/nagios/ambari.json!flume_agent + normal_check_interval 1 + retry_check_interval 0.5 + max_check_attempts 3 +} +{% endif %} +
