AMBARI-7951. Alerts are present on host after enabling maintenance mode (dsen via dlysnichenko)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/9406f962 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/9406f962 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/9406f962 Branch: refs/heads/branch-1.7.0 Commit: 9406f962d7b686a6bbc03f2212f617caa0004d6c Parents: 8086fdb Author: Lisnichenko Dmitro <[email protected]> Authored: Sat Oct 25 19:46:20 2014 +0300 Committer: Lisnichenko Dmitro <[email protected]> Committed: Sat Oct 25 19:47:57 2014 +0300 ---------------------------------------------------------------------- .../services/NAGIOS/package/files/mm_wrapper.py | 13 ++--- .../package/templates/hadoop-services.cfg.j2 | 31 ++++++++++ .../services/NAGIOS/package/files/mm_wrapper.py | 15 +++-- .../templates/hadoop-servicegroups.cfg.j2 | 2 +- .../package/templates/hadoop-services.cfg.j2 | 59 ++++++++++++++++++-- .../stacks/1.3.2/NAGIOS/test_mm_wrapper.py | 4 +- .../stacks/2.0.6/NAGIOS/test_mm_wrapper.py | 4 +- 7 files changed, 104 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/mm_wrapper.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/mm_wrapper.py b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/mm_wrapper.py index 7a622b6..b1b6a84 100644 --- a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/mm_wrapper.py +++ b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/mm_wrapper.py @@ -22,7 +22,7 @@ import subprocess import os N_SGN = 'NAGIOS_SERVICEGROUPNAME' -N_SD = 'NAGIOS_SERVICEDESC' +N_SD = 'NAGIOS__SERVICEHOST_COMPONENT' N_HOST = 'NAGIOS_HOSTNAME' LIST_SEPARATOR = "--" @@ -53,14 +53,15 @@ def ignored_host_list(service, component): if lines: for l in lines: tokens = l.split(' ') - if len(tokens) == 3 and tokens[1] == service and tokens[2].strip() == component: - result.append(tokens[0]) + if len(tokens) == 3 and tokens[1].strip().upper() == service.strip().upper() and \ + tokens[2].strip().upper() == component.strip().upper(): + result.append(tokens[0]) return result def get_real_service(): try: - service = os.environ[N_SGN] # e.g. 'HBASE' + service = os.environ[N_SGN].strip().upper() # e.g. 'HBASE' except KeyError: service = '' return service @@ -68,9 +69,7 @@ def get_real_service(): def get_real_component(): try: - arr_desc = os.environ[N_SD] # e.g. 'HBASE::Percent RegionServers live' - SEPARATOR = "::" - comp_name = arr_desc.replace(SEPARATOR, ' ').split(' ')[0] + comp_name = os.environ[N_SD].strip() except KeyError: comp_name = '' mapping = { http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2 index 65dcb7e..48bba18 100644 --- a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2 +++ b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2 @@ -139,6 +139,7 @@ define service { use hadoop-service service_description GANGLIA::Ganglia Server process servicegroups GANGLIA + _host_component GANGLIA_SERVER check_command check_tcp_wrapper!{{ ganglia_port }}!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 @@ -152,6 +153,7 @@ define service { use hadoop-service service_description GANGLIA::Ganglia Monitor process for NameNode servicegroups GANGLIA + _host_component GANGLIA_MONITOR check_command check_tcp_wrapper!{{ ganglia_collector_namenode_port }}!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 @@ -167,6 +169,7 @@ define service { use hadoop-service service_description GANGLIA::Ganglia Monitor process for JobTracker servicegroups GANGLIA + _host_component GANGLIA_MONITOR check_command check_tcp_wrapper!{{ ganglia_collector_jobtracker_port }}!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 @@ -182,6 +185,7 @@ define service { use hadoop-service service_description GANGLIA::Ganglia Monitor process for HBase Master servicegroups GANGLIA + _host_component GANGLIA_MONITOR check_command check_tcp_wrapper!{{ ganglia_collector_hbase_port }}!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 @@ -198,6 +202,7 @@ define service { use hadoop-service service_description GANGLIA::Ganglia Monitor process for HistoryServer servicegroups GANGLIA + _host_component GANGLIA_MONITOR check_command check_tcp_wrapper!{{ ganglia_collector_hs_port }}!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 @@ -216,6 +221,7 @@ define service { service_description NAMENODE::Secondary NameNode process servicegroups HDFS check_command check_tcp_wrapper!{{ snamenode_port }}!-w 1 -c 1 + _host_component SECONDARY_NAMENODE normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -233,6 +239,7 @@ define service { service_description NAMENODE::NameNode edit logs directory status on {{ namenode_hostname }} servicegroups HDFS check_command check_name_dir_status!{{ namenode_port }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component NAMENODE normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 @@ -246,6 +253,7 @@ define service { servicegroups HDFS # check_command check_cpu!200%!250% check_command check_cpu!{{ namenode_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component NAMENODE normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -258,6 +266,7 @@ define service { service_description NAMENODE::NameNode Web UI on {{ namenode_hostname }} servicegroups HDFS check_command check_webui!namenode!{{ namenode_port }} + _host_component NAMENODE normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -269,6 +278,7 @@ define service { service_description NAMENODE::NameNode process on {{ namenode_hostname }} servicegroups HDFS check_command check_tcp_wrapper!{{ namenode_metadata_port }}!-w 1 -c 1 + _host_component NAMENODE normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -280,6 +290,7 @@ define service { service_description HDFS::NameNode RPC latency on {{ namenode_hostname }} servicegroups HDFS check_command check_rpcq_latency!NameNode!{{ namenode_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component NAMENODE normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 @@ -293,6 +304,7 @@ define service { service_description HDFS::Blocks health servicegroups HDFS check_command check_hdfs_blocks!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!{{ nn_metrics_property }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component DATANODE normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 @@ -304,6 +316,7 @@ define service { service_description HDFS::HDFS capacity utilization servicegroups HDFS check_command check_hdfs_capacity!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!80%!90%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component DATANODE normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 @@ -319,6 +332,7 @@ define service { service_description JOBTRACKER::JobTracker Web UI servicegroups MAPREDUCE check_command check_webui!jobtracker!{{ jtnode_port }} + _host_component JOBTRACKER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -330,6 +344,7 @@ define service { service_description JOBTRACKER::HistoryServer Web UI servicegroups MAPREDUCE check_command check_webui!jobhistory!{{ jobhistory_port }} + _host_component HISTORYSERVER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -341,6 +356,7 @@ define service { service_description JOBTRACKER::JobTracker CPU utilization servicegroups MAPREDUCE check_command check_cpu!{{ jtnode_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component JOBTRACKER normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -354,6 +370,7 @@ define service { service_description JOBTRACKER::JobTracker process servicegroups MAPREDUCE check_command check_tcp_wrapper!{{ jtnode_port }}!-w 1 -c 1 + _host_component JOBTRACKER normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 @@ -365,6 +382,7 @@ define service { service_description MAPREDUCE::JobTracker RPC latency servicegroups MAPREDUCE check_command check_rpcq_latency!JobTracker!{{ jtnode_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component JOBTRACKER normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 @@ -389,6 +407,7 @@ define service { service_description TASKTRACKER::TaskTracker process servicegroups MAPREDUCE check_command check_tcp_wrapper!{{ tasktracker_port }}!-w 1 -c 1 + _host_component TASKTRACKER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -418,6 +437,7 @@ define service { service_description DATANODE::DataNode process servicegroups HDFS check_command check_tcp_wrapper!{{datanode_port}}!-w 1 -c 1 + _host_component DATANODE normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -429,6 +449,7 @@ define service { service_description DATANODE::DataNode space servicegroups HDFS check_command check_datanode_storage!{{ datanode_port }}!90%!90%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component DATANODE normal_check_interval 2 retry_check_interval 1 max_check_attempts 2 @@ -444,6 +465,7 @@ define service { service_description FLUME::Flume Agent process servicegroups FLUME check_command check_tcp_wrapper!{{ flume_port }}!-w 1 -c 1 + _host_component FLUME normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -459,6 +481,7 @@ define service { service_description ZOOKEEPER::ZooKeeper Server process servicegroups ZOOKEEPER check_command check_tcp_wrapper!{{ clientPort }}!-w 1 -c 1 + _host_component ZOOKEEPER_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -473,6 +496,7 @@ define service { service_description REGIONSERVER::RegionServer process servicegroups HBASE check_command check_tcp_wrapper!{{ hbase_rs_port }}!-w 1 -c 1 + _host_component HBASE_REGIONSERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -486,6 +510,7 @@ define service { service_description HBASEMASTER::HBase Master CPU utilization servicegroups HBASE check_command check_cpu_ha!{{ hbase_master_hosts_in_str }}!{{ hbase_master_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component HBASE_MASTER normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -500,6 +525,7 @@ define service { service_description HBASEMASTER::HBase Master process on {{ hbasemaster }} servicegroups HBASE check_command check_tcp_wrapper!{{ hbase_master_rpc_port }}!-w 1 -c 1 + _host_component HBASE_MASTER normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 @@ -515,6 +541,7 @@ define service { service_description HIVE-METASTORE::Hive Metastore process servicegroups HIVE check_command check_tcp_wrapper!{{ hive_metastore_port }}!-w 1 -c 1 + _host_component HIVE_METASTORE normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 @@ -527,6 +554,7 @@ define service { service_description HIVE-SERVER::HiveServer2 process servicegroups HIVE check_command check_tcp_wrapper_sasl!{{ hive_server_port }}!{{ '--security-enabled' if security_enabled else '' }}!-w 1 -c 1 + _host_component HIVE_SERVER normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 @@ -544,6 +572,7 @@ define service { {% else %} check_command check_oozie_status!{{ oozie_server_port }}!{{ java64_home }}!false {% endif %} + _host_component OOZIE_SERVER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -561,6 +590,7 @@ define service { {% else %} check_command check_templeton_status!{{ templeton_port }}!v1!false {% endif %} + _host_component WEBHCAT_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -574,6 +604,7 @@ define service { service_description HUE::Hue Server status servicegroups HUE check_command check_hue_status + _host_component HUE normal_check_interval 100 retry_check_interval 0.5 max_check_attempts 3 http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/mm_wrapper.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/mm_wrapper.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/mm_wrapper.py index 7a622b6..e9ea422 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/mm_wrapper.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/mm_wrapper.py @@ -22,7 +22,7 @@ import subprocess import os N_SGN = 'NAGIOS_SERVICEGROUPNAME' -N_SD = 'NAGIOS_SERVICEDESC' +N_SD = 'NAGIOS__SERVICEHOST_COMPONENT' N_HOST = 'NAGIOS_HOSTNAME' LIST_SEPARATOR = "--" @@ -53,14 +53,15 @@ def ignored_host_list(service, component): if lines: for l in lines: tokens = l.split(' ') - if len(tokens) == 3 and tokens[1] == service and tokens[2].strip() == component: - result.append(tokens[0]) + if len(tokens) == 3 and tokens[1].strip().upper() == service.strip().upper() and \ + tokens[2].strip().upper() == component.strip().upper(): + result.append(tokens[0]) return result def get_real_service(): try: - service = os.environ[N_SGN] # e.g. 'HBASE' + service = os.environ[N_SGN].strip().upper() # e.g. 'HBASE' except KeyError: service = '' return service @@ -68,15 +69,13 @@ def get_real_service(): def get_real_component(): try: - arr_desc = os.environ[N_SD] # e.g. 'HBASE::Percent RegionServers live' - SEPARATOR = "::" - comp_name = arr_desc.replace(SEPARATOR, ' ').split(' ')[0] + comp_name = os.environ[N_SD].strip() except KeyError: comp_name = '' mapping = { 'HBASEMASTER': 'HBASE_MASTER', 'REGIONSERVER': 'HBASE_REGIONSERVER', - 'JOBHISTORY': 'MAPREDUCE2', + 'JOBHISTORY': 'HISTORYSERVER', 'HIVE-METASTORE': 'HIVE_METASTORE', 'HIVE-SERVER': 'HIVE_SERVER', 'FLUME': 'FLUME_HANDLER', http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 index c5ca5bb..a10fa80 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-servicegroups.cfg.j2 @@ -31,7 +31,7 @@ {%if hostgroup_defs['jobtracker'] or hostgroup_defs['historyserver2']-%} define servicegroup { - servicegroup_name MAPREDUCE + servicegroup_name MAPREDUCE2 alias MAPREDUCE Checks } {% endif %} http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 index 155d750..04423fb 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 @@ -95,6 +95,7 @@ define service { service_description HDFS::NameNode HA Healthy servicegroups HDFS check_command check_namenodes_ha!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }} + _host_component NAMENODE normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 5 @@ -155,6 +156,7 @@ define service { service_description GANGLIA::Ganglia Server process servicegroups GANGLIA check_command check_tcp_wrapper!{{ ganglia_port }}!-w 1 -c 1 + _host_component GANGLIA_SERVER normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 @@ -167,6 +169,7 @@ define service { service_description GANGLIA::Ganglia Monitor process for NameNode servicegroups GANGLIA check_command check_tcp_wrapper!{{ ganglia_collector_namenode_port }}!-w 1 -c 1 + _host_component GANGLIA_MONITOR normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 @@ -180,6 +183,7 @@ define service { service_description GANGLIA::Ganglia Monitor process for HBase Master servicegroups GANGLIA check_command check_tcp_wrapper!{{ ganglia_collector_hbase_port }}!-w 1 -c 1 + _host_component GANGLIA_MONITOR normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 @@ -193,6 +197,7 @@ define service { service_description GANGLIA::Ganglia Monitor process for ResourceManager servicegroups GANGLIA check_command check_tcp_wrapper!{{ ganglia_collector_rm_port }}!-w 1 -c 1 + _host_component GANGLIA_MONITOR normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 @@ -206,6 +211,7 @@ define service { service_description GANGLIA::Ganglia Monitor process for HistoryServer servicegroups GANGLIA check_command check_tcp_wrapper!{{ ganglia_collector_hs_port }}!-w 1 -c 1 + _host_component GANGLIA_MONITOR normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 @@ -222,6 +228,7 @@ define service { service_description NAMENODE::Secondary NameNode process servicegroups HDFS check_command check_tcp_wrapper!{{ snamenode_port }}!-w 1 -c 1 + _host_component SECONDARY_NAMENODE normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -236,6 +243,7 @@ define service { service_description STORM_UI_SERVER::Storm UI on {{ hostgroup_defs['storm_ui'][0] }} servicegroups STORM check_command check_webui!storm_ui!{{ storm_ui_port }} + _host_component STORM_UI_SERVER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -250,6 +258,7 @@ define service { service_description STORM_UI_SERVER::Storm UI Server process servicegroups STORM check_command check_tcp_wrapper!{{ storm_ui_port }}!-w 1 -c 1 + _host_component STORM_UI_SERVER normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -264,6 +273,7 @@ define service { service_description NIMBUS::Nimbus process servicegroups STORM check_command check_tcp_wrapper!{{ nimbus_port }}!-w 1 -c 1 + _host_component NIMBUS normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -278,6 +288,7 @@ define service { service_description DRPC_SERVER::DRPC Server process servicegroups STORM check_command check_tcp_wrapper!{{ drpc_port }}!-w 1 -c 1 + _host_component DRPC_SERVER normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -292,6 +303,7 @@ define service { service_description STORM_REST_API::Storm REST API Server process servicegroups STORM check_command check_tcp_wrapper!{{ storm_rest_api_port }}!-w 1 -c 1 + _host_component STORM_REST_API normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -306,6 +318,7 @@ define service { service_description SUPERVISOR::Percent Supervisors live servicegroups STORM check_command check_aggregate!"SUPERVISOR::Supervisors process"!10%!30% + _host_component SUPERVISOR normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -317,6 +330,7 @@ define service { service_description SUPERVISOR::Supervisors process servicegroups STORM check_command check_tcp_wrapper!{{ supervisor_port }}!-w 1 -c 1 + _host_component SUPERVISOR normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -333,6 +347,7 @@ define service { service_description NAMENODE::NameNode edit logs directory status on {{ namenode_hostname }} servicegroups HDFS check_command check_name_dir_status!{{ namenode_port }}!{{ str(hdfs_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component NAMENODE normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 @@ -345,6 +360,7 @@ define service { service_description NAMENODE::NameNode host CPU utilization on {{ namenode_hostname }} servicegroups HDFS check_command check_cpu!{{ namenode_port }}!200%!250%!{{ str(hdfs_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component NAMENODE normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -357,6 +373,7 @@ define service { service_description NAMENODE::NameNode Web UI on {{ namenode_hostname }} servicegroups HDFS check_command check_webui!namenode!{{ namenode_port }} + _host_component NAMENODE normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -368,6 +385,7 @@ define service { service_description NAMENODE::NameNode process on {{ namenode_hostname }} servicegroups HDFS check_command check_tcp_wrapper!{{nn_ha_host_port_map[namenode_hostname]}}!-w 1 -c 1 + _host_component NAMENODE normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -379,6 +397,7 @@ define service { service_description HDFS::NameNode RPC latency on {{ namenode_hostname }} servicegroups HDFS check_command check_rpcq_latency!NameNode!{{ namenode_port }}!3000!5000!{{ str(hdfs_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component NAMENODE normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 @@ -392,6 +411,7 @@ define service { service_description NAMENODE::Last checkpoint time servicegroups HDFS check_command check_checkpoint_time!{{ nn_hosts_string }}!{{ namenode_port }}!200!200!{{ dfs_namenode_checkpoint_period }}!{{dfs_namenode_checkpoint_txns}}!{{str(hdfs_ssl_enabled).lower()}} + _host_component NAMENODE normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -403,6 +423,7 @@ define service { service_description HDFS::Blocks health servicegroups HDFS check_command check_hdfs_blocks!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!{{ nn_metrics_property }}!{{ str(hdfs_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component DATANODE normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 @@ -414,6 +435,7 @@ define service { service_description HDFS::HDFS capacity utilization servicegroups HDFS check_command check_hdfs_capacity!$HOSTGROUPMEMBERS:namenode$!{{ namenode_port }}!80%!90%!{{ str(hdfs_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component DATANODE normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 @@ -429,6 +451,7 @@ define service { service_description RESOURCEMANAGER::ResourceManager Web UI servicegroups YARN check_command check_webui_ha!resourcemanager!{{ rm_hosts_in_str }}!{{ rm_port }} + _host_component RESOURCEMANAGER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -441,6 +464,7 @@ define service { service_description RESOURCEMANAGER::ResourceManager CPU utilization servicegroups YARN check_command check_cpu_ha!{{ rm_hosts_in_str }}!{{ rm_port }}!200%!250%!{{ str(yarn_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component RESOURCEMANAGER normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -453,6 +477,7 @@ define service { service_description RESOURCEMANAGER::ResourceManager RPC latency servicegroups YARN check_command check_rpcq_latency_ha!{{ rm_hosts_in_str }}!ResourceManager!{{ rm_port }}!3000!5000!{{ str(yarn_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component RESOURCEMANAGER normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 @@ -465,6 +490,7 @@ define service { service_description RESOURCEMANAGER::ResourceManager process on {{ rm_host }} servicegroups YARN check_command check_tcp_wrapper!{{ rm_port }}!-w 1 -c 1 + _host_component RESOURCEMANAGER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -480,6 +506,7 @@ define service { service_description NODEMANAGER::NodeManager process servicegroups YARN check_command check_tcp_wrapper!{{ nm_port }}!-w 1 -c 1 + _host_component NODEMANAGER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -491,6 +518,7 @@ define service { service_description NODEMANAGER::NodeManager health servicegroups YARN check_command check_nodemanager_health!{{ nm_port }}!{{ str(security_enabled).lower() }}!{{ str(yarn_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }} + _host_component NODEMANAGER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -501,6 +529,7 @@ define service { service_description NODEMANAGER::Percent NodeManagers live servicegroups YARN check_command check_aggregate!"NODEMANAGER::NodeManager process"!10%!30% + _host_component NODEMANAGER normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -513,8 +542,9 @@ define service { hostgroup_name historyserver2 use hadoop-service service_description JOBHISTORY::HistoryServer Web UI - servicegroups MAPREDUCE + servicegroups MAPREDUCE2 check_command check_webui!historyserver2!{{ hs_port }} + _host_component HISTORYSERVER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -525,8 +555,9 @@ define service { hostgroup_name historyserver2 use hadoop-service service_description JOBHISTORY::HistoryServer CPU utilization - servicegroups MAPREDUCE + servicegroups MAPREDUCE2 check_command check_cpu!{{ hs_port }}!200%!250%!{{ str(mapreduce_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component HISTORYSERVER normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -537,8 +568,9 @@ define service { hostgroup_name historyserver2 use hadoop-service service_description JOBHISTORY::HistoryServer RPC latency - servicegroups MAPREDUCE + servicegroups MAPREDUCE2 check_command check_rpcq_latency!JobHistoryServer!{{ hs_port }}!3000!5000!{{ str(mapreduce_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component HISTORYSERVER normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 @@ -548,8 +580,9 @@ define service { hostgroup_name historyserver2 use hadoop-service service_description JOBHISTORY::HistoryServer process - servicegroups MAPREDUCE + servicegroups MAPREDUCE2 check_command check_tcp_wrapper!{{ hs_port }}!-w 1 -c 1 + _host_component HISTORYSERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -565,6 +598,7 @@ define service { service_description JOURNALNODE::JournalNode process servicegroups HDFS check_command check_tcp_wrapper!{{ journalnode_port }}!-w 1 -c 1 + _host_component JOURNALNODE normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -592,6 +626,7 @@ define service { service_description DATANODE::DataNode process servicegroups HDFS check_command check_tcp_wrapper!{{datanode_port}}!-w 1 -c 1 + _host_component DATANODE normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -603,6 +638,7 @@ define service { service_description DATANODE::DataNode space servicegroups HDFS check_command check_datanode_storage!{{ datanode_port }}!90%!90%!{{ str(hdfs_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component DATANODE normal_check_interval 2 retry_check_interval 1 max_check_attempts 2 @@ -618,6 +654,7 @@ define service { service_description ZOOKEEPER::ZooKeeper Server process servicegroups ZOOKEEPER check_command check_tcp_wrapper!{{ clientPort }}!-w 1 -c 1 + _host_component ZOOKEEPER_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -632,6 +669,7 @@ define service { service_description REGIONSERVER::RegionServer process servicegroups HBASE check_command check_tcp_wrapper!{{ hbase_rs_port }}!-w 1 -c 1 + _host_component HBASE_REGIONSERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -645,6 +683,7 @@ define service { service_description HBASEMASTER::HBase Master CPU utilization servicegroups HBASE check_command check_cpu_ha!{{ hbase_master_hosts_in_str }}!{{ hbase_master_port }}!200%!250%!false!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }} + _host_component HBASE_MASTER normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 @@ -659,6 +698,7 @@ define service { service_description HBASEMASTER::HBase Master process on {{ hbasemaster }} servicegroups HBASE check_command check_tcp_wrapper!{{ hbase_master_rpc_port }}!-w 1 -c 1 + _host_component HBASE_MASTER normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 @@ -674,6 +714,7 @@ define service { service_description HIVE-METASTORE::Hive Metastore process servicegroups HIVE check_command check_tcp_wrapper!{{ hive_metastore_port }}!-w 1 -c 1 + _host_component HIVE_METASTORE normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 @@ -686,6 +727,7 @@ define service { service_description HIVE-SERVER::HiveServer2 process servicegroups HIVE check_command check_tcp_wrapper_sasl!{{ hive_server_port }}!{{ '--security-enabled' if security_enabled else '' }}!-w 1 -c 1 + _host_component HIVE_SERVER normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 @@ -703,6 +745,7 @@ define service { {% else %} check_command check_oozie_status!{{ oozie_server_port }}!{{ java64_home }}!false {% endif %} + _host_component OOZIE_SERVER normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 @@ -720,6 +763,7 @@ define service { {% else %} check_command check_templeton_status!{{ templeton_port }}!v1!false {% endif %} + _host_component WEBHCAT_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -733,6 +777,7 @@ define service { service_description HUE::Hue Server status servicegroups HUE check_command check_hue_status + _host_component HUE normal_check_interval 100 retry_check_interval 0.5 max_check_attempts 3 @@ -746,6 +791,7 @@ define service { service_description FALCON::Falcon Server process servicegroups FALCON check_command check_tcp_wrapper!{{ falcon_port }}!-w 1 -c 1 + _host_component FALCON_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -755,6 +801,7 @@ define service { service_description FALCON::Falcon Server Web UI servicegroups FALCON check_command check_webui!falconserver!{{ falcon_port }} + _host_component FALCON_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -767,6 +814,7 @@ define service { service_description APP_TIMELINE_SERVER::App Timeline Server process servicegroups YARN check_command check_tcp_wrapper!{{ ahs_port }}!-w 1 -c 1 + _host_component APP_TIMELINE_SERVER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -781,6 +829,7 @@ define service { service_description FLUME::Flume Agent process servicegroups FLUME check_command check_ambari!/var/nagios/ambari.json!flume_agent + _host_component FLUME_HANDLER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -795,6 +844,7 @@ define service { service_description KNOX::Knox Gateway process servicegroups KNOX check_command check_tcp_wrapper!{{ knox_gateway_port }}!-w 1 -c 1 + _host_component KNOX_GATEWAY normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 @@ -809,6 +859,7 @@ define service { service_description KAFKA::Kafka Broker process servicegroups KAFKA check_command check_tcp_wrapper!{{ kafka_broker_port }}!-w 1 -c 1 + _host_component KAFKA_BROKER normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_mm_wrapper.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_mm_wrapper.py b/ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_mm_wrapper.py index cd3873e..04869a4 100644 --- a/ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_mm_wrapper.py +++ b/ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_mm_wrapper.py @@ -94,10 +94,10 @@ vm-3.vm ZOOKEEPER ZOOKEEPER_SERVER def test_get_real_component(self): - with patch.dict(os.environ, {'NAGIOS_SERVICEDESC': 'SUPERVISOR::Supervisors process'}, clear=True): + with patch.dict(os.environ, {'NAGIOS__SERVICEHOST_COMPONENT': 'SUPERVISOR'}, clear=True): component = mm_wrapper.get_real_component() self.assertEqual(component, 'SUPERVISOR') - with patch.dict(os.environ, {'NAGIOS_SERVICEDESC': 'JOBHISTORY::HistoryServer process'}, clear=True): + with patch.dict(os.environ, {'NAGIOS__SERVICEHOST_COMPONENT': 'JOBHISTORY'}, clear=True): component = mm_wrapper.get_real_component() self.assertEqual(component, 'MAPREDUCE2') http://git-wip-us.apache.org/repos/asf/ambari/blob/9406f962/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_mm_wrapper.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_mm_wrapper.py b/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_mm_wrapper.py index cd3873e..169fa70 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_mm_wrapper.py +++ b/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_mm_wrapper.py @@ -94,10 +94,10 @@ vm-3.vm ZOOKEEPER ZOOKEEPER_SERVER def test_get_real_component(self): - with patch.dict(os.environ, {'NAGIOS_SERVICEDESC': 'SUPERVISOR::Supervisors process'}, clear=True): + with patch.dict(os.environ, {'NAGIOS__SERVICEHOST_COMPONENT': 'SUPERVISOR'}, clear=True): component = mm_wrapper.get_real_component() self.assertEqual(component, 'SUPERVISOR') - with patch.dict(os.environ, {'NAGIOS_SERVICEDESC': 'JOBHISTORY::HistoryServer process'}, clear=True): + with patch.dict(os.environ, {'NAGIOS__SERVICEHOST_COMPONENT': 'MAPREDUCE2'}, clear=True): component = mm_wrapper.get_real_component() self.assertEqual(component, 'MAPREDUCE2')
