Elukey has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/330154 )
Change subject: Add JVM Heap usage alarms for basic Hadoop daemons
......................................................................
Add JVM Heap usage alarms for basic Hadoop daemons
Bug: T88640
Change-Id: I6d47880bd69106268eb0ff03bd9ee2293d59049c
---
M hieradata/eqiad/cdh/hadoop.yaml
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
4 files changed, 87 insertions(+), 1 deletion(-)
Approvals:
Elukey: Looks good to me, approved
jenkins-bot: Verified
diff --git a/hieradata/eqiad/cdh/hadoop.yaml b/hieradata/eqiad/cdh/hadoop.yaml
index 39c9879..04e1baf 100644
--- a/hieradata/eqiad/cdh/hadoop.yaml
+++ b/hieradata/eqiad/cdh/hadoop.yaml
@@ -32,6 +32,8 @@
cdh::hadoop::net_topology_script_template:
'role/analytics_cluster/hadoop/net-topology.py.erb'
# Increase NameNode heapsize independent from other daemons
+# The opts value will be used for the JVM, meanwhile the raw number for alarms.
+cdh::hadoop::hadoop_namenode_heapsize: 4096
cdh::hadoop::hadoop_namenode_opts: "-Xmx4096m"
cdh::hadoop::mapreduce_reduce_shuffle_parallelcopies: 10
diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index d30450c..4e976a4 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -90,6 +90,32 @@
Sudo::User['nagios-check_hdfs_active_namenode'],
],
}
+
+ # Java heap space used alerts
+ # The goal is to get alarms for long running memory leaks like T153951
+ $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+ $namenode_jvm_critical_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
+ monitoring::graphite_threshold { 'analytics_hadoop_hdfs_namenode':
+ description => 'HDFS active Namenode JVM Heap usage',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $namenode_jvm_warning_threshold,
+ critical => $namenode_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'admins,analytics',
+ }
+
+ $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+ $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
+ monitoring::graphite_threshold {
'analytics_hadoop_yarn_resource_manager':
+ description => 'Yarn active ResourceManager JVM Heap usage',
+ metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9980.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $rm_jvm_warning_threshold,
+ critical => $rm_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'admins,analytics',
+ }
}
# Firewall
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 21c0455..3d30dfe 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -31,6 +31,20 @@
contact_group => 'admins,analytics',
require => Class['cdh::hadoop::namenode::standby'],
}
+
+ # Java heap space used alerts
+ # The goal is to get alarms for long running memory leaks like T153951
+ $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+ $namenode_jvm_critical_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
+ monitoring::graphite_threshold { 'analytics_hadoop_namenode_hdfs':
+ description => 'HDFS standby Namenode JVM Heap usage',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $namenode_jvm_warning_threshold,
+ critical => $namenode_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'admins,analytics',
+ }
}
# Firewall
@@ -43,6 +57,25 @@
include ::cdh::hadoop::resourcemanager
# Firewall
include ::role::analytics_cluster::hadoop::ferm::resourcemanager
+
+ # Use jmxtrans for sending metrics
+ class { 'cdh::hadoop::jmxtrans::resourcemanager':
+ statsd => hiera('statsd'),
+ }
+
+ # Java heap space used alerts
+ # The goal is to get alarms for long running memory leaks like T153951
+ $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+ $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
+ monitoring::graphite_threshold {
'analytics_hadoop_yarn_resource_manager':
+ description => 'YARN Resource Manager JVM Heap usage',
+ metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9984.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $rm_jvm_warning_threshold,
+ critical => $rm_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'admins,analytics',
+ }
}
-}
\ No newline at end of file
+}
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index 9030b91..bda0a67 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -54,6 +54,31 @@
contact_group => 'admins,analytics',
retry_interval => 3,
}
+
+ # Java heap space used alerts
+ # The goal is to get alarms for long running memory leaks like T153951
+ $dn_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.7
+ $dn_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.9
+ $nm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+ $nm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
+ monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager':
+ description => 'YARN NodeManager JVM Heap usage',
+ metric =>
"Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $dn_jvm_warning_threshold,
+ critical => $dn_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'admins,analytics',
+ }
+ monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode':
+ description => 'HDFS DataNode JVM Heap usage',
+ metric =>
"Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $nm_jvm_warning_threshold,
+ critical => $nm_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'admins,analytics',
+ }
}
# hive::client is nice to have for jobs launched
--
To view, visit https://gerrit.wikimedia.org/r/330154
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I6d47880bd69106268eb0ff03bd9ee2293d59049c
Gerrit-PatchSet: 15
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits