Elukey has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/330154 )

Change subject: Add JVM Heap usage alarms for basic Hadoop daemons
......................................................................


Add JVM Heap usage alarms for basic Hadoop daemons

Bug: T88640
Change-Id: I6d47880bd69106268eb0ff03bd9ee2293d59049c
---
M hieradata/eqiad/cdh/hadoop.yaml
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
4 files changed, 87 insertions(+), 1 deletion(-)

Approvals:
  Elukey: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/hieradata/eqiad/cdh/hadoop.yaml b/hieradata/eqiad/cdh/hadoop.yaml
index 39c9879..04e1baf 100644
--- a/hieradata/eqiad/cdh/hadoop.yaml
+++ b/hieradata/eqiad/cdh/hadoop.yaml
@@ -32,6 +32,8 @@
 cdh::hadoop::net_topology_script_template: 
'role/analytics_cluster/hadoop/net-topology.py.erb'
 
 # Increase NameNode heapsize independent from other daemons
+# The opts value will be used for the JVM, meanwhile the raw number for alarms.
+cdh::hadoop::hadoop_namenode_heapsize: 4096
 cdh::hadoop::hadoop_namenode_opts: "-Xmx4096m"
 
 cdh::hadoop::mapreduce_reduce_shuffle_parallelcopies: 10
diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp 
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index d30450c..4e976a4 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -90,6 +90,32 @@
                 Sudo::User['nagios-check_hdfs_active_namenode'],
             ],
         }
+
+        # Java heap space used alerts
+        # The goal is to get alarms for long running memory leaks like T153951
+        $namenode_jvm_warning_threshold  = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+        $namenode_jvm_critical_threshold = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
+        monitoring::graphite_threshold { 'analytics_hadoop_hdfs_namenode':
+            description   => 'HDFS active Namenode JVM Heap usage',
+            metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+            from          => '60min',
+            warning       => $namenode_jvm_warning_threshold,
+            critical      => $namenode_jvm_critical_threshold,
+            percentage    => '60',
+            contact_group => 'admins,analytics',
+        }
+
+        $rm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+        $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
+        monitoring::graphite_threshold { 
'analytics_hadoop_yarn_resource_manager':
+            description   => 'Yarn active ResourceManager JVM Heap usage',
+            metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9980.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+            from          => '60min',
+            warning       => $rm_jvm_warning_threshold,
+            critical      => $rm_jvm_critical_threshold,
+            percentage    => '60',
+            contact_group => 'admins,analytics',
+        }
     }
 
     # Firewall
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp 
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 21c0455..3d30dfe 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -31,6 +31,20 @@
             contact_group => 'admins,analytics',
             require       => Class['cdh::hadoop::namenode::standby'],
         }
+
+        # Java heap space used alerts
+        # The goal is to get alarms for long running memory leaks like T153951
+        $namenode_jvm_warning_threshold  = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+        $namenode_jvm_critical_threshold = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
+        monitoring::graphite_threshold { 'analytics_hadoop_namenode_hdfs':
+            description   => 'HDFS standby Namenode JVM Heap usage',
+            metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+            from          => '60min',
+            warning       => $namenode_jvm_warning_threshold,
+            critical      => $namenode_jvm_critical_threshold,
+            percentage    => '60',
+            contact_group => 'admins,analytics',
+        }
     }
 
     # Firewall
@@ -43,6 +57,25 @@
         include ::cdh::hadoop::resourcemanager
         # Firewall
         include ::role::analytics_cluster::hadoop::ferm::resourcemanager
+
+        # Use jmxtrans for sending metrics
+        class { 'cdh::hadoop::jmxtrans::resourcemanager':
+            statsd  => hiera('statsd'),
+        }
+
+        # Java heap space used alerts
+        # The goal is to get alarms for long running memory leaks like T153951
+        $rm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+        $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
+        monitoring::graphite_threshold { 
'analytics_hadoop_yarn_resource_manager':
+            description   => 'YARN Resource Manager JVM Heap usage',
+            metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9984.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+            from          => '60min',
+            warning       => $rm_jvm_warning_threshold,
+            critical      => $rm_jvm_critical_threshold,
+            percentage    => '60',
+            contact_group => 'admins,analytics',
+        }
     }
 
-}
\ No newline at end of file
+}
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp 
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index 9030b91..bda0a67 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -54,6 +54,31 @@
             contact_group  => 'admins,analytics',
             retry_interval => 3,
         }
+
+        # Java heap space used alerts
+        # The goal is to get alarms for long running memory leaks like T153951
+        $dn_jvm_warning_threshold  = hiera(cdh::hadoop::hadoop_heapsize) * 0.7
+        $dn_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.9
+        $nm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+        $nm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
+        monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager':
+            description   => 'YARN NodeManager JVM Heap usage',
+            metric        => 
"Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper",
+            from          => '60min',
+            warning       => $dn_jvm_warning_threshold,
+            critical      => $dn_jvm_critical_threshold,
+            percentage    => '60',
+            contact_group => 'admins,analytics',
+        }
+        monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode':
+            description   => 'HDFS DataNode JVM Heap usage',
+            metric        => 
"Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper",
+            from          => '60min',
+            warning       => $nm_jvm_warning_threshold,
+            critical      => $nm_jvm_critical_threshold,
+            percentage    => '60',
+            contact_group => 'admins,analytics',
+        }
     }
 
     # hive::client is nice to have for jobs launched

-- 
To view, visit https://gerrit.wikimedia.org/r/330154
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6d47880bd69106268eb0ff03bd9ee2293d59049c
Gerrit-PatchSet: 15
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to