Elukey has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/337574 )
Change subject: Fix and tune the new Analytics Hadoop alarms
......................................................................
Fix and tune the new Analytics Hadoop alarms
Bug: T88640
Change-Id: I1e47c128ca04dc48690ecbd5d70fa7ee154b7423
---
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
3 files changed, 8 insertions(+), 8 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/74/337574/1
diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index 3303f5e..8963ac1 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -94,7 +94,7 @@
# Java heap space used alerts
# The goal is to get alarms for long running memory leaks like T153951
- $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+ $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.8
$namenode_jvm_critical_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
monitoring::graphite_threshold { 'analytics_hadoop_hdfs_namenode':
description => 'HDFS active Namenode JVM Heap usage',
@@ -106,11 +106,11 @@
contact_group => 'admins,analytics',
}
- $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+ $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.8
$rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
monitoring::graphite_threshold {
'analytics_hadoop_yarn_resource_manager':
description => 'Yarn active ResourceManager JVM Heap usage',
- metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9980.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+ metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
from => '60min',
warning => $rm_jvm_warning_threshold,
critical => $rm_jvm_critical_threshold,
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 7367790..c44969e 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -35,11 +35,11 @@
# Java heap space used alerts
# The goal is to get alarms for long running memory leaks like T153951
- $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+ $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.8
$namenode_jvm_critical_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
monitoring::graphite_threshold { 'analytics_hadoop_namenode_hdfs':
description => 'HDFS standby Namenode JVM Heap usage',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9983.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
from => '60min',
warning => $namenode_jvm_warning_threshold,
critical => $namenode_jvm_critical_threshold,
@@ -66,7 +66,7 @@
# Java heap space used alerts
# The goal is to get alarms for long running memory leaks like T153951
- $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+ $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.8
$rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
monitoring::graphite_threshold {
'analytics_hadoop_yarn_resource_manager':
description => 'YARN Resource Manager JVM Heap usage',
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index 7aff0ce..f169109 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -59,9 +59,9 @@
# Java heap space used alerts
# The goal is to get alarms for long running memory leaks like T153951
- $dn_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.7
+ $dn_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.8
$dn_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.9
- $nm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+ $nm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.8
$nm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager':
description => 'YARN NodeManager JVM Heap usage',
--
To view, visit https://gerrit.wikimedia.org/r/337574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I1e47c128ca04dc48690ecbd5d70fa7ee154b7423
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits