Elukey has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/392658 )
Change subject: role::analytics_cluster::hadoop: move worker and masters to
role/profiles
......................................................................
role::analytics_cluster::hadoop: move worker and masters to role/profiles
Bug: T167790
Change-Id: I6f70df109254085657f62dcdae15204752cba2b1
---
M hieradata/role/common/analytics_cluster/hadoop/master.yaml
M hieradata/role/common/analytics_cluster/hadoop/standby.yaml
M hieradata/role/common/analytics_cluster/hadoop/worker.yaml
M manifests/site.pp
A modules/profile/manifests/analytics/refinery.pp
A modules/profile/manifests/hadoop/backup/namenode.pp
R modules/profile/manifests/hadoop/firewall/master.pp
A modules/profile/manifests/hadoop/master.pp
A modules/profile/manifests/hadoop/master/standby.pp
R modules/profile/manifests/hadoop/mysql_password.pp
A modules/profile/manifests/hadoop/users.pp
A modules/profile/manifests/hadoop/worker.pp
D modules/role/manifests/analytics_cluster/backup.pp
M modules/role/manifests/analytics_cluster/coordinator.pp
M modules/role/manifests/analytics_cluster/database/meta/backup_dest.pp
M modules/role/manifests/analytics_cluster/hadoop/backup/namenode.pp
D modules/role/manifests/analytics_cluster/hadoop/ferm/namenode.pp
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
20 files changed, 677 insertions(+), 496 deletions(-)
Approvals:
Elukey: Looks good to me, approved
jenkins-bot: Verified
diff --git a/hieradata/role/common/analytics_cluster/hadoop/master.yaml
b/hieradata/role/common/analytics_cluster/hadoop/master.yaml
index 8ef1259..c8fa084 100644
--- a/hieradata/role/common/analytics_cluster/hadoop/master.yaml
+++ b/hieradata/role/common/analytics_cluster/hadoop/master.yaml
@@ -16,4 +16,6 @@
profile::hadoop::client::zookeeper_cluster_name: 'main-eqiad'
profile::hadoop::client::resourcemanager_hosts:
- analytics1001.eqiad.wmnet
- - analytics1002.eqiad.wmnet
\ No newline at end of file
+ - analytics1002.eqiad.wmnet
+
+profile::hadoop::master::monitoring_enabled: true
\ No newline at end of file
diff --git a/hieradata/role/common/analytics_cluster/hadoop/standby.yaml
b/hieradata/role/common/analytics_cluster/hadoop/standby.yaml
index 8ef1259..bf6cc92 100644
--- a/hieradata/role/common/analytics_cluster/hadoop/standby.yaml
+++ b/hieradata/role/common/analytics_cluster/hadoop/standby.yaml
@@ -16,4 +16,6 @@
profile::hadoop::client::zookeeper_cluster_name: 'main-eqiad'
profile::hadoop::client::resourcemanager_hosts:
- analytics1001.eqiad.wmnet
- - analytics1002.eqiad.wmnet
\ No newline at end of file
+ - analytics1002.eqiad.wmnet
+
+profile::hadoop::standby_master::monitoring_enabled: true
\ No newline at end of file
diff --git a/hieradata/role/common/analytics_cluster/hadoop/worker.yaml
b/hieradata/role/common/analytics_cluster/hadoop/worker.yaml
index 085bcb1..4f4535f 100644
--- a/hieradata/role/common/analytics_cluster/hadoop/worker.yaml
+++ b/hieradata/role/common/analytics_cluster/hadoop/worker.yaml
@@ -1,12 +1,11 @@
nagios_group: analytics_eqiad
cluster: analytics
-# FIXME:
-# indirect hiera lookup due to includes in the role:
-# role::analytics::hadoop::client
-hadoop_zookeeper_cluster_name: main-eqiad
+
admin::groups:
- analytics-admins
+profile::hadoop::worker::monitoring_enabled: true
+
# Analytics worker disks are large. We will install a custom
# NRPE check for them, so the base module's should ignore them.
profile::base::check_disk_options: '-w 6% -c 3% -W 6% -K 3% -l -e -A -i
"/var/lib/hadoop/data"'
diff --git a/manifests/site.pp b/manifests/site.pp
index ceb7f96..f1bdf85 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -40,18 +40,13 @@
# - primary active NameNode
# - YARN ResourceManager
node 'analytics1001.eqiad.wmnet' {
- role(analytics_cluster::hadoop::master,
- analytics_cluster::users)
-
- include ::standard
- include ::base::firewall
+ role(analytics_cluster::hadoop::master)
}
# analytics1002 is the Hadoop standby NameNode and ResourceManager.
node 'analytics1002.eqiad.wmnet' {
role(analytics_cluster::hadoop::standby,
- analytics_cluster::users,
# analytics1002 is usually inactive, and it has a
# decent amount of disk space. We use it to
# store some backups, including fsimage snapshots
@@ -59,18 +54,11 @@
# analytics_cluster::database::meta (MySQL analytics-meta) instance.
# If you move these, make sure /srv/backup has
# enough space to store backups.
- analytics_cluster::hadoop::backup::namenode,
analytics_cluster::database::meta::backup_dest)
-
- include ::standard
- include ::base::firewall
}
node 'analytics1003.eqiad.wmnet' {
role(analytics_cluster::coordinator)
-
- include ::standard
- include ::base::firewall
}
# analytics1028-analytics1068 are Hadoop worker nodes.
@@ -81,9 +69,6 @@
# This is used for Hadoop network topology awareness.
node /analytics10(2[89]|3[0-9]|4[0-9]|5[0-9]|6[0-9]).eqiad.wmnet/ {
role(analytics_cluster::hadoop::worker)
-
- include ::base::firewall
- include ::standard
}
# Analytics Query Service
diff --git a/modules/profile/manifests/analytics/refinery.pp
b/modules/profile/manifests/analytics/refinery.pp
new file mode 100644
index 0000000..713c6f8
--- /dev/null
+++ b/modules/profile/manifests/analytics/refinery.pp
@@ -0,0 +1,73 @@
+# == Class profile::analytics::refinery
+#
+# Includes configuration and resources needed for deploying
+# and using the analytics/refinery repository.
+#
+class profile::analytics::refinery {
+ # Make this class depend on hadoop::client. Refinery
+ # is intended to work with Hadoop, and many of the
+ # role classes here use the hdfs user, which is created
+ # by the CDH packages.
+ require ::profile::hadoop::client
+
+ # Clone mediawiki/event-schemas so refinery can use them.
+ class { '::eventschemas': }
+
+ # Include geoip for geolocating
+ class { '::geoip': }
+
+ # Some refinery python scripts use docopt for CLI parsing.
+ if !defined(Package['python-docopt']) {
+ package { 'python-docopt':
+ ensure => 'installed',
+ }
+ }
+ # refinery python module uses dateutil
+ if !defined(Package['python-dateutil']) {
+ package { 'python-dateutil':
+ ensure => 'installed',
+ }
+ }
+
+ # The analytics/refinery repo will deployed to this node via Scap3.
+ # The analytics user/groups are deployed/managed by Scap.
+ # The analytics_deploy SSH keypair files are stored in the private repo,
+ # and since manage_user is true the analytics_deploy public ssh key
+ # will be added to the 'analytics' user's ssh config. The rationale is to
+ # have a single 'analytics' multi-purpose user that owns refinery files
+ # deployed via scap and could possibly do other things (not yet defined).
+ scap::target { 'analytics/refinery':
+ deploy_user => 'analytics',
+ key_name => 'analytics_deploy',
+ manage_user => true,
+ }
+
+ # analytics/refinery repository is deployed via git-deploy at this path.
+ # You must deploy this yourself; puppet will not do it for you.
+ $path = '/srv/deployment/analytics/refinery'
+
+ # Put refinery python module in user PYTHONPATH
+ file { '/etc/profile.d/refinery.sh':
+ content => "export PYTHONPATH=\${PYTHONPATH}:${path}/python",
+ }
+
+ # Create directory in /var/log for general purpose Refinery job logging.
+ $log_dir = '/var/log/refinery'
+ $log_dir_group = $::realm ? {
+ 'production' => 'analytics-admins',
+ 'labs' => "project-${::labsproject}",
+ }
+ file { $log_dir:
+ ensure => 'directory',
+ owner => 'hdfs',
+ group => $log_dir_group,
+ # setgid bit here to make refinery log files writeable
+ # by users in the $$log_dir_group group.
+ mode => '2775',
+ }
+
+ logrotate::conf { 'refinery':
+ source =>
'puppet:///modules/role/analytics_cluster/refinery-logrotate.conf',
+ require => File[$log_dir],
+ }
+}
diff --git a/modules/profile/manifests/hadoop/backup/namenode.pp
b/modules/profile/manifests/hadoop/backup/namenode.pp
new file mode 100644
index 0000000..95f2349
--- /dev/null
+++ b/modules/profile/manifests/hadoop/backup/namenode.pp
@@ -0,0 +1,50 @@
+# == Class profile::hadoop::backup::namenode
+#
+# Periodically runs hdfs dfsadmin -fetchImage
+# and ensures that bacula backs up Hadoop NameNode fsimages,
+# in the case we need to recover if both Hadoop NameNodes.
+#
+class profile::hadoop::backup::namenode {
+ require ::profile::hadoop::client
+
+ file { '/srv/backup':
+ ensure => 'directory',
+ owner => 'root',
+ group => 'analytics-admins',
+ mode => '0755',
+ }
+
+ $destination = '/srv/backup/hadoop/namenode'
+ file { [
+ '/srv/backup/hadoop',
+ $destination
+ ]:
+ ensure => 'directory',
+ owner => 'hdfs',
+ group => 'analytics-admins',
+ mode => '0750',
+ require => File['/srv/backup']
+ }
+
+ cron { 'hadoop-namenode-backup-fetchimage':
+ command => "/usr/bin/hdfs dfsadmin -fetchImage ${destination} >
/dev/null 2>&1 ",
+ user => 'hdfs',
+ hour => 0,
+ minute => 0,
+ }
+
+ $retention_days = 30
+ # Delete files older than $retention_days.
+ cron { 'hadoop-namenode-backup-prune':
+ command => "/usr/bin/find ${destination} -mtime +${retention_days}
-delete > /dev/null 2>&1",
+ user => 'hdfs',
+ hour => 1,
+ minute => 0,
+ }
+
+ # Bacula will also back up this directory.
+ # See: bacula::director::fileset { 'hadoop-namenode-backup'
+ # in profile::backup::director
+ include ::profile::backup::host
+ backup::set { 'hadoop-namenode-backup' : }
+}
diff --git
a/modules/role/manifests/analytics_cluster/hadoop/ferm/resourcemanager.pp
b/modules/profile/manifests/hadoop/firewall/master.pp
similarity index 64%
rename from
modules/role/manifests/analytics_cluster/hadoop/ferm/resourcemanager.pp
rename to modules/profile/manifests/hadoop/firewall/master.pp
index ef8a206..90501e2 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/ferm/resourcemanager.pp
+++ b/modules/profile/manifests/hadoop/firewall/master.pp
@@ -1,6 +1,33 @@
-# == Class role::analytics_cluster::hadoop::ferm::resourcemanager
+# == Class profile::hadoop::firewall::master
#
-class role::analytics_cluster::hadoop::ferm::resourcemanager {
+# Set of common firewall rules for Hadoop Master nodes (active and standby)
+#
+class profile::hadoop::firewall::master {
+
+ ferm::service{ 'hadoop-hdfs-namenode':
+ proto => 'tcp',
+ port => '8020',
+ srange => '(($ANALYTICS_NETWORKS $DRUID_PUBLIC_HOSTS))',
+ }
+
+ ferm::service{ 'hadoop-hdfs-zkfc':
+ proto => 'tcp',
+ port => '8019',
+ srange => '$ANALYTICS_NETWORKS',
+ }
+
+ ferm::service{ 'hadoop-hdfs-namenode-http-ui':
+ proto => 'tcp',
+ port => '50070',
+ srange => '$ANALYTICS_NETWORKS',
+ }
+
+ ferm::service{ 'hadoop-hdfs-namenode-jmx':
+ proto => 'tcp',
+ port => '9980',
+ srange => '$ANALYTICS_NETWORKS',
+ }
+
ferm::service{ 'hadoop-yarn-resourcemanager-scheduler':
proto => 'tcp',
port => '8030',
diff --git a/modules/profile/manifests/hadoop/master.pp
b/modules/profile/manifests/hadoop/master.pp
new file mode 100644
index 0000000..87b0fd5
--- /dev/null
+++ b/modules/profile/manifests/hadoop/master.pp
@@ -0,0 +1,193 @@
+# == Class profile::hadoop::master
+#
+# Sets up a Hadoop Master node.
+#
+# == Parameters
+#
+# [*monitoring_enabled*]
+# If production monitoring needs to be enabled or not.
+#
+class profile::hadoop::master(
+ $monitoring_enabled =
hiera('profile::hadoop::master::monitoring_enabled'),
+ $hadoop_namenode_heapsize = hiera('cdh::hadoop::namenode_heapsize'),
+ $statsd = hiera('statsd'),
+){
+ include ::profile::hadoop::client
+
+ class { '::cdh::hadoop::master': }
+
+ # Use jmxtrans for sending metrics
+ class { '::cdh::hadoop::jmxtrans::master':
+ statsd => $statsd,
+ }
+
+ # This will create HDFS user home directories
+ # for all users in the provided groups.
+ # This only needs to be run on the NameNode
+ # where all users that want to use Hadoop
+ # must have shell accounts anyway.
+ class { '::cdh::hadoop::users':
+ require => Class['cdh::hadoop::master'],
+ }
+
+ # We need to include this class somewhere, and the master
+ # role is as good as place as any, since we only need it to
+ # be included on one node.
+ include ::profile::hadoop::mysql_password
+
+ # FairScheduler is creating event logs in hadoop.log.dir/fairscheduler/
+ # It rotates them but does not delete old ones. Set up cronjob to
+ # delete old files in this directory.
+ cron { 'hadoop-clean-fairscheduler-event-logs':
+ command => 'test -d /var/log/hadoop-yarn/fairscheduler &&
/usr/bin/find /var/log/hadoop-yarn/fairscheduler -type f -mtime +14 -exec rm {}
>/dev/null \;',
+ minute => 5,
+ hour => 0,
+ require => Class['cdh::hadoop::master'],
+ }
+
+ file { '/usr/local/lib/nagios/plugins/check_hdfs_topology':
+ ensure => present,
+ source =>
'puppet:///modules/role/analytics_cluster/hadoop/check_hdfs_topology',
+ mode => '0555',
+ owner => 'root',
+ group => 'root',
+ }
+
+ # Ensure that druid deep storage directories exist for all Druid clusters.
+ ::druid::cdh::hadoop::deep_storage { 'analytics-eqiad':
+ # analytics-eqiad predates the time when there were multiple Druid
clusters.
+ # It's deep storage directory will be /user/druid/deep-storage.
+ path => '/user/druid/deep-storage',
+ }
+ # The Druid public-eqiad cluster's deep storage
+ # directory will be /user/druid/deep-storage-public-eqiad
+ ::druid::cdh::hadoop::deep_storage { 'public-eqiad': }
+
+ # Include icinga alerts if production realm.
+ if $monitoring_enabled {
+ # Icinga process alerts for NameNode, ResourceManager and HistoryServer
+ nrpe::monitor_service { 'hadoop-hdfs-namenode':
+ description => 'Hadoop Namenode - Primary',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::master'],
+ critical => true,
+ }
+ nrpe::monitor_service { 'hadoop-hdfs-zkfc':
+ description => 'Hadoop HDFS Zookeeper failover controller',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.tools.DFSZKFailoverController"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::master'],
+ }
+ nrpe::monitor_service { 'hadoop-yarn-resourcemanager':
+ description => 'Hadoop ResourceManager',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.yarn.server.resourcemanager.ResourceManager"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::master'],
+ critical => true,
+ }
+ nrpe::monitor_service { 'hadoop-mapreduce-historyserver':
+ description => 'Hadoop HistoryServer',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::master'],
+ }
+
+ # Allow nagios to run some scripts as hdfs user.
+ sudo::user { 'nagios-check_hdfs_active_namenode':
+ user => 'nagios',
+ privileges => [
+ 'ALL = NOPASSWD: /usr/local/bin/check_hdfs_active_namenode',
+ 'ALL = NOPASSWD:
/usr/local/lib/nagios/plugins/check_hdfs_topology',
+ ],
+ }
+ # Alert if the HDFS topology shows any inconsistency.
+ nrpe::monitor_service { 'check_hdfs_topology':
+ description => 'HDFS topology check',
+ nrpe_command => '/usr/bin/sudo
/usr/local/lib/nagios/plugins/check_hdfs_topology',
+ check_interval => 30,
+ retries => 2,
+ require =>
File['/usr/local/lib/nagios/plugins/check_hdfs_topology'],
+ }
+ # Alert if there is no active NameNode
+ nrpe::monitor_service { 'hadoop-hdfs-active-namenode':
+ description => 'At least one Hadoop HDFS NameNode is active',
+ nrpe_command => '/usr/bin/sudo
/usr/local/bin/check_hdfs_active_namenode',
+ contact_group => 'admins,analytics',
+ require => [
+ Class['cdh::hadoop::master'],
+ Sudo::User['nagios-check_hdfs_active_namenode'],
+ ],
+ }
+
+ # Alert if the HDFS space consumption raises above a safe threshold.
+ monitoring::graphite_threshold { 'hadoop-hdfs-percent-used':
+ description => 'HDFS capacity used percentage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=47&fullscreen',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean",
+ from => '30min',
+ warning => 85,
+ critical => 90,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+
+ # Alert in case of HDFS currupted or missing blocks. In the ideal state
+ # these values should always be 0.
+ monitoring::graphite_threshold { 'hadoop-hdfs-corrupt-blocks':
+ description => 'HDFS corrupt blocks',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=39&fullscreen',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.CorruptBlocks.mean",
+ from => '30min',
+ warning => 2,
+ critical => 5,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+
+ monitoring::graphite_threshold { 'hadoop-hdfs-missing-blocks':
+ description => 'HDFS missing blocks',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=40&fullscreen',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.MissingBlocks.mean",
+ from => '180min',
+ warning => 2,
+ critical => 5,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+
+ # Java heap space used alerts.
+ # The goal is to get alarms for long running memory leaks like T153951.
+ # Only include heap size alerts if heap size is configured.
+ if $hadoop_namenode_heapsize {
+ $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9
+ $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95
+ monitoring::graphite_threshold {
'hadoop-hdfs-namenode-heap-usaage':
+ description => 'HDFS active Namenode JVM Heap usage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=4&fullscreen&orgId=1',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $nn_jvm_warning_threshold,
+ critical => $nn_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+ }
+
+ $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize
+ if $hadoop_resourcemanager_heapsize {
+ $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9
+ $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize *
0.95
+ monitoring::graphite_threshold {
'hadoop-yarn-resourcemananager-heap-usage':
+ description => 'YARN active ResourceManager JVM Heap usage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=12&fullscreen&orgId=1',
+ metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $rm_jvm_warning_threshold,
+ critical => $rm_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+ }
+ }
+}
diff --git a/modules/profile/manifests/hadoop/master/standby.pp
b/modules/profile/manifests/hadoop/master/standby.pp
new file mode 100644
index 0000000..5053081
--- /dev/null
+++ b/modules/profile/manifests/hadoop/master/standby.pp
@@ -0,0 +1,89 @@
+# == Class profile::hadoop::master::standby
+#
+# Sets up a standby/backup Hadoop Master node.
+#
+# [*monitoring_enabled*]
+# If production monitoring needs to be enabled or not.
+#
+class profile::hadoop::master::standby(
+ $monitoring_enabled =
hiera('profile::hadoop::standby_master::monitoring_enabled'),
+ $hadoop_namenode_heapsize = hiera('cdh::hadoop::namenode_heapsize'),
+ $statsd = hiera('statsd'),
+) {
+ require ::profile::hadoop::client
+
+ # Ensure that druid user exists on standby namenodes nodes.
+ class { '::druid::cdh::hadoop::user': }
+
+ class { '::cdh::hadoop::namenode::standby': }
+
+ # Use jmxtrans for sending metrics
+ class { '::cdh::hadoop::jmxtrans::namenode':
+ statsd => $statsd,
+ }
+
+ # Include icinga alerts if production realm.
+ if $monitoring_enabled {
+ # Icinga process alert for Stand By NameNode
+ nrpe::monitor_service { 'hadoop-hdfs-namenode':
+ description => 'Hadoop Namenode - Stand By',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::namenode::standby'],
+ critical => true,
+ }
+ nrpe::monitor_service { 'hadoop-hdfs-zkfc':
+ description => 'Hadoop HDFS Zookeeper failover controller',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.tools.DFSZKFailoverController"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::namenode::standby'],
+ }
+
+ # Java heap space used alerts.
+ # The goal is to get alarms for long running memory leaks like T153951.
+ # Only include heap size alerts if heap size is configured.
+ if $hadoop_namenode_heapsize {
+ $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9
+ $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95
+ monitoring::graphite_threshold {
'hadoop-hdfs-namenode-heap-usaage':
+ description => 'HDFS standby Namenode JVM Heap usage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=4&fullscreen',
+ metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $nn_jvm_warning_threshold,
+ critical => $nn_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+ }
+ }
+
+ class { '::cdh::hadoop::resourcemanager': }
+
+ # Use jmxtrans for sending metrics
+ class { 'cdh::hadoop::jmxtrans::resourcemanager':
+ statsd => $statsd,
+ }
+
+ # Include icinga alerts if production realm.
+ if $monitoring_enabled {
+ # Java heap space used alerts.
+ # The goal is to get alarms for long running memory leaks like T153951.
+ # Only include heap size alerts if heap size is configured.
+ $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize
+ if $hadoop_resourcemanager_heapsize {
+ $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9
+ $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize *
0.95
+ monitoring::graphite_threshold {
'hadoop-yarn-resourcemananager-heap-usage':
+ description => 'YARN standby Resource Manager JVM Heap
usage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=12&fullscreen',
+ metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $rm_jvm_warning_threshold,
+ critical => $rm_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+ }
+ }
+}
diff --git a/modules/role/manifests/analytics_cluster/mysql_password.pp
b/modules/profile/manifests/hadoop/mysql_password.pp
similarity index 92%
rename from modules/role/manifests/analytics_cluster/mysql_password.pp
rename to modules/profile/manifests/hadoop/mysql_password.pp
index 5413015..b865f35 100644
--- a/modules/role/manifests/analytics_cluster/mysql_password.pp
+++ b/modules/profile/manifests/hadoop/mysql_password.pp
@@ -1,10 +1,11 @@
-# == Class role::analytics_cluster::mysql_password
+# == Class profile::hadoop::mysql_password
+#
# Creates protected files in HDFS that contains
# a passwords used to access MySQL slaves.
# This is so we can automate sqooping of data
# out of MySQL into Hadoop.
#
-class role::analytics_cluster::mysql_password {
+class profile::hadoop::mysql_password {
require ::profile::hadoop::client
include ::passwords::mysql::research
diff --git a/modules/profile/manifests/hadoop/users.pp
b/modules/profile/manifests/hadoop/users.pp
new file mode 100644
index 0000000..8cbe5d1
--- /dev/null
+++ b/modules/profile/manifests/hadoop/users.pp
@@ -0,0 +1,27 @@
+# == Class profile::hadoop::users
+#
+# Installs any special system users needed on analytics namenodes or clients.
+# This is used for ensuring that users exist for use in HDFS.
+#
+# NOTE: Puppet does not manage creation of system user HDFS home directories.
+# you will need to do this manually. To do so, run from any Hadoop node:
+# sudo -u hdfs hdfs dfs -mkdir /user/$user && \
+# sudo -u hdfs hdfs dfs -chown $user:$group /user/$user
+# And optionally:
+# sudo -u hdfs hdfs dfs -chmod 775 /user/$user
+#
+class profile::hadoop::users {
+ # analytics-search user will be use to deploy
+ # wikimedia/discovery/analytics into HDFS.
+ # The analytics-search-users group will be allowed to
+ # sudo -u analytics-search.
+ group { 'analytics-search':
+ ensure => present,
+ }
+
+ user { 'analytics-search':
+ ensure => present,
+ gid => 'analytics-search',
+ system => true,
+ }
+}
diff --git a/modules/profile/manifests/hadoop/worker.pp
b/modules/profile/manifests/hadoop/worker.pp
new file mode 100644
index 0000000..933a615
--- /dev/null
+++ b/modules/profile/manifests/hadoop/worker.pp
@@ -0,0 +1,166 @@
+# == Class profile::hadoop::worker
+#
+# Configure a Analytics Hadoop worker node.
+#
+# == Parameters
+#
+# [*monitoring_enabled*]
+# If production monitoring needs to be enabled or not.
+#
+class profile::hadoop::worker(
+ $monitoring_enabled = hiera('profile::hadoop::worker::monitoring_enabled'),
+ $statsd = hiera('statsd'),
+) {
+ include ::profile::hadoop::client
+
+ # hive::client is nice to have for jobs launched
+ # from random worker nodes as app masters so they
+ # have access to hive-site.xml and other hive jars.
+ # This installs hive-hcatalog package on worker nodes to get
+ # hcatalog jars, including Hive JsonSerde for using
+ # JSON backed Hive tables.
+ include ::profile::hive::client
+
+ class { '::cdh::hadoop::worker': }
+
+ # Use jmxtrans for sending metrics
+ class { '::cdh::hadoop::jmxtrans::worker':
+ statsd => $statsd,
+ }
+
+ # Spark Python stopped working in Spark 1.5.0 with Oozie,
+ # for complicated reasons. We need to be able to set
+ # SPARK_HOME in an oozie launcher, and that SPARK_HOME
+ # needs to point at a locally installed spark directory
+ # in order load Spark Python dependencies.
+ class { '::cdh::spark': }
+
+ # Spark 2 is manually packaged by us, it is not part of CDH.
+ require_package('spark2')
+
+ # sqoop needs to be on worker nodes if Oozie is to
+ # launch sqoop jobs.
+ class { '::cdh::sqoop': }
+
+ # Install MaxMind databases for geocoding UDFs
+ class { '::geoip': }
+
+ # Install packages that are useful for distributed
+ # computation in Hadoop, and thus should be available on
+ # any Hadoop nodes.
+ require_package(
+ 'python-pandas',
+ 'python-scipy',
+ 'python-requests',
+ 'python-matplotlib',
+ 'python-dateutil',
+ 'python-sympy',
+ 'python-docopt',
+ 'python3',
+ 'python3-tabulate',
+ 'python3-scipy',
+ 'python3-enchant',
+ 'python3-tz',
+ 'python3-nltk',
+ 'python3-nose',
+ 'python3-setuptools',
+ 'python3-requests',
+ 'python3-mmh3',
+ 'python3-docopt',
+ 'libgomp1'
+ )
+
+ # Need a specifc version of python-numpy for sklearn.
+ # There are some weird dependency / require_package
+ # issues that force us to use the package resource
+ # directly.
+ package { ['python-numpy', 'python3-numpy']:
+ ensure => '1:1.12.1-2~bpo8+1',
+ }
+ package { ['python3-sklearn','python3-sklearn-lib']:
+ ensure => 'installed',
+ require => Package['python3-numpy'],
+ }
+
+ # This allows Hadoop daemons to talk to each other.
+ ferm::service{ 'hadoop-access':
+ proto => 'tcp',
+ port => '1024:65535',
+ srange => '(($ANALYTICS_NETWORKS $DRUID_PUBLIC_HOSTS))',
+ }
+
+ if $monitoring_enabled {
+ # Icinga process alerts for DataNode and NodeManager
+ nrpe::monitor_service { 'hadoop-hdfs-datanode':
+ description => 'Hadoop DataNode',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.datanode.DataNode"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::worker'],
+ }
+ nrpe::monitor_service { 'hadoop-yarn-nodemanager':
+ description => 'Hadoop NodeManager',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.yarn.server.nodemanager.NodeManager"',
+ contact_group => 'admins,analytics',
+ require => Class['cdh::hadoop::worker'],
+ }
+
+ # Alert on datanode mount disk space. These mounts are ignored by the
+ # base module's check_disk via the
base::monitoring::host::nrpe_check_disk_options
+ # override in worker.yaml hieradata.
+ nrpe::monitor_service { 'disk_space_hadoop_worker':
+ description => 'Disk space on Hadoop worker',
+ nrpe_command => '/usr/lib/nagios/plugins/check_disk --units GB -w
32 -c 16 -e -l -r "/var/lib/hadoop/data"',
+ contact_group => 'admins,analytics',
+ }
+
+ # Make sure that this worker node has NodeManager running in a RUNNING
state.
+ # Install a custom check command for NodeManager Node-State:
+ file { '/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state':
+ source =>
'puppet:///modules/role/analytics_cluster/hadoop/check_hadoop_yarn_node_state',
+ owner => 'root',
+ group => 'root',
+ mode => '0755',
+ }
+ nrpe::monitor_service { 'hadoop_yarn_node_state':
+ description => 'YARN NodeManager Node-State',
+ nrpe_command =>
'/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state',
+ contact_group => 'admins,analytics',
+ retry_interval => 3,
+ }
+
+ # Java heap space used alerts.
+ # The goal is to get alarms for long running memory leaks like T153951.
+ # Only include heap size alerts if heap size is configured.
+ $hadoop_datanode_heapsize = $::cdh::hadoop::hadoop_heapsize
+ if $hadoop_datanode_heapsize {
+ $dn_jvm_warning_threshold = $hadoop_datanode_heapsize * 0.9
+ $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.95
+ monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode':
+ description => 'HDFS DataNode JVM Heap usage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=1&fullscreen&orgId=1',
+ metric =>
"Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $dn_jvm_critical_threshold,
+ critical => $dn_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+ }
+
+ $hadoop_nodemanager_heapsize = $::cdh::hadoop::yarn_heapsize
+ if $hadoop_nodemanager_heapsize {
+ $nm_jvm_warning_threshold = $hadoop_nodemanager_heapsize * 0.9
+ $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.95
+ monitoring::graphite_threshold {
'analytics_hadoop_yarn_nodemanager':
+ description => 'YARN NodeManager JVM Heap usage',
+ dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=17&fullscreen',
+ metric =>
"Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper",
+ from => '60min',
+ warning => $nm_jvm_critical_threshold,
+ critical => $nm_jvm_critical_threshold,
+ percentage => '60',
+ contact_group => 'analytics',
+ }
+ }
+ }
+}
diff --git a/modules/role/manifests/analytics_cluster/backup.pp
b/modules/role/manifests/analytics_cluster/backup.pp
deleted file mode 100644
index e44105b..0000000
--- a/modules/role/manifests/analytics_cluster/backup.pp
+++ /dev/null
@@ -1,10 +0,0 @@
-# == Class role::analytics_cluster::backup
-# Simple wrapper class to create and manage /srv/backup
-class role::analytics_cluster::backup {
- file { '/srv/backup':
- ensure => 'directory',
- owner => 'root',
- group => 'analytics-admins',
- mode => '0755',
- }
-}
diff --git a/modules/role/manifests/analytics_cluster/coordinator.pp
b/modules/role/manifests/analytics_cluster/coordinator.pp
index e252af5..78270e7 100644
--- a/modules/role/manifests/analytics_cluster/coordinator.pp
+++ b/modules/role/manifests/analytics_cluster/coordinator.pp
@@ -53,4 +53,7 @@
include ::role::analytics_cluster::refinery::job::project_namespace_map
include ::role::analytics_cluster::refinery::job::sqoop_mediawiki
include ::role::analytics_cluster::refinery::job::json_refine
+
+ class { '::standard': }
+ include ::profile::base::firewall
}
\ No newline at end of file
diff --git
a/modules/role/manifests/analytics_cluster/database/meta/backup_dest.pp
b/modules/role/manifests/analytics_cluster/database/meta/backup_dest.pp
index fd691d6..f4bfa05 100644
--- a/modules/role/manifests/analytics_cluster/database/meta/backup_dest.pp
+++ b/modules/role/manifests/analytics_cluster/database/meta/backup_dest.pp
@@ -1,8 +1,13 @@
# == Class role::analytics_cluster::database::meta::backup_dest
#
class role::analytics_cluster::database::meta::backup_dest {
- # Ensure /srv/backup exists
- include ::role::analytics_cluster::backup
+
+ file { '/srv/backup':
+ ensure => 'directory',
+ owner => 'root',
+ group => 'analytics-admins',
+ mode => '0755',
+ }
file { [
'/srv/backup/mysql',
diff --git a/modules/role/manifests/analytics_cluster/hadoop/backup/namenode.pp
b/modules/role/manifests/analytics_cluster/hadoop/backup/namenode.pp
index c86e50e5..89fd6c4 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/backup/namenode.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/backup/namenode.pp
@@ -6,17 +6,23 @@
class role::analytics_cluster::hadoop::backup::namenode {
require ::profile::hadoop::client
- include ::role::analytics_cluster::backup
+ file { '/srv/backup':
+ ensure => 'directory',
+ owner => 'root',
+ group => 'analytics-admins',
+ mode => '0755',
+ }
$destination = '/srv/backup/hadoop/namenode'
file { [
'/srv/backup/hadoop',
$destination
]:
- ensure => 'directory',
- owner => 'hdfs',
- group => 'analytics-admins',
- mode => '0750',
+ ensure => 'directory',
+ owner => 'hdfs',
+ group => 'analytics-admins',
+ mode => '0750',
+ require => File['/srv/backup']
}
cron { 'hadoop-namenode-backup-fetchimage':
diff --git a/modules/role/manifests/analytics_cluster/hadoop/ferm/namenode.pp
b/modules/role/manifests/analytics_cluster/hadoop/ferm/namenode.pp
deleted file mode 100644
index ce8b976..0000000
--- a/modules/role/manifests/analytics_cluster/hadoop/ferm/namenode.pp
+++ /dev/null
@@ -1,27 +0,0 @@
-# == Class role::analytics_cluster::hadoop::ferm::namenode
-#
-class role::analytics_cluster::hadoop::ferm::namenode {
- ferm::service{ 'hadoop-hdfs-namenode':
- proto => 'tcp',
- port => '8020',
- srange => '(($ANALYTICS_NETWORKS $DRUID_PUBLIC_HOSTS))',
- }
-
- ferm::service{ 'hadoop-hdfs-zkfc':
- proto => 'tcp',
- port => '8019',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-hdfs-namenode-http-ui':
- proto => 'tcp',
- port => '50070',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-hdfs-namenode-jmx':
- proto => 'tcp',
- port => '9980',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index 2705780..3eae3ed 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -7,187 +7,10 @@
description => 'Hadoop Master (NameNode & ResourceManager)',
}
- require ::profile::hadoop::client
-
- # Ensure that druid deep storage directories exist for all Druid clusters.
- ::druid::cdh::hadoop::deep_storage { 'analytics-eqiad':
- # analytics-eqiad predates the time when there were multiple Druid
clusters.
- # It's deep storage directory will be /user/druid/deep-storage.
- path => '/user/druid/deep-storage',
- }
- # The Druid public-eqiad cluster's deep storage
- # directory will be /user/druid/deep-storage-public-eqiad
- ::druid::cdh::hadoop::deep_storage { 'public-eqiad': }
-
- class { '::cdh::hadoop::master': }
-
- # Use jmxtrans for sending metrics
- class { '::cdh::hadoop::jmxtrans::master':
- statsd => hiera('statsd'),
- }
-
- # This will create HDFS user home directories
- # for all users in the provided groups.
- # This only needs to be run on the NameNode
- # where all users that want to use Hadoop
- # must have shell accounts anyway.
- class { '::cdh::hadoop::users':
- require => Class['cdh::hadoop::master'],
- }
-
- # We need to include this class somewhere, and the master
- # role is as good as place as any, since we only need it to
- # be included on one node.
- include ::role::analytics_cluster::mysql_password
-
- # FairScheduler is creating event logs in hadoop.log.dir/fairscheduler/
- # It rotates them but does not delete old ones. Set up cronjob to
- # delete old files in this directory.
- cron { 'hadoop-clean-fairscheduler-event-logs':
- command => 'test -d /var/log/hadoop-yarn/fairscheduler &&
/usr/bin/find /var/log/hadoop-yarn/fairscheduler -type f -mtime +14 -exec rm {}
>/dev/null \;',
- minute => 5,
- hour => 0,
- require => Class['cdh::hadoop::master'],
- }
-
- file { '/usr/local/lib/nagios/plugins/check_hdfs_topology':
- ensure => present,
- source =>
'puppet:///modules/role/analytics_cluster/hadoop/check_hdfs_topology',
- mode => '0555',
- owner => 'root',
- group => 'root',
- }
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Icinga process alerts for NameNode, ResourceManager and HistoryServer
- nrpe::monitor_service { 'hadoop-hdfs-namenode':
- description => 'Hadoop Namenode - Primary',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::master'],
- critical => true,
- }
- nrpe::monitor_service { 'hadoop-hdfs-zkfc':
- description => 'Hadoop HDFS Zookeeper failover controller',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.tools.DFSZKFailoverController"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::master'],
- }
- nrpe::monitor_service { 'hadoop-yarn-resourcemanager':
- description => 'Hadoop ResourceManager',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.yarn.server.resourcemanager.ResourceManager"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::master'],
- critical => true,
- }
- nrpe::monitor_service { 'hadoop-mapreduce-historyserver':
- description => 'Hadoop HistoryServer',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::master'],
- }
-
- # Allow nagios to run some scripts as hdfs user.
- sudo::user { 'nagios-check_hdfs_active_namenode':
- user => 'nagios',
- privileges => [
- 'ALL = NOPASSWD: /usr/local/bin/check_hdfs_active_namenode',
- 'ALL = NOPASSWD:
/usr/local/lib/nagios/plugins/check_hdfs_topology',
- ],
- }
- # Alert if the HDFS topology shows any inconsistency.
- nrpe::monitor_service { 'check_hdfs_topology':
- description => 'HDFS topology check',
- nrpe_command => '/usr/bin/sudo
/usr/local/lib/nagios/plugins/check_hdfs_topology',
- check_interval => 30,
- retries => 2,
- require =>
File['/usr/local/lib/nagios/plugins/check_hdfs_topology'],
- }
- # Alert if there is no active NameNode
- nrpe::monitor_service { 'hadoop-hdfs-active-namenode':
- description => 'At least one Hadoop HDFS NameNode is active',
- nrpe_command => '/usr/bin/sudo
/usr/local/bin/check_hdfs_active_namenode',
- contact_group => 'admins,analytics',
- require => [
- Class['cdh::hadoop::master'],
- Sudo::User['nagios-check_hdfs_active_namenode'],
- ],
- }
-
- # Alert if the HDFS space consumption raises above a safe threshold.
- monitoring::graphite_threshold { 'hadoop-hdfs-percent-used':
- description => 'HDFS capacity used percentage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=47&fullscreen',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean",
- from => '30min',
- warning => 85,
- critical => 90,
- percentage => '60',
- contact_group => 'analytics',
- }
-
- # Alert in case of HDFS currupted or missing blocks. In the ideal state
- # these values should always be 0.
- monitoring::graphite_threshold { 'hadoop-hdfs-corrupt-blocks':
- description => 'HDFS corrupt blocks',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=39&fullscreen',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.CorruptBlocks.mean",
- from => '30min',
- warning => 2,
- critical => 5,
- percentage => '60',
- contact_group => 'analytics',
- }
-
- monitoring::graphite_threshold { 'hadoop-hdfs-missing-blocks':
- description => 'HDFS missing blocks',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=40&fullscreen',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.MissingBlocks.mean",
- from => '180min',
- warning => 2,
- critical => 5,
- percentage => '60',
- contact_group => 'analytics',
- }
-
- # Java heap space used alerts.
- # The goal is to get alarms for long running memory leaks like T153951.
- # Only include heap size alerts if heap size is configured.
- $hadoop_namenode_heapsize = hiera('cdh::hadoop::namenode_heapsize',
undef)
- if $hadoop_namenode_heapsize {
- $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9
- $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95
- monitoring::graphite_threshold {
'hadoop-hdfs-namenode-heap-usaage':
- description => 'HDFS active Namenode JVM Heap usage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=4&fullscreen&orgId=1',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $nn_jvm_warning_threshold,
- critical => $nn_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'analytics',
- }
- }
-
- $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize
- if $hadoop_resourcemanager_heapsize {
- $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9
- $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize *
0.95
- monitoring::graphite_threshold {
'hadoop-yarn-resourcemananager-heap-usage':
- description => 'YARN active ResourceManager JVM Heap usage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=12&fullscreen&orgId=1',
- metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $rm_jvm_warning_threshold,
- critical => $rm_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'analytics',
- }
- }
- }
-
- # Firewall
- include ::role::analytics_cluster::hadoop::ferm::namenode
- include ::role::analytics_cluster::hadoop::ferm::resourcemanager
+ include ::profile::hadoop::mysql_password
+ include ::profile::hadoop::master
+ include ::profile::hadoop::users
+ include ::profile::hadoop::firewall::master
+ include ::profile::base::firewall
+ class { 'standard': }
}
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 6d51ba6..9beac5b 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -6,92 +6,12 @@
system::role { 'analytics_cluster::hadoop::standby':
description => 'Hadoop Standby NameNode',
}
- require ::profile::hadoop::client
- # Ensure that druid user exists on standby namenodes nodes.
- class { '::druid::cdh::hadoop::user': }
-
- class { '::cdh::hadoop::namenode::standby': }
-
- # Use jmxtrans for sending metrics
- class { '::cdh::hadoop::jmxtrans::namenode':
- statsd => hiera('statsd'),
- }
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Icinga process alert for Stand By NameNode
- nrpe::monitor_service { 'hadoop-hdfs-namenode':
- description => 'Hadoop Namenode - Stand By',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::namenode::standby'],
- critical => true,
- }
- nrpe::monitor_service { 'hadoop-hdfs-zkfc':
- description => 'Hadoop HDFS Zookeeper failover controller',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.tools.DFSZKFailoverController"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::namenode::standby'],
- }
-
- $hadoop_namenode_heapsize = hiera('cdh::hadoop::namenode_heapsize',
undef)
- # Java heap space used alerts.
- # The goal is to get alarms for long running memory leaks like T153951.
- # Only include heap size alerts if heap size is configured.
- if $hadoop_namenode_heapsize {
- $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9
- $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95
- monitoring::graphite_threshold {
'hadoop-hdfs-namenode-heap-usaage':
- description => 'HDFS standby Namenode JVM Heap usage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=4&fullscreen',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $nn_jvm_warning_threshold,
- critical => $nn_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'analytics',
- }
- }
- }
-
- # Firewall
- include ::role::analytics_cluster::hadoop::ferm::namenode
-
- # If this is a resourcemanager host, then go ahead
- # and include a resourcemanager on all standby nodes as well
- # as the master node.
- if $::fqdn in $::cdh::hadoop::resourcemanager_hosts {
- include ::cdh::hadoop::resourcemanager
- # Firewall
- include ::role::analytics_cluster::hadoop::ferm::resourcemanager
-
- # Use jmxtrans for sending metrics
- class { 'cdh::hadoop::jmxtrans::resourcemanager':
- statsd => hiera('statsd'),
- }
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Java heap space used alerts.
- # The goal is to get alarms for long running memory leaks like
T153951.
- # Only include heap size alerts if heap size is configured.
- $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize
- if $hadoop_resourcemanager_heapsize {
- $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize *
0.9
- $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize
* 0.95
- monitoring::graphite_threshold {
'hadoop-yarn-resourcemananager-heap-usage':
- description => 'YARN standby Resource Manager JVM Heap
usage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=12&fullscreen',
- metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $rm_jvm_warning_threshold,
- critical => $rm_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'analytics',
- }
- }
- }
- }
+ include ::profile::hadoop::client
+ include ::profile::hadoop::master::standby
+ include ::profile::hadoop::firewall::master
+ include ::profile::hadoop::users
+ include ::profile::base::firewall
+ class { 'standard': }
}
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index 14d3ba0..8d60537 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -1,164 +1,11 @@
-# == Class role::role::analytics_cluster::hadoop::worker
-# Includes cdh::hadoop::worker classes
+# == Class role::analytics_cluster::hadoop::worker
#
# filtertags: labs-project-analytics labs-project-math
class role::analytics_cluster::hadoop::worker {
system::role { 'analytics_cluster::hadoop::worker':
description => 'Hadoop Worker (DataNode & NodeManager)',
}
-
- require ::profile::hadoop::client
-
- class { '::cdh::hadoop::worker': }
-
- # Use jmxtrans for sending metrics
- class { '::cdh::hadoop::jmxtrans::worker':
- statsd => hiera('statsd'),
- }
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Icinga process alerts for DataNode and NodeManager
- nrpe::monitor_service { 'hadoop-hdfs-datanode':
- description => 'Hadoop DataNode',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.datanode.DataNode"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::worker'],
- }
- nrpe::monitor_service { 'hadoop-yarn-nodemanager':
- description => 'Hadoop NodeManager',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.yarn.server.nodemanager.NodeManager"',
- contact_group => 'admins,analytics',
- require => Class['cdh::hadoop::worker'],
- }
-
- # Alert on datanode mount disk space. These mounts are ignored by the
- # base module's check_disk via the
base::monitoring::host::nrpe_check_disk_options
- # override in worker.yaml hieradata.
- nrpe::monitor_service { 'disk_space_hadoop_worker':
- description => 'Disk space on Hadoop worker',
- nrpe_command => '/usr/lib/nagios/plugins/check_disk --units GB -w
32 -c 16 -e -l -r "/var/lib/hadoop/data"',
- contact_group => 'admins,analytics',
- }
-
- # Make sure that this worker node has NodeManager running in a RUNNING
state.
- # Install a custom check command for NodeManager Node-State:
- file { '/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state':
- source =>
'puppet:///modules/role/analytics_cluster/hadoop/check_hadoop_yarn_node_state',
- owner => 'root',
- group => 'root',
- mode => '0755',
- }
- nrpe::monitor_service { 'hadoop_yarn_node_state':
- description => 'YARN NodeManager Node-State',
- nrpe_command =>
'/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state',
- contact_group => 'admins,analytics',
- retry_interval => 3,
- }
-
- # Java heap space used alerts.
- # The goal is to get alarms for long running memory leaks like T153951.
- # Only include heap size alerts if heap size is configured.
- $hadoop_datanode_heapsize = $::cdh::hadoop::hadoop_heapsize
- if $hadoop_datanode_heapsize {
- $dn_jvm_warning_threshold = $hadoop_datanode_heapsize * 0.9
- $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.95
- monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode':
- description => 'HDFS DataNode JVM Heap usage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=1&fullscreen&orgId=1',
- metric =>
"Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $dn_jvm_critical_threshold,
- critical => $dn_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'analytics',
- }
- }
-
- $hadoop_nodemanager_heapsize = $::cdh::hadoop::yarn_heapsize
- if $hadoop_nodemanager_heapsize {
- $nm_jvm_warning_threshold = $hadoop_nodemanager_heapsize * 0.9
- $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.95
- monitoring::graphite_threshold {
'analytics_hadoop_yarn_nodemanager':
- description => 'YARN NodeManager JVM Heap usage',
- dashboard_link =>
'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=17&fullscreen',
- metric =>
"Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $nm_jvm_critical_threshold,
- critical => $nm_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'analytics',
- }
- }
-
- }
-
- # hive::client is nice to have for jobs launched
- # from random worker nodes as app masters so they
- # have access to hive-site.xml and other hive jars.
- # This installs hive-hcatalog package on worker nodes to get
- # hcatalog jars, including Hive JsonSerde for using
- # JSON backed Hive tables.
- include ::profile::hive::client
-
- # Spark Python stopped working in Spark 1.5.0 with Oozie,
- # for complicated reasons. We need to be able to set
- # SPARK_HOME in an oozie launcher, and that SPARK_HOME
- # needs to point at a locally installed spark directory
- # in order load Spark Python dependencies.
- include ::cdh::spark
-
- # Spark 2 is manually packaged by us, it is not part of CDH.
- require_package('spark2')
-
- # sqoop needs to be on worker nodes if Oozie is to
- # launch sqoop jobs.
- include ::cdh::sqoop
-
- # Install MaxMind databases for geocoding UDFs
- include ::geoip
-
- # Install packages that are useful for distributed
- # computation in Hadoop, and thus should be available on
- # any Hadoop nodes.
- require_package(
- 'python-pandas',
- 'python-scipy',
- 'python-requests',
- 'python-matplotlib',
- 'python-dateutil',
- 'python-sympy',
- 'python-docopt',
- 'python3',
- 'python3-tabulate',
- 'python3-scipy',
- 'python3-enchant',
- 'python3-tz',
- 'python3-nltk',
- 'python3-nose',
- 'python3-setuptools',
- 'python3-requests',
- 'python3-mmh3',
- 'python3-docopt',
- 'libgomp1'
- )
-
- # Need a specifc version of python-numpy for sklearn.
- # There are some weird dependency / require_package
- # issues that force us to use the package resource
- # directly.
- package { ['python-numpy', 'python3-numpy']:
- ensure => '1:1.12.1-2~bpo8+1',
- }
- package { ['python3-sklearn','python3-sklearn-lib']:
- ensure => 'installed',
- require => Package['python3-numpy'],
- }
-
- # This allows Hadoop daemons to talk to each other.
- ferm::service{ 'hadoop-access':
- proto => 'tcp',
- port => '1024:65535',
- srange => '(($ANALYTICS_NETWORKS $DRUID_PUBLIC_HOSTS))',
- }
+ include ::profile::hadoop::worker
+ include ::profile::base::firewall
+ class { '::standard': }
}
--
To view, visit https://gerrit.wikimedia.org/r/392658
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I6f70df109254085657f62dcdae15204752cba2b1
Gerrit-PatchSet: 12
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Giuseppe Lavagetto <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits