Ottomata has submitted this change and it was merged.
Change subject: Remove unused analytics role classes
......................................................................
Remove unused analytics role classes
Bug: T109859
Change-Id: I48f7d76255ca0bf04322cd4e6e306e1a60ebf374
---
M hieradata/eqiad.yaml
D manifests/role/analytics.pp
D manifests/role/analytics/hadoop.pp
D manifests/role/analytics/hive.pp
D manifests/role/analytics/hue.pp
D manifests/role/analytics/impala.pp
D manifests/role/analytics/mahout.pp
D manifests/role/analytics/oozie.pp
D manifests/role/analytics/pig.pp
D manifests/role/analytics/refinery.pp
D manifests/role/analytics/spark.pp
D manifests/role/analytics/sqoop.pp
M modules/camus/manifests/job.pp
13 files changed, 2 insertions(+), 1,574 deletions(-)
Approvals:
Ottomata: Looks good to me, approved
jenkins-bot: Verified
diff --git a/hieradata/eqiad.yaml b/hieradata/eqiad.yaml
index 8d10be5..756caef 100644
--- a/hieradata/eqiad.yaml
+++ b/hieradata/eqiad.yaml
@@ -141,11 +141,4 @@
labs_baremetal_servers:
- '10.64.20.12'
-# Used in role::analytics::hive::config
-hive_server_host: analytics1027.eqiad.wmnet
-hive_metastore_host: analytics1027.eqiad.wmnet
-
-# Used in role::analytics::oozie::config
-oozie_host: analytics1027.eqiad.wmnet
-
ldap_labs_hostname: ldap-labs.eqiad.wikimedia.org
diff --git a/manifests/role/analytics.pp b/manifests/role/analytics.pp
deleted file mode 100644
index 81a9b5c..0000000
--- a/manifests/role/analytics.pp
+++ /dev/null
@@ -1,27 +0,0 @@
-# analytics servers (RT-1985)
-# == Class role::analytics
-# Base class for all analytics nodes.
-# All analytics nodes should include this.
-class role::analytics {
- system::role { 'role::analytics': description => 'analytics server' }
-
- require_package('openjdk-7-jdk')
-
- # This packages conflicts with the hadoop-fuse-dfs
- # and with impalad in that two libjvm.so files get added
- # to LD_LIBRARY_PATH. We dont't need this
- # package anyway, so ensure it is absent.
- package { 'icedtea-7-jre-jamvm':
- ensure => 'absent'
- }
-}
-
-# == Class role::analytics::hadoop::monitor_disks
-# Installs monitoring plugins for disks
-#
-class role::analytics::monitor_disks {
- if $::standard::has_ganglia {
- ganglia::plugin::python { 'diskstat': }
- }
-
-}
diff --git a/manifests/role/analytics/hadoop.pp
b/manifests/role/analytics/hadoop.pp
deleted file mode 100644
index 071ac98..0000000
--- a/manifests/role/analytics/hadoop.pp
+++ /dev/null
@@ -1,826 +0,0 @@
- # role/analytics/hadoop.pp
-#
-# Role classes for Analytics Hadoop nodes.
-# These role classes will configure Hadoop properly in either
-# the Labs or Production environments.
-#
-#
-# Production configs are hardcoded here. Labs has a few parameters
-# that need to be specified as global variables via the Manage Instances GUI:
-#
-# $cluster_name - Logical name of this cluster. Required.
-#
-# $hadoop_namenodes - Comma separated list of FQDNs that should be NameNodes
-# for this cluster. The first entry in the list
-# is assumed to be the preferred primary NameNode.
Required.
-# This list will also be used as $resourcemanager_hosts.
-# If hiera('zookeeper_hosts') is set, and this list has
more
-# than one entry, and $journalnode_hosts is also set,
then
-# HA YARN ResourceManager will be configured.
-# TODO: Change the name of this variable to
hadoop_masters
-# When we make this work better with hiera.
-#
-# $journalnode_hosts - Comma separated list of FQDNs that should be
JournalNodes
-# for this cluster. Optional. If not specified, HA
will not be configured.
-#
-# $heapsize - Optional. Set this to a value in MB to limit the JVM
-# heapsize for all Hadoop daemons. Optional.
-#
-#
-# Usage:
-#
-# To install only hadoop client packages and configs:
-# include role::analytics::hadoop::client
-#
-# To install a Hadoop Master (NameNode + ResourceManager, etc.):
-# include role::analytics::hadoop::master
-#
-# To install a Hadoop Worker (DataNode + NodeManager + etc.):
-# include role::analytics::hadoop::worker
-#
-
-# == Class role::analytics::hadoop::config
-# This is just a config class. You can include this
-# anywhere if you need to infer Hadoop configs. It
-# only sets variables, it will not install or configure
-# any packages. hadoop::client inherits from this class.
-#
-class role::analytics::hadoop::config {
-
- # Configs common to both Production and Labs.
- $hadoop_var_directory = '/var/lib/hadoop'
- $hadoop_name_directory = "${hadoop_var_directory}/name"
- $hadoop_data_directory = "${hadoop_var_directory}/data"
- $hadoop_journal_directory =
"${hadoop_var_directory}/journal"
- $dfs_block_size = 268435456 # 256 MB
- $io_file_buffer_size = 131072
- # Turn on Snappy compression by default for maps and final outputs
- $mapreduce_intermediate_compression_codec =
'org.apache.hadoop.io.compress.SnappyCodec'
- $mapreduce_output_compression = true
- $mapreduce_output_compression_codec =
'org.apache.hadoop.io.compress.SnappyCodec'
- $mapreduce_output_compression_type = 'BLOCK'
- $mapreduce_job_reuse_jvm_num_tasks = 1
- $fair_scheduler_template = 'hadoop/fair-scheduler.xml.erb'
- # setting this to false or undef interferes with defining it within a node
- $gelf_logging_enabled = false
-
- # This needs to be set in order to use Impala
- $dfs_datanode_hdfs_blocks_metadata_enabled = true
-
- # Yarn App Master possible port ranges
- $yarn_app_mapreduce_am_job_client_port_range = '55000-55199'
-
- # Look up zookeeper_hosts from hiera.
- $zookeeper_hosts = keys(hiera('zookeeper_hosts', undef))
-
- # Configs specific to Production.
- if $::realm == 'production' {
- # This is the logical name of the Analytics Hadoop cluster.
- $cluster_name = 'analytics-hadoop'
-
- $namenode_hosts = [
- 'analytics1001.eqiad.wmnet',
- 'analytics1002.eqiad.wmnet',
- ]
- $resourcemanager_hosts = $namenode_hosts
-
- # JournalNodes are colocated on worker DataNodes.
- $journalnode_hosts = [
- 'analytics1052.eqiad.wmnet', # Row A3
- 'analytics1028.eqiad.wmnet', # Row C2
- 'analytics1035.eqiad.wmnet', # Row D2
- ]
-
- # analytics1011-analytics1020 have 12 mounts on disks sda - sdl.
- if $::hostname =~ /analytics10(1[1-9]|20)/ {
- $datanode_mounts = [
- "${hadoop_data_directory}/a",
- "${hadoop_data_directory}/b",
- "${hadoop_data_directory}/c",
- "${hadoop_data_directory}/d",
- "${hadoop_data_directory}/e",
- "${hadoop_data_directory}/f",
- "${hadoop_data_directory}/g",
- "${hadoop_data_directory}/h",
- "${hadoop_data_directory}/i",
- "${hadoop_data_directory}/j",
- "${hadoop_data_directory}/k",
- "${hadoop_data_directory}/l",
- ]
- }
- # analytics1028-analytics1041 have mounts on disks sdb - sdm.
- # (sda is hardware raid on the 2 2.5 drives in the flex bays.)
- else {
- $datanode_mounts = [
- "${hadoop_data_directory}/b",
- "${hadoop_data_directory}/c",
- "${hadoop_data_directory}/d",
- "${hadoop_data_directory}/e",
- "${hadoop_data_directory}/f",
- "${hadoop_data_directory}/g",
- "${hadoop_data_directory}/h",
- "${hadoop_data_directory}/i",
- "${hadoop_data_directory}/j",
- "${hadoop_data_directory}/k",
- "${hadoop_data_directory}/l",
- "${hadoop_data_directory}/m",
- ]
- }
-
- $mapreduce_reduce_shuffle_parallelcopies = 10
- $mapreduce_task_io_sort_mb = 200
- $mapreduce_task_io_sort_factor = 10
-
-
- # Configure memory based on these recommendations and then adjusted:
- #
http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.0.6.0/bk_installing_manually_book/content/rpm-chap1-11.html
-
- ### These Map/Reduce and YARN ApplicationMaster master settings are
- # settable per job, and the defaults when clients submit them are often
- # picked up from the local versions of the
/etc/hadoop/conf/{mapred,yarn}-site.xml files.
- # That means they should not be set relative to the local node facter
variables, and as such
- # use a hardcoded value of memory_per_container to work from.
Otherwise a job
- # submitted from a relatively small client node will use bad job
defaults.
- #
- # We currently run two different types of worker nodes in production.
- # The older Dells have 48G of RAM, and the newer ones have 64G.
- #
- # Using + 0 here ensures that these variables are
- # integers (Fixnums) and won't throw errors
- # when used with min()/max() functions.
-
- # Worker nodes are heterogenous, so I don't want to use a variable
- # memory per container size across the cluster. Larger nodes will just
- # allocate a few more containers. Setting this to 2G.
- $memory_per_container_mb = 2048 + 0
-
- # Map container size and JVM max heap size (-XmX)
- $mapreduce_map_memory_mb =
floor($memory_per_container_mb)
- $mapreduce_reduce_memory_mb = floor(2 *
$memory_per_container_mb)
- $map_jvm_heap_size = floor(0.8 *
$memory_per_container_mb)
- # Reduce container size and JVM max heap size (-Xmx)
- $mapreduce_map_java_opts = "-Xmx${map_jvm_heap_size}m"
- $reduce_jvm_heap_size = floor(0.8 * 2 *
$memory_per_container_mb)
- $mapreduce_reduce_java_opts =
"-Xmx${reduce_jvm_heap_size}m"
-
- # Yarn ApplicationMaster container size and max heap size (-Xmx)
- $yarn_app_mapreduce_am_resource_mb = floor(2 *
$memory_per_container_mb)
- $mapreduce_am_heap_size = floor(0.8 * 2 *
$memory_per_container_mb)
- $yarn_app_mapreduce_am_command_opts =
"-Xmx${mapreduce_am_heap_size}m"
-
- ### The amount of RAM for NodeManagers will only be be used by
NodeManager
- # processes running on the worker nodes themselves. Client nodes that
submit
- # jobs will ignore these settings. These are safe to set relative to
the
- # node currently evaluating puppet's facter variables.
-
- # Select a 'reserve' memory size for the
- # OS and other Hadoop processes.
- if $::memorysize_mb <= 1024 {
- $reserve_memory_mb = 256
- }
- elsif $::memorysize_mb <= 2048 {
- $reserve_memory_mb = 512
- }
- elsif $::memorysize_mb <= 4096 {
- $reserve_memory_mb = 1024
- }
- elsif $::memorysize_mb <= 16384 {
- $reserve_memory_mb = 2048
- }
- elsif $::memorysize_mb <= 24576 {
- $reserve_memory_mb = 4096
- }
- elsif $::memorysize_mb <= 49152 {
- $reserve_memory_mb = 6144
- }
- elsif $::memorysize_mb <= 73728 {
- $reserve_memory_mb = 8192
- }
- elsif $::memorysize_mb <= 98304 {
- $reserve_memory_mb = 12288
- }
- elsif $::memorysize_mb <= 131072 {
- $reserve_memory_mb = 24576
- }
- elsif $::memorysize_mb <= 262144 {
- $reserve_memory_mb = 32768
- }
- else {
- $reserve_memory_mb = 65536
- }
-
- # Memory available for use by Hadoop jobs.
- $available_memory_mb = $::memorysize_mb - $reserve_memory_mb
-
- # Since I have chosen a static $memory_per_container of 2048 across all
- # node sizes, we should just choose to give NodeManagers
- # $available_memory_mb to work with.
- # This will give nodes with 48G of memory about 21 containers, and
- # nodes with 64G memory about 28 containers.
- #
- # This is the total amount of memory that NodeManagers
- # will use for allocation to containers.
- $yarn_nodemanager_resource_memory_mb = floor($available_memory_mb)
-
- # Setting _minimum_allocation_mb to 0 to allow Impala to submit small
reservation requests.
- $yarn_scheduler_minimum_allocation_mb = 0
- $yarn_scheduler_maximum_allocation_mb =
$yarn_nodemanager_resource_memory_mb
- # Setting minimum_allocation_vcores to 0 to allow Impala to submit
small reservation requests.
- $yarn_scheduler_minimum_allocation_vcores = 0
-
- # use net-topology.py.erb to map hostname to /datacenter/rack/row id.
- $net_topology_script_template =
'hadoop/net-topology.py.erb'
- $hadoop_heapsize = undef
- # Increase NameNode heapsize independent from other daemons
- $hadoop_namenode_opts = '-Xmx4096m'
-
- $yarn_heapsize = undef
-
- # TODO: use variables from new ganglia module once it is finished.
- $ganglia_host = '208.80.154.10'
- $ganglia_port = 9681
- $gelf_logging_host = 'logstash1002.eqiad.wmnet'
- $gelf_logging_port = 12201
- # In production, make sure that HDFS user directories are
- # created for everyone in these groups.
- $hadoop_users_posix_groups = 'analytics-users
analytics-privatedata-users analytics-admins analytics-search-users'
- }
-
- # Configs specific to Labs.
- elsif $::realm == 'labs' {
- # These variables are configurable via the
- # Labs Manage Instances GUI.
- $namenode_hosts = $::hadoop_namenodes ? {
- undef => [$::fqdn],
- default => split($::hadoop_namenodes, ','),
- }
- $resourcemanager_hosts = $namenode_hosts
-
- $journalnode_hosts = $::hadoop_journalnodes ? {
- undef => undef,
- default => split($::hadoop_journalnodes, ','),
- }
-
- $cluster_name = $::hadoop_cluster_name ? {
- undef => undef,
- default => $::hadoop_cluster_name,
- }
-
- # Allow labs users to configure their Hadoop daemon
- # Heapsize. NOTE: This will be applied to
- # All Hadoop related services on this node.
- $heapsize = $::hadoop_heapsize ? {
- undef => undef,
- default => $::hadoop_heapsize,
- }
-
- $datanode_mounts = [
- "${hadoop_data_directory}/a",
- "${hadoop_data_directory}/b",
- ]
-
- # Labs sets these at undef, which lets the Hadoop defaults stick.
- $hadoop_namenode_opts = undef
- $mapreduce_reduce_shuffle_parallelcopies = undef
- $mapreduce_task_io_sort_mb = undef
- $mapreduce_task_io_sort_factor = undef
- $mapreduce_map_memory_mb = undef
- $mapreduce_reduce_memory_mb = undef
- $mapreduce_map_java_opts = undef
- $mapreduce_reduce_java_opts = undef
- $yarn_app_mapreduce_am_resource_mb = undef
- $yarn_app_mapreduce_am_command_opts = undef
- $yarn_nodemanager_resource_memory_mb = undef
- $yarn_scheduler_minimum_allocation_mb = 0
- $yarn_scheduler_maximum_allocation_mb = undef
- $yarn_scheduler_minimum_allocation_vcores = 0
-
- $net_topology_script_template = undef
-
- $ganglia_host = 'aggregator.eqiad.wmflabs'
- $ganglia_port = 50090
- $gelf_logging_host = '127.0.0.1'
- $gelf_logging_port = 12201
- # In labs, make sure that HDFS user directories are
- # created for everyone in the current labs project.
- $hadoop_users_posix_groups = $::labsproject
-
- # Hadoop directories in labs should be automatically created.
- # This conditional could be added to each of the main classes
- # below, but since it doesn't hurt to have these directories
- # in labs, and since I don't want to add $::realm conditionals
- # below, I just create them here.
- file { [
- $hadoop_var_directory,
- $hadoop_data_directory,
- ]:
- ensure => 'directory',
- }
- }
-}
-
-# == Class role::analytics::hadoop::ferm::namenode
-#
-class role::analytics::hadoop::ferm::namenode {
- ferm::service{ 'hadoop-hdfs-namenode':
- proto => 'tcp',
- port => '8020',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-hdfs-namenode-http-ui':
- proto => 'tcp',
- port => '50070',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-hdfs-httpfs':
- proto => 'tcp',
- port => '14000',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-hdfs-namenode-jmx':
- proto => 'tcp',
- port => '9980',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
-
-# == Class role::analytics::hadoop::ferm::resourcemanager
-#
-
-class role::analytics::hadoop::ferm::resourcemanager {
-
- ferm::service{ 'hadoop-yarn-resourcemanager-scheduler':
- proto => 'tcp',
- port => '8030',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-yarn-resourcemanager-tracker':
- proto => 'tcp',
- port => '8031',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-yarn-resourcemanager':
- proto => 'tcp',
- port => '8032',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-yarn-resourcemanager-admin':
- proto => 'tcp',
- port => '8033',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-yarn-resourcemanager-http-ui':
- proto => 'tcp',
- port => '8088',
- srange => '$INTERNAL',
- }
-
- ferm::service{ 'hadoop-mapreduce-historyserver':
- proto => 'tcp',
- port => '10020',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-mapreduce-historyserver-admin':
- proto => 'tcp',
- port => '10033',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-mapreduce-historyserver-http-ui':
- proto => 'tcp',
- port => '19888',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'hadoop-yarn-resourcemanager-jmx':
- proto => 'tcp',
- port => '9983',
- srange => '$ANALYTICS_NETWORKS',
- }
-
-
-}
-
-
-# == Class role::analytics::hadoop
-# Installs Hadoop client pacakges and configuration.
-#
-class role::analytics::hadoop::client inherits role::analytics::hadoop::config
{
- # need java before hadoop is installed
- require_package('openjdk-7-jdk')
-
- class { 'cdh::hadoop':
- cluster_name => $cluster_name,
- namenode_hosts => $namenode_hosts,
- journalnode_hosts => $journalnode_hosts,
- resourcemanager_hosts => $resourcemanager_hosts,
- zookeeper_hosts => $zookeeper_hosts,
- datanode_mounts => $datanode_mounts,
- dfs_name_dir =>
[$hadoop_name_directory],
- dfs_journalnode_edits_dir =>
$hadoop_journal_directory,
- dfs_block_size => $dfs_block_size,
- io_file_buffer_size => $io_file_buffer_size,
- mapreduce_intermediate_compression_codec =>
$mapreduce_intermediate_compression_codec,
- mapreduce_output_compression =>
$mapreduce_output_compression,
- mapreduce_output_compression_codec =>
$mapreduce_output_compression_codec,
- mapreduce_output_compression_type =>
$mapreduce_output_compression_type,
-
- mapreduce_job_reuse_jvm_num_tasks =>
$mapreduce_job_reuse_jvm_num_tasks,
- mapreduce_reduce_shuffle_parallelcopies =>
$mapreduce_reduce_shuffle_parallelcopies,
- mapreduce_task_io_sort_mb =>
$mapreduce_task_io_sort_mb,
- mapreduce_task_io_sort_factor =>
$mapreduce_task_io_sort_factor,
-
- mapreduce_map_memory_mb =>
$mapreduce_map_memory_mb,
- mapreduce_reduce_memory_mb =>
$mapreduce_reduce_memory_mb,
- mapreduce_map_java_opts =>
$mapreduce_map_java_opts,
- mapreduce_reduce_java_opts =>
$mapreduce_reduce_java_opts,
- yarn_app_mapreduce_am_resource_mb =>
$yarn_app_mapreduce_am_resource_mb,
- yarn_app_mapreduce_am_command_opts =>
$yarn_app_mapreduce_am_command_opts,
- yarn_app_mapreduce_am_job_client_port_range =>
$yarn_app_mapreduce_am_job_client_port_range,
-
- yarn_nodemanager_resource_memory_mb =>
$yarn_nodemanager_resource_memory_mb,
- yarn_scheduler_minimum_allocation_mb =>
$yarn_scheduler_minimum_allocation_mb,
- yarn_scheduler_maximum_allocation_mb =>
$yarn_scheduler_maximum_allocation_mb,
- yarn_scheduler_minimum_allocation_vcores =>
$yarn_scheduler_minimum_allocation_vcores,
-
- dfs_datanode_hdfs_blocks_metadata_enabled =>
$dfs_datanode_hdfs_blocks_metadata_enabled,
-
-
- # Use net-topology.py.erb to map hostname to /datacenter/rack/row id.
- net_topology_script_template =>
$net_topology_script_template,
- # Use fair-scheduler.xml.erb to define FairScheduler queues.
- fair_scheduler_template =>
$fair_scheduler_template,
-
- yarn_site_extra_properties => {
- # Enable FairScheduler preemption. This will allow the essential
queue
- # to preempt non-essential jobs.
- 'yarn.scheduler.fair.preemption' => true,
- # Let YARN wait for at least 1/3 of nodes to present scheduling
- # opportunties before scheduling a job for certain data
- # on a node on which that data is not present.
- 'yarn.scheduler.fair.locality.threshold.node' => '0.33',
- # After upgrading to CDH 5.4.0, we are encountering this bug:
- # https://issues.apache.org/jira/browse/MAPREDUCE-5799
- # This should work around the problem.
- 'yarn.app.mapreduce.am.env' =>
'LD_LIBRARY_PATH=/usr/lib/hadoop/lib/native',
- # The default of 90.0 for this was marking older dells as
unhealthy when they still
- # had 2TB of space left. 99% will mark them at unhealthy with
they still have
- # > 200G free.
-
'yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage'
=> '99.0',
- },
-
- gelf_logging_enabled => $gelf_logging_enabled,
- gelf_logging_host => $gelf_logging_host,
- gelf_logging_port => $gelf_logging_port,
-
- hadoop_namenode_opts => $hadoop_namenode_opts,
- }
-
- # If in production AND the current node is a journalnode, then
- # go ahead and include an icinga alert for the JournalNode process.
- if $::realm == 'production' and member($journalnode_hosts, $::fqdn) {
- nrpe::monitor_service { 'hadoop-hdfs-journalnode':
- description => 'Hadoop JournalNode',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.qjournal.server.JournalNode"',
- require => Class['cdh::hadoop'],
- critical => true,
- }
- }
- file { '/usr/local/bin/hadoop-yarn-logging-helper.sh':
- content => template('hadoop/hadoop-yarn-logging-helper.erb'),
- mode => '0744',
- }
- if $gelf_logging_enabled {
- ensure_packages([
- # library dependency
- 'libjson-simple-java',
- # the libary itself: logstash-gelf.jar
- 'liblogstash-gelf-java',
- ])
- # symlink into hadoop classpath
- file { '/usr/lib/hadoop/lib/json_simple.jar':
- ensure => 'link',
- target => '/usr/share/java/json_simple.jar',
- require => Package['libjson-simple-java'],
- }
-
- # symlink into hadoop classpath
- file { '/usr/lib/hadoop/lib/logstash-gelf.jar':
- ensure => 'link',
- target => '/usr/share/java/logstash-gelf.jar',
- require => Package['liblogstash-gelf-java'],
- }
- # Patch container-log4j.properties inside nodemanager jar
- # See script source for details
- exec { 'hadoop-yarn-logging-helper-set':
- command => '/usr/local/bin/hadoop-yarn-logging-helper.sh set',
- subscribe => File['/usr/local/bin/hadoop-yarn-logging-helper.sh'],
- }
- } else {
- # Revert to original unmodified jar
- exec { 'hadoop-yarn-logging-helper-reset':
- command => '/usr/local/bin/hadoop-yarn-logging-helper.sh reset',
- subscribe => File['/usr/local/bin/hadoop-yarn-logging-helper.sh'],
- }
- }
-
- # Temporarily hardode DNS CNAMES into /etc/hosts.
- # jobs are failing around the cluster because these
- # are cached in DNS. I need to fix now. Will remove
- # this after new DNS has propogated.
- file_line { 'hadoop_master_cname_dns_override':
- ensure => 'absent',
- path => '/etc/hosts',
- line => '10.64.36.118 namenode.analytics.eqiad.wmnet
resoucemanager.analytics.eqiad.wmnet',
- }
-
- # Install packages that are useful for distributed
- # computation in Hadoop, and thus should be available on
- # any Hadoop nodes.
- ensure_packages([
- # Need python3 on Hadoop nodes in order to run
- # Hadoop Streaming python jobs.
- 'python3',
- 'python-numpy',
- 'python-pandas',
- 'python-scipy',
- 'python-requests',
- 'python-matplotlib',
- 'python-dateutil',
- 'python-sympy',
- ])
-}
-
-
-
-# == Class role::analytics::hadoop::master
-# Includes cdh::hadoop::master classes
-#
-class role::analytics::hadoop::master inherits role::analytics::hadoop::client
{
- system::role { 'role::analytics::hadoop::master':
- description => 'Hadoop Master (NameNode & ResourceManager)',
- }
-
- class { 'cdh::hadoop::master': }
-
- # Master should run httpfs daemon.
- class { 'cdh::hadoop::httpfs':
- require => Class['cdh::hadoop::master'],
- }
-
- # Hadoop nodes are spread across multiple rows
- # and need to be able to send multicast packets
- # multiple network hops. Hadoop GangliaContext
- # does not support this. See:
- # https://issues.apache.org/jira/browse/HADOOP-10181
- # We use jmxtrans instead.
- # Use jmxtrans for sending metrics to ganglia and statsd
-
- # TODO: use variables for stats server from somewhere?
- $statsd = 'statsd.eqiad.wmnet:8125'
-
- class { 'cdh::hadoop::jmxtrans::master':
- ganglia => "${ganglia_host}:${ganglia_port}",
- statsd => $statsd,
- }
-
- # monitor disk statistics
- include role::analytics::monitor_disks
-
- # FairScheduler is creating event logs in hadoop.log.dir/fairscheduler/
- # It rotates them but does not delete old ones. Set up cronjob to
- # delete old files in this directory.
- cron { 'hadoop-clean-fairscheduler-event-logs':
- command => 'test -d /var/log/hadoop-yarn/fairscheduler &&
/usr/bin/find /var/log/hadoop-yarn/fairscheduler -type f -mtime +14 -exec rm {}
>/dev/null \;',
- minute => 5,
- hour => 0,
- require => Class['cdh::hadoop::master'],
- }
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Icinga process alerts for NameNode, ResourceManager and HistoryServer
- nrpe::monitor_service { 'hadoop-hdfs-namenode':
- description => 'Hadoop Namenode - Primary',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"',
- require => Class['cdh::hadoop::master'],
- critical => true,
- }
- nrpe::monitor_service { 'hadoop-yarn-resourcemanager':
- description => 'Hadoop ResourceManager',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.yarn.server.resourcemanager.ResourceManager"',
- require => Class['cdh::hadoop::master'],
- critical => true,
- }
- nrpe::monitor_service { 'hadoop-mapreduce-historyserver':
- description => 'Hadoop HistoryServer',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer"',
- require => Class['cdh::hadoop::master'],
- }
-
- # Allow nagios to run the check_hdfs_active_namenode as hdfs user.
- sudo::user { 'nagios-check_hdfs_active_namenode':
- user => 'nagios',
- privileges => ['ALL = NOPASSWD:
/usr/local/bin/check_hdfs_active_namenode'],
- }
- # Alert if there is no active NameNode
- nrpe::monitor_service { 'hadoop-hdfs-active-namenode':
- description => 'At least one Hadoop HDFS NameNode is active',
- nrpe_command => '/usr/bin/sudo
/usr/local/bin/check_hdfs_active_namenode',
- require => [
- Class['cdh::hadoop::master'],
- Sudo::User['nagios-check_hdfs_active_namenode'],
- ],
- }
- }
-
- # This will create HDFS user home directories
- # for all users in the provided groups.
- # This only needs to be run on the NameNode
- # where all users that want to use Hadoop
- # must have shell accounts anyway.
- class { 'cdh::hadoop::users':
- groups => $hadoop_users_posix_groups,
- require => Class['cdh::hadoop::master'],
- }
-
-
- # Firewall
- include role::analytics::hadoop::ferm::namenode
- include role::analytics::hadoop::ferm::resourcemanager
-}
-
-# == Class role::analytics::hadoop::worker
-# Includes cdh::hadoop::worker classes
-class role::analytics::hadoop::worker inherits role::analytics::hadoop::client
{
- system::role { 'role::analytics::hadoop::worker':
- description => 'Hadoop Worker (DataNode & NodeManager)',
- }
-
- class { 'cdh::hadoop::worker': }
-
- # Hadoop nodes are spread across multiple rows
- # and need to be able to send multicast packets
- # multiple network hops. Hadoop GangliaContext
- # does not support this. See:
- # https://issues.apache.org/jira/browse/HADOOP-10181
- # We use jmxtrans instead.
-
- # Use jmxtrans for sending metrics to ganglia
- class { 'cdh::hadoop::jmxtrans::worker':
- ganglia => "${ganglia_host}:${ganglia_port}",
- statsd => $statsd,
- }
-
- # monitor disk statistics
- include role::analytics::monitor_disks
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Icinga process alerts for DataNode and NodeManager
- nrpe::monitor_service { 'hadoop-hdfs-datanode':
- description => 'Hadoop DataNode',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.datanode.DataNode"',
- require => Class['cdh::hadoop::worker'],
- }
- nrpe::monitor_service { 'hadoop-yarn-nodemanager':
- description => 'Hadoop NodeManager',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.yarn.server.nodemanager.NodeManager"',
- require => Class['cdh::hadoop::worker'],
- }
-
- # Alert on datanode mount disk space. These mounts are ignored by the
- # base module's check_disk via the
base::monitoring::host::nrpe_check_disk_options
- # override in worker.yaml hieradata.
- nrpe::monitor_service { 'disk_space_hadoop_worker':
- description => 'Disk space on Hadoop worker',
- nrpe_command => '/usr/lib/nagios/plugins/check_disk --units GB -w
32 -c 16 -e -l -r "/var/lib/hadoop/data"',
- }
-
- # Make sure that this worker node has NodeManager running in a RUNNING
state.
- # Install a custom check command for NodeManager Node-State:
- file { '/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state':
- source => 'puppet:///files/hadoop/check_hadoop_yarn_node_state',
- owner => 'root',
- group => 'root',
- mode => '0755',
- }
- nrpe::monitor_service { 'hadoop_yarn_node_state':
- description => 'YARN NodeManager Node-State',
- nrpe_command =>
'/usr/local/lib/nagios/plugins/check_hadoop_yarn_node_state',
- }
- }
-
-
- # Install hive client on worker nodes to get
- # hive-hcatalog package. hive-catalog depends
- # on hive package, so we might as well
- # configure hive too.
- include role::analytics::hive::client
-
-
- # We use this to send passive checks off to icinga
- # for generating alerts. We need the nsca-client package
- # to do this remotely. Some oozie jobs use this,
- # and it must be present on all datanodes.
- include role::analytics::hadoop::monitor::nsca::client
-
- # Install MaxMind databases for geocoding UDFs
- include geoip
-
-
- # Firewall
- ferm::service{ 'hadoop-access':
- proto => 'tcp',
- port => '1024:65535',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
-
-# == Class role::analytics::hadoop::monitor::nsca::client
-# This class exists in order to override the group ownership
-# and permissions of the /etc/send_nsca.cfg file. Hadoop
-# processes need to be able to read this file in order to
-# run send_nsca as part of Oozie submitted monitoring jobs.
-class role::analytics::hadoop::monitor::nsca::client inherits
icinga::nsca::client {
- File ['/etc/send_nsca.cfg'] {
- group => 'hadoop',
- mode => '0440',
- }
-}
-
-# == Class role::analytics::hadoop::standby
-# Include standby NameNode classes
-#
-class role::analytics::hadoop::standby inherits
role::analytics::hadoop::client {
- system::role { 'role::analytics::hadoop::standby':
- description => 'Hadoop Standby NameNode',
- }
-
- class { 'cdh::hadoop::namenode::standby': }
-
-
- # Use jmxtrans for sending metrics to ganglia
- class { 'cdh::hadoop::jmxtrans::namenode':
- ganglia => "${ganglia_host}:${ganglia_port}",
- }
-
- # monitor disk statistics
- include role::analytics::monitor_disks
-
-
- # Include icinga alerts if production realm.
- if $::realm == 'production' {
- # Icinga process alert for Stand By NameNode
- nrpe::monitor_service { 'hadoop-hdfs-namenode':
- description => 'Hadoop Namenode - Stand By',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "org.apache.hadoop.hdfs.server.namenode.NameNode"',
- require => Class['cdh::hadoop::namenode::standby'],
- critical => true,
- }
- }
-
- # If this is a resourcemanager host, then go ahead
- # and include a resourcemanager on all standby nodes as well
- # as the master node.
- if $::fqdn in $resourcemanager_hosts {
- include cdh::hadoop::resourcemanager
- # Firewall
- include role::analytics::hadoop::ferm::resourcemanager
- }
-
-
- # Firewall
- include role::analytics::hadoop::ferm::namenode
-}
-
-
-# == Class role::analytics::hadoop::balancer
-# Runs hdfs balancer periodically to keep data balanced across all DataNodes
-class role::analytics::hadoop::balancer {
- Class['role::analytics::hadoop::client'] ->
Class['role::analytics::hadoop::balancer']
-
- file { '/usr/local/bin/hdfs-balancer':
- source => 'puppet:///files/hadoop/hdfs-balancer',
- mode => '0754',
- owner => 'hdfs',
- group => 'hdfs',
- }
-
- cron { 'hdfs-balancer':
- command => '/usr/local/bin/hdfs-balancer >>
/var/log/hadoop-hdfs/balancer.log 2>&1',
- user => 'hdfs',
- # Every day at 6am UTC.
- minute => 0,
- hour => 6,
- require => File['/usr/local/bin/hdfs-balancer'],
- }
-}
diff --git a/manifests/role/analytics/hive.pp b/manifests/role/analytics/hive.pp
deleted file mode 100644
index 1cb4c61..0000000
--- a/manifests/role/analytics/hive.pp
+++ /dev/null
@@ -1,137 +0,0 @@
-# role/analytics/hive.pp
-#
-# Role classes for Analytics Hive client and server nodes.
-# These role classes will configure Hive properly in either
-# Labs or Production environments.
-#
-# If you are using these in Labs, you must include
role::analytics::hive::server
-# on your primary Hadoop NameNode.
-#
-# role::analytics::hive::client requires role::analytics::hadoop::client,
-# and will install Hadoop client pacakges and configs. In Labs,
-# you must set appropriate Hadoop client global parameters. See
-# role/analytics/hadoop.pp documentation for more info.
-
-
-# == Class role::analytics::hive::config
-#
-class role::analytics::hive::config {
- # require zookeeper config to get zookeeper hosts array.
- include role::analytics::hadoop::config
-
- # Set this pretty high, to avoid limiting the number
- # of substitution variables a Hive script can use.
- $variable_substitute_depth = 10000
-
- # The WMF webrequest table uses HCatalog's JSON Serde.
- # Automatically include this in Hive client classpaths.
- $hcatalog_jar =
'file:///usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar'
-
- # If refinery is included on this node, then add
- # refinery-hive.jar to the auxpath as well
- if defined(Class['role::analytics::refinery']) {
- $auxpath =
"${hcatalog_jar},file://${::role::analytics::refinery::path}/artifacts/refinery-hive.jar"
- }
- else {
- $auxpath = $hcatalog_jar
- }
-
- # Hive uses Zookeeper for table locking.
- $zookeeper_hosts = keys(hiera('zookeeper_hosts'))
-
- # We set support concurrency to false by default.
- # if someone needs to use it in their hive job, they
- # may manually set it to true via
- # set hive.support.concurrency = true;
- $support_concurrency = false
-
- if $::realm == 'production' {
- include passwords::analytics
-
- $jdbc_password = $passwords::analytics::hive_jdbc_password
- # Must set hive_server_host and hive_metastore_host in hiera
- # in production.
- $default_hive_host = undef
- }
- elsif $::realm == 'labs' {
- $jdbc_password = 'hive'
- # Default to hosting hive-server and hive-metastore on
- # primary namenode in labs.
- $default_hive_host =
$role::analytics::hadoop::config::namenode_hosts[0]
- }
-
- $server_host = hiera('hive_server_host', $default_hive_host)
- $metastore_host = hiera('hive_metastore_host', $default_hive_host)
-}
-
-
-# == Class role::analytics::hive
-# Installs base configs for hive client nodes
-#
-class role::analytics::hive::client inherits role::analytics::hive::config {
- require role::analytics::hadoop::client
-
- class { '::cdh::hive':
- metastore_host => $metastore_host,
- jdbc_password => $jdbc_password,
- zookeeper_hosts => $zookeeper_hosts,
- support_concurrency => $support_concurrency,
- variable_substitute_depth => $variable_substitute_depth,
- auxpath => $auxpath,
- # default to using Snappy for parquet formatted tables
- parquet_compression => 'SNAPPY',
- }
-}
-
-
-# == Class role::analytics::hive::server
-# Sets up Hive Server2 and MySQL backed Hive Metastore.
-#
-class role::analytics::hive::server inherits role::analytics::hive::client {
- if (!defined(Package['mysql-server'])) {
- package { 'mysql-server':
- ensure => 'installed',
- }
- }
-
- # Make sure mysql-server is installed before
- # MySQL Hive Metastore database class is applied.
- # Package['mysql-server'] -> Class['cdh::hive::metastore::mysql']
-
- # TODO: Set these better once hive is on its own server.
- # See: https://phabricator.wikimedia.org/T110090
- #
http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/cdh_ig_hive_install.html#concept_alp_4kl_3q_unique_1
- # TODO: Use hiera.
- $server_heapsize = $::realm ? {
- 'production' => 1024,
- default => undef,
- }
- $metastore_heapsize = $::realm ? {
- 'production' => 256,
- default => undef,
- }
- # # Setup Hive server and Metastore
- # class { 'cdh::hive::master':
- # server_heapsize => $server_heapsize,
- # metastore_heapsize => $metastore_heapsize,
- # }
-
- class { 'cdh::hive::server':
- heapsize => $server_heapsize,
- }
- class { 'cdh::hive::metastore':
- heapsize => $metastore_heapsize,
- }
-
- ferm::service{ 'hive_server':
- proto => 'tcp',
- port => '10000',
- srange => '$INTERNAL',
- }
-
- ferm::service{ 'hive_metastore':
- proto => 'tcp',
- port => '9083',
- srange => '$INTERNAL',
- }
-}
diff --git a/manifests/role/analytics/hue.pp b/manifests/role/analytics/hue.pp
deleted file mode 100644
index c352d63..0000000
--- a/manifests/role/analytics/hue.pp
+++ /dev/null
@@ -1,66 +0,0 @@
-# == Class role::analytics::hue
-# Installs Hue server.
-#
-class role::analytics::hue {
- # Require that all Hue applications
- # have their corresponding clients
- # and configs installed.
- require role::analytics::hadoop::client
- require role::analytics::hive::client
- require role::analytics::oozie::client
- require role::analytics::pig
- require role::analytics::sqoop
-
- # LDAP Labs config is the same as LDAP in production.
- include ldap::role::config::labs
-
- # Disable hue's SSL. SSL terminiation is handled by an upstream proxy.
- $ssl_private_key = false
- $ssl_certificate = false
- $secure_proxy_ssl_header = true
-
- if ($::realm == 'production') {
- include passwords::analytics
-
- $secret_key = $passwords::analytics::hue_secret_key
- $hive_server_host = 'analytics1027.eqiad.wmnet'
- # Disable automatic Hue user creation in production.
- $ldap_create_users_on_login = false
- }
- elsif ($::realm == 'labs') {
- $secret_key =
'oVEAAG5dp02MAuIScIetX3NZlmBkhOpagK92wY0GhBbq6ooc0B3rosmcxDg2fJBM'
- # Assume that in Labs, Hue should run on the main master Hadoop
NameNode.
- $hive_server_host = $role::analytics::hadoop::config::namenode_hosts[0]
- $ldap_create_users_on_login = true
- }
-
- class { 'cdh::hue':
- hive_server_host => $hive_server_host,
- secret_key => $secret_key,
- smtp_host => $::mail_smarthost[0],
- smtp_from_email => "hue@${::fqdn}",
- ldap_url => inline_template('<%=
scope.lookupvar("ldap::role::config::labs::servernames").collect { |host|
"ldaps://#{host}" }.join(" ") %>'),
- ldap_bind_dn =>
$ldap::role::config::labs::ldapconfig['proxyagent'],
- ldap_bind_password =>
$ldap::role::config::labs::ldapconfig['proxypass'],
- ldap_base_dn => $ldap::role::config::labs::basedn,
- ldap_username_pattern =>
'uid=<username>,ou=people,dc=wikimedia,dc=org',
- ldap_user_filter => 'objectclass=person',
- ldap_user_name_attr => 'uid',
- ldap_group_filter => 'objectclass=posixgroup',
- ldap_group_member_attr => 'member',
- ldap_create_users_on_login => $ldap_create_users_on_login,
- # Disable ssl in labs. Labs proxy handles SSL termination.
- ssl_private_key => $ssl_private_key,
- ssl_certificate => $ssl_certificate,
- secure_proxy_ssl_header => $secure_proxy_ssl_header,
- }
-
- ferm::service{ 'hue_server':
- proto => 'tcp',
- port => '8888',
- srange => '$INTERNAL',
- }
-}
-
-# TODO: Hue database backup.
-# TODO: Make Hue use MySQL database. Maybe?
diff --git a/manifests/role/analytics/impala.pp
b/manifests/role/analytics/impala.pp
deleted file mode 100644
index 2781b68..0000000
--- a/manifests/role/analytics/impala.pp
+++ /dev/null
@@ -1,66 +0,0 @@
-# Impala role classes.
-#
-# NOTE: Be sure that $analytics::impala::master_host is set in hiera!
-# In production this is set in hieradata/eqiad/analytics/impala.yaml.
-
-# == Class role::analytics::impala
-# Installs base impala packages and the impala-shell client.
-#
-class role::analytics::impala {
- class { 'cdh::impala':
- master_host => hiera('analytics::impala::master_host')
- }
-}
-
-# == Class role::analytics::impala::worker
-# Installs and configures the impalad server.
-#
-class role::analytics::impala::worker {
- include role::analytics::impala
- include cdh::impala::worker
-
- ferm::service { 'impalad':
- proto => 'tcp',
- port => '(21000 21050 22000 23000 25000 28000)',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
-
-# == Class role::analytics::impala::master
-# Installs and configures llama, impala-state-store and impala-catalog
-#
-class role::analytics::impala::master {
- include role::analytics::impala
- include base::firewall
-
- # The llama-master package stupidly creates the llama user
- # with a non system uid. This causes our admin module to
- # attempt to remove the user. Manage the user manually
- # here in puppet before installing that package.
- user { 'llama':
- ensure => 'present',
- comment => 'Llama',
- home => '/var/lib/llama',
- shell => '/bin/bash',
- system => true,
- before => Class['cdh::impala::master'],
- }
-
- include cdh::impala::master
-
- ferm::service { 'impala-state-store':
- proto => 'tcp',
- port => '(24000 25010)',
- srange => '$ANALYTICS_NETWORKS',
- }
- ferm::service { 'impala-catalog':
- proto => 'tcp',
- port => '(23020 25020 26000)',
- srange => '$ANALYTICS_NETWORKS',
- }
- ferm::service { 'impala-llama':
- proto => 'tcp',
- port => '(15000 15001 15002)',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
diff --git a/manifests/role/analytics/mahout.pp
b/manifests/role/analytics/mahout.pp
deleted file mode 100644
index d9108ea..0000000
--- a/manifests/role/analytics/mahout.pp
+++ /dev/null
@@ -1,3 +0,0 @@
-class role::analytics::mahout {
- include cdh::mahout
-}
diff --git a/manifests/role/analytics/oozie.pp
b/manifests/role/analytics/oozie.pp
deleted file mode 100644
index 8723a80..0000000
--- a/manifests/role/analytics/oozie.pp
+++ /dev/null
@@ -1,89 +0,0 @@
-# == Class role::analytics::oozie::client
-# Installs oozie client, which sets up the OOZIE_URL
-# environment variable. If you are using this class in
-# Labs, you must include oozie::server on your primary
-# Hadoop NameNode for this to work and set appropriate
-# Labs Hadoop global parameters.
-# See role/analytics/hadoop.pp documentation for more info.
-
-
-# == Class role::analytics::oozie::config
-#
-class role::analytics::oozie::config {
- include role::analytics::hadoop::config
-
- if $::realm == 'production' {
- include passwords::analytics
-
- $jdbc_password = $passwords::analytics::oozie_jdbc_password
- # Must set oozie_host in hiera in production.
- $default_oozie_host = undef
-
- }
- elsif $::realm == 'labs' {
- $jdbc_password = 'oozie'
- # Default to running oozie server on primary namenode in labs.
- $default_oozie_host =
$role::analytics::hadoop::config::namenode_hosts[0]
- }
-
- $oozie_host = hiera('oozie_host', $default_oozie_host)
-}
-
-
-# == Class role::analytics::oozie::client
-# Installs Oozie client.
-#
-class role::analytics::oozie::client inherits role::analytics::oozie::config {
- require role::analytics::hadoop::client
-
- class { 'cdh::oozie':
- oozie_host => $oozie_host,
- }
-}
-
-# == Class role::analytics::oozie::server
-# Installs Oozie server backed by a MySQL database.
-#
-class role::analytics::oozie::server inherits role::analytics::oozie::client {
- if (!defined(Package['mysql-server'])) {
- package { 'mysql-server':
- ensure => 'installed',
- }
- }
- # Make sure mysql-server is installed before
- # MySQL Oozie database class is applied.
- # Package['mysql-server'] -> Class['cdh::oozie::database::mysql']
-
- class { 'cdh::oozie::server':
- jdbc_password => $jdbc_password,
- smtp_host => $::mail_smarthost[0],
- smtp_from_email => "oozie@${::fqdn}",
- # This is not currently working. Disabling
- # this allows any user to manage any Oozie
- # job. Since access to our cluster is limited,
- # this isn't a big deal. But, we should still
- # figure out why this isn't working and
- # turn it back on.
- # I was not able to kill any oozie jobs
- # with this on, even though the
- # oozie.service.ProxyUserService.proxyuser.*
- # settings look like they are properly configured.
- authorization_service_authorization_enabled => false,
- }
-
- # Oozie is creating event logs in /var/log/oozie.
- # It rotates them but does not delete old ones. Set up cronjob to
- # delete old files in this directory.
- cron { 'oozie-clean-logs':
- command => 'test -d /var/log/oozie && /usr/bin/find /var/log/oozie
-type f -mtime +62 -exec rm {} >/dev/null \;',
- minute => 5,
- hour => 0,
- require => Class['cdh::oozie::server'],
- }
-
- ferm::service{ 'oozie_server':
- proto => 'tcp',
- port => '11000',
- srange => '$INTERNAL',
- }
-}
diff --git a/manifests/role/analytics/pig.pp b/manifests/role/analytics/pig.pp
deleted file mode 100644
index 150dc1e..0000000
--- a/manifests/role/analytics/pig.pp
+++ /dev/null
@@ -1,3 +0,0 @@
-class role::analytics::pig {
- include cdh::pig
-}
\ No newline at end of file
diff --git a/manifests/role/analytics/refinery.pp
b/manifests/role/analytics/refinery.pp
deleted file mode 100644
index 74ff5cd..0000000
--- a/manifests/role/analytics/refinery.pp
+++ /dev/null
@@ -1,286 +0,0 @@
-# == Class role::analytics::refinery
-# Includes configuration and resources needed for deploying
-# and using the analytics/refinery repository.
-#
-class role::analytics::refinery {
- # Make this class depend on hadoop::client. Refinery
- # is intended to work with Hadoop, and many of the
- # role classes here use the hdfs user, which is created
- # by the CDH packages.
- Class['role::analytics::hadoop::client'] ->
Class['role::analytics::refinery']
-
- # Some refinery python scripts use docopt for CLI parsing.
- if !defined(Package['python-docopt']) {
- package { 'python-docopt':
- ensure => 'installed',
- }
- }
- # refinery python module uses dateutil
- if !defined(Package['python-dateutil']) {
- package { 'python-dateutil':
- ensure => 'installed',
- }
- }
-
- # analytics/refinery will deployed to this node.
- package { 'analytics/refinery':
- provider => 'trebuchet',
- }
-
- # analytics/refinery repository is deployed via git-deploy at this path.
- # You must deploy this yourself; puppet will not do it for you.
- $path = '/srv/deployment/analytics/refinery'
-
- # Put refinery python module in user PYTHONPATH
- file { '/etc/profile.d/refinery.sh':
- content => "export PYTHONPATH=\${PYTHONPATH}:${path}/python"
- }
-
- # Create directory in /var/log for general purpose Refinery job logging.
- $log_dir = '/var/log/refinery'
- file { $log_dir:
- ensure => 'directory',
- owner => 'hdfs',
- group => 'analytics-admins',
- # setgid bit here to make refinery log files writeable
- # by users in the analytics-admins group.
- mode => '2775',
- }
-}
-
-
-# == Class role::analytics::refinery::camus
-# Uses camus::job to set up cron jobs to
-# import data from Kafka into Hadoop.
-#
-class role::analytics::refinery::camus {
- require role::analytics::refinery
- include role::kafka::analytics::config
-
- # Make all uses of camus::job set default kafka_brokers and camus_jar.
- # If you build a new camus or refinery, and you want to use it, you'll
- # need to change these. You can also override these defaults
- # for a particular camus::job instance by setting the parameter on
- # the camus::job declaration.
- Camus::Job {
- kafka_brokers =>
suffix($role::kafka::analytics::config::brokers_array, ':9092'),
- camus_jar =>
"${role::analytics::refinery::path}/artifacts/org/wikimedia/analytics/camus-wmf/camus-wmf-0.1.0-wmf6.jar",
- check_jar =>
"${role::analytics::refinery::path}/artifacts/org/wikimedia/analytics/refinery/refinery-job-0.0.26.jar",
- }
-
- # Import webrequest_* topics into /wmf/data/raw/webrequest
- # every 10 minutes, check runs and flag fully imported hours.
- camus::job { 'webrequest':
- check => true,
- minute => '*/10',
- }
-
- # Import eventlogging_* topics into /wmf/data/raw/eventlogging
- # once every hour.
- camus::job { 'eventlogging':
- minute => '5',
- }
-
- # Import mediawiki_* topics into /wmf/data/raw/mediawiki
- # once every hour. This data is expected to be Avro binary.
- camus::job { 'mediawiki':
- check => true,
- minute => '15',
- # refinery-camus contains some custom decoder classes which
- # are needed to import Avro binary data.
- libjars =>
"${role::analytics::refinery::path}/artifacts/org/wikimedia/analytics/refinery/refinery-camus-0.0.23.jar",
- }
-}
-
-# == Class role::analytics::refinery::data::drop
-# Installs cron job to drop old hive partitions
-# and delete old data from HDFS.
-#
-class role::analytics::refinery::data::drop {
- require role::analytics::refinery
-
- $webrequest_log_file =
"${role::analytics::refinery::log_dir}/drop-webrequest-partitions.log"
- $eventlogging_log_file =
"${role::analytics::refinery::log_dir}/drop-eventlogging-partitions.log"
-
- # keep this many days of raw webrequest data
- $raw_retention_days = 31
- cron { 'refinery-drop-webrequest-raw-partitions':
- command => "export
PYTHONPATH=\${PYTHONPATH}:${role::analytics::refinery::path}/python &&
${role::analytics::refinery::path}/bin/refinery-drop-webrequest-partitions -d
${raw_retention_days} -D wmf_raw -l /wmf/data/raw/webrequest -w raw >>
${webrequest_log_file} 2>&1",
- user => 'hdfs',
- minute => '15',
- hour => '*/4',
- }
-
- # keep this many days of refined webrequest data
- $refined_retention_days = 62
- cron { 'refinery-drop-webrequest-refined-partitions':
- command => "export
PYTHONPATH=\${PYTHONPATH}:${role::analytics::refinery::path}/python &&
${role::analytics::refinery::path}/bin/refinery-drop-webrequest-partitions -d
${refined_retention_days} -D wmf -l /wmf/data/wmf/webrequest -w refined >>
${webrequest_log_file} 2>&1",
- user => 'hdfs',
- minute => '45',
- hour => '*/4',
- }
-
- # keep this many days of eventlogging data
- $eventlogging_retention_days = 90
- cron {'refinery-drop-eventlogging-partitions':
- command => "export
PYTHONPATH=\${PYTHONPATH}:${role::analytics::refinery::path}/python &&
${role::analytics::refinery::path}/bin/refinery-drop-eventlogging-partitions -d
${eventlogging_retention_days} -l /wmf/data/raw/eventlogging >>
${eventlogging_log_file} 2>&1",
- user => 'hdfs',
- minute => '15',
- hour => '*/4',
- }
-}
-
-# == Class role::analytics::refinery::data::check::icinga
-# Configures passive/freshness icinga checks or data imports
-# in HDFS.
-#
-# For webrequest imports, the Oozie job that is responsible
-# for adding Hive partitions and checking data integrity
-# is responsible for triggering these passive checks.
-#
-# NOTE: These are disasbled due to nsca not working
-# properly between versions provided in Precise and Trusty.
-# we may reenable these if the icinga server gets upgraded
-# to Trusty.
-# See: https://phabricator.wikimedia.org/T76414
-# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=670373
-#
-class role::analytics::refinery::data::check::icinga {
- # We are monitoring hourly datasets.
- # Give Oozie a little time to finish running
- # the monitor_done_flag workflow for each hour.
- # 5400 seconds == 1.5 hours.
- $freshness_threshold = 5400
-
- # 1 == warning, 2 == critical.
- # Use warning for now while we make sure this works.
- $alert_return_code = 1
-
- # Monitor that each webrequest source is succesfully imported.
- # This is a passive check that is triggered by the Oozie
- # webrequest add partition jobs.
- monitoring::service { 'hive_partition_webrequest-bits':
- ensure => 'absent',
- description => 'hive_partition_webrequest-bits',
- check_command =>
"analytics_cluster_data_import-FAIL!wmf_raw.webrequest
bits!${alert_return_code}",
- passive => true,
- freshness => $freshness_threshold,
- retries => 1,
- }
- monitoring::service { 'hive_partition_webrequest-mobile':
- ensure => 'absent',
- description => 'hive_partition_webrequest-mobile',
- check_command =>
"analytics_cluster_data_import-FAIL!wmf_raw.webrequest
mobile!${alert_return_code}",
- passive => true,
- freshness => $freshness_threshold,
- retries => 1,
- }
- monitoring::service { 'hive_partition_webrequest-text':
- ensure => 'absent',
- description => 'hive_partition_webrequest-text',
- check_command =>
"analytics_cluster_data_import-FAIL!wmf_raw.webrequest
text!${alert_return_code}",
- passive => true,
- freshness => $freshness_threshold,
- retries => 1,
- }
- monitoring::service { 'hive_partition_webrequest-upload':
- ensure => 'absent',
- description => 'hive_partition_webrequest-upload',
- check_command =>
"analytics_cluster_data_import-FAIL!wmf_raw.webrequest
upload!${alert_return_code}",
- passive => true,
- freshness => $freshness_threshold,
- retries => 1,
- }
-}
-
-# == Class role::analytics::refinery::data::check::email
-# Configures cron jobs that send email about the faultyness of webrequest data
-#
-# These checks walk HDFS through the plain file system.
-#
-class role::analytics::refinery::data::check::email {
- require role::analytics::refinery
-
- # This should not be hardcoded. Instead, one should be able to use
- # $::cdh::hadoop::mount::mount_point to reference the user supplied
- # parameter when the cdh::hadoop::mount class is evaluated.
- # I am not sure why this is not working.
- $hdfs_mount_point = '/mnt/hdfs'
-
- $mail_to = '[email protected]'
-
- # Since the 'stats' user is not in ldap, it is unnecessarily hard
- # to grant it access to the private data in hdfs. As discussed in
- # https://gerrit.wikimedia.org/r/#/c/186254
- # the cron runs as hdfs instead.
- cron { 'refinery data check hdfs_mount':
- command =>
"${::role::analytics::refinery::path}/bin/refinery-dump-status-webrequest-partitions
--hdfs-mount ${hdfs_mount_point} --datasets webrequest,raw_webrequest --quiet
--percent-lost",
- environment => "MAILTO=${$mail_to}",
- user => 'hdfs',
- hour => 10,
- minute => 0,
- }
-
- cron { 'refinery data check pagecounts':
- command =>
"${::role::analytics::refinery::path}/bin/refinery-dump-status-webrequest-partitions
--hdfs-mount ${hdfs_mount_point} --datasets
pagecounts_all_sites,pagecounts_raw --quiet",
- environment => "MAILTO=${$mail_to}",
- user => 'hdfs', # See comment in above cron
- hour => 10,
- minute => 5,
- }
-
- cron { 'refinery data check pageviews':
- command =>
"${::role::analytics::refinery::path}/bin/refinery-dump-status-webrequest-partitions
--hdfs-mount ${hdfs_mount_point} --datasets pageview,projectview --quiet",
- environment => "MAILTO=${$mail_to}",
- user => 'hdfs', # See comment in first cron above
- hour => 10,
- minute => 10,
- }
-}
-
-# == Class role::analytics::refinery::source
-# Clones analytics/refinery/source repo and keeps it up-to-date
-#
-class role::analytics::refinery::source {
- require statistics
-
- $path = "${::statistics::working_path}/refinery-source"
-
- $user = $::statistics::user::username
- $group = $user
-
- file { $path:
- ensure => 'directory',
- owner => $user,
- group => $group,
- mode => '0755',
- }
-
- git::clone { 'refinery_source':
- ensure => 'latest',
- directory => $path,
- origin =>
'https://gerrit.wikimedia.org/r/p/analytics/refinery/source.git',
- owner => $user,
- group => $group,
- mode => '0755',
- require => File[$path],
- }
-}
-
-# == Class role::analytics::refinery::guard
-# Configures a cron job that runs analytics/refinery/source guards daily and
-# sends out an email upon issues
-#
-class role::analytics::refinery::guard {
- require role::analytics::refinery::source
-
- include ::maven
-
- cron { 'refinery source guard':
- command =>
"${role::analytics::refinery::source::path}/guard/run_all_guards.sh
--rebuild-jar --quiet",
- environment => '[email protected]',
- user => $role::analytics::refinery::source::user,
- hour => 15,
- minute => 35,
- }
-}
diff --git a/manifests/role/analytics/spark.pp
b/manifests/role/analytics/spark.pp
deleted file mode 100644
index 1ffa076..0000000
--- a/manifests/role/analytics/spark.pp
+++ /dev/null
@@ -1,59 +0,0 @@
-# == Class role::analytics::spark
-#
-class role::analytics::spark {
- include cdh::spark
-}
-
-# == Class role::analytics::spark::standalone
-# Configures a spark standalone cluster.
-# This runs spark daemons outside of YARN.
-# do not include role::analytics::spark
-# and role::analytics::spark::standalone on the same node.
-class role::analytics::spark::standalone {
- class { 'cdh::spark':
- master_host => hiera('spark_master_host', $::fqdn),
- worker_instances => hiera('spark_worker_instances', undef),
- worker_cores => hiera('spark_worker_cores',
floor($::processorcount / hiera('spark_worker_instances', 1))),
- worker_memory => hiera('spark_worker_memory', undef)
- }
-}
-
-class role::analytics::spark::standalone::master {
- require role::analytics::spark::standalone
- include cdh::spark::master
-
- ferm::service{ 'spark-master-web-ui':
- proto => 'tcp',
- port => '18080',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'spark-master-rpc':
- proto => 'tcp',
- port => '7077',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'spark-rest-server':
- proto => 'tcp',
- port => '6066',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
-
-class role::analytics::spark::standalone::worker {
- require role::analytics::spark::standalone
- include cdh::spark::worker
-
- ferm::service{ 'spark-worker-web-ui':
- proto => 'tcp',
- port => '18081',
- srange => '$ANALYTICS_NETWORKS',
- }
-
- ferm::service{ 'spark-worker-rpc':
- proto => 'tcp',
- port => '7078',
- srange => '$ANALYTICS_NETWORKS',
- }
-}
diff --git a/manifests/role/analytics/sqoop.pp
b/manifests/role/analytics/sqoop.pp
deleted file mode 100644
index 7d35a00..0000000
--- a/manifests/role/analytics/sqoop.pp
+++ /dev/null
@@ -1,3 +0,0 @@
-class role::analytics::sqoop {
- include cdh::sqoop
-}
\ No newline at end of file
diff --git a/modules/camus/manifests/job.pp b/modules/camus/manifests/job.pp
index 0a0a6f4..223e66d 100644
--- a/modules/camus/manifests/job.pp
+++ b/modules/camus/manifests/job.pp
@@ -11,8 +11,8 @@
#
# [*script*]
# Path to camus wrapper script. This is currently deployed with the refinery
-# source. You must include role::analytics::refinery if you don't override
-# this to a custom path.
+# source. You must include role::analytics_cluster::refinery if you don't
+# override this to a custom path.
# See: https://github.com/wikimedia/analytics-refinery/blob/master/bin/camus
#
# [*user*]
--
To view, visit https://gerrit.wikimedia.org/r/270851
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I48f7d76255ca0bf04322cd4e6e306e1a60ebf374
Gerrit-PatchSet: 2
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits