Ottomata has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/186396

Change subject: First commit in a multi-commit effort to move 
misc/statistics.pp into modules/
......................................................................

First commit in a multi-commit effort to move misc/statistics.pp into modules/

T87450

Change-Id: I49140d85ddea99f5d4d9a3c71e60cf7fa57d49b6
---
M manifests/role/statistics.pp
M manifests/site.pp
A modules/statistics/README.md
A modules/statistics/files/datasets.wikimedia.org
A modules/statistics/manifests/aggregator.pp
A modules/statistics/manifests/compute.pp
A modules/statistics/manifests/dataset_mount.pp
A modules/statistics/manifests/init.pp
A modules/statistics/manifests/password.pp
A modules/statistics/manifests/rsync/eventlogging.pp
A modules/statistics/manifests/rsync/webrequest.pp
A modules/statistics/manifests/rsync_job.pp
A modules/statistics/manifests/rsyncd.pp
A modules/statistics/manifests/sites/datasets.pp
A modules/statistics/manifests/sites/metrics.pp
A modules/statistics/manifests/sites/reportcard.pp
A modules/statistics/manifests/sites/stats.pp
A modules/statistics/manifests/user.pp
A modules/statistics/manifests/web.pp
A modules/statistics/manifests/wikistats.pp
A modules/statistics/templates/metrics.wikimedia.org.erb
21 files changed, 933 insertions(+), 13 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/96/186396/1

diff --git a/manifests/role/statistics.pp b/manifests/role/statistics.pp
index cdc5948..5e9dcd9 100644
--- a/manifests/role/statistics.pp
+++ b/manifests/role/statistics.pp
@@ -1,15 +1,15 @@
 # statistics servers (per ezachte - RT 2162)
 
 class role::statistics {
-    include misc::statistics::user
-    include misc::statistics::base
-
-    package { 'emacs23':
-        ensure => 'installed',
-    }
-
-    include role::backup::host
-    backup::set { 'home' : }
+    # include misc::statistics::user
+    # include misc::statistics::base
+    #
+    # package { 'emacs23':
+    #     ensure => 'installed',
+    # }
+    #
+    # include role::backup::host
+    # backup::set { 'home' : }
 }
 
 class role::statistics::cruncher inherits role::statistics {
@@ -92,3 +92,122 @@
     # backup eventlogging logs
     backup::set { 'a-eventlogging' : }
 }
+
+
+
+
+# ------------------------------------------------------------ #
+# The following role classes have commented out includes.
+# These will be uncommented piecemeal while the above role
+# have includes removed and are deprecated.
+
+
+# == Class role::statistics::module
+# Temp role to use the new statsitics module.
+# The following roles will replace the above ones.
+# When this happens the '::module' part of the class
+# names will be removed.
+class role::statistics::module {
+    # Manually set a list of statistics servers.
+    $statistics_servers = [
+        'stat1001.eqiad.wmnet',
+        'stat1002.eqiad.wmnet',
+        'stat1003.eqiad.wmnet',
+        'analytics1027.eqiad.wmnet',
+    ]
+
+    # we are attempting to stop using /a and to start using
+    # /srv instead.  stat1002 still use
+    # /a by default.  # stat1001 and stat1003 use /srv.
+    $working_path = $::hostname ? {
+        'stat1001' => '/srv',
+        'stat1003' => '/srv',
+        default    => '/a',
+    }
+
+    class { 'statistics':
+        servers      => $statistics_servers,
+        working_path => $working_path,
+    }
+}
+
+class role::statistics::module::cruncher inherits role::statistics::module {
+    system::role { 'role::statistics::cruncher':
+        description => 'Statistics general compute node (non private data)'
+    }
+
+    include role::backup::host
+    backup::set { 'home' : }
+    #
+    # # include stuff common to statistics compute nodes
+    # include statistics::server::compute
+    #
+    # # Aaron Halfaker (halfak) wants MongoDB for his project.
+    # class { 'mongodb':
+    #     dbpath  => "${::statistics::working_path}/mongodb",
+    # }
+    #
+    # # rsync logs from logging hosts
+    # include statistics::rsync::eventlogging
+    #
+    #
+    # # TODO:  Move geowiki into its own module:
+    # # geowiki: bringing data from production slave db to research db
+    # include misc::statistics::geowiki::jobs::data
+    # # geowiki: generate limn files from research db and push them
+    # include misc::statistics::geowiki::jobs::limn
+    # # geowiki: monitors the geowiki files of http://gp.wmflabs.org/
+    # include misc::statistics::geowiki::jobs::monitoring
+}
+
+class role::statistics::module::private inherits role::statistics::module {
+    system::role { 'role::statistics::private':
+        description => 'Statistics private data host and general compute node'
+    }
+
+    include role::backup::host
+    backup::set { 'home' : }
+
+    # # include stuff common to statistics compute nodes
+    # include statistics::server::compute
+    #
+    # # wikistats code is run here to
+    # # generate stats.wikimedia.org data
+    # include statistics::wikistats
+    #
+    # # rsync logs from logging hosts
+    # include statistics::rsync::webrequest
+    #
+    # # eventlogging logs are not private, but they
+    # # are here for convenience
+    # include statistics::rsync::eventlogging
+    #
+    # # backup eventlogging logs
+    # backup::set { 'a-eventlogging' : }
+    #
+    # # kafkatee is useful here for adhoc processing of kafkadata
+    # require_package('kafkatee')
+    #
+    # # aggregating hourly webstatscollector project count files into
+    # # daily per site csvs.
+    # # Although it is in the “private” role, the dataset actually isn't
+    # # private. We just keep it here to spare adding a separate role.
+    # include misc::statistics::aggregator
+}
+
+
+class role::statistics::module::web inherits role::statistics::module {
+    system::role { 'role::statistics::web':
+        description => 'Statistics private data host and general compute node'
+    }
+
+    # # include stuff common to statistics webserver nodes.
+    # include statistics::web
+    #
+    # # include statistics web sites
+    # include statistics::sites::datasets
+    # include statistics::sites::metrics
+    # include statistics::sites::reportcard
+    # include statistics::sites::stats
+}
+
diff --git a/manifests/site.pp b/manifests/site.pp
index 40ec439..6bfd010 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -2241,6 +2241,9 @@
 node 'stat1001.eqiad.wmnet' {
     include standard
     include role::statistics::www
+    # role::statistics::www will be replaced with the following role
+    include role::statistics::module::web
+
     include role::abacist
     class { 'admin': groups => ['statistics-web-users'] }
 }
@@ -2265,6 +2268,8 @@
     # include classes needed for storing and crunching
     # private data on stat1002.
     include role::statistics::private
+    # role::statistics::private will be replaced with the following role
+    include role::statistics::module::private
 
     # Make sure refinery happens before analytics::clients,
     # so that the hive role can properly configure Hive's
@@ -2296,17 +2301,18 @@
 node 'stat1003.eqiad.wmnet' {
     include standard
 
-    # stat1003 has a public IP and should be pretty
-    # well firewalled off.  If it needs a specific
-    # service opened up, this will be done in
-    # statistics classes.
     # NOTE: This will be moved to another class
     # someday, probably standard.
     class { 'base::firewall': }
 
     include role::statistics::cruncher
+    # role::statistics::private will be replaced with the following role
+    include role::cruncher::module::cruncher
 
+
+    # TODO: Find out if we still need cron_blog_pageviews
     include misc::statistics::cron_blog_pageviews
+
     include misc::statistics::limn::data::jobs
     include misc::statistics::researchdb_password
 
diff --git a/modules/statistics/README.md b/modules/statistics/README.md
new file mode 100644
index 0000000..57cc6dc
--- /dev/null
+++ b/modules/statistics/README.md
@@ -0,0 +1,10 @@
+This module is mostly a copy/paste of classes from the old
+manifests/misc/statistics.pp file.  It has not been refactored.
+The misc contents have been moved here in order to satisfy
+the requirement of moving all non site.pp manifests
+out of the root manifests and into a modules.
+
+You will probably want to include either statistics::web or
+statistics::compute, and then include any specific other
+specific classes to your nodes selectively.
+
diff --git a/modules/statistics/files/datasets.wikimedia.org 
b/modules/statistics/files/datasets.wikimedia.org
new file mode 100644
index 0000000..7c98969
--- /dev/null
+++ b/modules/statistics/files/datasets.wikimedia.org
@@ -0,0 +1,24 @@
+NameVirtualHost *:80
+<VirtualHost *:80>
+  ServerName datasets.wikimedia.org
+
+  DocumentRoot /srv/datasets.wikimedia.org
+
+  <Directory /srv/datasets.wikimedia.org >
+    Options Indexes FollowSymLinks MultiViews
+    AllowOverride None
+    Order allow,deny
+    allow from all
+  </Directory>
+
+  LogLevel warn
+  ErrorLog /var/log/apache2/datasets_error.log
+  CustomLog /var/log/apache2/datasets_access.log combined
+  ServerSignature Off
+</VirtualHost>
+
+<VirtualHost *:80>
+    ServerName  stat1001.wikimedia.org
+    RewriteEngine On
+    RewriteRule ^(.*)$ http://datasets.wikimedia.org/$1 [R=301]
+</VirtualHost>
diff --git a/modules/statistics/manifests/aggregator.pp 
b/modules/statistics/manifests/aggregator.pp
new file mode 100644
index 0000000..717a1b6
--- /dev/null
+++ b/modules/statistics/manifests/aggregator.pp
@@ -0,0 +1,84 @@
+# == Class statistics::aggregator
+# Handles aggregation of pagecounts-all-sites projectcounts files
+# TODO: Should this be in its own module?
+#
+class statistics::aggregator {
+    Class['::statistics'] => Class['::statistics::aggregator']
+
+    # This class uses the cdh::hadoop::mount in order to get
+    # data files out of HDFS.
+    Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator']
+
+    $working_path     = "${::statistics::working_path}/aggregator"
+
+    $script_path      = "${working_path}/scripts"
+    $data_repo_path   = "${working_path}/data"
+    $data_path        = "${data_repo_path}/projectcounts"
+    $log_path         = "${working_path}/log"
+    # This should not be hardcoded.  Instead, one should be able to use
+    # $::cdh::hadoop::mount::mount_point to reference the user supplied
+    # parameter when the cdh::hadoop::mount class is evaluated.
+    # I am not sure why this is not working.
+    $hdfs_mount_point = '/mnt/hdfs'
+    $hdfs_source_path = 
"${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites"
+    $user             = $::statistics::user::username
+    $group            = $::statistics::user::username
+
+    file { $working_path:
+        ensure => 'directory',
+        owner  => $user,
+        group  => $group,
+        mode   => '0755'
+    }
+
+    git::clone { 'aggregator_code':
+        ensure    => 'latest',
+        directory => $script_path,
+        origin    => 
'https://gerrit.wikimedia.org/r/p/analytics/aggregator.git',
+        owner     => $user,
+        group     => $group,
+        mode      => '0755',
+        require  => File[$working_path],
+    }
+
+    git::clone { 'aggregator_data':
+        ensure    => 'latest',
+        directory => $data_repo_path,
+        origin    => 
'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git',
+        owner     => $user,
+        group     => $group,
+        mode      => '0755',
+        require  => File[$working_path],
+    }
+
+    file { $log_path:
+        ensure  => 'directory',
+        owner   => $user,
+        group   => $group,
+        mode    => '0755',
+        require  => File[$working_path],
+
+    }
+
+    # Cron for doing the basic aggregation step itself
+    cron { 'aggregator projectcounts aggregate':
+        command => "${script_path}/bin/aggregate_projectcounts --source 
${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' 
+\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` 
--push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log",
+        user    => $user,
+        hour    => '13',
+        minute  => '0',
+        require => [
+            Git::Clone['aggregator_code'],
+            Git::Clone['aggregator_data'],
+            File[$log_path],
+        ],
+    }
+
+    # Cron for basing monitoring of the aggregated data
+    cron { 'aggregator projectcounts monitor':
+        command => "${script_path}/bin/check_validity_aggregated_projectcounts 
--data ${data_path}",
+        user    => $user,
+        hour    => '13',
+        minute  => '45',
+        require => Cron['aggregator projectcounts aggregate'],
+    }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/compute.pp 
b/modules/statistics/manifests/compute.pp
new file mode 100644
index 0000000..d04778d
--- /dev/null
+++ b/modules/statistics/manifests/compute.pp
@@ -0,0 +1,121 @@
+# == Class statistics::compute
+# Class containing common stuff for a statisitics compute node.
+#
+class statistics::compute {
+    Class['::statistics'] -> Class['::statistics::compute']
+
+    # include mysql module base class to install mysql client
+    include mysql
+    include geoip
+    include statistics::dataset_mount
+
+    include misc::udp2log::udp_filter
+
+    require_package('nodejs')
+    require_package('openjdk-7-jdk')
+
+    package { [
+        'emacs23',
+        'mc',
+        'zip',
+        'p7zip',
+        'p7zip-full',
+        'subversion',
+        'mercurial',
+        'tofrodos',
+        'git-review',
+        'imagemagick',
+        # halfak wants make to manage dependencies
+        'make',
+        # for checking up on eventlogging
+        'zpubsub',
+        # libwww-perl for wikistats stuff
+        'libwww-perl',
+        'php5-cli',
+        'php5-mysql',
+        'sqlite3', # For storing and interacting with intermediate results
+        'libgdal1-dev', # Requested by lzia for rgdal
+        'libproj-dev', # Requested by lzia for rgdal
+        'libbz2-dev', # for compiling some python libs.  RT 8278
+        'libboost-regex-dev',  # Ironholds wants these
+        'libboost-system-dev',
+        'libyaml-cpp0.3',
+        'libyaml-cpp0.3-dev',
+        'libgoogle-glog-dev',
+        'libboost-iostreams-dev',
+        'libmaxminddb-dev',
+        'build-essential', # Requested by halfak to install SciPy
+    ]:
+        ensure => 'latest',
+    }
+
+    # Python packages
+    package { [
+        'python-geoip',
+        'libapache2-mod-python',
+        'python-django',
+        'python-mysqldb',
+        'python-yaml',
+        'python-dateutil',
+        'python-numpy',
+        'python-scipy',
+        'python-boto',      # Amazon S3 access (needed to get zero sms logs)
+        'python-pandas',    # Pivot tables processing
+        'python-requests',  # Simple lib to make API calls
+        'python-unidecode', # Unicode simplification - converts everything to 
latin set
+        'python-pygeoip',   # For geo-encoding IP addresses
+        'python-ua-parser', # For parsing User Agents
+        'python-matplotlib',  # For generating plots of data
+        'python-netaddr',
+        'python-virtualenv', # T84378
+        # Aaron Halfaker (halfak) wants python{,3}-dev environments for module 
oursql
+        'python-dev',  # RT 6561
+        'python3-dev', # RT 6561
+    ]:
+        ensure => 'installed',
+    }
+
+    # Plotting packags
+    package { [
+        'ploticus',
+        'libploticus0',
+        'r-base',
+        'r-cran-rmysql',
+        'libcairo2',
+        'libcairo2-dev',
+        'libxt-dev'
+    ]:
+        ensure => installed,
+    }
+
+    # clones mediawiki core at $working_path/mediawiki/core
+    # and ensures that it is at the latest revision.
+    # RT 2162
+    $statistics_mediawiki_directory = 
"${::statistics::working_path}/mediawiki/core"
+
+    git::clone { 'statistics_mediawiki':
+        ensure    => 'latest',
+        directory => $statistics_mediawiki_directory,
+        origin    => 'https://gerrit.wikimedia.org/r/p/mediawiki/core.git',
+        owner     => 'mwdeploy',
+        group     => 'wikidev',
+    }
+
+    include passwords::mysql::research
+    # This file will render at
+    # /etc/mysql/conf.d/research-client.cnf.
+    mysql::config::client { 'research':
+        user  => $::passwords::mysql::research::user,
+        pass  => $::passwords::mysql::research::pass,
+        group => 'researchers',
+        mode  => '0440',
+    }
+    # This file will render at
+    # /etc/mysql/conf.d/stats-research-client.cnf.
+    mysql::config::client { 'stats-research':
+        user  => $::passwords::mysql::research::user,
+        pass  => $::passwords::mysql::research::pass,
+        group => $::statistics::user::username,
+        mode  => '0440',
+    }
+}
diff --git a/modules/statistics/manifests/dataset_mount.pp 
b/modules/statistics/manifests/dataset_mount.pp
new file mode 100644
index 0000000..c222c3b
--- /dev/null
+++ b/modules/statistics/manifests/dataset_mount.pp
@@ -0,0 +1,25 @@
+# == Class statistics::dataset_mount
+# Mounts /data from dataset1001 server.
+# xmldumps and other misc files needed
+# for generating statistics are here.
+#
+# NOTE: This class has nothing to do with the
+# datasets site hosted at 'datasets.wikimedia.org'.
+#
+class statistics::dataset_mount {
+    # need this for NFS mounts.
+    include nfs::common
+
+    file { '/mnt/data':
+        ensure => 'directory',
+    }
+
+    mount { '/mnt/data':
+        ensure  => 'mounted',
+        device  => '208.80.154.11:/data',
+        fstype  => 'nfs',
+        options => 
'ro,bg,tcp,rsize=8192,wsize=8192,timeo=14,intr,addr=208.80.154.11',
+        atboot  => true,
+        require => [File['/mnt/data'], Class['nfs::common']],
+    }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/init.pp 
b/modules/statistics/manifests/init.pp
new file mode 100644
index 0000000..6877196
--- /dev/null
+++ b/modules/statistics/manifests/init.pp
@@ -0,0 +1,38 @@
+# == Class statistics
+# Base wrapper class for stat servers.
+# All stat servers should include this class.
+#
+# == Parameters
+#   $servers        - list of statistics servers.
+#                     These will be granted rsync read and
+#                     write access between each other.
+#   $working_path   - Base path for statistics data.
+#                     Default: /srv
+class statistics(
+    $servers,
+    $working_path = '/srv'
+) {
+    include statistics::user
+
+    file { $working_path:
+        ensure  => 'directory',
+        owner   => 'root',
+        group   => 'wikidev',
+        mode    => '0775',
+    }
+
+    if $working_path == '/srv' {
+        # symlink /a to /srv for backwards compatibility
+        file { '/a':
+            ensure => 'link',
+            target => '/srv',
+        }
+    }
+
+    # set up rsync modules for copying files
+    # on statistic servers in $working_path
+    class { 'statistics::rsyncd':
+        path        => $working_path,
+        hosts_allow => $servers,
+    }
+}
diff --git a/modules/statistics/manifests/password.pp 
b/modules/statistics/manifests/password.pp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/modules/statistics/manifests/password.pp
diff --git a/modules/statistics/manifests/rsync/eventlogging.pp 
b/modules/statistics/manifests/rsync/eventlogging.pp
new file mode 100644
index 0000000..0444510
--- /dev/null
+++ b/modules/statistics/manifests/rsync/eventlogging.pp
@@ -0,0 +1,28 @@
+# == Class statistics::rsync::eventlogging
+#
+# Sets up daily cron jobs to rsync log files from remote
+# logging hosts to a local destination for further processing.
+#
+class statistics::rsync::eventlogging {
+    Class['::statistics'] -> Class['::statistics::rsync::webrequest']
+    $working_path = $::statistics::working_path
+
+    # Any logs older than this will be pruned by
+    # the rsync_job define.
+    $retention_days = 90
+
+    file { "${working_path}/eventlogging":
+        ensure  => 'directory',
+        owner   => 'stats',
+        group   => 'wikidev',
+        mode    => '0775',
+    }
+
+    # eventlogging logs from vanadium
+    statistics::rsync_job { 'eventlogging':
+        source         => 'vanadium.eqiad.wmnet::eventlogging/archive/*.gz',
+        destination    => "${working_path}/eventlogging/archive",
+        retention_days => $retention_days,
+
+    }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/rsync/webrequest.pp 
b/modules/statistics/manifests/rsync/webrequest.pp
new file mode 100644
index 0000000..ba82906
--- /dev/null
+++ b/modules/statistics/manifests/rsync/webrequest.pp
@@ -0,0 +1,86 @@
+# == Class statistics::rsync::webrequest
+#
+# Sets up daily cron jobs to rsync log files from remote
+# logging hosts to a local destination for further processing.
+#
+class statistics::rsync::webrequest {
+    Class['::statistics'] -> Class['::statistics::rsync::webrequest']
+    $working_path = $::statistics::working_path
+
+    # Make sure destination directories exist.
+    # Too bad I can't do this with recurse => true.
+    # See: https://projects.puppetlabs.com/issues/86
+    # for a much too long discussion on why I can't.
+    file { [
+        "${working_path}/aft",
+        "${working_path}/aft/archive",
+        "${working_path}/public-datasets",
+    ]:
+        ensure  => 'directory',
+        owner   => 'stats',
+        group   => 'wikidev',
+        mode    => '0775',
+    }
+
+    # Make sure destination directories exist.
+    # Too bad I can't do this with recurse => true.
+    # See: https://projects.puppetlabs.com/issues/86
+    # for a much too long discussion on why I can't.
+    file { [
+        "${working_path}/squid",
+        "${working_path}/squid/archive",
+        # Moving away from "squid" nonmenclature for
+        # webrequest logs.  New generated log
+        # files will be rsynced into /a/log.
+        "${working_path}/log",
+        "${working_path}/log/webrequest",
+    ]:
+        ensure  => directory,
+        owner   => 'stats',
+        group   => 'wikidev',
+        mode    => '0755',
+    }
+
+    # wikipedia zero logs from oxygen
+    statistics::rsync_job { 'wikipedia_zero':
+        source      => 
'oxygen.wikimedia.org::udp2log/webrequest/archive/zero*.gz',
+        destination => "${working_path}/squid/archive/zero",
+    }
+
+    # API logs from erbium
+    statistics::rsync_job { 'api':
+        source      => 
'erbium.eqiad.wmnet::udp2log/webrequest/archive/api-usage*.gz',
+        destination => "${working_path}/squid/archive/api",
+    }
+
+    # sampled-1000 logs from erbium
+    statistics::rsync_job { 'sampled_1000':
+        source      => 
'erbium.eqiad.wmnet::udp2log/webrequest/archive/sampled-1000*.gz',
+        destination => "${working_path}/squid/archive/sampled",
+    }
+
+    # glam_nara logs from erbium
+    statistics::rsync_job { 'glam_nara':
+        source      => 
'erbium.eqiad.wmnet::udp2log/webrequest/archive/glam_nara*.gz',
+        destination => "${working_path}/squid/archive/glam_nara",
+    }
+
+    # edit logs from oxygen
+    statistics::rsync_job { 'edits':
+        source      => 
'oxygen.wikimedia.org::udp2log/webrequest/archive/edits*.gz',
+        destination => "${working_path}/squid/archive/edits",
+    }
+
+    # mobile logs from oxygen
+    statistics::rsync_job { 'mobile':
+        source      => 
'oxygen.wikimedia.org::udp2log/webrequest/archive/mobile*.gz',
+        destination => "${working_path}/squid/archive/mobile",
+    }
+
+    # all webrequest archive logs from hdfs
+    statistics::rsync_job { 'hdfs_webrequest_archive':
+        source         => 'stat1002.eqiad.wmnet::hdfs-archive/webrequest/*',
+        destination    => "${working_path}/log/webrequest/archive",
+        retention_days => 90, # Pruning after 90 days as those logs contain 
private data.
+    }
+}
diff --git a/modules/statistics/manifests/rsync_job.pp 
b/modules/statistics/manifests/rsync_job.pp
new file mode 100644
index 0000000..8d92c6a
--- /dev/null
+++ b/modules/statistics/manifests/rsync_job.pp
@@ -0,0 +1,47 @@
+# == Define: statistics::rsync_job
+#
+# Sets up a daily cron job to rsync from $source to $destination
+# as the $misc::statistics::user::username user.  This requires
+# that the $misc::statistics::user::username user is installed
+# on both $source and $destination hosts.
+#
+# == Parameters:
+#    source         - rsync source argument (including hostname)
+#    destination    - rsync destination argument
+#    retention_days - If set, a cron will be installed to remove files older 
than this many days from $destination.
+#
+define statistics::rsync_job($source, $destination, $retention_days = undef) {
+    Class['::statistics'] -> Statistics::Rsync_job[$name]
+    require statistics::user
+
+    # ensure that the destination directory exists
+    file { $destination:
+        ensure  => 'directory',
+        owner   => $::statistics::user::username,
+        group   => 'wikidev',
+        mode    => '0755',
+    }
+
+    # Create a daily cron job to rsync $source to $destination.
+    # This requires that the $misc::statistics::user::username
+    # user is installed on the source host.
+    cron { "rsync_${name}_logs":
+        command => "/usr/bin/rsync -rt --perms --chmod=g-w ${source} 
${destination}/",
+        user    => $::statistics::user::username,
+        hour    => 8,
+        minute  => 0,
+    }
+
+    $prune_old_logs_ensure = $retention_days ? {
+        undef   => 'absent',
+        default => 'present',
+    }
+
+    cron { "prune_old_${name}_logs":
+        ensure  => $prune_old_logs_ensure,
+        command => "/usr/bin/find ${destination} -ctime +${retention_days} 
-exec rm {} \\;",
+        user    => $::statistics::user::username,
+        minute  => 0,
+        hour    => 9,
+    }
+}
diff --git a/modules/statistics/manifests/rsyncd.pp 
b/modules/statistics/manifests/rsyncd.pp
new file mode 100644
index 0000000..34a4ed1
--- /dev/null
+++ b/modules/statistics/manifests/rsyncd.pp
@@ -0,0 +1,57 @@
+# == Class statistics::rsyncd
+# Sets up rsyncd and common modules
+# for statistic servers.  Currently
+# this is read/write between statistic
+# servers in /srv or /a.
+#
+# == Parameters
+#   path        - string.  Base path to allow rsync access.  Should probably 
be /srv or /a.
+#   hosts_allow - array.   Hosts to grant rsync access.
+class statistics::rsyncd($path, $hosts_allow)
+{
+    Class['::statistics'] -> Class['statistics::rsyncd']
+
+    # this uses modules/rsync to
+    # set up an rsync daemon service
+    include rsync::server
+
+    # Set up an rsync module
+    # (in /etc/rsyncd.conf) for /srv.
+    rsync::server::module { 'srv':
+        path        => $path
+        read_only   => 'no',
+        list        => 'yes',
+        hosts_allow => $hosts_allow,
+    }
+
+    # Set up an rsync module for /a if
+    # we are using /srv a working path on this node.
+    # This if for backwards compatibility.
+    if ($::statistics::working_path == '/srv') {
+        rsync::server::module { 'a':
+            path        => $path,
+            read_only   => 'no',
+            list        => 'yes',
+            hosts_allow => $hosts_allow,
+        }
+    }
+
+    # Set up an rsync module
+    # (in /etc/rsync.conf) for /var/www.
+    # This will allow $hosts_allow to host public data files
+    # from the default Apache VirtualHost.
+    rsync::server::module { 'www':
+        path        => '/var/www',
+        read_only   => 'no',
+        list        => 'yes',
+        hosts_allow => $hosts_allow,
+    }
+
+    # Allow rsyncd traffic from internal networks.
+    # and stat* public IPs.
+    ferm::service { 'rsync':
+        proto  => 'tcp',
+        port   => '873',
+        srange => '($INTERNAL)',
+    }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/sites/datasets.pp 
b/modules/statistics/manifests/sites/datasets.pp
new file mode 100644
index 0000000..a77b6d6
--- /dev/null
+++ b/modules/statistics/manifests/sites/datasets.pp
@@ -0,0 +1,60 @@
+# == Class statistics::sites::datasets
+# datasets.wikimedia.org
+#
+# TODO: Parameterize rsync source hostnames
+#
+# NOTE: This class has nothing to do with the
+# dataset1001 datasets_mount.
+#
+class statistics::sites::datasets {
+    Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+
+    $working_path = $::statistics::working_path
+    file { [
+        "${working_path}/public-datasets",
+        "${working_path}/aggregate-datasets"
+    ]:
+        ensure => 'directory',
+        owner  => 'root',
+        group  => 'www-data',
+        mode   => '0640',
+    }
+
+    # symlink /var/www/public-datasets to $working_path/public-datasets
+    file { '/var/www/public-datasets':
+        ensure => 'link',
+        target => "${working_path}/public-datasets",
+        owner  => 'root',
+        group  => 'www-data',
+        mode   => '0640',
+    }
+
+    # symlink /var/www/aggregate-datasets to $working_path/aggregate-datasets
+    file { '/var/www/aggregate-datasets':
+        ensure => 'link',
+        target => "${working_path}/aggregate-datasets",
+        owner  => 'root',
+        group  => 'www-data',
+        mode   => '0640',
+    }
+
+    # rsync from stat1003:/srv/public-datasets to $working_path/public-datasets
+    cron { 'rsync public datasets':
+        command => "/usr/bin/rsync -rt --delete 
stat1003.eqiad.wmnet::srv/public-datasets/* ${working_path}/public-datasets/",
+        require => File["${working_path}/public-datasets"],
+        user    => 'root',
+        minute  => '*/30',
+    }
+
+    # rsync from stat1002:/srv/aggregate-datasets to 
$working_path/aggregate-datasets
+    cron { 'rsync aggregate datasets from stat1002':
+        command => "/usr/bin/rsync -rt --delete 
stat1002.eqiad.wmnet::srv/aggregate-datasets/* 
${working_path}/aggregate-datasets/",
+        require => File["${working_path}/aggregate-datasets"],
+        user    => 'root',
+        minute  => '*/30',
+    }
+
+    apache::site { 'datasets':
+        source => 'puppet:///modules/statistics/datasets.wikimedia.org',
+    }
+}
diff --git a/modules/statistics/manifests/sites/metrics.pp 
b/modules/statistics/manifests/sites/metrics.pp
new file mode 100644
index 0000000..7320fd9
--- /dev/null
+++ b/modules/statistics/manifests/sites/metrics.pp
@@ -0,0 +1,21 @@
+# == Class statistics::sites::metrics
+# metrics.wikimedia.org and metrics-api.wikimedia.org
+# They should just redirect to Wikimetrics
+#
+class statistics::sites::metrics {
+    Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+    include ::apache::mod::alias
+
+    $site_name       = 'metrics.wikimedia.org'
+    $redirect_target = 'https://metrics.wmflabs.org/'
+
+    # Set up the VirtualHost
+    apache::site { $site_name:
+        content => template("modules/statistics/metrics.wikimedia.org.erb"),
+    }
+
+    # make access and error log for metrics-api readable by wikidev group
+    file { ['/var/log/apache2/access.metrics.log', 
'/var/log/apache2/error.metrics.log']:
+        group   => 'wikidev',
+    }
+}
diff --git a/modules/statistics/manifests/sites/reportcard.pp 
b/modules/statistics/manifests/sites/reportcard.pp
new file mode 100644
index 0000000..cc8133a
--- /dev/null
+++ b/modules/statistics/manifests/sites/reportcard.pp
@@ -0,0 +1,7 @@
+# == Class statistics::sites::reportcard
+class statistics::sites::reportcard {
+    Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+
+    misc::limn::instance { 'reportcard': }
+}
+
diff --git a/modules/statistics/manifests/sites/stats.pp 
b/modules/statistics/manifests/sites/stats.pp
new file mode 100644
index 0000000..b3bcb05
--- /dev/null
+++ b/modules/statistics/manifests/sites/stats.pp
@@ -0,0 +1,51 @@
+# ==Class statistics::sites::stats
+# stats.wikimedia.org
+class statistics::sites::stats {
+    Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+
+    # TODO!  geowiki module???
+    # require misc::statistics::geowiki::data::private
+
+    $site_name                     = 'stats.wikimedia.org'
+    $docroot                       = "/srv/${site_name}/htdocs"
+    $geowiki_private_directory     = "${docroot}/geowiki-private"
+    $geowiki_private_htpasswd_file = '/etc/apache2/htpasswd.stats-geowiki'
+
+    # add htpasswd file for stats.wikimedia.org
+    file { '/etc/apache2/htpasswd.stats':
+        owner   => 'root',
+        group   => 'root',
+        mode    => '0644',
+        source  => 'puppet:///private/apache/htpasswd.stats',
+    }
+
+    # add htpasswd file for private geowiki data
+    file { $geowiki_private_htpasswd_file:
+        owner   => 'root',
+        group   => 'www-data',
+        mode    => '0640',
+        source  => 'puppet:///private/apache/htpasswd.stats-geowiki',
+    }
+
+    # TODO:
+    # # link geowiki checkout from docroot
+    # file { $geowiki_private_directory:
+    #     ensure  => 'link',
+    #     target  => 
"${misc::statistics::geowiki::data::private::geowiki_private_data_path}/datafiles",
+    #     owner   => 'root',
+    #     group   => 'www-data',
+    #     mode    => '0750',
+    # }
+
+    apache::site { $site_name:
+        content => template("apache/sites/${site_name}.erb"),
+    }
+
+    file { '/etc/apache2/ports.conf':
+        ensure  => 'present',
+        mode    => '0644',
+        owner   => 'root',
+        group   => 'root',
+        source  => 'puppet:///files/apache/ports.conf.ssl',
+    }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/user.pp 
b/modules/statistics/manifests/user.pp
new file mode 100644
index 0000000..f63f696
--- /dev/null
+++ b/modules/statistics/manifests/user.pp
@@ -0,0 +1,49 @@
+class statistics::user {
+    include passwords::statistics::user
+
+    $username = 'stats'
+    $homedir  = "/var/lib/${username}"
+
+    group { $username:
+        ensure => present,
+        name   => $username,
+        system => true,
+    }
+
+    user { $username:
+        home       => $homedir,
+        groups     => ['wikidev'],
+        shell      => '/bin/bash',
+        managehome => true,
+        system     => true
+    }
+
+    git::userconfig { 'stats':
+        homedir  => $homedir,
+        settings => {
+            'user' => {
+                'name'  => 'Statistics User',
+                # TODO: use a better email than this :(
+                'email' => '[email protected]',
+            },
+            # Enable automated git/gerrit authentication via http
+            # by using .git-credential file store.
+            'credential' => {
+                'helper' => 'store',
+            },
+        },
+        require   => User[$username],
+    }
+
+    # Render the .git-credentials file with the stats user's http password.
+    # This password is set from 
https://gerrit.wikimedia.org/r/#/settings/http-password.
+    # To log into gerrit as the stats user, check the /srv/password/stats-user 
file
+    # for LDAP login creds.
+    file { "${homedir}/.git-credentials":
+        mode    => '0600',
+        owner   => $username,
+        group   => $username,
+        content => 
"https://${username}:${passwords::statistics::user::gerrit_http_password}@gerrit.wikimedia.org";,
+        require => User[$username],
+    }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/web.pp 
b/modules/statistics/manifests/web.pp
new file mode 100644
index 0000000..1219217
--- /dev/null
+++ b/modules/statistics/manifests/web.pp
@@ -0,0 +1,28 @@
+# == Class statistics::web
+# Common things needed for a statistics webserver node.
+# This should be included if you want to include any
+# sites in statistics::sites
+class statistics::web {
+    Class['::statistics'] -> Class['::statistics::web']
+
+    $ssl_settings = ssl_ciphersuite('apache-2.2', 'compat', '365')
+
+    include webserver::apache
+
+    # make sure /var/log/apache2 is readable by wikidevs for debugging.
+    # This won't make the actual log files readable, only the directory.
+    # Individual log files can be created and made readable by
+    # classes that manage individual sites.
+    file { '/var/log/apache2':
+        ensure  => 'directory',
+        owner   => 'root',
+        group   => 'wikidev',
+        mode    => '0750',
+        require => Class['webserver::apache'],
+    }
+
+    include ::apache::mod::rewrite
+    include ::apache::mod::proxy
+    include ::apache::mod::proxy_http
+    include ::apache::mod::headers
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/wikistats.pp 
b/modules/statistics/manifests/wikistats.pp
new file mode 100644
index 0000000..5146e7d
--- /dev/null
+++ b/modules/statistics/manifests/wikistats.pp
@@ -0,0 +1,32 @@
+# == Class statistics::wikistats
+# wikistats configuration for generating
+# stats.wikimedia.org data.
+#
+# TODO: puppetize clone of wikistats?
+class statistics::wikistats {
+    Class['::statistics'] -> Class['::statistics::wikistats']
+
+    # Perl packages needed for wikistats
+    package { [
+        'libjson-xs-perl',
+        'libtemplate-perl',
+        'libnet-patricia-perl',
+        'libregexp-assemble-perl',
+    ]:
+        ensure => 'installed',
+    }
+    # this cron uses pigz to unzip squid archive files in parallel
+    package { 'pigz':
+        ensure => 'installed',
+    }
+
+    # generates the new mobile pageviews report
+    # and syncs the file PageViewsPerMonthAll.csv to stat1002
+    cron { 'new mobile pageviews report':
+        command  => "/bin/bash 
${::statistics::working_path}/wikistats_git/pageviews_reports/bin/stat1-cron-script.sh",
+        user     => 'stats',
+        monthday => 1,
+        hour     => 7,
+        minute   => 20,
+    }
+}
diff --git a/modules/statistics/templates/metrics.wikimedia.org.erb 
b/modules/statistics/templates/metrics.wikimedia.org.erb
new file mode 100644
index 0000000..9bac6be
--- /dev/null
+++ b/modules/statistics/templates/metrics.wikimedia.org.erb
@@ -0,0 +1,27 @@
+# Note: This file is managed by Puppet.
+<%
+# ERb template variables:
+#
+#   site_name
+#   redirect_target
+#
+-%>
+
+<VirtualHost *:80>
+  # <%= @site_name %>, and metrics-api.wikimedia.org are
+  # deprecated. The services it provided are now available through
+  # Wikimetrics.
+  #
+  # We keep the domain around only because old documentation and
+  # search engines still refer to it.
+
+  ServerName <%= @site_name %>
+  ServerAlias metrics-api.wikimedia.org
+  ServerAdmin [email protected]
+
+  Redirect permanent / <%= @redirect_target %>
+
+  ErrorLog /var/log/apache2/error.metrics.log
+  LogLevel warn
+  CustomLog /var/log/apache2/access.metrics.log combined
+</VirtualHost>

-- 
To view, visit https://gerrit.wikimedia.org/r/186396
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I49140d85ddea99f5d4d9a3c71e60cf7fa57d49b6
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to