Ottomata has uploaded a new change for review.
https://gerrit.wikimedia.org/r/186396
Change subject: First commit in a multi-commit effort to move
misc/statistics.pp into modules/
......................................................................
First commit in a multi-commit effort to move misc/statistics.pp into modules/
T87450
Change-Id: I49140d85ddea99f5d4d9a3c71e60cf7fa57d49b6
---
M manifests/role/statistics.pp
M manifests/site.pp
A modules/statistics/README.md
A modules/statistics/files/datasets.wikimedia.org
A modules/statistics/manifests/aggregator.pp
A modules/statistics/manifests/compute.pp
A modules/statistics/manifests/dataset_mount.pp
A modules/statistics/manifests/init.pp
A modules/statistics/manifests/password.pp
A modules/statistics/manifests/rsync/eventlogging.pp
A modules/statistics/manifests/rsync/webrequest.pp
A modules/statistics/manifests/rsync_job.pp
A modules/statistics/manifests/rsyncd.pp
A modules/statistics/manifests/sites/datasets.pp
A modules/statistics/manifests/sites/metrics.pp
A modules/statistics/manifests/sites/reportcard.pp
A modules/statistics/manifests/sites/stats.pp
A modules/statistics/manifests/user.pp
A modules/statistics/manifests/web.pp
A modules/statistics/manifests/wikistats.pp
A modules/statistics/templates/metrics.wikimedia.org.erb
21 files changed, 933 insertions(+), 13 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/96/186396/1
diff --git a/manifests/role/statistics.pp b/manifests/role/statistics.pp
index cdc5948..5e9dcd9 100644
--- a/manifests/role/statistics.pp
+++ b/manifests/role/statistics.pp
@@ -1,15 +1,15 @@
# statistics servers (per ezachte - RT 2162)
class role::statistics {
- include misc::statistics::user
- include misc::statistics::base
-
- package { 'emacs23':
- ensure => 'installed',
- }
-
- include role::backup::host
- backup::set { 'home' : }
+ # include misc::statistics::user
+ # include misc::statistics::base
+ #
+ # package { 'emacs23':
+ # ensure => 'installed',
+ # }
+ #
+ # include role::backup::host
+ # backup::set { 'home' : }
}
class role::statistics::cruncher inherits role::statistics {
@@ -92,3 +92,122 @@
# backup eventlogging logs
backup::set { 'a-eventlogging' : }
}
+
+
+
+
+# ------------------------------------------------------------ #
+# The following role classes have commented out includes.
+# These will be uncommented piecemeal while the above role
+# have includes removed and are deprecated.
+
+
+# == Class role::statistics::module
+# Temp role to use the new statsitics module.
+# The following roles will replace the above ones.
+# When this happens the '::module' part of the class
+# names will be removed.
+class role::statistics::module {
+ # Manually set a list of statistics servers.
+ $statistics_servers = [
+ 'stat1001.eqiad.wmnet',
+ 'stat1002.eqiad.wmnet',
+ 'stat1003.eqiad.wmnet',
+ 'analytics1027.eqiad.wmnet',
+ ]
+
+ # we are attempting to stop using /a and to start using
+ # /srv instead. stat1002 still use
+ # /a by default. # stat1001 and stat1003 use /srv.
+ $working_path = $::hostname ? {
+ 'stat1001' => '/srv',
+ 'stat1003' => '/srv',
+ default => '/a',
+ }
+
+ class { 'statistics':
+ servers => $statistics_servers,
+ working_path => $working_path,
+ }
+}
+
+class role::statistics::module::cruncher inherits role::statistics::module {
+ system::role { 'role::statistics::cruncher':
+ description => 'Statistics general compute node (non private data)'
+ }
+
+ include role::backup::host
+ backup::set { 'home' : }
+ #
+ # # include stuff common to statistics compute nodes
+ # include statistics::server::compute
+ #
+ # # Aaron Halfaker (halfak) wants MongoDB for his project.
+ # class { 'mongodb':
+ # dbpath => "${::statistics::working_path}/mongodb",
+ # }
+ #
+ # # rsync logs from logging hosts
+ # include statistics::rsync::eventlogging
+ #
+ #
+ # # TODO: Move geowiki into its own module:
+ # # geowiki: bringing data from production slave db to research db
+ # include misc::statistics::geowiki::jobs::data
+ # # geowiki: generate limn files from research db and push them
+ # include misc::statistics::geowiki::jobs::limn
+ # # geowiki: monitors the geowiki files of http://gp.wmflabs.org/
+ # include misc::statistics::geowiki::jobs::monitoring
+}
+
+class role::statistics::module::private inherits role::statistics::module {
+ system::role { 'role::statistics::private':
+ description => 'Statistics private data host and general compute node'
+ }
+
+ include role::backup::host
+ backup::set { 'home' : }
+
+ # # include stuff common to statistics compute nodes
+ # include statistics::server::compute
+ #
+ # # wikistats code is run here to
+ # # generate stats.wikimedia.org data
+ # include statistics::wikistats
+ #
+ # # rsync logs from logging hosts
+ # include statistics::rsync::webrequest
+ #
+ # # eventlogging logs are not private, but they
+ # # are here for convenience
+ # include statistics::rsync::eventlogging
+ #
+ # # backup eventlogging logs
+ # backup::set { 'a-eventlogging' : }
+ #
+ # # kafkatee is useful here for adhoc processing of kafkadata
+ # require_package('kafkatee')
+ #
+ # # aggregating hourly webstatscollector project count files into
+ # # daily per site csvs.
+ # # Although it is in the “private” role, the dataset actually isn't
+ # # private. We just keep it here to spare adding a separate role.
+ # include misc::statistics::aggregator
+}
+
+
+class role::statistics::module::web inherits role::statistics::module {
+ system::role { 'role::statistics::web':
+ description => 'Statistics private data host and general compute node'
+ }
+
+ # # include stuff common to statistics webserver nodes.
+ # include statistics::web
+ #
+ # # include statistics web sites
+ # include statistics::sites::datasets
+ # include statistics::sites::metrics
+ # include statistics::sites::reportcard
+ # include statistics::sites::stats
+}
+
diff --git a/manifests/site.pp b/manifests/site.pp
index 40ec439..6bfd010 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -2241,6 +2241,9 @@
node 'stat1001.eqiad.wmnet' {
include standard
include role::statistics::www
+ # role::statistics::www will be replaced with the following role
+ include role::statistics::module::web
+
include role::abacist
class { 'admin': groups => ['statistics-web-users'] }
}
@@ -2265,6 +2268,8 @@
# include classes needed for storing and crunching
# private data on stat1002.
include role::statistics::private
+ # role::statistics::private will be replaced with the following role
+ include role::statistics::module::private
# Make sure refinery happens before analytics::clients,
# so that the hive role can properly configure Hive's
@@ -2296,17 +2301,18 @@
node 'stat1003.eqiad.wmnet' {
include standard
- # stat1003 has a public IP and should be pretty
- # well firewalled off. If it needs a specific
- # service opened up, this will be done in
- # statistics classes.
# NOTE: This will be moved to another class
# someday, probably standard.
class { 'base::firewall': }
include role::statistics::cruncher
+ # role::statistics::private will be replaced with the following role
+ include role::cruncher::module::cruncher
+
+ # TODO: Find out if we still need cron_blog_pageviews
include misc::statistics::cron_blog_pageviews
+
include misc::statistics::limn::data::jobs
include misc::statistics::researchdb_password
diff --git a/modules/statistics/README.md b/modules/statistics/README.md
new file mode 100644
index 0000000..57cc6dc
--- /dev/null
+++ b/modules/statistics/README.md
@@ -0,0 +1,10 @@
+This module is mostly a copy/paste of classes from the old
+manifests/misc/statistics.pp file. It has not been refactored.
+The misc contents have been moved here in order to satisfy
+the requirement of moving all non site.pp manifests
+out of the root manifests and into a modules.
+
+You will probably want to include either statistics::web or
+statistics::compute, and then include any specific other
+specific classes to your nodes selectively.
+
diff --git a/modules/statistics/files/datasets.wikimedia.org
b/modules/statistics/files/datasets.wikimedia.org
new file mode 100644
index 0000000..7c98969
--- /dev/null
+++ b/modules/statistics/files/datasets.wikimedia.org
@@ -0,0 +1,24 @@
+NameVirtualHost *:80
+<VirtualHost *:80>
+ ServerName datasets.wikimedia.org
+
+ DocumentRoot /srv/datasets.wikimedia.org
+
+ <Directory /srv/datasets.wikimedia.org >
+ Options Indexes FollowSymLinks MultiViews
+ AllowOverride None
+ Order allow,deny
+ allow from all
+ </Directory>
+
+ LogLevel warn
+ ErrorLog /var/log/apache2/datasets_error.log
+ CustomLog /var/log/apache2/datasets_access.log combined
+ ServerSignature Off
+</VirtualHost>
+
+<VirtualHost *:80>
+ ServerName stat1001.wikimedia.org
+ RewriteEngine On
+ RewriteRule ^(.*)$ http://datasets.wikimedia.org/$1 [R=301]
+</VirtualHost>
diff --git a/modules/statistics/manifests/aggregator.pp
b/modules/statistics/manifests/aggregator.pp
new file mode 100644
index 0000000..717a1b6
--- /dev/null
+++ b/modules/statistics/manifests/aggregator.pp
@@ -0,0 +1,84 @@
+# == Class statistics::aggregator
+# Handles aggregation of pagecounts-all-sites projectcounts files
+# TODO: Should this be in its own module?
+#
+class statistics::aggregator {
+ Class['::statistics'] => Class['::statistics::aggregator']
+
+ # This class uses the cdh::hadoop::mount in order to get
+ # data files out of HDFS.
+ Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator']
+
+ $working_path = "${::statistics::working_path}/aggregator"
+
+ $script_path = "${working_path}/scripts"
+ $data_repo_path = "${working_path}/data"
+ $data_path = "${data_repo_path}/projectcounts"
+ $log_path = "${working_path}/log"
+ # This should not be hardcoded. Instead, one should be able to use
+ # $::cdh::hadoop::mount::mount_point to reference the user supplied
+ # parameter when the cdh::hadoop::mount class is evaluated.
+ # I am not sure why this is not working.
+ $hdfs_mount_point = '/mnt/hdfs'
+ $hdfs_source_path =
"${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites"
+ $user = $::statistics::user::username
+ $group = $::statistics::user::username
+
+ file { $working_path:
+ ensure => 'directory',
+ owner => $user,
+ group => $group,
+ mode => '0755'
+ }
+
+ git::clone { 'aggregator_code':
+ ensure => 'latest',
+ directory => $script_path,
+ origin =>
'https://gerrit.wikimedia.org/r/p/analytics/aggregator.git',
+ owner => $user,
+ group => $group,
+ mode => '0755',
+ require => File[$working_path],
+ }
+
+ git::clone { 'aggregator_data':
+ ensure => 'latest',
+ directory => $data_repo_path,
+ origin =>
'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git',
+ owner => $user,
+ group => $group,
+ mode => '0755',
+ require => File[$working_path],
+ }
+
+ file { $log_path:
+ ensure => 'directory',
+ owner => $user,
+ group => $group,
+ mode => '0755',
+ require => File[$working_path],
+
+ }
+
+ # Cron for doing the basic aggregation step itself
+ cron { 'aggregator projectcounts aggregate':
+ command => "${script_path}/bin/aggregate_projectcounts --source
${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day'
+\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d`
--push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log",
+ user => $user,
+ hour => '13',
+ minute => '0',
+ require => [
+ Git::Clone['aggregator_code'],
+ Git::Clone['aggregator_data'],
+ File[$log_path],
+ ],
+ }
+
+ # Cron for basing monitoring of the aggregated data
+ cron { 'aggregator projectcounts monitor':
+ command => "${script_path}/bin/check_validity_aggregated_projectcounts
--data ${data_path}",
+ user => $user,
+ hour => '13',
+ minute => '45',
+ require => Cron['aggregator projectcounts aggregate'],
+ }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/compute.pp
b/modules/statistics/manifests/compute.pp
new file mode 100644
index 0000000..d04778d
--- /dev/null
+++ b/modules/statistics/manifests/compute.pp
@@ -0,0 +1,121 @@
+# == Class statistics::compute
+# Class containing common stuff for a statisitics compute node.
+#
+class statistics::compute {
+ Class['::statistics'] -> Class['::statistics::compute']
+
+ # include mysql module base class to install mysql client
+ include mysql
+ include geoip
+ include statistics::dataset_mount
+
+ include misc::udp2log::udp_filter
+
+ require_package('nodejs')
+ require_package('openjdk-7-jdk')
+
+ package { [
+ 'emacs23',
+ 'mc',
+ 'zip',
+ 'p7zip',
+ 'p7zip-full',
+ 'subversion',
+ 'mercurial',
+ 'tofrodos',
+ 'git-review',
+ 'imagemagick',
+ # halfak wants make to manage dependencies
+ 'make',
+ # for checking up on eventlogging
+ 'zpubsub',
+ # libwww-perl for wikistats stuff
+ 'libwww-perl',
+ 'php5-cli',
+ 'php5-mysql',
+ 'sqlite3', # For storing and interacting with intermediate results
+ 'libgdal1-dev', # Requested by lzia for rgdal
+ 'libproj-dev', # Requested by lzia for rgdal
+ 'libbz2-dev', # for compiling some python libs. RT 8278
+ 'libboost-regex-dev', # Ironholds wants these
+ 'libboost-system-dev',
+ 'libyaml-cpp0.3',
+ 'libyaml-cpp0.3-dev',
+ 'libgoogle-glog-dev',
+ 'libboost-iostreams-dev',
+ 'libmaxminddb-dev',
+ 'build-essential', # Requested by halfak to install SciPy
+ ]:
+ ensure => 'latest',
+ }
+
+ # Python packages
+ package { [
+ 'python-geoip',
+ 'libapache2-mod-python',
+ 'python-django',
+ 'python-mysqldb',
+ 'python-yaml',
+ 'python-dateutil',
+ 'python-numpy',
+ 'python-scipy',
+ 'python-boto', # Amazon S3 access (needed to get zero sms logs)
+ 'python-pandas', # Pivot tables processing
+ 'python-requests', # Simple lib to make API calls
+ 'python-unidecode', # Unicode simplification - converts everything to
latin set
+ 'python-pygeoip', # For geo-encoding IP addresses
+ 'python-ua-parser', # For parsing User Agents
+ 'python-matplotlib', # For generating plots of data
+ 'python-netaddr',
+ 'python-virtualenv', # T84378
+ # Aaron Halfaker (halfak) wants python{,3}-dev environments for module
oursql
+ 'python-dev', # RT 6561
+ 'python3-dev', # RT 6561
+ ]:
+ ensure => 'installed',
+ }
+
+ # Plotting packags
+ package { [
+ 'ploticus',
+ 'libploticus0',
+ 'r-base',
+ 'r-cran-rmysql',
+ 'libcairo2',
+ 'libcairo2-dev',
+ 'libxt-dev'
+ ]:
+ ensure => installed,
+ }
+
+ # clones mediawiki core at $working_path/mediawiki/core
+ # and ensures that it is at the latest revision.
+ # RT 2162
+ $statistics_mediawiki_directory =
"${::statistics::working_path}/mediawiki/core"
+
+ git::clone { 'statistics_mediawiki':
+ ensure => 'latest',
+ directory => $statistics_mediawiki_directory,
+ origin => 'https://gerrit.wikimedia.org/r/p/mediawiki/core.git',
+ owner => 'mwdeploy',
+ group => 'wikidev',
+ }
+
+ include passwords::mysql::research
+ # This file will render at
+ # /etc/mysql/conf.d/research-client.cnf.
+ mysql::config::client { 'research':
+ user => $::passwords::mysql::research::user,
+ pass => $::passwords::mysql::research::pass,
+ group => 'researchers',
+ mode => '0440',
+ }
+ # This file will render at
+ # /etc/mysql/conf.d/stats-research-client.cnf.
+ mysql::config::client { 'stats-research':
+ user => $::passwords::mysql::research::user,
+ pass => $::passwords::mysql::research::pass,
+ group => $::statistics::user::username,
+ mode => '0440',
+ }
+}
diff --git a/modules/statistics/manifests/dataset_mount.pp
b/modules/statistics/manifests/dataset_mount.pp
new file mode 100644
index 0000000..c222c3b
--- /dev/null
+++ b/modules/statistics/manifests/dataset_mount.pp
@@ -0,0 +1,25 @@
+# == Class statistics::dataset_mount
+# Mounts /data from dataset1001 server.
+# xmldumps and other misc files needed
+# for generating statistics are here.
+#
+# NOTE: This class has nothing to do with the
+# datasets site hosted at 'datasets.wikimedia.org'.
+#
+class statistics::dataset_mount {
+ # need this for NFS mounts.
+ include nfs::common
+
+ file { '/mnt/data':
+ ensure => 'directory',
+ }
+
+ mount { '/mnt/data':
+ ensure => 'mounted',
+ device => '208.80.154.11:/data',
+ fstype => 'nfs',
+ options =>
'ro,bg,tcp,rsize=8192,wsize=8192,timeo=14,intr,addr=208.80.154.11',
+ atboot => true,
+ require => [File['/mnt/data'], Class['nfs::common']],
+ }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/init.pp
b/modules/statistics/manifests/init.pp
new file mode 100644
index 0000000..6877196
--- /dev/null
+++ b/modules/statistics/manifests/init.pp
@@ -0,0 +1,38 @@
+# == Class statistics
+# Base wrapper class for stat servers.
+# All stat servers should include this class.
+#
+# == Parameters
+# $servers - list of statistics servers.
+# These will be granted rsync read and
+# write access between each other.
+# $working_path - Base path for statistics data.
+# Default: /srv
+class statistics(
+ $servers,
+ $working_path = '/srv'
+) {
+ include statistics::user
+
+ file { $working_path:
+ ensure => 'directory',
+ owner => 'root',
+ group => 'wikidev',
+ mode => '0775',
+ }
+
+ if $working_path == '/srv' {
+ # symlink /a to /srv for backwards compatibility
+ file { '/a':
+ ensure => 'link',
+ target => '/srv',
+ }
+ }
+
+ # set up rsync modules for copying files
+ # on statistic servers in $working_path
+ class { 'statistics::rsyncd':
+ path => $working_path,
+ hosts_allow => $servers,
+ }
+}
diff --git a/modules/statistics/manifests/password.pp
b/modules/statistics/manifests/password.pp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/modules/statistics/manifests/password.pp
diff --git a/modules/statistics/manifests/rsync/eventlogging.pp
b/modules/statistics/manifests/rsync/eventlogging.pp
new file mode 100644
index 0000000..0444510
--- /dev/null
+++ b/modules/statistics/manifests/rsync/eventlogging.pp
@@ -0,0 +1,28 @@
+# == Class statistics::rsync::eventlogging
+#
+# Sets up daily cron jobs to rsync log files from remote
+# logging hosts to a local destination for further processing.
+#
+class statistics::rsync::eventlogging {
+ Class['::statistics'] -> Class['::statistics::rsync::webrequest']
+ $working_path = $::statistics::working_path
+
+ # Any logs older than this will be pruned by
+ # the rsync_job define.
+ $retention_days = 90
+
+ file { "${working_path}/eventlogging":
+ ensure => 'directory',
+ owner => 'stats',
+ group => 'wikidev',
+ mode => '0775',
+ }
+
+ # eventlogging logs from vanadium
+ statistics::rsync_job { 'eventlogging':
+ source => 'vanadium.eqiad.wmnet::eventlogging/archive/*.gz',
+ destination => "${working_path}/eventlogging/archive",
+ retention_days => $retention_days,
+
+ }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/rsync/webrequest.pp
b/modules/statistics/manifests/rsync/webrequest.pp
new file mode 100644
index 0000000..ba82906
--- /dev/null
+++ b/modules/statistics/manifests/rsync/webrequest.pp
@@ -0,0 +1,86 @@
+# == Class statistics::rsync::webrequest
+#
+# Sets up daily cron jobs to rsync log files from remote
+# logging hosts to a local destination for further processing.
+#
+class statistics::rsync::webrequest {
+ Class['::statistics'] -> Class['::statistics::rsync::webrequest']
+ $working_path = $::statistics::working_path
+
+ # Make sure destination directories exist.
+ # Too bad I can't do this with recurse => true.
+ # See: https://projects.puppetlabs.com/issues/86
+ # for a much too long discussion on why I can't.
+ file { [
+ "${working_path}/aft",
+ "${working_path}/aft/archive",
+ "${working_path}/public-datasets",
+ ]:
+ ensure => 'directory',
+ owner => 'stats',
+ group => 'wikidev',
+ mode => '0775',
+ }
+
+ # Make sure destination directories exist.
+ # Too bad I can't do this with recurse => true.
+ # See: https://projects.puppetlabs.com/issues/86
+ # for a much too long discussion on why I can't.
+ file { [
+ "${working_path}/squid",
+ "${working_path}/squid/archive",
+ # Moving away from "squid" nonmenclature for
+ # webrequest logs. New generated log
+ # files will be rsynced into /a/log.
+ "${working_path}/log",
+ "${working_path}/log/webrequest",
+ ]:
+ ensure => directory,
+ owner => 'stats',
+ group => 'wikidev',
+ mode => '0755',
+ }
+
+ # wikipedia zero logs from oxygen
+ statistics::rsync_job { 'wikipedia_zero':
+ source =>
'oxygen.wikimedia.org::udp2log/webrequest/archive/zero*.gz',
+ destination => "${working_path}/squid/archive/zero",
+ }
+
+ # API logs from erbium
+ statistics::rsync_job { 'api':
+ source =>
'erbium.eqiad.wmnet::udp2log/webrequest/archive/api-usage*.gz',
+ destination => "${working_path}/squid/archive/api",
+ }
+
+ # sampled-1000 logs from erbium
+ statistics::rsync_job { 'sampled_1000':
+ source =>
'erbium.eqiad.wmnet::udp2log/webrequest/archive/sampled-1000*.gz',
+ destination => "${working_path}/squid/archive/sampled",
+ }
+
+ # glam_nara logs from erbium
+ statistics::rsync_job { 'glam_nara':
+ source =>
'erbium.eqiad.wmnet::udp2log/webrequest/archive/glam_nara*.gz',
+ destination => "${working_path}/squid/archive/glam_nara",
+ }
+
+ # edit logs from oxygen
+ statistics::rsync_job { 'edits':
+ source =>
'oxygen.wikimedia.org::udp2log/webrequest/archive/edits*.gz',
+ destination => "${working_path}/squid/archive/edits",
+ }
+
+ # mobile logs from oxygen
+ statistics::rsync_job { 'mobile':
+ source =>
'oxygen.wikimedia.org::udp2log/webrequest/archive/mobile*.gz',
+ destination => "${working_path}/squid/archive/mobile",
+ }
+
+ # all webrequest archive logs from hdfs
+ statistics::rsync_job { 'hdfs_webrequest_archive':
+ source => 'stat1002.eqiad.wmnet::hdfs-archive/webrequest/*',
+ destination => "${working_path}/log/webrequest/archive",
+ retention_days => 90, # Pruning after 90 days as those logs contain
private data.
+ }
+}
diff --git a/modules/statistics/manifests/rsync_job.pp
b/modules/statistics/manifests/rsync_job.pp
new file mode 100644
index 0000000..8d92c6a
--- /dev/null
+++ b/modules/statistics/manifests/rsync_job.pp
@@ -0,0 +1,47 @@
+# == Define: statistics::rsync_job
+#
+# Sets up a daily cron job to rsync from $source to $destination
+# as the $misc::statistics::user::username user. This requires
+# that the $misc::statistics::user::username user is installed
+# on both $source and $destination hosts.
+#
+# == Parameters:
+# source - rsync source argument (including hostname)
+# destination - rsync destination argument
+# retention_days - If set, a cron will be installed to remove files older
than this many days from $destination.
+#
+define statistics::rsync_job($source, $destination, $retention_days = undef) {
+ Class['::statistics'] -> Statistics::Rsync_job[$name]
+ require statistics::user
+
+ # ensure that the destination directory exists
+ file { $destination:
+ ensure => 'directory',
+ owner => $::statistics::user::username,
+ group => 'wikidev',
+ mode => '0755',
+ }
+
+ # Create a daily cron job to rsync $source to $destination.
+ # This requires that the $misc::statistics::user::username
+ # user is installed on the source host.
+ cron { "rsync_${name}_logs":
+ command => "/usr/bin/rsync -rt --perms --chmod=g-w ${source}
${destination}/",
+ user => $::statistics::user::username,
+ hour => 8,
+ minute => 0,
+ }
+
+ $prune_old_logs_ensure = $retention_days ? {
+ undef => 'absent',
+ default => 'present',
+ }
+
+ cron { "prune_old_${name}_logs":
+ ensure => $prune_old_logs_ensure,
+ command => "/usr/bin/find ${destination} -ctime +${retention_days}
-exec rm {} \\;",
+ user => $::statistics::user::username,
+ minute => 0,
+ hour => 9,
+ }
+}
diff --git a/modules/statistics/manifests/rsyncd.pp
b/modules/statistics/manifests/rsyncd.pp
new file mode 100644
index 0000000..34a4ed1
--- /dev/null
+++ b/modules/statistics/manifests/rsyncd.pp
@@ -0,0 +1,57 @@
+# == Class statistics::rsyncd
+# Sets up rsyncd and common modules
+# for statistic servers. Currently
+# this is read/write between statistic
+# servers in /srv or /a.
+#
+# == Parameters
+# path - string. Base path to allow rsync access. Should probably
be /srv or /a.
+# hosts_allow - array. Hosts to grant rsync access.
+class statistics::rsyncd($path, $hosts_allow)
+{
+ Class['::statistics'] -> Class['statistics::rsyncd']
+
+ # this uses modules/rsync to
+ # set up an rsync daemon service
+ include rsync::server
+
+ # Set up an rsync module
+ # (in /etc/rsyncd.conf) for /srv.
+ rsync::server::module { 'srv':
+ path => $path
+ read_only => 'no',
+ list => 'yes',
+ hosts_allow => $hosts_allow,
+ }
+
+ # Set up an rsync module for /a if
+ # we are using /srv a working path on this node.
+ # This if for backwards compatibility.
+ if ($::statistics::working_path == '/srv') {
+ rsync::server::module { 'a':
+ path => $path,
+ read_only => 'no',
+ list => 'yes',
+ hosts_allow => $hosts_allow,
+ }
+ }
+
+ # Set up an rsync module
+ # (in /etc/rsync.conf) for /var/www.
+ # This will allow $hosts_allow to host public data files
+ # from the default Apache VirtualHost.
+ rsync::server::module { 'www':
+ path => '/var/www',
+ read_only => 'no',
+ list => 'yes',
+ hosts_allow => $hosts_allow,
+ }
+
+ # Allow rsyncd traffic from internal networks.
+ # and stat* public IPs.
+ ferm::service { 'rsync':
+ proto => 'tcp',
+ port => '873',
+ srange => '($INTERNAL)',
+ }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/sites/datasets.pp
b/modules/statistics/manifests/sites/datasets.pp
new file mode 100644
index 0000000..a77b6d6
--- /dev/null
+++ b/modules/statistics/manifests/sites/datasets.pp
@@ -0,0 +1,60 @@
+# == Class statistics::sites::datasets
+# datasets.wikimedia.org
+#
+# TODO: Parameterize rsync source hostnames
+#
+# NOTE: This class has nothing to do with the
+# dataset1001 datasets_mount.
+#
+class statistics::sites::datasets {
+ Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+
+ $working_path = $::statistics::working_path
+ file { [
+ "${working_path}/public-datasets",
+ "${working_path}/aggregate-datasets"
+ ]:
+ ensure => 'directory',
+ owner => 'root',
+ group => 'www-data',
+ mode => '0640',
+ }
+
+ # symlink /var/www/public-datasets to $working_path/public-datasets
+ file { '/var/www/public-datasets':
+ ensure => 'link',
+ target => "${working_path}/public-datasets",
+ owner => 'root',
+ group => 'www-data',
+ mode => '0640',
+ }
+
+ # symlink /var/www/aggregate-datasets to $working_path/aggregate-datasets
+ file { '/var/www/aggregate-datasets':
+ ensure => 'link',
+ target => "${working_path}/aggregate-datasets",
+ owner => 'root',
+ group => 'www-data',
+ mode => '0640',
+ }
+
+ # rsync from stat1003:/srv/public-datasets to $working_path/public-datasets
+ cron { 'rsync public datasets':
+ command => "/usr/bin/rsync -rt --delete
stat1003.eqiad.wmnet::srv/public-datasets/* ${working_path}/public-datasets/",
+ require => File["${working_path}/public-datasets"],
+ user => 'root',
+ minute => '*/30',
+ }
+
+ # rsync from stat1002:/srv/aggregate-datasets to
$working_path/aggregate-datasets
+ cron { 'rsync aggregate datasets from stat1002':
+ command => "/usr/bin/rsync -rt --delete
stat1002.eqiad.wmnet::srv/aggregate-datasets/*
${working_path}/aggregate-datasets/",
+ require => File["${working_path}/aggregate-datasets"],
+ user => 'root',
+ minute => '*/30',
+ }
+
+ apache::site { 'datasets':
+ source => 'puppet:///modules/statistics/datasets.wikimedia.org',
+ }
+}
diff --git a/modules/statistics/manifests/sites/metrics.pp
b/modules/statistics/manifests/sites/metrics.pp
new file mode 100644
index 0000000..7320fd9
--- /dev/null
+++ b/modules/statistics/manifests/sites/metrics.pp
@@ -0,0 +1,21 @@
+# == Class statistics::sites::metrics
+# metrics.wikimedia.org and metrics-api.wikimedia.org
+# They should just redirect to Wikimetrics
+#
+class statistics::sites::metrics {
+ Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+ include ::apache::mod::alias
+
+ $site_name = 'metrics.wikimedia.org'
+ $redirect_target = 'https://metrics.wmflabs.org/'
+
+ # Set up the VirtualHost
+ apache::site { $site_name:
+ content => template("modules/statistics/metrics.wikimedia.org.erb"),
+ }
+
+ # make access and error log for metrics-api readable by wikidev group
+ file { ['/var/log/apache2/access.metrics.log',
'/var/log/apache2/error.metrics.log']:
+ group => 'wikidev',
+ }
+}
diff --git a/modules/statistics/manifests/sites/reportcard.pp
b/modules/statistics/manifests/sites/reportcard.pp
new file mode 100644
index 0000000..cc8133a
--- /dev/null
+++ b/modules/statistics/manifests/sites/reportcard.pp
@@ -0,0 +1,7 @@
+# == Class statistics::sites::reportcard
+class statistics::sites::reportcard {
+ Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+
+ misc::limn::instance { 'reportcard': }
+}
+
diff --git a/modules/statistics/manifests/sites/stats.pp
b/modules/statistics/manifests/sites/stats.pp
new file mode 100644
index 0000000..b3bcb05
--- /dev/null
+++ b/modules/statistics/manifests/sites/stats.pp
@@ -0,0 +1,51 @@
+# ==Class statistics::sites::stats
+# stats.wikimedia.org
+class statistics::sites::stats {
+ Class['::statistics::web'] -> Class['::statistics::sites::datasets']
+
+ # TODO! geowiki module???
+ # require misc::statistics::geowiki::data::private
+
+ $site_name = 'stats.wikimedia.org'
+ $docroot = "/srv/${site_name}/htdocs"
+ $geowiki_private_directory = "${docroot}/geowiki-private"
+ $geowiki_private_htpasswd_file = '/etc/apache2/htpasswd.stats-geowiki'
+
+ # add htpasswd file for stats.wikimedia.org
+ file { '/etc/apache2/htpasswd.stats':
+ owner => 'root',
+ group => 'root',
+ mode => '0644',
+ source => 'puppet:///private/apache/htpasswd.stats',
+ }
+
+ # add htpasswd file for private geowiki data
+ file { $geowiki_private_htpasswd_file:
+ owner => 'root',
+ group => 'www-data',
+ mode => '0640',
+ source => 'puppet:///private/apache/htpasswd.stats-geowiki',
+ }
+
+ # TODO:
+ # # link geowiki checkout from docroot
+ # file { $geowiki_private_directory:
+ # ensure => 'link',
+ # target =>
"${misc::statistics::geowiki::data::private::geowiki_private_data_path}/datafiles",
+ # owner => 'root',
+ # group => 'www-data',
+ # mode => '0750',
+ # }
+
+ apache::site { $site_name:
+ content => template("apache/sites/${site_name}.erb"),
+ }
+
+ file { '/etc/apache2/ports.conf':
+ ensure => 'present',
+ mode => '0644',
+ owner => 'root',
+ group => 'root',
+ source => 'puppet:///files/apache/ports.conf.ssl',
+ }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/user.pp
b/modules/statistics/manifests/user.pp
new file mode 100644
index 0000000..f63f696
--- /dev/null
+++ b/modules/statistics/manifests/user.pp
@@ -0,0 +1,49 @@
+class statistics::user {
+ include passwords::statistics::user
+
+ $username = 'stats'
+ $homedir = "/var/lib/${username}"
+
+ group { $username:
+ ensure => present,
+ name => $username,
+ system => true,
+ }
+
+ user { $username:
+ home => $homedir,
+ groups => ['wikidev'],
+ shell => '/bin/bash',
+ managehome => true,
+ system => true
+ }
+
+ git::userconfig { 'stats':
+ homedir => $homedir,
+ settings => {
+ 'user' => {
+ 'name' => 'Statistics User',
+ # TODO: use a better email than this :(
+ 'email' => '[email protected]',
+ },
+ # Enable automated git/gerrit authentication via http
+ # by using .git-credential file store.
+ 'credential' => {
+ 'helper' => 'store',
+ },
+ },
+ require => User[$username],
+ }
+
+ # Render the .git-credentials file with the stats user's http password.
+ # This password is set from
https://gerrit.wikimedia.org/r/#/settings/http-password.
+ # To log into gerrit as the stats user, check the /srv/password/stats-user
file
+ # for LDAP login creds.
+ file { "${homedir}/.git-credentials":
+ mode => '0600',
+ owner => $username,
+ group => $username,
+ content =>
"https://${username}:${passwords::statistics::user::gerrit_http_password}@gerrit.wikimedia.org",
+ require => User[$username],
+ }
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/web.pp
b/modules/statistics/manifests/web.pp
new file mode 100644
index 0000000..1219217
--- /dev/null
+++ b/modules/statistics/manifests/web.pp
@@ -0,0 +1,28 @@
+# == Class statistics::web
+# Common things needed for a statistics webserver node.
+# This should be included if you want to include any
+# sites in statistics::sites
+class statistics::web {
+ Class['::statistics'] -> Class['::statistics::web']
+
+ $ssl_settings = ssl_ciphersuite('apache-2.2', 'compat', '365')
+
+ include webserver::apache
+
+ # make sure /var/log/apache2 is readable by wikidevs for debugging.
+ # This won't make the actual log files readable, only the directory.
+ # Individual log files can be created and made readable by
+ # classes that manage individual sites.
+ file { '/var/log/apache2':
+ ensure => 'directory',
+ owner => 'root',
+ group => 'wikidev',
+ mode => '0750',
+ require => Class['webserver::apache'],
+ }
+
+ include ::apache::mod::rewrite
+ include ::apache::mod::proxy
+ include ::apache::mod::proxy_http
+ include ::apache::mod::headers
+}
\ No newline at end of file
diff --git a/modules/statistics/manifests/wikistats.pp
b/modules/statistics/manifests/wikistats.pp
new file mode 100644
index 0000000..5146e7d
--- /dev/null
+++ b/modules/statistics/manifests/wikistats.pp
@@ -0,0 +1,32 @@
+# == Class statistics::wikistats
+# wikistats configuration for generating
+# stats.wikimedia.org data.
+#
+# TODO: puppetize clone of wikistats?
+class statistics::wikistats {
+ Class['::statistics'] -> Class['::statistics::wikistats']
+
+ # Perl packages needed for wikistats
+ package { [
+ 'libjson-xs-perl',
+ 'libtemplate-perl',
+ 'libnet-patricia-perl',
+ 'libregexp-assemble-perl',
+ ]:
+ ensure => 'installed',
+ }
+ # this cron uses pigz to unzip squid archive files in parallel
+ package { 'pigz':
+ ensure => 'installed',
+ }
+
+ # generates the new mobile pageviews report
+ # and syncs the file PageViewsPerMonthAll.csv to stat1002
+ cron { 'new mobile pageviews report':
+ command => "/bin/bash
${::statistics::working_path}/wikistats_git/pageviews_reports/bin/stat1-cron-script.sh",
+ user => 'stats',
+ monthday => 1,
+ hour => 7,
+ minute => 20,
+ }
+}
diff --git a/modules/statistics/templates/metrics.wikimedia.org.erb
b/modules/statistics/templates/metrics.wikimedia.org.erb
new file mode 100644
index 0000000..9bac6be
--- /dev/null
+++ b/modules/statistics/templates/metrics.wikimedia.org.erb
@@ -0,0 +1,27 @@
+# Note: This file is managed by Puppet.
+<%
+# ERb template variables:
+#
+# site_name
+# redirect_target
+#
+-%>
+
+<VirtualHost *:80>
+ # <%= @site_name %>, and metrics-api.wikimedia.org are
+ # deprecated. The services it provided are now available through
+ # Wikimetrics.
+ #
+ # We keep the domain around only because old documentation and
+ # search engines still refer to it.
+
+ ServerName <%= @site_name %>
+ ServerAlias metrics-api.wikimedia.org
+ ServerAdmin [email protected]
+
+ Redirect permanent / <%= @redirect_target %>
+
+ ErrorLog /var/log/apache2/error.metrics.log
+ LogLevel warn
+ CustomLog /var/log/apache2/access.metrics.log combined
+</VirtualHost>
--
To view, visit https://gerrit.wikimedia.org/r/186396
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I49140d85ddea99f5d4d9a3c71e60cf7fa57d49b6
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits