Ottomata has submitted this change and it was merged.
Change subject: Replace limn::data::generate by reportupdater
......................................................................
Replace limn::data::generate by reportupdater
Moves the generation of reports out of the statistics module.
Reportupdater now has a dedicated repository.
Bug: T127327
Change-Id: I79cafe41b1e6002ee3e6c9087809ed9515424ad6
---
M manifests/role/statistics.pp
A modules/reportupdater/manifests/init.pp
A modules/reportupdater/manifests/job.pp
D modules/statistics/manifests/limn/data.pp
D modules/statistics/manifests/limn/data/generate.pp
5 files changed, 181 insertions(+), 136 deletions(-)
Approvals:
Ottomata: Verified; Looks good to me, approved
diff --git a/manifests/role/statistics.pp b/manifests/role/statistics.pp
index c080e21..9368912 100644
--- a/manifests/role/statistics.pp
+++ b/manifests/role/statistics.pp
@@ -74,17 +74,44 @@
include geowiki::job::monitoring
- # Use the statistics::limn::data::generate define
- # to set up cron jobs to generate and generate limn files
- # from research db and push them
- statistics::limn::data::generate { 'mobile': }
- statistics::limn::data::generate { 'flow': }
- statistics::limn::data::generate { 'edit': }
- statistics::limn::data::generate { 'language': }
- statistics::limn::data::generate { 'extdist': }
- statistics::limn::data::generate { 'ee': }
- statistics::limn::data::generate { 'multimedia': }
+ # Set up reportupdater to be executed on this machine
+ # and rsync the output base path to stat1001.
+ class { 'reportupdater':
+ base_path => "${::statistics::working_path}/reportupdater",
+ user => $::statistics::user::username,
+ rsync_to => 'stat1001.eqiad.wmnet::www/limn-public-data/',
+ }
+ # Set up various jobs to be executed by reportupdater
+ # creating several reports on mysql research db.
+ reportupdater::job { 'mobile':
+ repository => 'limn-mobile-data',
+ output_dir => 'mobile/datafiles',
+ }
+ reportupdater::job { 'flow':
+ repository => 'limn-flow-data',
+ output_dir => 'flow/datafiles',
+ }
+ reportupdater::job { 'edit':
+ repository => 'limn-edit-data',
+ output_dir => 'metrics',
+ }
+ reportupdater::job { 'language':
+ repository => 'limn-language-data',
+ output_dir => 'metrics/beta-feature-enables',
+ }
+ reportupdater::job { 'extdist':
+ repository => 'limn-extdist-data',
+ output_dir => 'extdist/datafiles',
+ }
+ reportupdater::job { 'ee':
+ repository => 'limn-ee-data',
+ output_dir => 'metrics/echo',
+ }
+ reportupdater::job { 'multimedia':
+ repository => 'limn-multimedia-data',
+ output_dir => 'metrics/beta-feature-enables',
+ }
}
@@ -136,6 +163,16 @@
group => 'statistics-privatedata-users',
mode => '0440',
}
+
+ # Set up reportupdater to be executed on this machine.
+ class { 'reportupdater':
+ base_path => "${::statistics::working_path}/reportupdater",
+ user => $::statistics::user::username,
+ }
+ # Set up a job to create browser reports on hive db.
+ reportupdater::job { 'browser':
+ repository => 'reportupdater-queries',
+ }
}
diff --git a/modules/reportupdater/manifests/init.pp
b/modules/reportupdater/manifests/init.pp
new file mode 100644
index 0000000..bb81f67
--- /dev/null
+++ b/modules/reportupdater/manifests/init.pp
@@ -0,0 +1,63 @@
+# == Class reportupdater
+#
+# Sets up repositories and rsync for using reportupdater.
+# See: https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater
+#
+# == Parameters
+# $user - string. User for cloning repositories and
+# folder permits.
+#
+# $base_path - string. Base path where to put reportupdater's
+# repository, job query repositories, and data output.
+# Default: /srv/reportupdater
+#
+# $rsync_to - string. [optional] If defined, everything i
+# $base_path/output will be rsynced to $rsync_to.
+#
+class reportupdater(
+ $user,
+ $base_path = '/srv/reportupdater',
+ $rsync_to = undef,
+) {
+ # Path at which reportupdater source will be cloned.
+ $path = "${base_path}/reportupdater"
+
+ # Path in which all reportupdater output will be stored.
+ $output_path = "${base_path}/output"
+
+ # Path in which all reportupdater jobs will log.
+ $log_path = "${base_path}/log"
+
+ # Path in which individual reportupdater job repositories
+ # will be cloned.
+ $job_repositories_path = "${::reportupdater::base_path}/jobs"
+
+ # Ensure these directories exist and are writeable by $user.
+ file { [$base_path, $output_path, $log_path, $job_repositories_path]:
+ ensure => 'directory',
+ owner => $user,
+ group => 'wikidev',
+ mode => '0775',
+ }
+
+ # Ensure reportupdater is cloned and latest version.
+ git::clone { 'analytics/reportupdater':
+ ensure => 'latest',
+ directory => $path,
+ origin =>
'https://gerrit.wikimedia.org/r/p/analytics/reportupdater.git',
+ owner => $user,
+ require => File[$base_path],
+ }
+
+ # If specified, rsync anything generated in $output_base_path to $rsync_to.
+ $rsync_cron_ensure = $rsync_to ? {
+ undef => 'absent',
+ default => 'present',
+ }
+ cron { 'reportupdater_rsync_to':
+ ensure => $rsync_cron_ensure,
+ command => "/usr/bin/rsync -rt ${output_path}/* ${rsync_to}",
+ user => $user,
+ minute => 15,
+ }
+}
diff --git a/modules/reportupdater/manifests/job.pp
b/modules/reportupdater/manifests/job.pp
new file mode 100644
index 0000000..c334abd
--- /dev/null
+++ b/modules/reportupdater/manifests/job.pp
@@ -0,0 +1,71 @@
+# == Define reportupdater::job
+#
+# Sets up hourly cron jobs to run reportupdater, which generates
+# and updates tsv reports for a set of given queries.
+#
+# This requires that a repository with config and queries for the script
+# exists at https://gerrit.wikimedia.org/r/p/analytics/${repository}.git.
+#
+# == Parameters
+# title - string. Name of query dir inside of $repository.
+# a $title directory with reportupdater query config
+# must exist inside of $repository.
+#
+# repository - string. Name of the query repository in gerrit in the
+# analytics/ namespace. All reportupdater job
+# repositories must be in analytics/
+# E.g. analytics/reportupdater-queries
+#
+# output_dir - string. [optional] Relative path where to write the reports.
+# This will be relative to $::reportupdater::base_path/output
+# Default: $title
+#
+# == Usage
+# reportupdater::job { 'browser': }
+#
+# reportupdater::job { 'mobile':
+# repository => 'limn-mobile-data',
+# output_dir => "mobile/datafiles",
+# }
+#
+define reportupdater::job(
+ $repository,
+ $output_dir = $title,
+)
+{
+ Class['::reportupdater'] -> Reportupdater::Job[$title]
+
+ # Name of the repository in gerrit.
+ # All reportupdater job repositories are in the analytics/ namespace.
+ $repository_name = "analytics/${repository}"
+
+ # Path at which this reportupdater job repository will be cloned.
+ $path =
"${::reportupdater::job_repositories_path}/${repository}"
+
+ # Path of the query configuration directory inside of $repository_name.
+ $query_path = "${path}/${title}"
+
+ # Path at which the job will store logs.
+ $log_file =
"${$::reportupdater::log_path}/${repository}-${title}.log"
+
+ # Path at which the job will store its report output.
+ $output_path = "${::reportupdater::output_path}/${output_dir}"
+
+ # Ensure the query repository is cloned and latest version.
+ # It is possible that multiple jobs will use the same repository,
+ # so wrap this in an if !defined.
+ if !defined(Git::Clone[$repository_name]) {
+ git::clone { $repository_name:
+ ensure => 'latest',
+ directory => $path,
+ origin =>
"https://gerrit.wikimedia.org/r/p/${repository_name}.git",
+ owner => $::reportupdater::user
+ }
+ }
+
+ cron { "reportupdater_${repository}-${title}":
+ command => "python ${::reportupdater::path}/update_reports.py
${query_path} ${output_path} >> ${log_file} 2>&1",
+ user => $user,
+ minute => 0,
+ }
+}
diff --git a/modules/statistics/manifests/limn/data.pp
b/modules/statistics/manifests/limn/data.pp
deleted file mode 100644
index 1613083..0000000
--- a/modules/statistics/manifests/limn/data.pp
+++ /dev/null
@@ -1,76 +0,0 @@
-
-# == Class statistics::limn::data
-# Sets up base directories and repositories
-# for using the statistics::limn::data::generate() define.
-#
-class statistics::limn::data {
- Class['::statistics::compute'] -> Class['::statistics::limn::data']
- Class['::statistics::user'] -> Class['::statistics::limn::data']
-
- $working_path = '/srv'
-
- # Directory where the repository of the generate.py will be cloned.
- $source_dir = "${working_path}/limn-mobile-data"
-
- # generate.py command to run in a cron.
- $command = "${source_dir}/generate.py"
-
- # my.cnf credentials file. This is the file rendered by
- # mysql::config::client { 'stats-research': } defined in
statistics::compute
- $mysql_credentials = '/etc/mysql/conf.d/stats-research-client.cnf'
-
- # cron job logs will be kept here
- $log_dir = '/var/log/limn-data'
-
- # generate.py's repository
- $git_remote =
'https://gerrit.wikimedia.org/r/p/analytics/limn-mobile-data.git'
-
- # public data directory. Data will be synced from here to a public web
host.
- $public_dir = "${working_path}/limn-public-data"
-
- # Rsync generated data to stat1001 at
http://datasets.wikimedia.org/limn-public-data/
- $rsync_to = 'stat1001.eqiad.wmnet::www/limn-public-data/'
-
- # user to own files and run cron job as (stats).
- $user = $::statistics::user::username
-
- # This path is used in the limn-mobile-data config.
- # Symlink this until they change it.
- #
https://github.com/wikimedia/analytics-limn-mobile-data/blob/2321a6a0976b1805e79fecd495cf12ed7c6565a0/mobile/config.yaml#L5
- file { "${working_path}/.my.cnf.research":
- ensure => 'link',
- target => $mysql_credentials,
- require => Mysql::Config::Client['stats-research'],
- }
-
- # TODO: This repository contains the generate.py script.
- # Other limn data repositories only have config and data
- # directories. generate.py should be abstracted out into
- # a general purupose limn data generator.
- # For now, all limn data classes rely on this repository
- # and generate.py script to be present.
- if !defined(Git::Clone['analytics/limn-mobile-data']) {
- git::clone { 'analytics/limn-mobile-data':
- ensure => 'latest',
- directory => $source_dir,
- origin => $git_remote,
- owner => $user,
- require => [User[$user]],
- }
- }
-
- # Make sure these are writeable by $user.
- file { [$log_dir, $public_dir]:
- ensure => 'directory',
- owner => $user,
- group => wikidev,
- mode => '0775',
- }
-
- # Rsync anything generated in $public_dir to $rsync_to
- cron { 'rsync_limn_public_data':
- command => "/usr/bin/rsync -rt ${public_dir}/* ${rsync_to}",
- user => $user,
- minute => 15,
- }
-}
diff --git a/modules/statistics/manifests/limn/data/generate.pp
b/modules/statistics/manifests/limn/data/generate.pp
deleted file mode 100644
index a3132fc..0000000
--- a/modules/statistics/manifests/limn/data/generate.pp
+++ /dev/null
@@ -1,50 +0,0 @@
-
-# == Define statistics::limn::data::generate
-#
-# Sets up daily cron jobs to run a script which
-# generates csv datafiles and rsyncs those files
-# to stat1001 so they can be served publicly.
-#
-# This requires that a repository with config to pass to generate.py
-# exists at https://gerrit.wikimedia.org/r/p/analytics/limn-${title}-data.git.
-#
-# == Usage
-# statistics::limn::data::generate { 'mobile': }
-# statistics::limn::data::generate { 'flow': }
-# ...
-#
-define statistics::limn::data::generate() {
- require ::statistics::limn::data
-
- $user = $::statistics::limn::data::user
- $command = $::statistics::limn::data::command
-
- # A repo at analytics/limn-${title}-data.git had better exist!
- $git_remote =
"https://gerrit.wikimedia.org/r/p/analytics/limn-${title}-data.git"
-
- # Directory at which to clone $git_remote
- $source_dir =
"${::statistics::limn::data::working_path}/limn-${title}-data"
-
- # config directory for this limn data generate job
- $config_dir = "${$source_dir}/${title}/"
-
- # log file for the generate cron job
- $log =
"${::statistics::limn::data::log_dir}/limn-${title}-data.log"
-
- if !defined(Git::Clone["analytics/limn-${title}-data"]) {
- git::clone { "analytics/limn-${title}-data":
- ensure => 'latest',
- directory => $source_dir,
- origin => $git_remote,
- owner => $user,
- require => [User[$user]],
- }
- }
-
- # This will generate data into $public_dir/${title} (if configured
correctly)
- cron { "generate_${title}_limn_public_data":
- command => "python ${command} ${config_dir} >> ${log} 2>&1",
- user => $user,
- minute => 0,
- }
-}
--
To view, visit https://gerrit.wikimedia.org/r/273487
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I79cafe41b1e6002ee3e6c9087809ed9515424ad6
Gerrit-PatchSet: 14
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Mforns <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits