Ottomata has submitted this change and it was merged. Change subject: Removing code that generates pageviews using legacy definition ......................................................................
Removing code that generates pageviews using legacy definition We no longer use the legacy pageview definition thus makes no sense for these files to be generated. New pageview definition is in effect from May 2015: https://meta.wikimedia.org/wiki/Research:Page_view and legacy counts are available until the merge of this code. Consumer of this code is vital signs Bug: T124244 Change-Id: I7c6869da0fdc18d8edc6e67cbd688abac39676f5 --- M manifests/role/statistics.pp D modules/statistics/manifests/aggregator/projectcounts.pp M modules/statistics/manifests/aggregator/projectview.pp 3 files changed, 2 insertions(+), 83 deletions(-) Approvals: Ottomata: Verified; Looks good to me, approved diff --git a/manifests/role/statistics.pp b/manifests/role/statistics.pp index 7a33714..72fe14a 100644 --- a/manifests/role/statistics.pp +++ b/manifests/role/statistics.pp @@ -111,11 +111,8 @@ # kafkatee is useful here for adhoc processing of kafkadata require_package('kafkatee') - # aggregating hourly pagecount-all-sites project count files into - # daily per site csvs. # Although it is in the "private" role, the dataset actually isn't # private. We just keep it here to spare adding a separate role. - include statistics::aggregator::projectcounts include statistics::aggregator::projectview include passwords::mysql::research diff --git a/modules/statistics/manifests/aggregator/projectcounts.pp b/modules/statistics/manifests/aggregator/projectcounts.pp deleted file mode 100644 index 8812adc..0000000 --- a/modules/statistics/manifests/aggregator/projectcounts.pp +++ /dev/null @@ -1,78 +0,0 @@ -# == Class statistics::aggregator::projectcounts -# Handles aggregation of pagecounts-all-sites projectcounts files -# -# WARNING - Files aggregated by this instance are legacy ones -# A new pageview definition has been provided and aggregation -# for it can be found in the same folder: projectview.pp -# -class statistics::aggregator::projectcounts { - require statistics::aggregator - - # This class uses the cdh::hadoop::mount in order to get - # data files out of HDFS. - Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator::projectcounts'] - - $script_path = $::statistics::aggregator::script_path - $working_path = "${::statistics::aggregator::working_path}/projectcounts" - $data_repo_path = "${working_path}/data" - $data_path = "${data_repo_path}/projectcounts" - $log_path = "${working_path}/log" - # This should not be hardcoded. Instead, one should be able to use - # $::cdh::hadoop::mount::mount_point to reference the user supplied - # parameter when the cdh::hadoop::mount class is evaluated. - # I am not sure why this is not working. - $hdfs_mount_point = '/mnt/hdfs' - $hdfs_source_path = "${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites" - $user = $::statistics::user::username - $group = $::statistics::user::username - - file { $working_path: - ensure => 'directory', - owner => $user, - group => $group, - mode => '0755' - } - - git::clone { 'aggregator_projectcounts_data': - ensure => 'latest', - directory => $data_repo_path, - # This repo should be /analytics/aggregator/projectcounts/data to - # be differenciated easily with /analytics/aggregator/projectview/data. - # But for legacy reasons we keep it as is. - origin => 'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git', - owner => $user, - group => $group, - mode => '0755', - require => File[$working_path], - } - - file { $log_path: - ensure => 'directory', - owner => $user, - group => $group, - mode => '0755', - require => File[$working_path], - - } - - # Cron for doing the basic aggregation step itself - cron { 'aggregator projectcounts aggregate': - command => "log_file=\"${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log\" && ${script_path}/bin/aggregate_projectcounts --source ${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' +\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` --push-target --log \${log_file} 2>> \${log_file}", - user => $user, - hour => '13', - minute => '0', - require => [ - Git::Clone['aggregator_projectcounts_data'], - File[$log_path], - ], - } - - # Cron for basing monitoring of the aggregated data - cron { 'aggregator projectcounts monitor': - command => "${script_path}/bin/check_validity_aggregated_projectcounts --data ${data_path}", - user => $user, - hour => '13', - minute => '45', - require => Cron['aggregator projectcounts aggregate'], - } -} diff --git a/modules/statistics/manifests/aggregator/projectview.pp b/modules/statistics/manifests/aggregator/projectview.pp index b490019..f05b3de 100644 --- a/modules/statistics/manifests/aggregator/projectview.pp +++ b/modules/statistics/manifests/aggregator/projectview.pp @@ -2,8 +2,8 @@ # Handles aggregation of projectview_hourly files # # WARNING - Files aggregated by this instance are using the -# new pageview definition. The legacy ones are managed by -# projectcounts.pp in the same folder. +# new pageview definition. The legacy ones are no longer +# being calculated # class statistics::aggregator::projectview { require statistics::aggregator -- To view, visit https://gerrit.wikimedia.org/r/265656 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I7c6869da0fdc18d8edc6e67cbd688abac39676f5 Gerrit-PatchSet: 5 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Nuria <[email protected]> Gerrit-Reviewer: Ottomata <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
