Ottomata has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/402064 )
Change subject: Refine mediawiki job queue events into Hive event database
......................................................................
Refine mediawiki job queue events into Hive event database
Change-Id: I279aa9046d4a632183894d9d21893307962d4621
---
M modules/role/manifests/analytics_cluster/refinery/job/json_refine.pp
1 file changed, 42 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/64/402064/1
diff --git
a/modules/role/manifests/analytics_cluster/refinery/job/json_refine.pp
b/modules/role/manifests/analytics_cluster/refinery/job/json_refine.pp
index d1c7a3f..f50189b 100644
--- a/modules/role/manifests/analytics_cluster/refinery/job/json_refine.pp
+++ b/modules/role/manifests/analytics_cluster/refinery/job/json_refine.pp
@@ -30,4 +30,46 @@
table_blacklist =>
'^mediawiki_page_properties_change|mediawiki_recentchange$',
minute => 20,
}
+
+ # Refine Mediawiki job queue events (from EventBus).
+ # This could be combined into the same EventBus refine job above, but it
is nice to
+ # have them separated, as the job queue schemas are legacy and can be
problematic.
+
+ # $problematic_jobs will not be refined.
+ # These have inconsistent schemas that cause refinement to fail.
+ $problematic_jobs = [
+ 'EchoNotificationJob',
+ 'EchoNotificationDeleteJob',
+ 'TranslationsUpdateJob',
+ 'MessageGroupStatesUpdaterJob',
+ 'InjectRCRecords',
+ 'cirrusSearchDeleteArchive',
+ 'enqueue',
+ 'htmlCacheUpdate',
+ 'LocalRenameUserJob',
+ 'RecordLintJob',
+ 'wikibase_addUsagesForPage',
+ 'refreshLinks',
+ 'cirrusSearchCheckerJob',
+ 'MassMessageSubmitJob',
+ 'refreshLinksPrioritized',
+ 'TranslatablePageMoveJob',
+ 'ORESFetchScoreJob',
+ 'PublishStashedFile',
+ 'CentralAuthCreateLocalAccountJob',
+ 'gwtoolsetUploadMediafileJob',
+ ]
+ $table_blacklist = sprintf('.*(%s)$', join($problematic_jobs, '|'))
+
+ role::analytics_cluster::refinery::job::json_refine_job {
'eventlogging_eventbus_job_queue':
+ # This is imported by camus_job { 'mediawiki_job': }
+ input_base_path => '/wmf/data/raw/mediawiki_job',
+ # 'datacenter' is extracted from the input path into a Hive table
partition
+ input_regex =>
'.*(eqiad|codfw)_(.+)/hourly/(\\d+)/(\\d+)/(\\d+)/(\\d+)',
+ input_capture => 'datacenter,table,year,month,day,hour',
+ output_base_path => '/wmf/data/event',
+ output_database => 'event',
+ table_blacklist => $table_blacklist,
+ minute => 25,
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/402064
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I279aa9046d4a632183894d9d21893307962d4621
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits