Joal has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/342197 )
Change subject: Add oozie job for standard metrics computation
......................................................................
Add oozie job for standard metrics computation
Add oozie coordinator and workflow to regularly compute metrics
over a mediawiki history snapshot
Bug: T160151
Change-Id: Ic5fddf5cc99b0c428f6685d0ef495ee2fe0c449b
---
M oozie/mediawiki/history/datasets.xml
M oozie/mediawiki/history/denormalize/coordinator.properties
A oozie/mediawiki/history/metrics/coordinator.properties
A oozie/mediawiki/history/metrics/coordinator.xml
M oozie/mediawiki/history/metrics/daily_edits.hql
M oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
M oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
M oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
M oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
M oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
M oozie/mediawiki/history/metrics/daily_unique_editors.hql
M oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
M oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
M oozie/mediawiki/history/metrics/monthly_new_editors.hql
M oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
M oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
A oozie/mediawiki/history/metrics/workflow.xml
17 files changed, 696 insertions(+), 2 deletions(-)
Approvals:
Joal: Verified; Looks good to me, approved
diff --git a/oozie/mediawiki/history/datasets.xml
b/oozie/mediawiki/history/datasets.xml
index 8845ffd..c2232d8 100644
--- a/oozie/mediawiki/history/datasets.xml
+++ b/oozie/mediawiki/history/datasets.xml
@@ -69,7 +69,6 @@
</dataset>
<!-- History metrics datasets -->
-
<dataset name="mw_metrics"
frequency="${coord:months(1)}"
initial-instance="${start_time}"
diff --git a/oozie/mediawiki/history/denormalize/coordinator.properties
b/oozie/mediawiki/history/denormalize/coordinator.properties
index 62ec627..65a3136 100644
--- a/oozie/mediawiki/history/denormalize/coordinator.properties
+++ b/oozie/mediawiki/history/denormalize/coordinator.properties
@@ -76,7 +76,7 @@
spark_partitions_number = 1024
tmp_path =
${name_node}/tmp/mediawiki/history/checkpoints
-# Workflow to add a partition
+# Workflow to repair partitions
repair_partitions_workflow_file =
${oozie_directory}/util/hive/partition/repair/workflow.xml
# Workflow to mark a directory as done
diff --git a/oozie/mediawiki/history/metrics/coordinator.properties
b/oozie/mediawiki/history/metrics/coordinator.properties
new file mode 100644
index 0000000..2550d6f
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/coordinator.properties
@@ -0,0 +1,63 @@
+# Configures a coordinator to automatically compute standard metrics from a
mediawiki history snapshot
+# Usage:
+# oozie job -Duser=$USER -Dstart_time=2015-08-01T00:00Z -submit -config
oozie/mediawiki/history/metrics/coordinator.properties
+#
+# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
+# .xml files exist there when this job is submitted.
+
+name_node = hdfs://analytics-hadoop
+job_tracker = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name = default
+
+#Default user
+user = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should override this to point
directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory. E.g.
/wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory = ${name_node}/wmf/refinery/current
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory = ${refinery_directory}/oozie
+
+# HDFS path to coordinator to run.
+coordinator_file =
${oozie_directory}/mediawiki/history/metrics/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file =
${oozie_directory}/mediawiki/history/metrics/workflow.xml
+
+# HDFS path to hive-site.xml file. This is needed to run hive actions.
+hive_site_xml = ${name_node}/user/hive/hive-site.xml
+
+# HDFS path to mediawiki history datasets definitions
+datasets_file =
${oozie_directory}/mediawiki/history/datasets.xml
+mw_directory = ${name_node}/wmf/data/wmf/mediawiki
+
+# mw metric_result table
+mw_denormalized_history_table = wmf.mediawiki_history
+mw_metrics_table = wmf.mediawiki_metrics
+
+# Run metrics for all wikis, starting 2001
+wiki_db = all
+start_timestamp = 20010101000000
+
+# Initial import time of the webrequest dataset.
+start_time = 2017-03-01T00:00Z
+
+# Time to stop running this coordinator. Year 3000 == never!
+stop_time = 3000-01-01T00:00Z
+
+# Workflow to mark a directory as done
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+# Workflow to send an error email
+send_error_email_workflow_file =
${oozie_directory}/util/send_error_email/workflow.xml
+
+# email address to contact in case of SLA errors
+sla_alert_contact = [email protected]
+
+# Coordinator to start.
+oozie.coord.application.path = ${coordinator_file}
+oozie.use.system.libpath = true
+oozie.action.external.stats.write = true
\ No newline at end of file
diff --git a/oozie/mediawiki/history/metrics/coordinator.xml
b/oozie/mediawiki/history/metrics/coordinator.xml
new file mode 100644
index 0000000..42050e7
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/coordinator.xml
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+ xmlns:sla="uri:oozie:sla:0.2"
+ name="mediawiki-history-metrics-coord"
+ frequency="${coord:months(1)}"
+ start="${start_time}"
+ end="${stop_time}"
+ timezone="Universal">
+
+ <parameters>
+ <!-- Required properties -->
+ <property><name>queue_name</name></property>
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+ <property><name>workflow_file</name></property>
+ <property><name>start_time</name></property>
+ <property><name>stop_time</name></property>
+
+ <property><name>datasets_file</name></property>
+ <property><name>mw_directory</name></property>
+
+
+ <property><name>mw_denormalized_history_table</name></property>
+ <property><name>mw_metrics_table</name></property>
+
+ <property><name>start_timestamp</name></property>
+ <property><name>wiki_db</name></property>
+
+ <property><name>hive_site_xml</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+ <property><name>send_error_email_workflow_file</name></property>
+
+ <property><name>sla_alert_contact</name></property>
+ </parameters>
+
+ <controls>
+ <!--(timeout is measured in minutes)-->
+ <timeout>-1</timeout>
+
+ <!-- Setting low concurrency for resource sharing.
+ The job runs pretty fast (~1 minute) and increasing concurrency
should not cause any problems-->
+ <concurrency>1</concurrency>
+
+ <throttle>2</throttle>
+
+ </controls>
+
+ <datasets>
+ <!--
+ Include refined datasets files.
+ $datasets_file will be used as the input events
+ -->
+ <include>${datasets_file}</include>
+ </datasets>
+
+ <input-events>
+ <data-in name="mw_denormalized_history_partitioned"
dataset="mw_denormalized_history_partitioned">
+ <instance>${coord:current(0)}</instance>
+ </data-in>
+ </input-events>
+
+ <output-events>
+ <data-out name="mw_metrics" dataset="mw_metrics">
+ <instance>${coord:current(0)}</instance>
+ </data-out>
+ </output-events>
+
+ <action>
+ <workflow>
+ <app-path>${workflow_file}</app-path>
+ <configuration>
+ <property>
+ <name>snapshot</name>
+ <value>${coord:formatTime(coord:nominalTime(),
"yyyy")}-${coord:formatTime(coord:nominalTime(), "MM")}</value>
+ </property>
+ <property>
+ <name>end_timestamp</name>
+ <value>${coord:formatTime(coord:nominalTime(),
"yyyy")}${coord:formatTime(coord:nominalTime(), "MM")}01000000</value>
+ </property>
+ <property>
+ <name>metrics_location</name>
+ <value>${coord:dataOut('mw_metrics')}</value>
+ </property>
+ </configuration>
+ </workflow>
+
+ <sla:info>
+ <!--
+ Use action actual time as SLA base, since it's the time used
+ to compute timeout
+ -->
+ <sla:nominal-time>${coord:actualTime()}</sla:nominal-time>
+ <sla:should-end>${4 * DAYS}</sla:should-end>
+ <sla:alert-events>end_miss</sla:alert-events>
+ <sla:alert-contact>${sla_alert_contact}</sla:alert-contact>
+ </sla:info>
+
+ </action>
+</coordinator-app>
\ No newline at end of file
diff --git a/oozie/mediawiki/history/metrics/daily_edits.hql
b/oozie/mediawiki/history/metrics/daily_edits.hql
index 5195b26..e3d5fd5 100644
--- a/oozie/mediawiki/history/metrics/daily_edits.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
b/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
index 811094e..bb68b58 100644
--- a/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
b/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
index 20719b8..6ebc1f0 100644
--- a/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git
a/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
b/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
index 55ca4a3..649fadd 100644
--- a/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
index b2904f3..cb236da 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
index 22eeb31..3c5b13f 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_editors.hql
index 1826bb7..4a90023 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_editors.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
b/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
index ede4b49..9242bc2 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git
a/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
index e7d40b0..3907067 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/monthly_new_editors.hql
b/oozie/mediawiki/history/metrics/monthly_new_editors.hql
index 0687a56..40491a0 100644
--- a/oozie/mediawiki/history/metrics/monthly_new_editors.hql
+++ b/oozie/mediawiki/history/metrics/monthly_new_editors.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
b/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
index effc373..4e930e3 100644
--- a/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
+++ b/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
b/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
index d708087..0195027 100644
--- a/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
+++ b/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
@@ -19,6 +19,7 @@
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
-- there are no more than 900 wikis, no matter how we import, usually less
set hive.exec.max.dynamic.partitions=2000;
-- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/workflow.xml
b/oozie/mediawiki/history/metrics/workflow.xml
new file mode 100644
index 0000000..181587d
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/workflow.xml
@@ -0,0 +1,521 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+ name="mediawiki-history-metrics-wf-${snapshot}">
+
+ <parameters>
+
+ <!-- Default values for inner oozie settings -->
+ <property>
+ <name>oozie_launcher_queue_name</name>
+ <value>${queue_name}</value>
+ </property>
+ <property>
+ <name>oozie_launcher_memory</name>
+ <value>2048</value>
+ </property>
+
+ <!-- Required properties -->
+ <property><name>queue_name</name></property>
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+
+ <property>
+ <name>metrics_location</name>
+ <description>location to mark the metrics dataset
done</description>
+ </property>
+
+ <!-- Hive tables-->
+ <property>
+ <name>mw_denormalized_history_table</name>
+ <description>Recomputed denormalized history table</description>
+ </property>
+ <property>
+ <name>mw_metrics_table</name>
+ <description>Mediawiki metrics table</description>
+ </property>
+
+ <property>
+ <name>start_timestamp</name>
+ <description>Start timestamp (YYYYMMDDHHMMSS format) to restrict
metrics computation</description>
+ </property>
+ <property>
+ <name>end_timestamp</name>
+ <description>End timestamp (YYYYMMDDHHMMSS format) to restrict
metrics computation</description>
+ </property>
+ <property>
+ <name>wiki_db</name>
+ <description>Wikis to compute metrics for (can be
'all')</description>
+ </property>
+
+ <property>
+ <name>snapshot</name>
+ <description>The snapshot partition used (usually
YYYY-MM)</description>
+ </property>
+
+ <!-- Subworkflows -->
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <description>Workflow for marking a directory done</description>
+ </property>
+ <property>
+ <name>send_error_email_workflow_file</name>
+ <description>Workflow for sending an email</description>
+ </property>
+ </parameters>
+
+ <start to="compute_daily_edits"/>
+
+ <action name="compute_daily_edits">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_edits.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_edits_by_anonymous_users"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_edits_by_anonymous_users">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_edits_by_anonymous_users.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_edits_by_bot_users"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_edits_by_bot_users">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_edits_by_bot_users.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_edits_by_registered_users"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_edits_by_registered_users">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_edits_by_registered_users.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_unique_anonymous_editors"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_unique_anonymous_editors">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_unique_anonymous_editors.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_unique_bot_editors"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_unique_bot_editors">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_unique_bot_editors.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_unique_editors"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_unique_editors">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_unique_editors.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_unique_page_creators"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_unique_page_creators">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_unique_page_creators.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_daily_unique_registered_editors"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_daily_unique_registered_editors">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>daily_unique_registered_editors.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_monthly_new_editors"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_monthly_new_editors">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>monthly_new_editors.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_monthly_new_registered_users"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_monthly_new_registered_users">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>monthly_new_registered_users.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="compute_monthly_surviving_new_editors"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="compute_monthly_surviving_new_editors">
+ <hive xmlns="uri:oozie:hive-action:0.3">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ </configuration>
+
+ <script>monthly_surviving_new_editors.hql</script>
+
+ <param>source_table=${mw_denormalized_history_table}</param>
+ <param>destination_table=${mw_metrics_table}</param>
+ <param>start_timestamp=${start_timestamp}</param>
+ <param>end_timestamp=${end_timestamp}</param>
+ <param>wiki_db=${wiki_db}</param>
+ <param>snapshot=${snapshot}</param>
+ </hive>
+ <ok to="mark_metrics_dataset_done"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="mark_metrics_dataset_done">
+ <sub-workflow>
+ <app-path>${mark_directory_done_workflow_file}</app-path>
+ <configuration>
+ <property>
+ <name>directory</name>
+ <value>${metrics_location}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
+ <ok to="end"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="send_error_email">
+ <sub-workflow>
+ <app-path>${send_error_email_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+ <property>
+ <name>parent_name</name>
+ <value>${wf:name()}</value>
+ </property>
+ <property>
+ <name>parent_failed_action</name>
+ <value>${wf:lastErrorNode()}</value>
+ </property>
+ <property>
+ <name>parent_error_code</name>
+ <value>${wf:errorCode(wf:lastErrorNode())}</value>
+ </property>
+ <property>
+ <name>parent_error_message</name>
+ <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
+ <ok to="kill"/>
+ <error to="kill"/>
+ </action>
+
+ <kill name="kill">
+ <message>Action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+ </kill>
+ <end name="end"/>
+</workflow-app>
--
To view, visit https://gerrit.wikimedia.org/r/342197
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ic5fddf5cc99b0c428f6685d0ef495ee2fe0c449b
Gerrit-PatchSet: 11
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits