Joal has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/342197 )

Change subject: Add oozie job for standard metrics computation
......................................................................


Add oozie job for standard metrics computation

Add oozie coordinator and workflow to regularly compute metrics
over a mediawiki history snapshot

Bug: T160151
Change-Id: Ic5fddf5cc99b0c428f6685d0ef495ee2fe0c449b
---
M oozie/mediawiki/history/datasets.xml
M oozie/mediawiki/history/denormalize/coordinator.properties
A oozie/mediawiki/history/metrics/coordinator.properties
A oozie/mediawiki/history/metrics/coordinator.xml
M oozie/mediawiki/history/metrics/daily_edits.hql
M oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
M oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
M oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
M oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
M oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
M oozie/mediawiki/history/metrics/daily_unique_editors.hql
M oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
M oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
M oozie/mediawiki/history/metrics/monthly_new_editors.hql
M oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
M oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
A oozie/mediawiki/history/metrics/workflow.xml
17 files changed, 696 insertions(+), 2 deletions(-)

Approvals:
  Joal: Verified; Looks good to me, approved



diff --git a/oozie/mediawiki/history/datasets.xml 
b/oozie/mediawiki/history/datasets.xml
index 8845ffd..c2232d8 100644
--- a/oozie/mediawiki/history/datasets.xml
+++ b/oozie/mediawiki/history/datasets.xml
@@ -69,7 +69,6 @@
     </dataset>
 
     <!-- History metrics datasets -->
-
     <dataset name="mw_metrics"
              frequency="${coord:months(1)}"
              initial-instance="${start_time}"
diff --git a/oozie/mediawiki/history/denormalize/coordinator.properties 
b/oozie/mediawiki/history/denormalize/coordinator.properties
index 62ec627..65a3136 100644
--- a/oozie/mediawiki/history/denormalize/coordinator.properties
+++ b/oozie/mediawiki/history/denormalize/coordinator.properties
@@ -76,7 +76,7 @@
 spark_partitions_number           = 1024
 tmp_path                          = 
${name_node}/tmp/mediawiki/history/checkpoints
 
-# Workflow to add a partition
+# Workflow to repair partitions
 repair_partitions_workflow_file   = 
${oozie_directory}/util/hive/partition/repair/workflow.xml
 
 # Workflow to mark a directory as done
diff --git a/oozie/mediawiki/history/metrics/coordinator.properties 
b/oozie/mediawiki/history/metrics/coordinator.properties
new file mode 100644
index 0000000..2550d6f
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/coordinator.properties
@@ -0,0 +1,63 @@
+# Configures a coordinator to automatically compute standard metrics from  a 
mediawiki history snapshot
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-08-01T00:00Z -submit -config 
oozie/mediawiki/history/metrics/coordinator.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+
+#Default user
+user                              = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should override this to point 
directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory. E.g. 
/wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                = ${name_node}/wmf/refinery/current
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                   = ${refinery_directory}/oozie
+
+# HDFS path to coordinator to run.
+coordinator_file                  = 
${oozie_directory}/mediawiki/history/metrics/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file                     = 
${oozie_directory}/mediawiki/history/metrics/workflow.xml
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+hive_site_xml                     = ${name_node}/user/hive/hive-site.xml
+
+# HDFS path to mediawiki history datasets definitions
+datasets_file                     = 
${oozie_directory}/mediawiki/history/datasets.xml
+mw_directory                      = ${name_node}/wmf/data/wmf/mediawiki
+
+# mw metric_result table
+mw_denormalized_history_table     = wmf.mediawiki_history
+mw_metrics_table                  = wmf.mediawiki_metrics
+
+# Run metrics for all wikis, starting 2001
+wiki_db                           = all
+start_timestamp                   = 20010101000000
+
+# Initial import time of the webrequest dataset.
+start_time                        = 2017-03-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# Workflow to mark a directory as done
+mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+# Workflow to send an error email
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# email address to contact in case of SLA errors
+sla_alert_contact                 = [email protected]
+
+# Coordinator to start.
+oozie.coord.application.path     = ${coordinator_file}
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
\ No newline at end of file
diff --git a/oozie/mediawiki/history/metrics/coordinator.xml 
b/oozie/mediawiki/history/metrics/coordinator.xml
new file mode 100644
index 0000000..42050e7
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/coordinator.xml
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    xmlns:sla="uri:oozie:sla:0.2"
+    name="mediawiki-history-metrics-coord"
+    frequency="${coord:months(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+        <!-- Required properties -->
+        <property><name>queue_name</name></property>
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>datasets_file</name></property>
+        <property><name>mw_directory</name></property>
+
+
+        <property><name>mw_denormalized_history_table</name></property>
+        <property><name>mw_metrics_table</name></property>
+
+        <property><name>start_timestamp</name></property>
+        <property><name>wiki_db</name></property>
+
+        <property><name>hive_site_xml</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>send_error_email_workflow_file</name></property>
+
+        <property><name>sla_alert_contact</name></property>
+    </parameters>
+
+    <controls>
+        <!--(timeout is measured in minutes)-->
+        <timeout>-1</timeout>
+
+        <!-- Setting low concurrency for resource sharing.
+             The job runs pretty fast (~1 minute) and increasing concurrency 
should not cause any problems-->
+        <concurrency>1</concurrency>
+
+        <throttle>2</throttle>
+
+    </controls>
+
+    <datasets>
+        <!--
+        Include refined datasets files.
+        $datasets_file will be used as the input events
+        -->
+        <include>${datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="mw_denormalized_history_partitioned" 
dataset="mw_denormalized_history_partitioned">
+            <instance>${coord:current(0)}</instance>
+        </data-in>
+    </input-events>
+
+    <output-events>
+        <data-out name="mw_metrics" dataset="mw_metrics">
+            <instance>${coord:current(0)}</instance>
+        </data-out>
+    </output-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>snapshot</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}-${coord:formatTime(coord:nominalTime(), "MM")}</value>
+                </property>
+                <property>
+                    <name>end_timestamp</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}${coord:formatTime(coord:nominalTime(), "MM")}01000000</value>
+                </property>
+                <property>
+                    <name>metrics_location</name>
+                    <value>${coord:dataOut('mw_metrics')}</value>
+                </property>
+            </configuration>
+        </workflow>
+
+        <sla:info>
+            <!--
+                Use action actual time as SLA base, since it's the time used
+                to compute timeout
+            -->
+            <sla:nominal-time>${coord:actualTime()}</sla:nominal-time>
+            <sla:should-end>${4 * DAYS}</sla:should-end>
+            <sla:alert-events>end_miss</sla:alert-events>
+            <sla:alert-contact>${sla_alert_contact}</sla:alert-contact>
+        </sla:info>
+
+    </action>
+</coordinator-app>
\ No newline at end of file
diff --git a/oozie/mediawiki/history/metrics/daily_edits.hql 
b/oozie/mediawiki/history/metrics/daily_edits.hql
index 5195b26..e3d5fd5 100644
--- a/oozie/mediawiki/history/metrics/daily_edits.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql 
b/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
index 811094e..bb68b58 100644
--- a/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql 
b/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
index 20719b8..6ebc1f0 100644
--- a/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git 
a/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql 
b/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
index 55ca4a3..649fadd 100644
--- a/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql 
b/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
index b2904f3..cb236da 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql 
b/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
index 22eeb31..3c5b13f 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_editors.hql 
b/oozie/mediawiki/history/metrics/daily_unique_editors.hql
index 1826bb7..4a90023 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_editors.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql 
b/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
index ede4b49..9242bc2 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git 
a/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql 
b/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
index e7d40b0..3907067 100644
--- a/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
+++ b/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/monthly_new_editors.hql 
b/oozie/mediawiki/history/metrics/monthly_new_editors.hql
index 0687a56..40491a0 100644
--- a/oozie/mediawiki/history/metrics/monthly_new_editors.hql
+++ b/oozie/mediawiki/history/metrics/monthly_new_editors.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql 
b/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
index effc373..4e930e3 100644
--- a/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
+++ b/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql 
b/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
index d708087..0195027 100644
--- a/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
+++ b/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
@@ -19,6 +19,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.error.on.empty.partition=false;
 -- there are no more than 900 wikis, no matter how we import, usually less
 set hive.exec.max.dynamic.partitions=2000;
 -- and we only use one node
diff --git a/oozie/mediawiki/history/metrics/workflow.xml 
b/oozie/mediawiki/history/metrics/workflow.xml
new file mode 100644
index 0000000..181587d
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/workflow.xml
@@ -0,0 +1,521 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+    name="mediawiki-history-metrics-wf-${snapshot}">
+
+    <parameters>
+
+        <!-- Default values for inner oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>2048</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>queue_name</name></property>
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+
+        <property>
+            <name>metrics_location</name>
+            <description>location to mark the metrics dataset 
done</description>
+        </property>
+
+        <!-- Hive tables-->
+        <property>
+            <name>mw_denormalized_history_table</name>
+            <description>Recomputed denormalized history table</description>
+        </property>
+        <property>
+            <name>mw_metrics_table</name>
+            <description>Mediawiki metrics table</description>
+        </property>
+
+        <property>
+            <name>start_timestamp</name>
+            <description>Start timestamp (YYYYMMDDHHMMSS format) to restrict 
metrics computation</description>
+        </property>
+        <property>
+            <name>end_timestamp</name>
+            <description>End timestamp (YYYYMMDDHHMMSS format) to restrict 
metrics computation</description>
+        </property>
+        <property>
+            <name>wiki_db</name>
+            <description>Wikis to compute metrics for (can be 
'all')</description>
+        </property>
+
+        <property>
+            <name>snapshot</name>
+            <description>The snapshot partition used (usually 
YYYY-MM)</description>
+        </property>
+
+        <!-- Subworkflows -->
+        <property>
+            <name>mark_directory_done_workflow_file</name>
+            <description>Workflow for marking a directory done</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
+        </property>
+    </parameters>
+
+    <start to="compute_daily_edits"/>
+
+    <action name="compute_daily_edits">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_edits.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_edits_by_anonymous_users"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_edits_by_anonymous_users">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_edits_by_anonymous_users.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_edits_by_bot_users"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_edits_by_bot_users">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_edits_by_bot_users.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_edits_by_registered_users"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_edits_by_registered_users">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_edits_by_registered_users.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_unique_anonymous_editors"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_unique_anonymous_editors">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_unique_anonymous_editors.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_unique_bot_editors"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_unique_bot_editors">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_unique_bot_editors.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_unique_editors"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_unique_editors">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_unique_editors.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_unique_page_creators"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_unique_page_creators">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_unique_page_creators.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_daily_unique_registered_editors"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_daily_unique_registered_editors">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>daily_unique_registered_editors.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_monthly_new_editors"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_monthly_new_editors">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>monthly_new_editors.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_monthly_new_registered_users"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_monthly_new_registered_users">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>monthly_new_registered_users.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="compute_monthly_surviving_new_editors"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="compute_monthly_surviving_new_editors">
+        <hive xmlns="uri:oozie:hive-action:0.3">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+            </configuration>
+
+            <script>monthly_surviving_new_editors.hql</script>
+
+            <param>source_table=${mw_denormalized_history_table}</param>
+            <param>destination_table=${mw_metrics_table}</param>
+            <param>start_timestamp=${start_timestamp}</param>
+            <param>end_timestamp=${end_timestamp}</param>
+            <param>wiki_db=${wiki_db}</param>
+            <param>snapshot=${snapshot}</param>
+        </hive>
+        <ok to="mark_metrics_dataset_done"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="mark_metrics_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    <value>${metrics_location}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="end"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>parent_failed_action</name>
+                    <value>${wf:lastErrorNode()}</value>
+                </property>
+                <property>
+                    <name>parent_error_code</name>
+                    <value>${wf:errorCode(wf:lastErrorNode())}</value>
+                </property>
+                <property>
+                    <name>parent_error_message</name>
+                    <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/342197
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic5fddf5cc99b0c428f6685d0ef495ee2fe0c449b
Gerrit-PatchSet: 11
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to