Joal has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/342197 )
Change subject: [WIP] Add oozie job for standard metrics computation
......................................................................
[WIP] Add oozie job for standard metrics computation
Bug: T160151
Change-Id: Ic5fddf5cc99b0c428f6685d0ef495ee2fe0c449b
---
R hive/mediawiki/history/create_mediawiki_metric_result.hql
M oozie/mediawiki/history/datasets.xml
M oozie/mediawiki/history/denormalize/bundle.properties
M oozie/mediawiki/history/denormalize/coordinator.xml
M oozie/mediawiki/history/denormalize/workflow.xml
A oozie/mediawiki/history/metrics/bundle.properties
A oozie/mediawiki/history/metrics/bundle.xml
A oozie/mediawiki/history/metrics/coordinator.xml
R oozie/mediawiki/history/metrics/daily_edits.hql
R oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
R oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
R oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
R oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
R oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
R oozie/mediawiki/history/metrics/daily_unique_editors.hql
R oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
R oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
R oozie/mediawiki/history/metrics/monthly_new_editors.hql
R oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
R oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
R oozie/mediawiki/history/metrics/productive_new_editors.hql
A oozie/mediawiki/history/metrics/workflow.xml
22 files changed, 466 insertions(+), 12 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/97/342197/1
diff --git a/hive/mediawiki/metrics/create_mediawiki_metric_result.hql
b/hive/mediawiki/history/create_mediawiki_metric_result.hql
similarity index 90%
rename from hive/mediawiki/metrics/create_mediawiki_metric_result.hql
rename to hive/mediawiki/history/create_mediawiki_metric_result.hql
index 6ee4865..8816541 100644
--- a/hive/mediawiki/metrics/create_mediawiki_metric_result.hql
+++ b/hive/mediawiki/history/create_mediawiki_metric_result.hql
@@ -9,7 +9,8 @@
PARTITIONED BY (
`infra` string COMMENT 'Infrastructure from which data is retrieved (usually
labs or prod)',
`version` string COMMENT 'Versioning information to keep multiple datasets
(usually YYYY-MM)',
- `metric` string COMMENT 'The metric being computed to measure')
+ `metric` string COMMENT 'The metric being computed to measure',
+ `wiki_db` string COMMENT 'The wiki this measurement pertains to')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
diff --git a/oozie/mediawiki/history/datasets.xml
b/oozie/mediawiki/history/datasets.xml
index 1463381..7bba001 100644
--- a/oozie/mediawiki/history/datasets.xml
+++ b/oozie/mediawiki/history/datasets.xml
@@ -88,7 +88,7 @@
<done-flag>_SUCCESS</done-flag>
</dataset>
- <dataset name="mw_history_denormalized"
+ <dataset name="mw_denormalized_history"
frequency="${coord:months(1)}"
initial-instance="${start_time}"
timezone="Universal">
@@ -97,7 +97,6 @@
</dataset>
<!-- History metrics datasets -->
-
<dataset name="mw_metrics"
frequency="${coord:months(1)}"
initial-instance="${start_time}"
diff --git a/oozie/mediawiki/history/denormalize/bundle.properties
b/oozie/mediawiki/history/denormalize/bundle.properties
index f20ccf9..edaf983 100644
--- a/oozie/mediawiki/history/denormalize/bundle.properties
+++ b/oozie/mediawiki/history/denormalize/bundle.properties
@@ -76,7 +76,7 @@
partitions_number = 1024
tmp_path =
${name_node}/tmp/mediawiki/history/checkpoints
-# Workflow to add a partition
+# Workflow to repair partitions
repair_partitions_workflow_file =
${oozie_directory}/util/hive/partition/repair/workflow.xml
# Workflow to mark a directory as done
diff --git a/oozie/mediawiki/history/denormalize/coordinator.xml
b/oozie/mediawiki/history/denormalize/coordinator.xml
index 6a8f132..bbff8dd 100644
--- a/oozie/mediawiki/history/denormalize/coordinator.xml
+++ b/oozie/mediawiki/history/denormalize/coordinator.xml
@@ -99,7 +99,7 @@
<data-out name="mw_page_history" dataset="mw_page_history">
<instance>${coord:current(0)}</instance>
</data-out>
- <data-out name="mw_history_denormalized"
dataset="mw_history_denormalized">
+ <data-out name="mw_denormalized_history"
dataset="mw_denormalized_history">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>
@@ -109,7 +109,7 @@
<app-path>${workflow_file}</app-path>
<configuration>
<property>
- <name>version</name>
+ <name>mw_version_partition</name>
<value>${coord:formatTime(coord:nominalTime(),
"yyyy")}-${coord:formatTime(coord:nominalTime(), "MM")}</value>
</property>
</configuration>
diff --git a/oozie/mediawiki/history/denormalize/workflow.xml
b/oozie/mediawiki/history/denormalize/workflow.xml
index 3348d2a..1e86335 100644
--- a/oozie/mediawiki/history/denormalize/workflow.xml
+++ b/oozie/mediawiki/history/denormalize/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
- name="mediawiki-history-denormalize-wf-${mw_infra_partition}-${version}">
+
name="mediawiki-history-denormalize-wf-${mw_infra_partition}-${mw_version_partition}">
<parameters>
@@ -54,7 +54,7 @@
<description>The infra data to process (prod or labs)</description>
</property>
<property>
- <name>version</name>
+ <name>mw_version_partition</name>
<description>The version for the job</description>
</property>
@@ -217,7 +217,7 @@
</configuration>
<master>${spark_master}</master>
<mode>${spark_deploy}</mode>
- <name>${spark_job_name}-${mw_infra_partition}-${version}</name>
+
<name>${spark_job_name}-${mw_infra_partition}-${mw_version_partition}</name>
<class>${spark_job_class}</class>
<jar>${spark_job_jar}</jar>
<spark-opts>--conf spark.yarn.jar=${spark_assembly_jar}
--executor-memory ${spark_executor_memory} --driver-memory
${spark_driver_memory} --queue ${queue_name}</spark-opts>
@@ -228,7 +228,7 @@
<arg>--infra</arg>
<arg>${mw_infra_partition}</arg>
<arg>--version</arg>
- <arg>${version}</arg>
+ <arg>${mw_version_partition}</arg>
<arg>--temporary-path</arg>
<arg>${tmp_path}</arg>
<arg>--num-partitions</arg>
diff --git a/oozie/mediawiki/history/metrics/bundle.properties
b/oozie/mediawiki/history/metrics/bundle.properties
new file mode 100644
index 0000000..21d0a87
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/bundle.properties
@@ -0,0 +1,68 @@
+# Configures a bundle to automatically compute prod and labs standard metrics
+# from mediawiki history
+# Usage:
+# oozie job -Duser=$USER -Dstart_time=2015-08-01T00:00Z -submit -config
oozie/mediawiki/history/metrics/bundle.properties
+#
+# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
+# .xml files exist there when this job is submitted.
+
+name_node = hdfs://analytics-hadoop
+job_tracker = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name = default
+
+#Default user
+user = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should override this to point
directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory. E.g.
/wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory = ${name_node}/wmf/refinery/current
+
+# HDFS path to artifacts that will be used by this job.
+# E.g. refinery-job.jar should exist here.
+artifacts_directory = ${refinery_directory}/artifacts
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory = ${refinery_directory}/oozie
+
+# HDFS path to bundle to run.
+bundle_file =
${oozie_directory}/mediawiki/history/metrics/bundle.xml
+
+# HDFS path to coordinator to run for each infra.
+coordinator_file =
${oozie_directory}/mediawiki/history/metrics/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file =
${oozie_directory}/mediawiki/history/metrics/workflow.xml
+
+# HDFS path to hive-site.xml file. This is needed to run hive actions.
+hive_site_xml = ${name_node}/user/hive/hive-site.xml
+
+# HDFS path to mediawiki history datasets definitions
+datasets_file =
${oozie_directory}/mediawiki/history/datasets.xml
+mw_directory = ${name_node}/wmf/data/wmf/mediawiki
+
+# mw metric_result table
+mw_denormalized_history_table = wmf.mediawiki_history
+mw_metrics_table = wmf.mediawiki_metric_result
+
+
+# Initial import time of the webrequest dataset.
+start_time = 2017-03-01T00:00Z
+
+# Time to stop running this coordinator. Year 3000 == never!
+stop_time = 3000-01-01T00:00Z
+
+# Workflow to mark a directory as done
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+# Workflow to send an error email
+send_error_email_workflow_file =
${oozie_directory}/util/send_error_email/workflow.xml
+
+# email address to contact in case of SLA errors
+sla_alert_contact = [email protected]
+
+# Coordinator to start.
+oozie.bundle.application.path = ${bundle_file}
+oozie.use.system.libpath = true
+oozie.action.external.stats.write = true
\ No newline at end of file
diff --git a/oozie/mediawiki/history/metrics/bundle.xml
b/oozie/mediawiki/history/metrics/bundle.xml
new file mode 100644
index 0000000..71e51e1
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/bundle.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<bundle-app xmlns="uri:oozie:bundle:0.2"
+ name="mediawiki-history-metrics-bundle">
+
+ <parameters>
+ <!-- Required properties -->
+ <property><name>queue_name</name></property>
+ <property><name>coordinator_file</name></property>
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+ <property><name>workflow_file</name></property>
+ <property><name>start_time</name></property>
+ <property><name>stop_time</name></property>
+
+ <property><name>datasets_file</name></property>
+ <property><name>mw_directory</name></property>
+
+ <property><name>mw_denormalized_history_table</name></property>
+ <property><name>mw_metrics_table</name></property>
+
+ <property><name>hive_site_xml</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+ <property><name>send_error_email_workflow_file</name></property>
+
+ <property><name>sla_alert_contact</name></property>
+ </parameters>
+
+ <coordinator name="mediawiki-history-metrics-coord-prod">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>mw_infra_partition</name>
+ <value>prod</value>
+ </property>
+ </configuration>
+ </coordinator>
+
+ <coordinator name="mediawiki-history-metrics-coord-labs">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>mw_infra_partition</name>
+ <value>labs</value>
+ </property>
+ </configuration>
+ </coordinator>
+
+</bundle-app>
diff --git a/oozie/mediawiki/history/metrics/coordinator.xml
b/oozie/mediawiki/history/metrics/coordinator.xml
new file mode 100644
index 0000000..b0900dd
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/coordinator.xml
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+ xmlns:sla="uri:oozie:sla:0.2"
+ name="mediawiki-history-metrics-coord-${mw_infra_partition}"
+ frequency="${coord:months(1)}"
+ start="${start_time}"
+ end="${stop_time}"
+ timezone="Universal">
+
+ <parameters>
+ <!-- Required properties -->
+ <property><name>queue_name</name></property>
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+ <property><name>workflow_file</name></property>
+ <property><name>start_time</name></property>
+ <property><name>stop_time</name></property>
+
+ <property><name>datasets_file</name></property>
+ <property><name>mw_directory</name></property>
+
+ <property><name>mw_denormalized_history_table</name></property>
+ <property><name>mw_metrics_table</name></property>
+
+ <property><name>mw_infra_partition</name></property>
+
+ <property><name>hive_site_xml</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+ <property><name>send_error_email_workflow_file</name></property>
+
+ <property><name>sla_alert_contact</name></property>
+ </parameters>
+
+ <controls>
+ <!--(timeout is measured in minutes)-->
+ <timeout>-1</timeout>
+
+ <!-- Setting low concurrency for resource sharing.
+ The job runs pretty fast (~1 minute) and increasing concurrency
should not cause any problems-->
+ <concurrency>1</concurrency>
+
+ <throttle>2</throttle>
+
+ </controls>
+
+ <datasets>
+ <!--
+ Include refined datasets files.
+ $datasets_file will be used as the input events
+ -->
+ <include>${datasets_file}</include>
+ </datasets>
+
+ <input-events>
+ <data-in name="mw_history_denormalized"
dataset="mw_history_denormalized">
+ <instance>${coord:current(0)}</instance>
+ </data-in>
+ </input-events>
+
+ <output-events>
+ <data-in name="mw_metrics_" dataset="mw_history_denormalized">
+ <instance>${coord:current(0)}</instance>
+ </data-in>
+ </output-events>
+
+ <action>
+ <workflow>
+ <app-path>${workflow_file}</app-path>
+ <configuration>
+ <property>
+ <name>mw_version_partition</name>
+ <value>${coord:formatTime(coord:nominalTime(),
"yyyy")}-${coord:formatTime(coord:nominalTime(), "MM")}</value>
+ </property>
+ </configuration>
+ </workflow>
+
+ <sla:info>
+ <!--
+ Use action actual time as SLA base, since it's the time used
+ to compute timeout
+ -->
+ <sla:nominal-time>${coord:actualTime()}</sla:nominal-time>
+ <sla:should-end>${4 * DAYS}</sla:should-end>
+ <sla:alert-events>end_miss</sla:alert-events>
+ <sla:alert-contact>${sla_alert_contact}</sla:alert-contact>
+ </sla:info>
+
+ </action>
+</coordinator-app>
\ No newline at end of file
diff --git a/oozie/mediawiki/history/metric/daily_edits.hql
b/oozie/mediawiki/history/metrics/daily_edits.hql
similarity index 83%
rename from oozie/mediawiki/history/metric/daily_edits.hql
rename to oozie/mediawiki/history/metrics/daily_edits.hql
index df630f4..45aca3f 100644
--- a/oozie/mediawiki/history/metric/daily_edits.hql
+++ b/oozie/mediawiki/history/metrics/daily_edits.hql
@@ -3,7 +3,9 @@
-- start_timestamp -- YYYYMMDDHHmmss formatted start (inclusive)
-- end_timestamp -- YYYYMMDDHHmmss formatted end (exclusive)
-- wiki_db -- Optionally filter by source wiki database.
--- Pass 'all' to compute for all wikis
+-- Pass 'all' to compute for all wikis
+-- infra -- infra partition to use
+-- version -- version partition to use
--
-- Usage:
-- hive -f daily_edits.hql \
@@ -21,7 +23,8 @@
set hive.exec.max.dynamic.partitions.pernode=2000;
-- dynamic partitions must be specified here
- insert overwrite table ${destination_table} partition (metric, wiki_db)
+ insert overwrite table ${destination_table}
+ partition (infra='${infra}', version='${version}', metric, wiki_db)
-- dynamic partitions must be selected in order and at the end
select concat_ws('-',
substring(event_timestamp, 0, 4),
@@ -38,6 +41,8 @@
and ('${wiki_db}' = 'all' or wiki_db = '${wiki_db}')
and event_timestamp >= '${start_timestamp}'
and event_timestamp < '${end_timestamp}'
+ and infra = '${infra}'
+ and version = '${version}'
group by wiki_db,
substring(event_timestamp, 0, 4),
diff --git a/oozie/mediawiki/history/metric/daily_edits_by_anonymous_users.hql
b/oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_edits_by_anonymous_users.hql
rename to oozie/mediawiki/history/metrics/daily_edits_by_anonymous_users.hql
diff --git a/oozie/mediawiki/history/metric/daily_edits_by_bot_users.hql
b/oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_edits_by_bot_users.hql
rename to oozie/mediawiki/history/metrics/daily_edits_by_bot_users.hql
diff --git a/oozie/mediawiki/history/metric/daily_edits_by_registered_users.hql
b/oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_edits_by_registered_users.hql
rename to oozie/mediawiki/history/metrics/daily_edits_by_registered_users.hql
diff --git a/oozie/mediawiki/history/metric/daily_unique_anonymous_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_unique_anonymous_editors.hql
rename to oozie/mediawiki/history/metrics/daily_unique_anonymous_editors.hql
diff --git a/oozie/mediawiki/history/metric/daily_unique_bot_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_unique_bot_editors.hql
rename to oozie/mediawiki/history/metrics/daily_unique_bot_editors.hql
diff --git a/oozie/mediawiki/history/metric/daily_unique_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_unique_editors.hql
rename to oozie/mediawiki/history/metrics/daily_unique_editors.hql
diff --git a/oozie/mediawiki/history/metric/unique_page_creators.hql
b/oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/unique_page_creators.hql
rename to oozie/mediawiki/history/metrics/daily_unique_page_creators.hql
diff --git a/oozie/mediawiki/history/metric/daily_unique_registered_editors.hql
b/oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/daily_unique_registered_editors.hql
rename to oozie/mediawiki/history/metrics/daily_unique_registered_editors.hql
diff --git a/oozie/mediawiki/history/metric/monthly_new_editors.hql
b/oozie/mediawiki/history/metrics/monthly_new_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/monthly_new_editors.hql
rename to oozie/mediawiki/history/metrics/monthly_new_editors.hql
diff --git a/oozie/mediawiki/history/metric/monthly_new_registered_users.hql
b/oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/monthly_new_registered_users.hql
rename to oozie/mediawiki/history/metrics/monthly_new_registered_users.hql
diff --git a/oozie/mediawiki/history/metric/monthly_surviving_new_editors.hql
b/oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/monthly_surviving_new_editors.hql
rename to oozie/mediawiki/history/metrics/monthly_surviving_new_editors.hql
diff --git a/oozie/mediawiki/history/metric/productive_new_editors.hql
b/oozie/mediawiki/history/metrics/productive_new_editors.hql
similarity index 100%
rename from oozie/mediawiki/history/metric/productive_new_editors.hql
rename to oozie/mediawiki/history/metrics/productive_new_editors.hql
diff --git a/oozie/mediawiki/history/metrics/workflow.xml
b/oozie/mediawiki/history/metrics/workflow.xml
new file mode 100644
index 0000000..acd0730
--- /dev/null
+++ b/oozie/mediawiki/history/metrics/workflow.xml
@@ -0,0 +1,244 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+
name="mediawiki-history-denormalize-wf-${mw_infra_partition}-${mw_version_partition}">
+
+ <parameters>
+
+ <!-- Default values for inner oozie settings -->
+ <property>
+ <name>oozie_launcher_queue_name</name>
+ <value>${queue_name}</value>
+ </property>
+ <property>
+ <name>oozie_launcher_memory</name>
+ <value>2048</value>
+ </property>
+
+ <!-- Required properties -->
+ <property><name>queue_name</name></property>
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+
+ <property>
+ <name>mw_directory</name>
+ <description>Path to mediawiki processed data on
Hadoop</description>
+ </property>
+
+ <property>
+ <name>mw_infra_partition</name>
+ <description>The infra data to process (prod or labs)</description>
+ </property>
+ <property>
+ <name>mw_version_partition</name>
+ <description>The version for the job</description>
+ </property>
+
+ <!-- Hive tables-->
+ <property>
+ <name>mw_denormalized_history_table</name>
+ <description>Recomputed denormalized history table</description>
+ </property>
+ <property>
+ <name>mw_metrics_table</name>
+ <description>Mediawiki metrics table</description>
+ </property>
+
+ <!-- Subworkflows -->
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <description>Workflow for marking a directory done</description>
+ </property>
+ <property>
+ <name>send_error_email_workflow_file</name>
+ <description>Workflow for sending an email</description>
+ </property>
+ </parameters>
+
+ <start to="compute_daily_edits"/>
+
+ <!-- Add MW tables partitions -->
+
+ <action name="repair_archive_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_archive_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_logging_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_logging_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_logging_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_page_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_page_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_page_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_revision_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_revision_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_revision_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_user_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_user_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_user_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_user_groups_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_user_groups_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_user_groups_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="denormalize_history"/>
+ <error to="send_error_email"/>
+ </action>
+
+
+ <!-- Compute user, page and denormalized history -->
+
+ <action name="denormalize_history">
+ <spark xmlns="uri:oozie:spark-action:0.1">
+
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ </configuration>
+ <master>${spark_master}</master>
+ <mode>${spark_deploy}</mode>
+
<name>${spark_job_name}-${mw_infra_partition}-${mw_version_partition}</name>
+ <class>${spark_job_class}</class>
+ <jar>${spark_job_jar}</jar>
+ <spark-opts>--conf spark.yarn.jar=${spark_assembly_jar}
--executor-memory ${spark_executor_memory} --driver-memory
${spark_driver_memory} --queue ${queue_name}</spark-opts>
+ <arg>--mediawiki-base-path</arg>
+ <arg>${mw_raw_directory}</arg>
+ <arg>--output-base-path</arg>
+ <arg>${mw_directory}</arg>
+ <arg>--infra</arg>
+ <arg>${mw_infra_partition}</arg>
+ <arg>--version</arg>
+ <arg>${mw_version_partition}</arg>
+ <arg>--temporary-path</arg>
+ <arg>${tmp_path}</arg>
+ <arg>--num-partitions</arg>
+ <arg>${partitions_number}</arg>
+ </spark>
+ <ok to="repair_user_history_table_partitions" />
+ <error to="send_error_email" />
+ </action>
+
+ <!-- Add MW History tables partitions -->
+
+ <action name="repair_user_history_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_user_history_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_page_history_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_page_history_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_page_history_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="repair_denormalized_history_table_partitions"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="repair_denormalized_history_table_partitions">
+ <sub-workflow>
+ <app-path>${repair_partitions_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+
<property><name>table</name><value>${mw_denormalized_history_table}</value></property>
+ </configuration>
+ </sub-workflow>
+ <ok to="end"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="send_error_email">
+ <sub-workflow>
+ <app-path>${send_error_email_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+ <property>
+ <name>parent_name</name>
+ <value>${wf:name()}</value>
+ </property>
+ <property>
+ <name>parent_failed_action</name>
+ <value>${wf:lastErrorNode()}</value>
+ </property>
+ <property>
+ <name>parent_error_code</name>
+ <value>${wf:errorCode(wf:lastErrorNode())}</value>
+ </property>
+ <property>
+ <name>parent_error_message</name>
+ <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
+ <ok to="kill"/>
+ <error to="kill"/>
+ </action>
+
+ <kill name="kill">
+ <message>Action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+ </kill>
+ <end name="end"/>
+</workflow-app>
--
To view, visit https://gerrit.wikimedia.org/r/342197
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic5fddf5cc99b0c428f6685d0ef495ee2fe0c449b
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits