Ottomata has uploaded a new change for review.
https://gerrit.wikimedia.org/r/184804
Change subject: Add _SUCCESS done-flag in refined webrequest data directories
after successful refinement
......................................................................
Add _SUCCESS done-flag in refined webrequest data directories after successful
refinement
Change-Id: I88058d136ebe10a7330558543ad49f8fcf13a108
---
M oozie/webrequest/datasets.xml
M oozie/webrequest/refine/bundle.properties
M oozie/webrequest/refine/bundle.xml
M oozie/webrequest/refine/coordinator.xml
M oozie/webrequest/refine/workflow.xml
5 files changed, 54 insertions(+), 8 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/04/184804/1
diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index d9c8ad7..09bd80a 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -34,15 +34,15 @@
initial-instance="${start_time}"
timezone="Universal">
<uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
- <done-flag></done-flag>
+ <done-flag>_SUCCESS</done-flag>
</dataset>
+
<dataset name="webrequest_text"
frequency="${coord:hours(1)}"
initial-instance="${start_time}"
timezone="Universal">
<uri-template>${webrequest_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
- <done-flag></done-flag>
-
+ <done-flag>_SUCCESS</done-flag>
</dataset>
diff --git a/oozie/webrequest/refine/bundle.properties
b/oozie/webrequest/refine/bundle.properties
index 9a965da..ba9dd09 100644
--- a/oozie/webrequest/refine/bundle.properties
+++ b/oozie/webrequest/refine/bundle.properties
@@ -36,6 +36,7 @@
# HDFS path to webrequest dataset definitions
datasets_raw_file =
${oozie_directory}/webrequest/datasets_raw.xml
+datasets_file = ${oozie_directory}/webrequest/datasets.xml
# Initial import time of the webrequest dataset.
start_time = 2015-01-01T00:00Z
@@ -52,6 +53,7 @@
# HDFS path to directory where webrequest data is time bucketed.
webrequest_raw_data_directory = ${name_node}/wmf/data/raw/webrequest
+webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
# Coordintator to start.
oozie.bundle.application.path =
${oozie_directory}/webrequest/refine/bundle.xml
diff --git a/oozie/webrequest/refine/bundle.xml
b/oozie/webrequest/refine/bundle.xml
index 72048a5..09888d5 100644
--- a/oozie/webrequest/refine/bundle.xml
+++ b/oozie/webrequest/refine/bundle.xml
@@ -13,10 +13,15 @@
<property><name>name_node</name></property>
<property><name>job_tracker</name></property>
<property><name>workflow_file</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
+
<property><name>webrequest_raw_data_directory</name></property>
<property><name>datasets_raw_file</name></property>
+ <property><name>webrequest_data_directory</name></property>
+ <property><name>datasets_file</name></property>
<property><name>hive_site_xml</name></property>
<property><name>artifacts_directory</name></property>
diff --git a/oozie/webrequest/refine/coordinator.xml
b/oozie/webrequest/refine/coordinator.xml
index e747846..b052c5e 100644
--- a/oozie/webrequest/refine/coordinator.xml
+++ b/oozie/webrequest/refine/coordinator.xml
@@ -16,10 +16,15 @@
<property><name>name_node</name></property>
<property><name>job_tracker</name></property>
<property><name>workflow_file</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
+
<property><name>webrequest_raw_data_directory</name></property>
<property><name>datasets_raw_file</name></property>
+ <property><name>webrequest_data_directory</name></property>
+ <property><name>datasets_file</name></property>
<property><name>hive_site_xml</name></property>
<property><name>artifacts_directory</name></property>
@@ -58,17 +63,26 @@
<datasets>
<!--
- Include the given datasets_file file. This should
- define the "webrequest" dataset for this coordinator.
+ Include both raw and refined datasets files.
+ $datasets_raw_file will be used as the input events,
+ and $datasets_file will be used to determine output
+ location in which to add a done-flag.
-->
<include>${datasets_raw_file}</include>
+ <include>${datasets_file}</include>
</datasets>
<input-events>
- <data-in name="input" dataset="webrequest_${webrequest_source}_raw">
+ <data-in name="raw_input"
dataset="webrequest_${webrequest_source}_raw">
<instance>${coord:current(0)}</instance>
</data-in>
</input-events>
+
+ <output-events>
+ <data-out name="refined_output"
dataset="webrequest_${webrequest_source}">
+ <instance>${coord:current(0)}</instance>
+ </data-out>
+ </output-events>
<action>
<workflow>
@@ -116,7 +130,10 @@
<name>hour</name>
<value>${coord:formatTime(coord:nominalTime(),
"H")}</value>
</property>
-
+ <property>
+ <name>destination_dataset_directory</name>
+ <value>${coord:dataOut('refined_output')}</value>
+ </property>
</configuration>
</workflow>
</action>
diff --git a/oozie/webrequest/refine/workflow.xml
b/oozie/webrequest/refine/workflow.xml
index 599ca73..2390a1c 100644
--- a/oozie/webrequest/refine/workflow.xml
+++ b/oozie/webrequest/refine/workflow.xml
@@ -55,6 +55,14 @@
<name>hour</name>
<description>The partition's hour</description>
</property>
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <description>Workflow for marking a directory done</description>
+ </property>
+ <property>
+ <name>destination_dataset_directory</name>
+ <description>Directory to generate the done flag in</description>
+ </property>
</parameters>
<start to="refine"/>
@@ -86,12 +94,26 @@
<param>hour=${hour}</param>
</hive>
+ <ok to="mark_dataset_done"/>
+ <error to="kill"/>
+ </action>
+
+ <action name="mark_dataset_done">
+ <sub-workflow>
+ <app-path>${mark_directory_done_workflow_file}</app-path>
+ <configuration>
+ <property>
+ <name>directory</name>
+ <value>${destination_dataset_directory}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
<ok to="end"/>
<error to="kill"/>
</action>
<kill name="kill">
- <message>Webrequest refine action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+ <message>Action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>
--
To view, visit https://gerrit.wikimedia.org/r/184804
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I88058d136ebe10a7330558543ad49f8fcf13a108
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits