Ottomata has submitted this change and it was merged.
Change subject: Add _SUCCESS done-flag in refined webrequest data directories
after successful refinement
......................................................................
Add _SUCCESS done-flag in refined webrequest data directories after successful
refinement
Change-Id: I88058d136ebe10a7330558543ad49f8fcf13a108
---
M oozie/webrequest/datasets.xml
M oozie/webrequest/refine/bundle.properties
M oozie/webrequest/refine/bundle.xml
M oozie/webrequest/refine/coordinator.xml
M oozie/webrequest/refine/workflow.xml
5 files changed, 61 insertions(+), 8 deletions(-)
Approvals:
Ottomata: Verified; Looks good to me, approved
diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index d72e95f..60ea069 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -35,15 +35,15 @@
initial-instance="${start_time}"
timezone="Universal">
<uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
- <done-flag></done-flag>
+ <done-flag>_SUCCESS</done-flag>
</dataset>
+
<dataset name="webrequest_text"
frequency="${coord:hours(1)}"
initial-instance="${start_time}"
timezone="Universal">
<uri-template>${webrequest_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
- <done-flag></done-flag>
-
+ <done-flag>_SUCCESS</done-flag>
</dataset>
diff --git a/oozie/webrequest/refine/bundle.properties
b/oozie/webrequest/refine/bundle.properties
index 9a965da..9379ba4 100644
--- a/oozie/webrequest/refine/bundle.properties
+++ b/oozie/webrequest/refine/bundle.properties
@@ -36,12 +36,16 @@
# HDFS path to webrequest dataset definitions
datasets_raw_file =
${oozie_directory}/webrequest/datasets_raw.xml
+datasets_file = ${oozie_directory}/webrequest/datasets.xml
# Initial import time of the webrequest dataset.
start_time = 2015-01-01T00:00Z
# Time to stop running this coordinator. Year 3000 == never!
stop_time = 3000-01-01T00:00Z
+
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
# HDFS path to hive-site.xml file. This is needed to run hive actions.
hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml
@@ -52,6 +56,7 @@
# HDFS path to directory where webrequest data is time bucketed.
webrequest_raw_data_directory = ${name_node}/wmf/data/raw/webrequest
+webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
# Coordintator to start.
oozie.bundle.application.path =
${oozie_directory}/webrequest/refine/bundle.xml
diff --git a/oozie/webrequest/refine/bundle.xml
b/oozie/webrequest/refine/bundle.xml
index ec91dee..5c636d7 100644
--- a/oozie/webrequest/refine/bundle.xml
+++ b/oozie/webrequest/refine/bundle.xml
@@ -13,10 +13,15 @@
<property><name>name_node</name></property>
<property><name>job_tracker</name></property>
<property><name>workflow_file</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
+
<property><name>webrequest_raw_data_directory</name></property>
<property><name>datasets_raw_file</name></property>
+ <property><name>webrequest_data_directory</name></property>
+ <property><name>datasets_file</name></property>
<property><name>hive_site_xml</name></property>
<property><name>artifacts_directory</name></property>
diff --git a/oozie/webrequest/refine/coordinator.xml
b/oozie/webrequest/refine/coordinator.xml
index e747846..1161b21 100644
--- a/oozie/webrequest/refine/coordinator.xml
+++ b/oozie/webrequest/refine/coordinator.xml
@@ -16,10 +16,15 @@
<property><name>name_node</name></property>
<property><name>job_tracker</name></property>
<property><name>workflow_file</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
+
<property><name>webrequest_raw_data_directory</name></property>
<property><name>datasets_raw_file</name></property>
+ <property><name>webrequest_data_directory</name></property>
+ <property><name>datasets_file</name></property>
<property><name>hive_site_xml</name></property>
<property><name>artifacts_directory</name></property>
@@ -58,17 +63,26 @@
<datasets>
<!--
- Include the given datasets_file file. This should
- define the "webrequest" dataset for this coordinator.
+ Include both raw and refined datasets files.
+ $datasets_raw_file will be used as the input events,
+ and $datasets_file will be used to determine output
+ location in which to add a done-flag.
-->
<include>${datasets_raw_file}</include>
+ <include>${datasets_file}</include>
</datasets>
<input-events>
- <data-in name="input" dataset="webrequest_${webrequest_source}_raw">
+ <data-in name="raw_input"
dataset="webrequest_${webrequest_source}_raw">
<instance>${coord:current(0)}</instance>
</data-in>
</input-events>
+
+ <output-events>
+ <data-out name="refined_output"
dataset="webrequest_${webrequest_source}">
+ <instance>${coord:current(0)}</instance>
+ </data-out>
+ </output-events>
<action>
<workflow>
@@ -116,7 +130,14 @@
<name>hour</name>
<value>${coord:formatTime(coord:nominalTime(),
"H")}</value>
</property>
-
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <value>${mark_directory_done_workflow_file}</value>
+ </property>
+ <property>
+ <name>destination_dataset_directory</name>
+ <value>${coord:dataOut('refined_output')}</value>
+ </property>
</configuration>
</workflow>
</action>
diff --git a/oozie/webrequest/refine/workflow.xml
b/oozie/webrequest/refine/workflow.xml
index 599ca73..2390a1c 100644
--- a/oozie/webrequest/refine/workflow.xml
+++ b/oozie/webrequest/refine/workflow.xml
@@ -55,6 +55,14 @@
<name>hour</name>
<description>The partition's hour</description>
</property>
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <description>Workflow for marking a directory done</description>
+ </property>
+ <property>
+ <name>destination_dataset_directory</name>
+ <description>Directory to generate the done flag in</description>
+ </property>
</parameters>
<start to="refine"/>
@@ -86,12 +94,26 @@
<param>hour=${hour}</param>
</hive>
+ <ok to="mark_dataset_done"/>
+ <error to="kill"/>
+ </action>
+
+ <action name="mark_dataset_done">
+ <sub-workflow>
+ <app-path>${mark_directory_done_workflow_file}</app-path>
+ <configuration>
+ <property>
+ <name>directory</name>
+ <value>${destination_dataset_directory}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
<ok to="end"/>
<error to="kill"/>
</action>
<kill name="kill">
- <message>Webrequest refine action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+ <message>Action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>
--
To view, visit https://gerrit.wikimedia.org/r/184804
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I88058d136ebe10a7330558543ad49f8fcf13a108
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: QChris <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits