Ottomata has submitted this change and it was merged.

Change subject: Add _SUCCESS done-flag in refined webrequest data directories 
after successful refinement
......................................................................


Add _SUCCESS done-flag in refined webrequest data directories after successful 
refinement

Change-Id: I88058d136ebe10a7330558543ad49f8fcf13a108
---
M oozie/webrequest/datasets.xml
M oozie/webrequest/refine/bundle.properties
M oozie/webrequest/refine/bundle.xml
M oozie/webrequest/refine/coordinator.xml
M oozie/webrequest/refine/workflow.xml
5 files changed, 61 insertions(+), 8 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index d72e95f..60ea069 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -35,15 +35,15 @@
              initial-instance="${start_time}"
              timezone="Universal">
         
<uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
-        <done-flag></done-flag>
+        <done-flag>_SUCCESS</done-flag>
     </dataset>
+
     <dataset name="webrequest_text"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
              timezone="Universal">
         
<uri-template>${webrequest_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
-        <done-flag></done-flag>
-
+        <done-flag>_SUCCESS</done-flag>
     </dataset>
 
 
diff --git a/oozie/webrequest/refine/bundle.properties 
b/oozie/webrequest/refine/bundle.properties
index 9a965da..9379ba4 100644
--- a/oozie/webrequest/refine/bundle.properties
+++ b/oozie/webrequest/refine/bundle.properties
@@ -36,12 +36,16 @@
 
 # HDFS path to webrequest dataset definitions
 datasets_raw_file                 = 
${oozie_directory}/webrequest/datasets_raw.xml
+datasets_file                     = ${oozie_directory}/webrequest/datasets.xml
 
 # Initial import time of the webrequest dataset.
 start_time                        = 2015-01-01T00:00Z
 
 # Time to stop running this coordinator.  Year 3000 == never!
 stop_time                         = 3000-01-01T00:00Z
+
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
 
 # HDFS path to hive-site.xml file.  This is needed to run hive actions.
 hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
@@ -52,6 +56,7 @@
 
 # HDFS path to directory where webrequest data is time bucketed.
 webrequest_raw_data_directory     = ${name_node}/wmf/data/raw/webrequest
+webrequest_data_directory         = ${name_node}/wmf/data/wmf/webrequest
 
 # Coordintator to start.
 oozie.bundle.application.path     = 
${oozie_directory}/webrequest/refine/bundle.xml
diff --git a/oozie/webrequest/refine/bundle.xml 
b/oozie/webrequest/refine/bundle.xml
index ec91dee..5c636d7 100644
--- a/oozie/webrequest/refine/bundle.xml
+++ b/oozie/webrequest/refine/bundle.xml
@@ -13,10 +13,15 @@
         <property><name>name_node</name></property>
         <property><name>job_tracker</name></property>
         <property><name>workflow_file</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
+
         <property><name>webrequest_raw_data_directory</name></property>
         <property><name>datasets_raw_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>datasets_file</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>artifacts_directory</name></property>
diff --git a/oozie/webrequest/refine/coordinator.xml 
b/oozie/webrequest/refine/coordinator.xml
index e747846..1161b21 100644
--- a/oozie/webrequest/refine/coordinator.xml
+++ b/oozie/webrequest/refine/coordinator.xml
@@ -16,10 +16,15 @@
         <property><name>name_node</name></property>
         <property><name>job_tracker</name></property>
         <property><name>workflow_file</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
+
         <property><name>webrequest_raw_data_directory</name></property>
         <property><name>datasets_raw_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>datasets_file</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>artifacts_directory</name></property>
@@ -58,17 +63,26 @@
 
     <datasets>
         <!--
-        Include the given datasets_file file.  This should
-        define the "webrequest" dataset for this coordinator.
+        Include both raw and refined datasets files.
+        $datasets_raw_file will be used as the input events,
+        and $datasets_file will be used to determine output
+        location in which to add a done-flag.
         -->
         <include>${datasets_raw_file}</include>
+        <include>${datasets_file}</include>
     </datasets>
 
     <input-events>
-        <data-in name="input" dataset="webrequest_${webrequest_source}_raw">
+        <data-in name="raw_input" 
dataset="webrequest_${webrequest_source}_raw">
             <instance>${coord:current(0)}</instance>
         </data-in>
     </input-events>
+
+    <output-events>
+        <data-out name="refined_output" 
dataset="webrequest_${webrequest_source}">
+            <instance>${coord:current(0)}</instance>
+        </data-out>
+    </output-events>
 
     <action>
         <workflow>
@@ -116,7 +130,14 @@
                     <name>hour</name>
                     <value>${coord:formatTime(coord:nominalTime(), 
"H")}</value>
                 </property>
-
+                <property>
+                    <name>mark_directory_done_workflow_file</name>
+                    <value>${mark_directory_done_workflow_file}</value>
+                </property>
+                <property>
+                    <name>destination_dataset_directory</name>
+                    <value>${coord:dataOut('refined_output')}</value>
+                </property>
             </configuration>
         </workflow>
     </action>
diff --git a/oozie/webrequest/refine/workflow.xml 
b/oozie/webrequest/refine/workflow.xml
index 599ca73..2390a1c 100644
--- a/oozie/webrequest/refine/workflow.xml
+++ b/oozie/webrequest/refine/workflow.xml
@@ -55,6 +55,14 @@
             <name>hour</name>
             <description>The partition's hour</description>
         </property>
+        <property>
+            <name>mark_directory_done_workflow_file</name>
+            <description>Workflow for marking a directory done</description>
+        </property>
+        <property>
+            <name>destination_dataset_directory</name>
+            <description>Directory to generate the done flag in</description>
+        </property>
     </parameters>
 
     <start to="refine"/>
@@ -86,12 +94,26 @@
             <param>hour=${hour}</param>
         </hive>
 
+        <ok to="mark_dataset_done"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="mark_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    <value>${destination_dataset_directory}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
         <ok to="end"/>
         <error to="kill"/>
     </action>
 
     <kill name="kill">
-        <message>Webrequest refine action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
     </kill>
     <end name="end"/>
 </workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/184804
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I88058d136ebe10a7330558543ad49f8fcf13a108
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: QChris <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to