Ottomata has submitted this change and it was merged.

Change subject: Stop tying Oozie's “data_directory” to webrequests table
......................................................................


Stop tying Oozie's “data_directory” to webrequests table

Oozie's “data_directory” setting was pointing to the directory of a
webrequest_source partition. Both this naming and the tying in the
webrequest_source partition would have caused problems for the
upcoming addition of webstastcollector.

The naming was in the way, because when including the webrequest's
dataset definitions in webstatscollector, they would have required the
global “data_directory” variable to point to the /webrequests/ data
directory.

Including the webrequest_source partition in the “data_directory”
setting was nice when only dealing with a single source per
coordinator. But for webstatscollector, we need to pull in data from
both text and mobile sources in one go. This would not easily be
possible when having the webrequest_source into “data_directory”.

Hence, we construct the dataset's locations from a more general
data_directory, materialize the data set definitions for the
webrequest sources, and use parametrized names to refer to them.

Change-Id: I4f3b00089728627a8c36de4a9b184ef0bc0691c6
---
M oozie/webrequest/datasets.xml
M oozie/webrequest/partition/add/bundle.properties
M oozie/webrequest/partition/add/bundle.xml
M oozie/webrequest/partition/add/coordinator.xml
M oozie/webrequest/partition/monitor_done_flag/bundle.properties
M oozie/webrequest/partition/monitor_done_flag/bundle.xml
M oozie/webrequest/partition/monitor_done_flag/coordinator.xml
7 files changed, 61 insertions(+), 20 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index efa26d9..d6e6507 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -5,38 +5,79 @@
 
     ${start_time}     - the initial instance of your data.
                         Example: 2014-04-01T00:00Z
-    ${data_directory} - Path to directory where data is time bucketed.
-                        Example: 
/wmf/data/raw/webrequest/webrequest_mobile/hourly
+    ${webrequest_data_directory} - Path to directory where data is time 
bucketed.
+                        Example: /wmf/data/raw/webrequest
 -->
 
 <datasets>
     <!--
-    The webrequest_unchecked dataset should be used for cases
-    where you do not care if the sequence stats have been
-    checked.  This will simply include any imported hourly
-    data directories that exist.
+    The webrequest_*_unchecked datasets should be used for cases where you do
+    not care if the sequence stats have been checked.  This will simply include
+    any imported hourly data directories that exist.
     -->
-    <dataset name="webrequest_unchecked"
+    <dataset name="webrequest_bits_unchecked"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
              timezone="Universal">
-        
<uri-template>${data_directory}/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        
<uri-template>${webrequest_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+    <dataset name="webrequest_mobile_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+    <dataset name="webrequest_text_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+    <dataset name="webrequest_upload_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
         <done-flag></done-flag>
     </dataset>
 
     <!--
-    The webrequest dataset should be used if you want to be
+    The webrequest_* datasets should be used if you want to be
     sure that you are only working with hourly imports for which
     sequence stats have been checked.  These directories have an
     empty _SUCCESS flag created in them once they have been checked
     and it has been determined that the expected number of requests
     equals the actual number of entires for this hour.
     -->
-    <dataset name="webrequest"
+    <dataset name="webrequest_bits"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
              timezone="Universal">
-        
<uri-template>${data_directory}/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        
<uri-template>${webrequest_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+    <dataset name="webrequest_mobile"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+    <dataset name="webrequest_text"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+    <dataset name="webrequest_upload"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
         <done-flag>_SUCCESS</done-flag>
     </dataset>
 </datasets>
diff --git a/oozie/webrequest/partition/add/bundle.properties 
b/oozie/webrequest/partition/add/bundle.properties
index f0bda7d..7a83eca 100644
--- a/oozie/webrequest/partition/add/bundle.properties
+++ b/oozie/webrequest/partition/add/bundle.properties
@@ -46,7 +46,7 @@
 faulty_hosts_directory            = 
${name_node}/wmf/data/raw/webrequests_faulty_hosts
 
 # HDFS path to directory where webrequest data is time bucketed.
-data_directory                    = 
${name_node}/wmf/data/raw/webrequest/webrequest_${webrequest_source}/hourly
+webrequest_data_directory         = ${name_node}/wmf/data/raw/webrequest
 
 # Coordintator to start.
 oozie.bundle.application.path     = 
${oozie_directory}/webrequest/partition/add/bundle.xml
diff --git a/oozie/webrequest/partition/add/bundle.xml 
b/oozie/webrequest/partition/add/bundle.xml
index eadd388..9789319 100644
--- a/oozie/webrequest/partition/add/bundle.xml
+++ b/oozie/webrequest/partition/add/bundle.xml
@@ -15,7 +15,7 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>data_directory</name></property>
+        <property><name>webrequest_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>add_partition_workflow_file</name></property>
diff --git a/oozie/webrequest/partition/add/coordinator.xml 
b/oozie/webrequest/partition/add/coordinator.xml
index da41071..14c9055 100644
--- a/oozie/webrequest/partition/add/coordinator.xml
+++ b/oozie/webrequest/partition/add/coordinator.xml
@@ -18,7 +18,7 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>data_directory</name></property>
+        <property><name>webrequest_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>add_partition_workflow_file</name></property>
@@ -78,7 +78,7 @@
     </datasets>
 
     <input-events>
-        <data-in name="input" dataset="webrequest_unchecked">
+        <data-in name="input" 
dataset="webrequest_${webrequest_source}_unchecked">
             <instance>${coord:current(0)}</instance>
         </data-in>
         <!--
@@ -88,7 +88,7 @@
         might get created /before/ writing for the current dataset
         finishes).
          -->
-        <data-in name="ready_indicator" dataset="webrequest_unchecked">
+        <data-in name="ready_indicator" 
dataset="webrequest_${webrequest_source}_unchecked">
             <instance>${coord:current(2)}</instance>
         </data-in>
     </input-events>
diff --git a/oozie/webrequest/partition/monitor_done_flag/bundle.properties 
b/oozie/webrequest/partition/monitor_done_flag/bundle.properties
index abd46dd..2b512ff 100644
--- a/oozie/webrequest/partition/monitor_done_flag/bundle.properties
+++ b/oozie/webrequest/partition/monitor_done_flag/bundle.properties
@@ -32,7 +32,7 @@
 stop_time                         = 3000-01-01T00:00Z
 
 # HDFS path to directory where webrequest data is time bucketed.
-data_directory                    = 
${name_node}/wmf/data/raw/webrequest/webrequest_${webrequest_source}/hourly
+webrequest_data_directory         = ${name_node}/wmf/data/raw/webrequest
 
 # Coordintator to start.
 oozie.bundle.application.path     = 
${oozie_directory}/webrequest/partition/monitor_done_flag/bundle.xml
diff --git a/oozie/webrequest/partition/monitor_done_flag/bundle.xml 
b/oozie/webrequest/partition/monitor_done_flag/bundle.xml
index 4e83ee2..578fbd6 100644
--- a/oozie/webrequest/partition/monitor_done_flag/bundle.xml
+++ b/oozie/webrequest/partition/monitor_done_flag/bundle.xml
@@ -15,7 +15,7 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>data_directory</name></property>
+        <property><name>webrequest_data_directory</name></property>
     </parameters>
 
     <coordinator name='monitor_done_flag-webrequest-bits'>
diff --git a/oozie/webrequest/partition/monitor_done_flag/coordinator.xml 
b/oozie/webrequest/partition/monitor_done_flag/coordinator.xml
index 60fadf9..1848bb2 100644
--- a/oozie/webrequest/partition/monitor_done_flag/coordinator.xml
+++ b/oozie/webrequest/partition/monitor_done_flag/coordinator.xml
@@ -18,7 +18,7 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>data_directory</name></property>
+        <property><name>webrequest_data_directory</name></property>
 
         <property><name>webrequest_source</name></property>
     </parameters>
@@ -48,7 +48,7 @@
     </datasets>
 
     <input-events>
-        <data-in name="input" dataset="webrequest">
+        <data-in name="input" dataset="webrequest_${webrequest_source}">
             <instance>${coord:current(0)}</instance>
         </data-in>
     </input-events>

-- 
To view, visit https://gerrit.wikimedia.org/r/161904
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I4f3b00089728627a8c36de4a9b184ef0bc0691c6
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to