Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/182807

to review the following change.

Change subject: Rearrange directory layout underneath /oozie
......................................................................

Rearrange directory layout underneath /oozie

On 2014-10-03 in #wikimedia-analytics it was decided to move the
directories underneath /oozie according to:

* webrequest/partition/add               -> webrequest/load
* webstats/insert_hourly_pagecounts      -> pagecounts-all-sites/load
* webstats/generate_hourly_files         -> pagecounts-all-sites/archive

We do not move files in hdfs yet, or rename Hive tables.

Change-Id: I83bbfb2dd74181595a54ea5126412fe718c21282
---
M diagrams/oozie-overview.dia
R oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
R oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
A oozie/pagecounts-all-sites/archive/bundle.properties
R oozie/pagecounts-all-sites/archive/bundle.xml
R oozie/pagecounts-all-sites/archive/coordinator.xml
R oozie/pagecounts-all-sites/archive/workflow.xml
R oozie/pagecounts-all-sites/datasets.xml
A oozie/pagecounts-all-sites/load/coordinator.properties
R oozie/pagecounts-all-sites/load/coordinator.xml
R oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
R oozie/pagecounts-all-sites/load/workflow.xml
R oozie/webrequest/load/README.md
R oozie/webrequest/load/bundle.properties
R oozie/webrequest/load/bundle.xml
R oozie/webrequest/load/check_sequence_statistics_workflow.xml
R oozie/webrequest/load/coordinator.xml
R oozie/webrequest/load/extract_faulty_hosts.hql
R oozie/webrequest/load/generate_sequence_statistics.hql
R oozie/webrequest/load/workflow.xml
D oozie/webstats/generate_hourly_files/bundle.properties
D oozie/webstats/insert_hourly_pagecounts/coordinator.properties
22 files changed, 176 insertions(+), 174 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/07/182807/1

diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index 2b8771b..952ebbb 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git 
a/oozie/webstats/generate_hourly_files/generate_hourly_pagecounts_file.hql 
b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
similarity index 85%
rename from 
oozie/webstats/generate_hourly_files/generate_hourly_pagecounts_file.hql
rename to oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
index 6103d14..4448fad 100644
--- a/oozie/webstats/generate_hourly_files/generate_hourly_pagecounts_file.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
@@ -2,12 +2,13 @@
 SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
 --^ To work around HIVE-3296, we have SETs before any comments
 
--- Generates an hourly webstats pagecounts file into HDFS
+-- Generates an hourly pagecounts-all-sites pagecounts file into HDFS
 --
 -- Parameters:
 --     destination_directory -- Directory in HDFS where to store the generated
 --                          data in.
---     source_table      -- table containing hourly aggregated webstats data
+--     source_table      -- table containing hourly aggregated
+--                          pagecounts-all-sites data
 --     year              -- year of the to-be-generated hour
 --     month             -- month of the to-be-generated hour
 --     day               -- day of the to-be-generated hour
@@ -15,7 +16,7 @@
 --
 --
 -- Usage:
---     hive -f generate_hourly_pagecounts_file.hql  \
+--     hive -f archive_pagecounts.hql  \
 --         -d destination_directory=/tmp/foo      \
 --         -d source_table=wmf.webstats           \
 --         -d year=2014                           \
diff --git 
a/oozie/webstats/generate_hourly_files/generate_hourly_projectcounts_file.hql 
b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
similarity index 84%
rename from 
oozie/webstats/generate_hourly_files/generate_hourly_projectcounts_file.hql
rename to oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
index ab94500..82cb00b 100644
--- 
a/oozie/webstats/generate_hourly_files/generate_hourly_projectcounts_file.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
@@ -1,12 +1,13 @@
 SET hive.exec.compress.output=false;
 --^ To work around HIVE-3296, we have SETs before any comments
 
--- Generates an hourly webstats projectcounts file into HDFS
+-- Generates an hourly pagecounts-all-sites projectcounts file into HDFS
 --
 -- Parameters:
 --     destination_directory -- Directory in HDFS where to store the generated
 --                          data in
---     source_table      -- table containing hourly aggregated webstats data
+--     source_table      -- table containing hourly aggregated
+--                          pagecounts-all-sites data
 --     year              -- year of the to-be-generated hour
 --     month             -- month of the to-be-generated hour
 --     day               -- day of the to-be-generated hour
@@ -14,7 +15,7 @@
 --
 --
 -- Usage:
---     hive -f generate_hourly_projectcounts_file.hql  \
+--     hive -f archive_projectcounts.hql  \
 --         -d destination_directory=/tmp/foo         \
 --         -d source_table=wmf.webstats              \
 --         -d year=2014                              \
diff --git a/oozie/pagecounts-all-sites/archive/bundle.properties 
b/oozie/pagecounts-all-sites/archive/bundle.properties
new file mode 100644
index 0000000..28d02ef
--- /dev/null
+++ b/oozie/pagecounts-all-sites/archive/bundle.properties
@@ -0,0 +1,61 @@
+# Configures a coordinator to generate an hourly pagecounts-all-sites files 
from
+# the webstats table.
+#
+# Usage:
+#     oozie job -run \
+#         -config oozie/pagecounts-all-sites/archive/bundle.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+
+name_node                              = hdfs://analytics-hadoop
+job_tracker                            = 
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                             = default
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                        = 
${name_node}/wmf/refinery/current/oozie
+
+# HDFS path to coordinator to run for each webrequest_source.
+coordinator_file                       = 
${oozie_directory}/pagecounts-all-sites/archive/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file                          = 
${oozie_directory}/pagecounts-all-sites/archive/workflow.xml
+
+# HDFS path to pagecounts-all-sites dataset definition
+pagecounts_all_sites_datasets_file     = 
${oozie_directory}/pagecounts-all-sites/datasets.xml
+
+# Initial import time of the pagecounts-all-sites dataset.
+start_time                             = 2014-04-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                              = 3000-01-01T00:00Z
+
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file      = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+archive_job_output_workflow_file       = 
${oozie_directory}/util/archive_job_output/workflow.xml
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+hive_site_xml                          = 
${oozie_directory}/util/hive/hive-site.xml
+
+# Table to write hourly pagecounts to (fully qualified)
+pagecounts_all_sites_table             = wmf.webstats
+
+# HDFS path to directory where pagecounts-all-sites data is time bucketed.
+pagecounts_all_sites_data_directory    = ${name_node}/wmf/data/wmf/webstats
+
+# Temporary directory
+temporary_directory                    = ${name_node}/tmp
+
+# Archive base directory
+archive_directory                      = ${name_node}/wmf/data/archive
+
+# Archive directory for pagecounts-all-sites
+pagecounts_all_sites_archive_directory = ${archive_directory}/webstats
+
+# Coordintator to start.
+oozie.bundle.application.path          = 
${oozie_directory}/pagecounts-all-sites/archive/bundle.xml
+oozie.use.system.libpath               = true
+oozie.action.external.stats.write      = true
diff --git a/oozie/webstats/generate_hourly_files/bundle.xml 
b/oozie/pagecounts-all-sites/archive/bundle.xml
similarity index 76%
rename from oozie/webstats/generate_hourly_files/bundle.xml
rename to oozie/pagecounts-all-sites/archive/bundle.xml
index eb6382e..901e820 100644
--- a/oozie/webstats/generate_hourly_files/bundle.xml
+++ b/oozie/pagecounts-all-sites/archive/bundle.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <bundle-app xmlns="uri:oozie:bundle:0.2"
-    name="webstats_generate_hourly_files-bundle">
+    name="pagecounts_all_sites_archive-bundle">
 
     <parameters>
         <property>
@@ -13,18 +13,18 @@
         <property><name>job_tracker</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webstats_datasets_file</name></property>
-        <property><name>webstats_data_directory</name></property>
+        <property><name>pagecounts_all_sites_datasets_file</name></property>
+        <property><name>pagecounts_all_sites_data_directory</name></property>
         <property><name>hive_site_xml</name></property>
         <property><name>workflow_file</name></property>
-        <property><name>webstats_table</name></property>
+        <property><name>pagecounts_all_sites_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
         <property><name>temporary_directory</name></property>
-        <property><name>webstats_archive_directory</name></property>
+        
<property><name>pagecounts_all_sites_archive_directory</name></property>
         <property><name>archive_job_output_workflow_file</name></property>
     </parameters>
 
-    <coordinator name="webstats_generate_hourly_files-pagecounts">
+    <coordinator name="pagecounts_all_sites_archive-pagecounts">
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -38,7 +38,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name="webstats_generate_hourly_files-projectcounts">
+    <coordinator name="pagecounts_all_sites_archive-projectcounts">
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
diff --git a/oozie/webstats/generate_hourly_files/coordinator.xml 
b/oozie/pagecounts-all-sites/archive/coordinator.xml
similarity index 86%
rename from oozie/webstats/generate_hourly_files/coordinator.xml
rename to oozie/pagecounts-all-sites/archive/coordinator.xml
index d2809db..88c0493 100644
--- a/oozie/webstats/generate_hourly_files/coordinator.xml
+++ b/oozie/pagecounts-all-sites/archive/coordinator.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="webstats_generate_hourly_files-${aspect_name}-coord"
+    name="pagecounts_all_sites_archive-${aspect_name}-coord"
     frequency="${coord:hours(1)}"
     start="${start_time}"
     end="${stop_time}"
@@ -17,14 +17,14 @@
         <property><name>job_tracker</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webstats_datasets_file</name></property>
-        <property><name>webstats_data_directory</name></property>
+        <property><name>pagecounts_all_sites_datasets_file</name></property>
+        <property><name>pagecounts_all_sites_data_directory</name></property>
         <property><name>hive_site_xml</name></property>
         <property><name>workflow_file</name></property>
-        <property><name>webstats_table</name></property>
+        <property><name>pagecounts_all_sites_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
         <property><name>temporary_directory</name></property>
-        <property><name>webstats_archive_directory</name></property>
+        
<property><name>pagecounts_all_sites_archive_directory</name></property>
         <property><name>archive_job_output_workflow_file</name></property>
         <property><name>aspect_name</name></property>
         <property><name>aspect_compression_ending</name></property>
@@ -55,11 +55,11 @@
     </controls>
 
     <datasets>
-        <include>${webstats_datasets_file}</include>
+        <include>${pagecounts_all_sites_datasets_file}</include>
     </datasets>
 
     <input-events>
-        <data-in name="webstats" dataset="webstats_pagecounts">
+        <data-in name="pagecounts_all_sites" 
dataset="pagecounts_all_sites_hourly">
             <instance>${coord:current(0)}</instance>
         </data-in>
     </input-events>
@@ -79,8 +79,8 @@
                     <value>${hive_site_xml}</value>
                 </property>
                 <property>
-                    <name>webstats_table</name>
-                    <value>${webstats_table}</value>
+                    <name>pagecounts_all_sites_table</name>
+                    <value>${pagecounts_all_sites_table}</value>
                 </property>
                 <property>
                     <name>year</name>
@@ -123,8 +123,8 @@
                     <value>${temporary_directory}</value>
                 </property>
                 <property>
-                    <name>webstats_archive_directory</name>
-                    <value>${webstats_archive_directory}</value>
+                    <name>pagecounts_all_sites_archive_directory</name>
+                    <value>${pagecounts_all_sites_archive_directory}</value>
                 </property>
                 <property>
                     <name>archive_job_output_workflow_file</name>
diff --git a/oozie/webstats/generate_hourly_files/workflow.xml 
b/oozie/pagecounts-all-sites/archive/workflow.xml
similarity index 86%
rename from oozie/webstats/generate_hourly_files/workflow.xml
rename to oozie/pagecounts-all-sites/archive/workflow.xml
index 5bff50f..c70f8f1 100644
--- a/oozie/webstats/generate_hourly_files/workflow.xml
+++ b/oozie/pagecounts-all-sites/archive/workflow.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <workflow-app xmlns="uri:oozie:workflow:0.4"
-    
name="webstats_generate_hourly_files-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
+    
name="pagecounts_all_sites_archive-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
 
     <parameters>
         <property>
@@ -17,9 +17,9 @@
             <description>hive-site.xml file path in HDFS</description>
         </property>
         <property>
-            <name>webstats_table</name>
+            <name>pagecounts_all_sites_table</name>
             <description>
-                Hive table to read webstats data from.
+                Hive table to read pagecounts-all-sites data from.
             </description>
         </property>
         <property>
@@ -63,8 +63,8 @@
             <description>A directory in HDFS for temporary files</description>
         </property>
         <property>
-            <name>webstats_archive_directory</name>
-            <description>Directory for archive of webstats files</description>
+            <name>pagecounts_all_sites_archive_directory</name>
+            <description>Directory for archive of pagecounts-all-sites 
files</description>
         </property>
         <property>
             <name>archive_job_output_workflow_file</name>
@@ -94,9 +94,9 @@
                 </property>
             </configuration>
 
-            <script>generate_hourly_${aspect_name}_file.hql</script>
+            <script>archive_${aspect_name}.hql</script>
 
-            <param>source_table=${webstats_table}</param>
+            <param>source_table=${pagecounts_all_sites_table}</param>
             
<param>destination_directory=${temporary_directory}/${wf:id()}-${aspect_name}</param>
             <param>year=${year}</param>
             <param>month=${month}</param>
@@ -140,7 +140,7 @@
                     timestamp in the filename. To not break scripts of people,
                     we also name files that way.
                     -->
-                    
<value>${webstats_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
 eq 'EMPTY' ? '' : aspect_compression_ending}</value>
+                    
<value>${pagecounts_all_sites_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
 eq 'EMPTY' ? '' : aspect_compression_ending}</value>
                 </property>
             </configuration>
         </sub-workflow>
diff --git a/oozie/webstats/datasets.xml 
b/oozie/pagecounts-all-sites/datasets.xml
similarity index 79%
rename from oozie/webstats/datasets.xml
rename to oozie/pagecounts-all-sites/datasets.xml
index 8ef1609..3dc9768 100644
--- a/oozie/webstats/datasets.xml
+++ b/oozie/pagecounts-all-sites/datasets.xml
@@ -1,17 +1,17 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-Defines reusable datasets for webstats.
+Defines reusable datasets for pagecounts-all-sites.
 Use this dataset in your coordinator.xml files by setting:
 
     ${start_time}     - the initial instance of your data.
                         Example: 2014-04-01T00:00Z
-    ${webstats_data_directory}
+    ${pagecounts_all_sites_data_directory}
                       - Path to directory where data is time bucketed.
                         Example: /wmf/data/wmf/webstats
 -->
 
 <datasets>
-    <dataset name="webstats_pagecounts"
+    <dataset name="pagecounts_all_sites_hourly"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
              timezone="Universal">
@@ -27,6 +27,6 @@
         at “${...}” as input for the second EL level. There, the variables hold
         their expected values, and we can start unpadding them.
         -->
-        
<uri-template>${webstats_data_directory}/year=${YEAR}/month=${"$"}{MONTH + 
0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
+        
<uri-template>${pagecounts_all_sites_data_directory}/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
     </dataset>
 </datasets>
diff --git a/oozie/pagecounts-all-sites/load/coordinator.properties 
b/oozie/pagecounts-all-sites/load/coordinator.properties
new file mode 100644
index 0000000..e89c81e
--- /dev/null
+++ b/oozie/pagecounts-all-sites/load/coordinator.properties
@@ -0,0 +1,56 @@
+# Configures a coordinator to insert hourly pagecounts-all-sites data
+# from webrequests table into the webstats table.
+#
+# Usage:
+#     oozie job -run \
+#         -config oozie/pagecounts-all-sites/load/coordinator.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+
+name_node                           = hdfs://analytics-hadoop
+job_tracker                         = 
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                          = default
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                     = ${name_node}/wmf/refinery/current/oozie
+
+# HDFS path to workflow to run.
+workflow_file                       = 
${oozie_directory}/pagecounts-all-sites/load/workflow.xml
+
+# HDFS path to webrequest dataset definition
+webrequest_datasets_file            = 
${oozie_directory}/webrequest/datasets.xml
+
+# HDFS path to pagecounts-all-sites dataset definition
+pagecounts_all_sites_datasets_file  = 
${oozie_directory}/pagecounts-all-sites/datasets.xml
+
+# Initial import time of the webrequest dataset.
+start_time                          = 2014-04-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                           = 3000-01-01T00:00Z
+
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file   = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+hive_site_xml                       = 
${oozie_directory}/util/hive/hive-site.xml
+
+# Table to read webrequests from (fully qualified)
+webrequest_table                    = wmf_raw.webrequest
+
+# Table to write hourly pagecounts to (fully qualified)
+pagecounts_all_sites_table          = wmf.webstats
+
+# HDFS paths to directories where webrequest data is time bucketed.
+webrequest_data_directory           = ${name_node}/wmf/data/raw/webrequest
+
+# HDFS path to directory where pagecounts-all-sites data is time bucketed.
+pagecounts_all_sites_data_directory = ${name_node}/wmf/data/wmf/webstats
+
+# Coordintator to start.
+oozie.coord.application.path        = 
${oozie_directory}/pagecounts-all-sites/load/coordinator.xml
+oozie.use.system.libpath            = true
+oozie.action.external.stats.write   = true
diff --git a/oozie/webstats/insert_hourly_pagecounts/coordinator.xml 
b/oozie/pagecounts-all-sites/load/coordinator.xml
similarity index 87%
rename from oozie/webstats/insert_hourly_pagecounts/coordinator.xml
rename to oozie/pagecounts-all-sites/load/coordinator.xml
index 913e46d..f659326 100644
--- a/oozie/webstats/insert_hourly_pagecounts/coordinator.xml
+++ b/oozie/pagecounts-all-sites/load/coordinator.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="webstats_insert_hourly_pagecounts-${webstats_table}-coord"
+    name="pagecounts_all_sites_load-${pagecounts_all_sites_table}-coord"
     frequency="${coord:hours(1)}"
     start="${start_time}"
     end="${stop_time}"
@@ -19,13 +19,13 @@
         <property><name>stop_time</name></property>
         <property><name>webrequest_datasets_file</name></property>
         <property><name>webrequest_data_directory</name></property>
-        <property><name>webstats_datasets_file</name></property>
-        <property><name>webstats_data_directory</name></property>
+        <property><name>pagecounts_all_sites_datasets_file</name></property>
+        <property><name>pagecounts_all_sites_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>workflow_file</name></property>
         <property><name>webrequest_table</name></property>
-        <property><name>webstats_table</name></property>
+        <property><name>pagecounts_all_sites_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
     </parameters>
 
@@ -55,7 +55,7 @@
 
     <datasets>
         <include>${webrequest_datasets_file}</include>
-        <include>${webstats_datasets_file}</include>
+        <include>${pagecounts_all_sites_datasets_file}</include>
     </datasets>
 
     <input-events>
@@ -68,7 +68,7 @@
     </input-events>
 
     <output-events>
-        <data-out name="webstats" dataset="webstats_pagecounts">
+        <data-out name="pagecounts_all_sites" 
dataset="pagecounts_all_sites_hourly">
             <instance>${coord:current(0)}</instance>
         </data-out>
     </output-events>
@@ -93,7 +93,7 @@
                 </property>
                 <property>
                     <name>destination_table</name>
-                    <value>${webstats_table}</value>
+                    <value>${pagecounts_all_sites_table}</value>
                 </property>
                 <property>
                     <name>year</name>
@@ -113,7 +113,7 @@
                 </property>
                 <property>
                     <name>destination_dataset_directory</name>
-                    <value>${coord:dataOut('webstats')}</value>
+                    <value>${coord:dataOut('pagecounts_all_sites')}</value>
                 </property>
             </configuration>
         </workflow>
diff --git 
a/oozie/webstats/insert_hourly_pagecounts/insert_hourly_pagecounts.hql 
b/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
similarity index 100%
rename from oozie/webstats/insert_hourly_pagecounts/insert_hourly_pagecounts.hql
rename to oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
diff --git a/oozie/webstats/insert_hourly_pagecounts/workflow.xml 
b/oozie/pagecounts-all-sites/load/workflow.xml
similarity index 96%
rename from oozie/webstats/insert_hourly_pagecounts/workflow.xml
rename to oozie/pagecounts-all-sites/load/workflow.xml
index f93ef66..8bdfc5f 100644
--- a/oozie/webstats/insert_hourly_pagecounts/workflow.xml
+++ b/oozie/pagecounts-all-sites/load/workflow.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <workflow-app xmlns="uri:oozie:workflow:0.4"
-    
name="webstats_insert_hourly_pagecounts-${destination_table}-${year}-${month}-${day}-${hour}-wf">
+    
name="pagecounts_all_sites_load-${destination_table}-${year}-${month}-${day}-${hour}-wf">
 
     <parameters>
         <property>
diff --git a/oozie/webrequest/partition/add/README.md 
b/oozie/webrequest/load/README.md
similarity index 100%
rename from oozie/webrequest/partition/add/README.md
rename to oozie/webrequest/load/README.md
diff --git a/oozie/webrequest/partition/add/bundle.properties 
b/oozie/webrequest/load/bundle.properties
similarity index 92%
rename from oozie/webrequest/partition/add/bundle.properties
rename to oozie/webrequest/load/bundle.properties
index df6bff2..bf4feca 100644
--- a/oozie/webrequest/partition/add/bundle.properties
+++ b/oozie/webrequest/load/bundle.properties
@@ -1,7 +1,7 @@
 # Configures a bundle to manage automatically adding Hive partitions to
 # a webrequest table.  Any of the following properties are overidable with -D.
 # Usage:
-# oozie job -submit -config oozie/webrequest/add/bundle.properties.
+# oozie job -submit -config oozie/webrequest/load/bundle.properties.
 #
 # NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
 #        .xml files exist there when this job is submitted.
@@ -16,10 +16,10 @@
 oozie_directory                   = ${name_node}/wmf/refinery/current/oozie
 
 # HDFS path to coordinator to run for each webrequest_source.
-coordinator_file                  = 
${oozie_directory}/webrequest/partition/add/coordinator.xml
+coordinator_file                  = 
${oozie_directory}/webrequest/load/coordinator.xml
 
 # HDFS path to workflow to run.
-workflow_file                     = 
${oozie_directory}/webrequest/partition/add/workflow.xml
+workflow_file                     = 
${oozie_directory}/webrequest/load/workflow.xml
 
 # HDFS path to webrequest dataset definition
 datasets_file                     = ${oozie_directory}/webrequest/datasets.xml
@@ -31,7 +31,7 @@
 stop_time                         = 3000-01-01T00:00Z
 
 # Workflow to add a partition
-add_partition_workflow_file       = 
${oozie_directory}/util/hive/partition/add/workflow.xml
+add_partition_workflow_file       = 
${oozie_directory}/util/hive/load/workflow.xml
 
 # Workflow to mark a directory as done
 mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
@@ -52,6 +52,6 @@
 webrequest_data_directory         = ${name_node}/wmf/data/raw/webrequest
 
 # Coordintator to start.
-oozie.bundle.application.path     = 
${oozie_directory}/webrequest/partition/add/bundle.xml
+oozie.bundle.application.path     = 
${oozie_directory}/webrequest/load/bundle.xml
 oozie.use.system.libpath          = true
 oozie.action.external.stats.write = true
diff --git a/oozie/webrequest/partition/add/bundle.xml 
b/oozie/webrequest/load/bundle.xml
similarity index 86%
rename from oozie/webrequest/partition/add/bundle.xml
rename to oozie/webrequest/load/bundle.xml
index c1144da..df602b7 100644
--- a/oozie/webrequest/partition/add/bundle.xml
+++ b/oozie/webrequest/load/bundle.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <bundle-app xmlns="uri:oozie:bundle:0.2"
-    name="hive_add_partition-${table}-bundle">
+    name="hive_webrequest_load-${table}-bundle">
 
     <parameters>
         <property>
@@ -25,7 +25,7 @@
         <property><name>mark_directory_done_workflow_file</name></property>
     </parameters>
 
-    <coordinator name='hive_add_partition-webrequest-bits'>
+    <coordinator name='hive_webrequest_load-webrequest-bits'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -35,7 +35,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='hive_add_partition-webrequest-mobile'>
+    <coordinator name='hive_webrequest_load-webrequest-mobile'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -45,7 +45,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='hive_add_partition-webrequest-text'>
+    <coordinator name='hive_webrequest_load-webrequest-text'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -55,7 +55,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='hive_add_partition-webrequest-upload'>
+    <coordinator name='hive_webrequest_load-webrequest-upload'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
diff --git 
a/oozie/webrequest/partition/add/check_sequence_statistics_workflow.xml 
b/oozie/webrequest/load/check_sequence_statistics_workflow.xml
similarity index 100%
rename from 
oozie/webrequest/partition/add/check_sequence_statistics_workflow.xml
rename to oozie/webrequest/load/check_sequence_statistics_workflow.xml
diff --git a/oozie/webrequest/partition/add/coordinator.xml 
b/oozie/webrequest/load/coordinator.xml
similarity index 98%
rename from oozie/webrequest/partition/add/coordinator.xml
rename to oozie/webrequest/load/coordinator.xml
index 08dad7a..77045d9 100644
--- a/oozie/webrequest/partition/add/coordinator.xml
+++ b/oozie/webrequest/load/coordinator.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="hive_add_partition-${table}-${webrequest_source}-coord"
+    name="hive_webrequest_load-${table}-${webrequest_source}-coord"
     frequency="${coord:hours(1)}"
     start="${start_time}"
     end="${stop_time}"
diff --git a/oozie/webrequest/partition/add/extract_faulty_hosts.hql 
b/oozie/webrequest/load/extract_faulty_hosts.hql
similarity index 100%
rename from oozie/webrequest/partition/add/extract_faulty_hosts.hql
rename to oozie/webrequest/load/extract_faulty_hosts.hql
diff --git a/oozie/webrequest/partition/add/generate_sequence_statistics.hql 
b/oozie/webrequest/load/generate_sequence_statistics.hql
similarity index 100%
rename from oozie/webrequest/partition/add/generate_sequence_statistics.hql
rename to oozie/webrequest/load/generate_sequence_statistics.hql
diff --git a/oozie/webrequest/partition/add/workflow.xml 
b/oozie/webrequest/load/workflow.xml
similarity index 97%
rename from oozie/webrequest/partition/add/workflow.xml
rename to oozie/webrequest/load/workflow.xml
index e2dad9e..bc2d271 100644
--- a/oozie/webrequest/partition/add/workflow.xml
+++ b/oozie/webrequest/load/workflow.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <workflow-app xmlns="uri:oozie:workflow:0.4"
-    
name="hive_add_partition-${table}-${webrequest_source},${year},${month},${day},${hour}-wf">
+    
name="hive_webrequest_load-${table}-${webrequest_source},${year},${month},${day},${hour}-wf">
 
     <parameters>
         <property>
diff --git a/oozie/webstats/generate_hourly_files/bundle.properties 
b/oozie/webstats/generate_hourly_files/bundle.properties
deleted file mode 100644
index 8a619e2..0000000
--- a/oozie/webstats/generate_hourly_files/bundle.properties
+++ /dev/null
@@ -1,61 +0,0 @@
-# Configures a coordinator to generate an hourly pagecounts file from
-# the webstats table.
-#
-# Usage:
-#     oozie job -run \
-#         -config oozie/webstats/generate_hourly_files/bundle.properties
-#
-# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
-#        .xml files exist there when this job is submitted.
-
-
-name_node                         = hdfs://analytics-hadoop
-job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
-queue_name                        = default
-
-# Base path in HDFS to oozie files.
-# Other files will be used relative to this path.
-oozie_directory                   = ${name_node}/wmf/refinery/current/oozie
-
-# HDFS path to coordinator to run for each webrequest_source.
-coordinator_file                  = 
${oozie_directory}/webstats/generate_hourly_files/coordinator.xml
-
-# HDFS path to workflow to run.
-workflow_file                     = 
${oozie_directory}/webstats/generate_hourly_files/workflow.xml
-
-# HDFS path to webstats dataset definition
-webstats_datasets_file   = ${oozie_directory}/webstats/datasets.xml
-
-# Initial import time of the webstats dataset.
-start_time                        = 2014-04-01T00:00Z
-
-# Time to stop running this coordinator.  Year 3000 == never!
-stop_time                         = 3000-01-01T00:00Z
-
-# HDFS path to workflow to mark a directory as done
-mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
-
-archive_job_output_workflow_file  = 
${oozie_directory}/util/archive_job_output/workflow.xml
-
-# HDFS path to hive-site.xml file.  This is needed to run hive actions.
-hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
-
-# Table to write hourly pagecounts to (fully qualified)
-webstats_table                    = wmf.webstats
-
-# HDFS path to directory where webstats data is time bucketed.
-webstats_data_directory           = ${name_node}/wmf/data/wmf/webstats
-
-# Temporary directory
-temporary_directory               = ${name_node}/tmp
-
-# Archive base directory
-archive_directory                 = ${name_node}/wmf/data/archive
-
-# Archive directory for webstats
-webstats_archive_directory        = ${archive_directory}/webstats
-
-# Coordintator to start.
-oozie.bundle.application.path     = 
${oozie_directory}/webstats/generate_hourly_files/bundle.xml
-oozie.use.system.libpath          = true
-oozie.action.external.stats.write = true
diff --git a/oozie/webstats/insert_hourly_pagecounts/coordinator.properties 
b/oozie/webstats/insert_hourly_pagecounts/coordinator.properties
deleted file mode 100644
index 6abcf52..0000000
--- a/oozie/webstats/insert_hourly_pagecounts/coordinator.properties
+++ /dev/null
@@ -1,56 +0,0 @@
-# Configures a coordinator to insert hourly webstats data
-# from webrequests table into the webstats table.
-#
-# Usage:
-#     oozie job -run \
-#         -config 
oozie/webstats/insert_hourly_pagecounts/coordinator.properties
-#
-# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
-#        .xml files exist there when this job is submitted.
-
-
-name_node                         = hdfs://analytics-hadoop
-job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
-queue_name                        = default
-
-# Base path in HDFS to oozie files.
-# Other files will be used relative to this path.
-oozie_directory                   = ${name_node}/wmf/refinery/current/oozie
-
-# HDFS path to workflow to run.
-workflow_file                     = 
${oozie_directory}/webstats/insert_hourly_pagecounts/workflow.xml
-
-# HDFS path to webrequest dataset definition
-webrequest_datasets_file          = ${oozie_directory}/webrequest/datasets.xml
-
-# HDFS path to webstats dataset definition
-webstats_datasets_file   = ${oozie_directory}/webstats/datasets.xml
-
-# Initial import time of the webrequest dataset.
-start_time                        = 2014-04-01T00:00Z
-
-# Time to stop running this coordinator.  Year 3000 == never!
-stop_time                         = 3000-01-01T00:00Z
-
-# HDFS path to workflow to mark a directory as done
-mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
-
-# HDFS path to hive-site.xml file.  This is needed to run hive actions.
-hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
-
-# Table to read webrequests from (fully qualified)
-webrequest_table                  = wmf_raw.webrequest
-
-# Table to write hourly pagecounts to (fully qualified)
-webstats_table                    = wmf.webstats
-
-# HDFS paths to directories where webrequest data is time bucketed.
-webrequest_data_directory         = ${name_node}/wmf/data/raw/webrequest
-
-# HDFS path to directory where webstats data is time bucketed.
-webstats_data_directory           = ${name_node}/wmf/data/wmf/webstats
-
-# Coordintator to start.
-oozie.coord.application.path      = 
${oozie_directory}/webstats/insert_hourly_pagecounts/coordinator.xml
-oozie.use.system.libpath          = true
-oozie.action.external.stats.write = true

-- 
To view, visit https://gerrit.wikimedia.org/r/182807
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I83bbfb2dd74181595a54ea5126412fe718c21282
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to