Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/182807
to review the following change.
Change subject: Rearrange directory layout underneath /oozie
......................................................................
Rearrange directory layout underneath /oozie
On 2014-10-03 in #wikimedia-analytics it was decided to move the
directories underneath /oozie according to:
* webrequest/partition/add -> webrequest/load
* webstats/insert_hourly_pagecounts -> pagecounts-all-sites/load
* webstats/generate_hourly_files -> pagecounts-all-sites/archive
We do not move files in hdfs yet, or rename Hive tables.
Change-Id: I83bbfb2dd74181595a54ea5126412fe718c21282
---
M diagrams/oozie-overview.dia
R oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
R oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
A oozie/pagecounts-all-sites/archive/bundle.properties
R oozie/pagecounts-all-sites/archive/bundle.xml
R oozie/pagecounts-all-sites/archive/coordinator.xml
R oozie/pagecounts-all-sites/archive/workflow.xml
R oozie/pagecounts-all-sites/datasets.xml
A oozie/pagecounts-all-sites/load/coordinator.properties
R oozie/pagecounts-all-sites/load/coordinator.xml
R oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
R oozie/pagecounts-all-sites/load/workflow.xml
R oozie/webrequest/load/README.md
R oozie/webrequest/load/bundle.properties
R oozie/webrequest/load/bundle.xml
R oozie/webrequest/load/check_sequence_statistics_workflow.xml
R oozie/webrequest/load/coordinator.xml
R oozie/webrequest/load/extract_faulty_hosts.hql
R oozie/webrequest/load/generate_sequence_statistics.hql
R oozie/webrequest/load/workflow.xml
D oozie/webstats/generate_hourly_files/bundle.properties
D oozie/webstats/insert_hourly_pagecounts/coordinator.properties
22 files changed, 176 insertions(+), 174 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/07/182807/1
diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index 2b8771b..952ebbb 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git
a/oozie/webstats/generate_hourly_files/generate_hourly_pagecounts_file.hql
b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
similarity index 85%
rename from
oozie/webstats/generate_hourly_files/generate_hourly_pagecounts_file.hql
rename to oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
index 6103d14..4448fad 100644
--- a/oozie/webstats/generate_hourly_files/generate_hourly_pagecounts_file.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
@@ -2,12 +2,13 @@
SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
--^ To work around HIVE-3296, we have SETs before any comments
--- Generates an hourly webstats pagecounts file into HDFS
+-- Generates an hourly pagecounts-all-sites pagecounts file into HDFS
--
-- Parameters:
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
--- source_table -- table containing hourly aggregated webstats data
+-- source_table -- table containing hourly aggregated
+-- pagecounts-all-sites data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -15,7 +16,7 @@
--
--
-- Usage:
--- hive -f generate_hourly_pagecounts_file.hql \
+-- hive -f archive_pagecounts.hql \
-- -d destination_directory=/tmp/foo \
-- -d source_table=wmf.webstats \
-- -d year=2014 \
diff --git
a/oozie/webstats/generate_hourly_files/generate_hourly_projectcounts_file.hql
b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
similarity index 84%
rename from
oozie/webstats/generate_hourly_files/generate_hourly_projectcounts_file.hql
rename to oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
index ab94500..82cb00b 100644
---
a/oozie/webstats/generate_hourly_files/generate_hourly_projectcounts_file.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
@@ -1,12 +1,13 @@
SET hive.exec.compress.output=false;
--^ To work around HIVE-3296, we have SETs before any comments
--- Generates an hourly webstats projectcounts file into HDFS
+-- Generates an hourly pagecounts-all-sites projectcounts file into HDFS
--
-- Parameters:
-- destination_directory -- Directory in HDFS where to store the generated
-- data in
--- source_table -- table containing hourly aggregated webstats data
+-- source_table -- table containing hourly aggregated
+-- pagecounts-all-sites data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -14,7 +15,7 @@
--
--
-- Usage:
--- hive -f generate_hourly_projectcounts_file.hql \
+-- hive -f archive_projectcounts.hql \
-- -d destination_directory=/tmp/foo \
-- -d source_table=wmf.webstats \
-- -d year=2014 \
diff --git a/oozie/pagecounts-all-sites/archive/bundle.properties
b/oozie/pagecounts-all-sites/archive/bundle.properties
new file mode 100644
index 0000000..28d02ef
--- /dev/null
+++ b/oozie/pagecounts-all-sites/archive/bundle.properties
@@ -0,0 +1,61 @@
+# Configures a coordinator to generate an hourly pagecounts-all-sites files
from
+# the webstats table.
+#
+# Usage:
+# oozie job -run \
+# -config oozie/pagecounts-all-sites/archive/bundle.properties
+#
+# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
+# .xml files exist there when this job is submitted.
+
+
+name_node = hdfs://analytics-hadoop
+job_tracker =
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name = default
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory =
${name_node}/wmf/refinery/current/oozie
+
+# HDFS path to coordinator to run for each webrequest_source.
+coordinator_file =
${oozie_directory}/pagecounts-all-sites/archive/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file =
${oozie_directory}/pagecounts-all-sites/archive/workflow.xml
+
+# HDFS path to pagecounts-all-sites dataset definition
+pagecounts_all_sites_datasets_file =
${oozie_directory}/pagecounts-all-sites/datasets.xml
+
+# Initial import time of the pagecounts-all-sites dataset.
+start_time = 2014-04-01T00:00Z
+
+# Time to stop running this coordinator. Year 3000 == never!
+stop_time = 3000-01-01T00:00Z
+
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+archive_job_output_workflow_file =
${oozie_directory}/util/archive_job_output/workflow.xml
+
+# HDFS path to hive-site.xml file. This is needed to run hive actions.
+hive_site_xml =
${oozie_directory}/util/hive/hive-site.xml
+
+# Table to write hourly pagecounts to (fully qualified)
+pagecounts_all_sites_table = wmf.webstats
+
+# HDFS path to directory where pagecounts-all-sites data is time bucketed.
+pagecounts_all_sites_data_directory = ${name_node}/wmf/data/wmf/webstats
+
+# Temporary directory
+temporary_directory = ${name_node}/tmp
+
+# Archive base directory
+archive_directory = ${name_node}/wmf/data/archive
+
+# Archive directory for pagecounts-all-sites
+pagecounts_all_sites_archive_directory = ${archive_directory}/webstats
+
+# Coordintator to start.
+oozie.bundle.application.path =
${oozie_directory}/pagecounts-all-sites/archive/bundle.xml
+oozie.use.system.libpath = true
+oozie.action.external.stats.write = true
diff --git a/oozie/webstats/generate_hourly_files/bundle.xml
b/oozie/pagecounts-all-sites/archive/bundle.xml
similarity index 76%
rename from oozie/webstats/generate_hourly_files/bundle.xml
rename to oozie/pagecounts-all-sites/archive/bundle.xml
index eb6382e..901e820 100644
--- a/oozie/webstats/generate_hourly_files/bundle.xml
+++ b/oozie/pagecounts-all-sites/archive/bundle.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<bundle-app xmlns="uri:oozie:bundle:0.2"
- name="webstats_generate_hourly_files-bundle">
+ name="pagecounts_all_sites_archive-bundle">
<parameters>
<property>
@@ -13,18 +13,18 @@
<property><name>job_tracker</name></property>
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
- <property><name>webstats_datasets_file</name></property>
- <property><name>webstats_data_directory</name></property>
+ <property><name>pagecounts_all_sites_datasets_file</name></property>
+ <property><name>pagecounts_all_sites_data_directory</name></property>
<property><name>hive_site_xml</name></property>
<property><name>workflow_file</name></property>
- <property><name>webstats_table</name></property>
+ <property><name>pagecounts_all_sites_table</name></property>
<property><name>mark_directory_done_workflow_file</name></property>
<property><name>temporary_directory</name></property>
- <property><name>webstats_archive_directory</name></property>
+
<property><name>pagecounts_all_sites_archive_directory</name></property>
<property><name>archive_job_output_workflow_file</name></property>
</parameters>
- <coordinator name="webstats_generate_hourly_files-pagecounts">
+ <coordinator name="pagecounts_all_sites_archive-pagecounts">
<app-path>${coordinator_file}</app-path>
<configuration>
<property>
@@ -38,7 +38,7 @@
</configuration>
</coordinator>
- <coordinator name="webstats_generate_hourly_files-projectcounts">
+ <coordinator name="pagecounts_all_sites_archive-projectcounts">
<app-path>${coordinator_file}</app-path>
<configuration>
<property>
diff --git a/oozie/webstats/generate_hourly_files/coordinator.xml
b/oozie/pagecounts-all-sites/archive/coordinator.xml
similarity index 86%
rename from oozie/webstats/generate_hourly_files/coordinator.xml
rename to oozie/pagecounts-all-sites/archive/coordinator.xml
index d2809db..88c0493 100644
--- a/oozie/webstats/generate_hourly_files/coordinator.xml
+++ b/oozie/pagecounts-all-sites/archive/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="webstats_generate_hourly_files-${aspect_name}-coord"
+ name="pagecounts_all_sites_archive-${aspect_name}-coord"
frequency="${coord:hours(1)}"
start="${start_time}"
end="${stop_time}"
@@ -17,14 +17,14 @@
<property><name>job_tracker</name></property>
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
- <property><name>webstats_datasets_file</name></property>
- <property><name>webstats_data_directory</name></property>
+ <property><name>pagecounts_all_sites_datasets_file</name></property>
+ <property><name>pagecounts_all_sites_data_directory</name></property>
<property><name>hive_site_xml</name></property>
<property><name>workflow_file</name></property>
- <property><name>webstats_table</name></property>
+ <property><name>pagecounts_all_sites_table</name></property>
<property><name>mark_directory_done_workflow_file</name></property>
<property><name>temporary_directory</name></property>
- <property><name>webstats_archive_directory</name></property>
+
<property><name>pagecounts_all_sites_archive_directory</name></property>
<property><name>archive_job_output_workflow_file</name></property>
<property><name>aspect_name</name></property>
<property><name>aspect_compression_ending</name></property>
@@ -55,11 +55,11 @@
</controls>
<datasets>
- <include>${webstats_datasets_file}</include>
+ <include>${pagecounts_all_sites_datasets_file}</include>
</datasets>
<input-events>
- <data-in name="webstats" dataset="webstats_pagecounts">
+ <data-in name="pagecounts_all_sites"
dataset="pagecounts_all_sites_hourly">
<instance>${coord:current(0)}</instance>
</data-in>
</input-events>
@@ -79,8 +79,8 @@
<value>${hive_site_xml}</value>
</property>
<property>
- <name>webstats_table</name>
- <value>${webstats_table}</value>
+ <name>pagecounts_all_sites_table</name>
+ <value>${pagecounts_all_sites_table}</value>
</property>
<property>
<name>year</name>
@@ -123,8 +123,8 @@
<value>${temporary_directory}</value>
</property>
<property>
- <name>webstats_archive_directory</name>
- <value>${webstats_archive_directory}</value>
+ <name>pagecounts_all_sites_archive_directory</name>
+ <value>${pagecounts_all_sites_archive_directory}</value>
</property>
<property>
<name>archive_job_output_workflow_file</name>
diff --git a/oozie/webstats/generate_hourly_files/workflow.xml
b/oozie/pagecounts-all-sites/archive/workflow.xml
similarity index 86%
rename from oozie/webstats/generate_hourly_files/workflow.xml
rename to oozie/pagecounts-all-sites/archive/workflow.xml
index 5bff50f..c70f8f1 100644
--- a/oozie/webstats/generate_hourly_files/workflow.xml
+++ b/oozie/pagecounts-all-sites/archive/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
-
name="webstats_generate_hourly_files-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
+
name="pagecounts_all_sites_archive-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
<parameters>
<property>
@@ -17,9 +17,9 @@
<description>hive-site.xml file path in HDFS</description>
</property>
<property>
- <name>webstats_table</name>
+ <name>pagecounts_all_sites_table</name>
<description>
- Hive table to read webstats data from.
+ Hive table to read pagecounts-all-sites data from.
</description>
</property>
<property>
@@ -63,8 +63,8 @@
<description>A directory in HDFS for temporary files</description>
</property>
<property>
- <name>webstats_archive_directory</name>
- <description>Directory for archive of webstats files</description>
+ <name>pagecounts_all_sites_archive_directory</name>
+ <description>Directory for archive of pagecounts-all-sites
files</description>
</property>
<property>
<name>archive_job_output_workflow_file</name>
@@ -94,9 +94,9 @@
</property>
</configuration>
- <script>generate_hourly_${aspect_name}_file.hql</script>
+ <script>archive_${aspect_name}.hql</script>
- <param>source_table=${webstats_table}</param>
+ <param>source_table=${pagecounts_all_sites_table}</param>
<param>destination_directory=${temporary_directory}/${wf:id()}-${aspect_name}</param>
<param>year=${year}</param>
<param>month=${month}</param>
@@ -140,7 +140,7 @@
timestamp in the filename. To not break scripts of people,
we also name files that way.
-->
-
<value>${webstats_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
eq 'EMPTY' ? '' : aspect_compression_ending}</value>
+
<value>${pagecounts_all_sites_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
eq 'EMPTY' ? '' : aspect_compression_ending}</value>
</property>
</configuration>
</sub-workflow>
diff --git a/oozie/webstats/datasets.xml
b/oozie/pagecounts-all-sites/datasets.xml
similarity index 79%
rename from oozie/webstats/datasets.xml
rename to oozie/pagecounts-all-sites/datasets.xml
index 8ef1609..3dc9768 100644
--- a/oozie/webstats/datasets.xml
+++ b/oozie/pagecounts-all-sites/datasets.xml
@@ -1,17 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
-Defines reusable datasets for webstats.
+Defines reusable datasets for pagecounts-all-sites.
Use this dataset in your coordinator.xml files by setting:
${start_time} - the initial instance of your data.
Example: 2014-04-01T00:00Z
- ${webstats_data_directory}
+ ${pagecounts_all_sites_data_directory}
- Path to directory where data is time bucketed.
Example: /wmf/data/wmf/webstats
-->
<datasets>
- <dataset name="webstats_pagecounts"
+ <dataset name="pagecounts_all_sites_hourly"
frequency="${coord:hours(1)}"
initial-instance="${start_time}"
timezone="Universal">
@@ -27,6 +27,6 @@
at “${...}” as input for the second EL level. There, the variables hold
their expected values, and we can start unpadding them.
-->
-
<uri-template>${webstats_data_directory}/year=${YEAR}/month=${"$"}{MONTH +
0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
+
<uri-template>${pagecounts_all_sites_data_directory}/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
</dataset>
</datasets>
diff --git a/oozie/pagecounts-all-sites/load/coordinator.properties
b/oozie/pagecounts-all-sites/load/coordinator.properties
new file mode 100644
index 0000000..e89c81e
--- /dev/null
+++ b/oozie/pagecounts-all-sites/load/coordinator.properties
@@ -0,0 +1,56 @@
+# Configures a coordinator to insert hourly pagecounts-all-sites data
+# from webrequests table into the webstats table.
+#
+# Usage:
+# oozie job -run \
+# -config oozie/pagecounts-all-sites/load/coordinator.properties
+#
+# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
+# .xml files exist there when this job is submitted.
+
+
+name_node = hdfs://analytics-hadoop
+job_tracker =
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name = default
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory = ${name_node}/wmf/refinery/current/oozie
+
+# HDFS path to workflow to run.
+workflow_file =
${oozie_directory}/pagecounts-all-sites/load/workflow.xml
+
+# HDFS path to webrequest dataset definition
+webrequest_datasets_file =
${oozie_directory}/webrequest/datasets.xml
+
+# HDFS path to pagecounts-all-sites dataset definition
+pagecounts_all_sites_datasets_file =
${oozie_directory}/pagecounts-all-sites/datasets.xml
+
+# Initial import time of the webrequest dataset.
+start_time = 2014-04-01T00:00Z
+
+# Time to stop running this coordinator. Year 3000 == never!
+stop_time = 3000-01-01T00:00Z
+
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
+
+# HDFS path to hive-site.xml file. This is needed to run hive actions.
+hive_site_xml =
${oozie_directory}/util/hive/hive-site.xml
+
+# Table to read webrequests from (fully qualified)
+webrequest_table = wmf_raw.webrequest
+
+# Table to write hourly pagecounts to (fully qualified)
+pagecounts_all_sites_table = wmf.webstats
+
+# HDFS paths to directories where webrequest data is time bucketed.
+webrequest_data_directory = ${name_node}/wmf/data/raw/webrequest
+
+# HDFS path to directory where pagecounts-all-sites data is time bucketed.
+pagecounts_all_sites_data_directory = ${name_node}/wmf/data/wmf/webstats
+
+# Coordintator to start.
+oozie.coord.application.path =
${oozie_directory}/pagecounts-all-sites/load/coordinator.xml
+oozie.use.system.libpath = true
+oozie.action.external.stats.write = true
diff --git a/oozie/webstats/insert_hourly_pagecounts/coordinator.xml
b/oozie/pagecounts-all-sites/load/coordinator.xml
similarity index 87%
rename from oozie/webstats/insert_hourly_pagecounts/coordinator.xml
rename to oozie/pagecounts-all-sites/load/coordinator.xml
index 913e46d..f659326 100644
--- a/oozie/webstats/insert_hourly_pagecounts/coordinator.xml
+++ b/oozie/pagecounts-all-sites/load/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="webstats_insert_hourly_pagecounts-${webstats_table}-coord"
+ name="pagecounts_all_sites_load-${pagecounts_all_sites_table}-coord"
frequency="${coord:hours(1)}"
start="${start_time}"
end="${stop_time}"
@@ -19,13 +19,13 @@
<property><name>stop_time</name></property>
<property><name>webrequest_datasets_file</name></property>
<property><name>webrequest_data_directory</name></property>
- <property><name>webstats_datasets_file</name></property>
- <property><name>webstats_data_directory</name></property>
+ <property><name>pagecounts_all_sites_datasets_file</name></property>
+ <property><name>pagecounts_all_sites_data_directory</name></property>
<property><name>hive_site_xml</name></property>
<property><name>workflow_file</name></property>
<property><name>webrequest_table</name></property>
- <property><name>webstats_table</name></property>
+ <property><name>pagecounts_all_sites_table</name></property>
<property><name>mark_directory_done_workflow_file</name></property>
</parameters>
@@ -55,7 +55,7 @@
<datasets>
<include>${webrequest_datasets_file}</include>
- <include>${webstats_datasets_file}</include>
+ <include>${pagecounts_all_sites_datasets_file}</include>
</datasets>
<input-events>
@@ -68,7 +68,7 @@
</input-events>
<output-events>
- <data-out name="webstats" dataset="webstats_pagecounts">
+ <data-out name="pagecounts_all_sites"
dataset="pagecounts_all_sites_hourly">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>
@@ -93,7 +93,7 @@
</property>
<property>
<name>destination_table</name>
- <value>${webstats_table}</value>
+ <value>${pagecounts_all_sites_table}</value>
</property>
<property>
<name>year</name>
@@ -113,7 +113,7 @@
</property>
<property>
<name>destination_dataset_directory</name>
- <value>${coord:dataOut('webstats')}</value>
+ <value>${coord:dataOut('pagecounts_all_sites')}</value>
</property>
</configuration>
</workflow>
diff --git
a/oozie/webstats/insert_hourly_pagecounts/insert_hourly_pagecounts.hql
b/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
similarity index 100%
rename from oozie/webstats/insert_hourly_pagecounts/insert_hourly_pagecounts.hql
rename to oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
diff --git a/oozie/webstats/insert_hourly_pagecounts/workflow.xml
b/oozie/pagecounts-all-sites/load/workflow.xml
similarity index 96%
rename from oozie/webstats/insert_hourly_pagecounts/workflow.xml
rename to oozie/pagecounts-all-sites/load/workflow.xml
index f93ef66..8bdfc5f 100644
--- a/oozie/webstats/insert_hourly_pagecounts/workflow.xml
+++ b/oozie/pagecounts-all-sites/load/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
-
name="webstats_insert_hourly_pagecounts-${destination_table}-${year}-${month}-${day}-${hour}-wf">
+
name="pagecounts_all_sites_load-${destination_table}-${year}-${month}-${day}-${hour}-wf">
<parameters>
<property>
diff --git a/oozie/webrequest/partition/add/README.md
b/oozie/webrequest/load/README.md
similarity index 100%
rename from oozie/webrequest/partition/add/README.md
rename to oozie/webrequest/load/README.md
diff --git a/oozie/webrequest/partition/add/bundle.properties
b/oozie/webrequest/load/bundle.properties
similarity index 92%
rename from oozie/webrequest/partition/add/bundle.properties
rename to oozie/webrequest/load/bundle.properties
index df6bff2..bf4feca 100644
--- a/oozie/webrequest/partition/add/bundle.properties
+++ b/oozie/webrequest/load/bundle.properties
@@ -1,7 +1,7 @@
# Configures a bundle to manage automatically adding Hive partitions to
# a webrequest table. Any of the following properties are overidable with -D.
# Usage:
-# oozie job -submit -config oozie/webrequest/add/bundle.properties.
+# oozie job -submit -config oozie/webrequest/load/bundle.properties.
#
# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
# .xml files exist there when this job is submitted.
@@ -16,10 +16,10 @@
oozie_directory = ${name_node}/wmf/refinery/current/oozie
# HDFS path to coordinator to run for each webrequest_source.
-coordinator_file =
${oozie_directory}/webrequest/partition/add/coordinator.xml
+coordinator_file =
${oozie_directory}/webrequest/load/coordinator.xml
# HDFS path to workflow to run.
-workflow_file =
${oozie_directory}/webrequest/partition/add/workflow.xml
+workflow_file =
${oozie_directory}/webrequest/load/workflow.xml
# HDFS path to webrequest dataset definition
datasets_file = ${oozie_directory}/webrequest/datasets.xml
@@ -31,7 +31,7 @@
stop_time = 3000-01-01T00:00Z
# Workflow to add a partition
-add_partition_workflow_file =
${oozie_directory}/util/hive/partition/add/workflow.xml
+add_partition_workflow_file =
${oozie_directory}/util/hive/load/workflow.xml
# Workflow to mark a directory as done
mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
@@ -52,6 +52,6 @@
webrequest_data_directory = ${name_node}/wmf/data/raw/webrequest
# Coordintator to start.
-oozie.bundle.application.path =
${oozie_directory}/webrequest/partition/add/bundle.xml
+oozie.bundle.application.path =
${oozie_directory}/webrequest/load/bundle.xml
oozie.use.system.libpath = true
oozie.action.external.stats.write = true
diff --git a/oozie/webrequest/partition/add/bundle.xml
b/oozie/webrequest/load/bundle.xml
similarity index 86%
rename from oozie/webrequest/partition/add/bundle.xml
rename to oozie/webrequest/load/bundle.xml
index c1144da..df602b7 100644
--- a/oozie/webrequest/partition/add/bundle.xml
+++ b/oozie/webrequest/load/bundle.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<bundle-app xmlns="uri:oozie:bundle:0.2"
- name="hive_add_partition-${table}-bundle">
+ name="hive_webrequest_load-${table}-bundle">
<parameters>
<property>
@@ -25,7 +25,7 @@
<property><name>mark_directory_done_workflow_file</name></property>
</parameters>
- <coordinator name='hive_add_partition-webrequest-bits'>
+ <coordinator name='hive_webrequest_load-webrequest-bits'>
<app-path>${coordinator_file}</app-path>
<configuration>
<property>
@@ -35,7 +35,7 @@
</configuration>
</coordinator>
- <coordinator name='hive_add_partition-webrequest-mobile'>
+ <coordinator name='hive_webrequest_load-webrequest-mobile'>
<app-path>${coordinator_file}</app-path>
<configuration>
<property>
@@ -45,7 +45,7 @@
</configuration>
</coordinator>
- <coordinator name='hive_add_partition-webrequest-text'>
+ <coordinator name='hive_webrequest_load-webrequest-text'>
<app-path>${coordinator_file}</app-path>
<configuration>
<property>
@@ -55,7 +55,7 @@
</configuration>
</coordinator>
- <coordinator name='hive_add_partition-webrequest-upload'>
+ <coordinator name='hive_webrequest_load-webrequest-upload'>
<app-path>${coordinator_file}</app-path>
<configuration>
<property>
diff --git
a/oozie/webrequest/partition/add/check_sequence_statistics_workflow.xml
b/oozie/webrequest/load/check_sequence_statistics_workflow.xml
similarity index 100%
rename from
oozie/webrequest/partition/add/check_sequence_statistics_workflow.xml
rename to oozie/webrequest/load/check_sequence_statistics_workflow.xml
diff --git a/oozie/webrequest/partition/add/coordinator.xml
b/oozie/webrequest/load/coordinator.xml
similarity index 98%
rename from oozie/webrequest/partition/add/coordinator.xml
rename to oozie/webrequest/load/coordinator.xml
index 08dad7a..77045d9 100644
--- a/oozie/webrequest/partition/add/coordinator.xml
+++ b/oozie/webrequest/load/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="hive_add_partition-${table}-${webrequest_source}-coord"
+ name="hive_webrequest_load-${table}-${webrequest_source}-coord"
frequency="${coord:hours(1)}"
start="${start_time}"
end="${stop_time}"
diff --git a/oozie/webrequest/partition/add/extract_faulty_hosts.hql
b/oozie/webrequest/load/extract_faulty_hosts.hql
similarity index 100%
rename from oozie/webrequest/partition/add/extract_faulty_hosts.hql
rename to oozie/webrequest/load/extract_faulty_hosts.hql
diff --git a/oozie/webrequest/partition/add/generate_sequence_statistics.hql
b/oozie/webrequest/load/generate_sequence_statistics.hql
similarity index 100%
rename from oozie/webrequest/partition/add/generate_sequence_statistics.hql
rename to oozie/webrequest/load/generate_sequence_statistics.hql
diff --git a/oozie/webrequest/partition/add/workflow.xml
b/oozie/webrequest/load/workflow.xml
similarity index 97%
rename from oozie/webrequest/partition/add/workflow.xml
rename to oozie/webrequest/load/workflow.xml
index e2dad9e..bc2d271 100644
--- a/oozie/webrequest/partition/add/workflow.xml
+++ b/oozie/webrequest/load/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
-
name="hive_add_partition-${table}-${webrequest_source},${year},${month},${day},${hour}-wf">
+
name="hive_webrequest_load-${table}-${webrequest_source},${year},${month},${day},${hour}-wf">
<parameters>
<property>
diff --git a/oozie/webstats/generate_hourly_files/bundle.properties
b/oozie/webstats/generate_hourly_files/bundle.properties
deleted file mode 100644
index 8a619e2..0000000
--- a/oozie/webstats/generate_hourly_files/bundle.properties
+++ /dev/null
@@ -1,61 +0,0 @@
-# Configures a coordinator to generate an hourly pagecounts file from
-# the webstats table.
-#
-# Usage:
-# oozie job -run \
-# -config oozie/webstats/generate_hourly_files/bundle.properties
-#
-# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
-# .xml files exist there when this job is submitted.
-
-
-name_node = hdfs://analytics-hadoop
-job_tracker = resourcemanager.analytics.eqiad.wmnet:8032
-queue_name = default
-
-# Base path in HDFS to oozie files.
-# Other files will be used relative to this path.
-oozie_directory = ${name_node}/wmf/refinery/current/oozie
-
-# HDFS path to coordinator to run for each webrequest_source.
-coordinator_file =
${oozie_directory}/webstats/generate_hourly_files/coordinator.xml
-
-# HDFS path to workflow to run.
-workflow_file =
${oozie_directory}/webstats/generate_hourly_files/workflow.xml
-
-# HDFS path to webstats dataset definition
-webstats_datasets_file = ${oozie_directory}/webstats/datasets.xml
-
-# Initial import time of the webstats dataset.
-start_time = 2014-04-01T00:00Z
-
-# Time to stop running this coordinator. Year 3000 == never!
-stop_time = 3000-01-01T00:00Z
-
-# HDFS path to workflow to mark a directory as done
-mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
-
-archive_job_output_workflow_file =
${oozie_directory}/util/archive_job_output/workflow.xml
-
-# HDFS path to hive-site.xml file. This is needed to run hive actions.
-hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml
-
-# Table to write hourly pagecounts to (fully qualified)
-webstats_table = wmf.webstats
-
-# HDFS path to directory where webstats data is time bucketed.
-webstats_data_directory = ${name_node}/wmf/data/wmf/webstats
-
-# Temporary directory
-temporary_directory = ${name_node}/tmp
-
-# Archive base directory
-archive_directory = ${name_node}/wmf/data/archive
-
-# Archive directory for webstats
-webstats_archive_directory = ${archive_directory}/webstats
-
-# Coordintator to start.
-oozie.bundle.application.path =
${oozie_directory}/webstats/generate_hourly_files/bundle.xml
-oozie.use.system.libpath = true
-oozie.action.external.stats.write = true
diff --git a/oozie/webstats/insert_hourly_pagecounts/coordinator.properties
b/oozie/webstats/insert_hourly_pagecounts/coordinator.properties
deleted file mode 100644
index 6abcf52..0000000
--- a/oozie/webstats/insert_hourly_pagecounts/coordinator.properties
+++ /dev/null
@@ -1,56 +0,0 @@
-# Configures a coordinator to insert hourly webstats data
-# from webrequests table into the webstats table.
-#
-# Usage:
-# oozie job -run \
-# -config
oozie/webstats/insert_hourly_pagecounts/coordinator.properties
-#
-# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
-# .xml files exist there when this job is submitted.
-
-
-name_node = hdfs://analytics-hadoop
-job_tracker = resourcemanager.analytics.eqiad.wmnet:8032
-queue_name = default
-
-# Base path in HDFS to oozie files.
-# Other files will be used relative to this path.
-oozie_directory = ${name_node}/wmf/refinery/current/oozie
-
-# HDFS path to workflow to run.
-workflow_file =
${oozie_directory}/webstats/insert_hourly_pagecounts/workflow.xml
-
-# HDFS path to webrequest dataset definition
-webrequest_datasets_file = ${oozie_directory}/webrequest/datasets.xml
-
-# HDFS path to webstats dataset definition
-webstats_datasets_file = ${oozie_directory}/webstats/datasets.xml
-
-# Initial import time of the webrequest dataset.
-start_time = 2014-04-01T00:00Z
-
-# Time to stop running this coordinator. Year 3000 == never!
-stop_time = 3000-01-01T00:00Z
-
-# HDFS path to workflow to mark a directory as done
-mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
-
-# HDFS path to hive-site.xml file. This is needed to run hive actions.
-hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml
-
-# Table to read webrequests from (fully qualified)
-webrequest_table = wmf_raw.webrequest
-
-# Table to write hourly pagecounts to (fully qualified)
-webstats_table = wmf.webstats
-
-# HDFS paths to directories where webrequest data is time bucketed.
-webrequest_data_directory = ${name_node}/wmf/data/raw/webrequest
-
-# HDFS path to directory where webstats data is time bucketed.
-webstats_data_directory = ${name_node}/wmf/data/wmf/webstats
-
-# Coordintator to start.
-oozie.coord.application.path =
${oozie_directory}/webstats/insert_hourly_pagecounts/coordinator.xml
-oozie.use.system.libpath = true
-oozie.action.external.stats.write = true
--
To view, visit https://gerrit.wikimedia.org/r/182807
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I83bbfb2dd74181595a54ea5126412fe718c21282
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits