Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/185708
to review the following change.
Change subject: Add pagecounts-raw computation to pagecounts-all-sites
......................................................................
Add pagecounts-raw computation to pagecounts-all-sites
Change-Id: If32afcc082ae28248eca46b58dc2748811b00489
---
M oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
M oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
M oozie/pagecounts-all-sites/archive/bundle.properties
M oozie/pagecounts-all-sites/archive/bundle.xml
M oozie/pagecounts-all-sites/archive/coordinator.xml
M oozie/pagecounts-all-sites/archive/workflow.xml
6 files changed, 111 insertions(+), 10 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/08/185708/1
diff --git a/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
index ae6bdf9..94cd202 100644
--- a/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
@@ -13,6 +13,8 @@
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
-- hour -- hour of the to-be-generated-hour
+-- extra_filter -- additional condition by which to filter the
+-- selected rows
--
--
-- Usage:
@@ -40,5 +42,6 @@
AND month=${month}
AND day=${day}
AND hour=${hour}
+ ${extra_filter}
ORDER BY line
LIMIT 100000000;
diff --git a/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
index 36f31c0..8f1e5ca 100644
--- a/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
@@ -12,6 +12,8 @@
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
-- hour -- hour of the to-be-generated-hour
+-- extra_filter -- additional condition by which to filter the
+-- selected rows
--
--
-- Usage:
@@ -39,6 +41,7 @@
AND month=${month}
AND day=${day}
AND hour=${hour}
+ ${extra_filter}
GROUP BY qualifier
ORDER BY line
LIMIT 100000;
diff --git a/oozie/pagecounts-all-sites/archive/bundle.properties
b/oozie/pagecounts-all-sites/archive/bundle.properties
index 7ef158f..d27c924 100644
--- a/oozie/pagecounts-all-sites/archive/bundle.properties
+++ b/oozie/pagecounts-all-sites/archive/bundle.properties
@@ -55,6 +55,12 @@
# Archive directory for pagecounts-all-sites
pagecounts_all_sites_archive_directory =
${archive_directory}/pagecounts-all-sites
+# Archive directory for pagecounts-raw
+pagecounts_raw_archive_directory = ${archive_directory}/pagecounts-raw
+
+# Extra filter for pagecounts-raw
+pagecounts_raw_extra_filter = ( NOT qualifier RLIKE
'\\\\.zero(\\\\.|$)' AND ( NOT qualifier RLIKE '\\\\.m(\\\\.|$)' ) OR qualifier
RLIKE
'^(commons|meta|incubator|species|strategy|outreach|usability|quality)\\\\.m$')
+
# Coordintator to start.
oozie.bundle.application.path =
${oozie_directory}/pagecounts-all-sites/archive/bundle.xml
oozie.use.system.libpath = true
diff --git a/oozie/pagecounts-all-sites/archive/bundle.xml
b/oozie/pagecounts-all-sites/archive/bundle.xml
index 901e820..99c428e 100644
--- a/oozie/pagecounts-all-sites/archive/bundle.xml
+++ b/oozie/pagecounts-all-sites/archive/bundle.xml
@@ -21,12 +21,17 @@
<property><name>mark_directory_done_workflow_file</name></property>
<property><name>temporary_directory</name></property>
<property><name>pagecounts_all_sites_archive_directory</name></property>
+ <property><name>pagecounts_raw_archive_directory</name></property>
<property><name>archive_job_output_workflow_file</name></property>
</parameters>
- <coordinator name="pagecounts_all_sites_archive-pagecounts">
+ <coordinator
name="pagecounts_all_sites_archive-pagecounts_all_sites-pagecounts">
<app-path>${coordinator_file}</app-path>
<configuration>
+ <property>
+ <name>dataset_name</name>
+ <value>pagecounts-all-sites</value>
+ </property>
<property>
<name>aspect_name</name>
<value>pagecounts</value>
@@ -35,12 +40,20 @@
<name>aspect_compression_ending</name>
<value>.gz</value>
</property>
+ <property>
+ <name>workflow_archive_directory</name>
+ <value>${pagecounts_all_sites_archive_directory}</value>
+ </property>
</configuration>
</coordinator>
- <coordinator name="pagecounts_all_sites_archive-projectcounts">
+ <coordinator
name="pagecounts_all_sites_archive-pagecounts_all_sites-projectcounts">
<app-path>${coordinator_file}</app-path>
<configuration>
+ <property>
+ <name>dataset_name</name>
+ <value>pagecounts-all-sites</value>
+ </property>
<property>
<name>aspect_name</name>
<value>projectcounts</value>
@@ -49,6 +62,62 @@
<name>aspect_compression_ending</name>
<value>EMPTY</value>
</property>
+ <property>
+ <name>workflow_archive_directory</name>
+ <value>${pagecounts_all_sites_archive_directory}</value>
+ </property>
+ </configuration>
+ </coordinator>
+
+ <coordinator name="pagecounts_all_sites_archive-pagecounts_raw-pagecounts">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>dataset_name</name>
+ <value>pagecounts-raw</value>
+ </property>
+ <property>
+ <name>aspect_name</name>
+ <value>pagecounts</value>
+ </property>
+ <property>
+ <name>aspect_compression_ending</name>
+ <value>.gz</value>
+ </property>
+ <property>
+ <name>workflow_archive_directory</name>
+ <value>${pagecounts_raw_archive_directory}</value>
+ </property>
+ <property>
+ <name>extra_filter</name>
+ <value>${pagecounts_raw_extra_filter}</value>
+ </property>
+ </configuration>
+ </coordinator>
+
+ <coordinator
name="pagecounts_all_sites_archive-pagecounts_raw-projectcounts">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>dataset_name</name>
+ <value>pagecounts-raw</value>
+ </property>
+ <property>
+ <name>aspect_name</name>
+ <value>projectcounts</value>
+ </property>
+ <property>
+ <name>aspect_compression_ending</name>
+ <value>EMPTY</value>
+ </property>
+ <property>
+ <name>workflow_archive_directory</name>
+ <value>${pagecounts_raw_archive_directory}</value>
+ </property>
+ <property>
+ <name>extra_filter</name>
+ <value>${pagecounts_raw_extra_filter}</value>
+ </property>
</configuration>
</coordinator>
diff --git a/oozie/pagecounts-all-sites/archive/coordinator.xml
b/oozie/pagecounts-all-sites/archive/coordinator.xml
index 88c0493..0aa06bb 100644
--- a/oozie/pagecounts-all-sites/archive/coordinator.xml
+++ b/oozie/pagecounts-all-sites/archive/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="pagecounts_all_sites_archive-${aspect_name}-coord"
+ name="pagecounts_all_sites_archive-${dataset_name}-${aspect_name}-coord"
frequency="${coord:hours(1)}"
start="${start_time}"
end="${stop_time}"
@@ -11,8 +11,13 @@
<name>queue_name</name>
<value>default</value>
</property>
+ <property>
+ <name>extra_filter</name>
+ <value></value>
+ </property>
<!-- Required properties. -->
+ <property><name>dataset_name</name></property>
<property><name>name_node</name></property>
<property><name>job_tracker</name></property>
<property><name>start_time</name></property>
@@ -24,7 +29,7 @@
<property><name>pagecounts_all_sites_table</name></property>
<property><name>mark_directory_done_workflow_file</name></property>
<property><name>temporary_directory</name></property>
-
<property><name>pagecounts_all_sites_archive_directory</name></property>
+ <property><name>workflow_archive_directory</name></property>
<property><name>archive_job_output_workflow_file</name></property>
<property><name>aspect_name</name></property>
<property><name>aspect_compression_ending</name></property>
@@ -70,6 +75,7 @@
<configuration>
<!-- Pass these properties through to the workflow -->
+
<property><name>dataset_name</name><value>${dataset_name}</value></property>
<property><name>name_node</name><value>${name_node}</value></property>
<property><name>job_tracker</name><value>${job_tracker}</value></property>
<property><name>queue_name</name><value>${queue_name}</value></property>
@@ -123,8 +129,8 @@
<value>${temporary_directory}</value>
</property>
<property>
- <name>pagecounts_all_sites_archive_directory</name>
- <value>${pagecounts_all_sites_archive_directory}</value>
+ <name>workflow_archive_directory</name>
+ <value>${workflow_archive_directory}</value>
</property>
<property>
<name>archive_job_output_workflow_file</name>
@@ -138,6 +144,10 @@
<name>aspect_compression_ending</name>
<value>${aspect_compression_ending}</value>
</property>
+ <property>
+ <name>extra_filter</name>
+ <value>${extra_filter}</value>
+ </property>
</configuration>
</workflow>
</action>
diff --git a/oozie/pagecounts-all-sites/archive/workflow.xml
b/oozie/pagecounts-all-sites/archive/workflow.xml
index f067889..466c8f3 100644
--- a/oozie/pagecounts-all-sites/archive/workflow.xml
+++ b/oozie/pagecounts-all-sites/archive/workflow.xml
@@ -1,14 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
-
name="pagecounts_all_sites_archive-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
+
name="pagecounts_all_sites_archive-${dataset_name}-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
<parameters>
<property>
<name>queue_name</name>
<value>default</value>
</property>
+ <property>
+ <name>extra_filter</name>
+ <value>0=0</value> <!-- sadly enough, 'true' does not work here-->
+ <description>
+ Additional filter to apply when selecting data from the
+ pagecounts-all-site table.
+ </description>
+ </property>
<!-- Required properties -->
+ <property><name>dataset_name</name></property>
<property><name>name_node</name></property>
<property><name>job_tracker</name></property>
@@ -63,8 +72,8 @@
<description>A directory in HDFS for temporary files</description>
</property>
<property>
- <name>pagecounts_all_sites_archive_directory</name>
- <description>Directory for archive of pagecounts-all-sites
files</description>
+ <name>workflow_archive_directory</name>
+ <description>Directory to archive the workflow output
to</description>
</property>
<property>
<name>archive_job_output_workflow_file</name>
@@ -102,6 +111,7 @@
<param>month=${month}</param>
<param>day=${day}</param>
<param>hour=${hour}</param>
+ <param>extra_filter= AND ${extra_filter}</param>
</hive>
<ok to="mark_dataset_done"/>
<error to="kill"/>
@@ -152,7 +162,7 @@
timestamp in the filename. To not break scripts of people,
we also name files that way.
-->
-
<value>${pagecounts_all_sites_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
eq 'EMPTY' ? '' : aspect_compression_ending}</value>
+
<value>${workflow_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
eq 'EMPTY' ? '' : aspect_compression_ending}</value>
</property>
</configuration>
</sub-workflow>
--
To view, visit https://gerrit.wikimedia.org/r/185708
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If32afcc082ae28248eca46b58dc2748811b00489
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits