Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/189218
to review the following change.
Change subject: Deduplicate legacy-tsvs webrequest_source dependency information
......................................................................
Deduplicate legacy-tsvs webrequest_source dependency information
Which of the legacy-tsvs depended on which webrequest_source was
duplicated by having the dependencies in the used coordinator, and
also in the HiveQL files. This was both not too nice maintenance wise,
and also it hinders reuse of the upcoming splitting out of 5xx tsvs
per webrequest_source.
We now extract the needed webrequest_sources from the coordinator
name. And thereby ease reuse of HiveQL files for different
webrequest_sources.
But HiveQL files need not rely on this. They can still choose to
duplicate webrequest_source dependency information.
Change-Id: I5ea5b3c874b3f94763413535077126469a77a9b1
---
M oozie/webrequest/legacy_tsvs/bundle.xml
M oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_edits_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_mobile-sampled-100_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_zero_tsv.hql
M oozie/webrequest/legacy_tsvs/workflow.xml
9 files changed, 53 insertions(+), 11 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/18/189218/1
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 97979ad..d179401 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -30,6 +30,10 @@
</parameters>
<coordinator name="webrequest_legacy_tsvs-sampled-1000">
+ <!--
+ No 'bits', or 'misc', as the sampled-1000 was last produced on erbium's
+ udp2log stream, which did not receive those two webrequest_sources.
+ -->
<app-path>${coordinator_mobile_text_upload_file}</app-path>
<configuration>
<property>
@@ -138,6 +142,7 @@
</coordinator>
<coordinator name="webrequest_legacy_tsvs-5xx">
+ <!-- TODO: Add 'bits', once it's turned on again -->
<app-path>${coordinator_misc_mobile_text_file}</app-path>
<configuration>
<property>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
index ecbcadf..69d5651 100644
--- a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -8,6 +8,8 @@
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -17,6 +19,7 @@
-- hive -f generate_5xx_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'mobile'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -56,8 +59,7 @@
) line,
dt
FROM ${webrequest_table}
- WHERE webrequest_source IN ('misc', 'mobile', 'text')
- -- TODO: Add 'bits', once it's turned on again
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
index 29012a9..e054713 100644
--- a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
@@ -8,6 +8,8 @@
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -17,6 +19,7 @@
-- hive -f generate_api-usage_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'mobile'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -60,7 +63,7 @@
-- covers only an hour worth of data. Hence, we resort to BUCKET
-- sampling.
FROM ${webrequest_table} TABLESAMPLE(BUCKET 1 OUT OF 100 ON rand())
- WHERE webrequest_source IN ('mobile', 'text')
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/generate_edits_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_edits_tsv.hql
index ff90f70..57efd0a 100644
--- a/oozie/webrequest/legacy_tsvs/generate_edits_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_edits_tsv.hql
@@ -8,6 +8,8 @@
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -17,6 +19,7 @@
-- hive -f generate_edits_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'text'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -56,7 +59,7 @@
) line,
dt
FROM ${webrequest_table}
- WHERE webrequest_source IN ('mobile', 'text')
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
index 2d2c24f..18a3b41 100644
--- a/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
@@ -11,6 +11,8 @@
-- -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -21,6 +23,7 @@
-- -d artifacts_directory=/path/to/refinery/artifacts \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'upload'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -73,7 +76,7 @@
-- covers only an hour worth of data. Hence, we resort to BUCKET
-- sampling.
FROM ${webrequest_table} TABLESAMPLE(BUCKET 1 OUT OF 10 ON rand())
- WHERE webrequest_source IN ('mobile', 'text', 'upload')
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/generate_mobile-sampled-100_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_mobile-sampled-100_tsv.hql
index c86b42b..958724d 100644
--- a/oozie/webrequest/legacy_tsvs/generate_mobile-sampled-100_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_mobile-sampled-100_tsv.hql
@@ -8,6 +8,8 @@
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -17,6 +19,7 @@
-- hive -f generate_mobile-sampled-100_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'mobile'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -60,7 +63,7 @@
-- covers only an hour worth of data. Hence, we resort to BUCKET
-- sampling.
FROM ${webrequest_table} TABLESAMPLE(BUCKET 1 OUT OF 100 ON rand())
- WHERE webrequest_source = 'mobile'
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
index b308643..92592c1 100644
--- a/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
@@ -8,6 +8,8 @@
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -17,6 +19,7 @@
-- hive -f generate_sampled-1000_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'mobile'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -60,10 +63,7 @@
-- covers only an hour worth of data. Hence, we resort to BUCKET
-- sampling.
FROM ${webrequest_table} TABLESAMPLE(BUCKET 1 OUT OF 1000 ON rand())
- WHERE webrequest_source IN ('mobile', 'text', 'upload')
- -- No 'bits', or 'misc', as the sampled-1000 was last produced on
- -- erbium's udp2log stream, which did not receive those two
- -- webrequest_sources.
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/generate_zero_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_zero_tsv.hql
index 583e412..f52476b 100644
--- a/oozie/webrequest/legacy_tsvs/generate_zero_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_zero_tsv.hql
@@ -8,6 +8,8 @@
-- destination_directory -- Directory in HDFS where to store the generated
-- data in.
-- webrequest_table -- table containing webrequests
+-- webrequest_sources -- Comma separated list of quoted webrequest_sources
+-- to be used for generating data
-- year -- year of the to-be-generated hour
-- month -- month of the to-be-generated hour
-- day -- day of the to-be-generated hour
@@ -17,6 +19,7 @@
-- hive -f generate_zero_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
+-- -d webrequest_sources="'mobile'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
@@ -56,7 +59,7 @@
) line,
dt
FROM ${webrequest_table}
- WHERE webrequest_source = 'mobile'
+ WHERE webrequest_source IN (${webrequest_sources})
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml
b/oozie/webrequest/legacy_tsvs/workflow.xml
index f907909..64b01da 100644
--- a/oozie/webrequest/legacy_tsvs/workflow.xml
+++ b/oozie/webrequest/legacy_tsvs/workflow.xml
@@ -96,6 +96,26 @@
<param>year=${year}</param>
<param>month=${month}</param>
<param>day=${day}</param>
+ <!--
+ We extract the needed webrequest_sources from the coordinator path
+ (which always exists in our setting).
+ This variable holds quoted strings, separated by commas. So for
+ example a coordinator path of
+
+ /path/to/coordinator_misc_mobile.xml
+
+ will get turned into
+
+ 'misc', 'mobile'
+
+ . This allows to use the variable straight in HivQL via
+
+ [...] AND webrequest_source IN ( ${webrequest_source} )
+
+ and not worry about duplicating the dependencies between the
+ coordinator and the HiveQL file.
+ -->
+
<param>webrequest_sources=${replaceAll(replaceAll(wf:conf('oozie.coord.application.path'),
'^.*/coordinator_([a-z_]*).xml', '\'$1\''),'_','\', \'')}</param>
</hive>
<ok to="mark_dataset_done"/>
<error to="kill"/>
--
To view, visit https://gerrit.wikimedia.org/r/189218
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I5ea5b3c874b3f94763413535077126469a77a9b1
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits