Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/184492
to review the following change.
Change subject: Add Oozie legacy_tsv job for bannerRequests TSVs
......................................................................
Add Oozie legacy_tsv job for bannerRequests TSVs
The udp2log filter for those TSVs lives at:
https://git.wikimedia.org/blob/operations%2Fpuppet.git/bc6ec933e0ee1000dd57cbd48cd788497ea584c2/templates%2Fudp2log%2Ffilters.erbium.erb#L14
Change-Id: I090bb852ac0634d1d29ef9d7267aed0d06b53416
---
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/generate_bannerRequests-sampled100_tsv.hql
2 files changed, 92 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/92/184492/1
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 6e3d5d7..e9451c8 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -138,4 +138,26 @@
</property>
</configuration>
</coordinator>
+
+ <coordinator name="webrequest_legacy_tsvs-bannerRequests-sampled100">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>aspect_name</name>
+ <value>bannerRequests-sampled100</value>
+ </property>
+ <property>
+ <name>aspect_tsv_archive_directory</name>
+ <value>${webrequest_archive_directory}/bannerRequests</value>
+ </property>
+ <property>
+ <name>hour_offset</name>
+ <value>12</value>
+ </property>
+ <property>
+ <name>needs_webrequest_upload</name>
+ <value>no</value>
+ </property>
+ </configuration>
+ </coordinator>
</bundle-app>
diff --git
a/oozie/webrequest/legacy_tsvs/generate_bannerRequests-sampled100_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_bannerRequests-sampled100_tsv.hql
new file mode 100644
index 0000000..2064634
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_bannerRequests-sampled100_tsv.hql
@@ -0,0 +1,70 @@
+SET hive.exec.compress.output=true;
+SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Generates a TSV for 1:100 sampled banner requests
+--
+-- Parameters:
+-- destination_directory -- Directory in HDFS where to store the generated
+-- data in.
+-- webrequest_table -- table containing webrequests
+-- year -- year of the to-be-generated hour
+-- month -- month of the to-be-generated hour
+-- day -- day of the to-be-generated hour
+--
+--
+-- Usage:
+-- hive -f generate_bannerRequests-sampled100_tsv.hql \
+-- -d destination_directory=/tmp/foo \
+-- -d webrequest_table=wmf_raw.webrequest \
+-- -d year=2014 \
+-- -d month=4 \
+-- -d day=1
+--
+
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+ -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+ -- works for exports to local directories (see HIVE-5672), we have to
+ -- prepare the lines by hand through concatenation :-(
+ --
+ -- This concatenation however means that we cannot sort by dt without also
+ -- adding it to the SELECTed columns. Hence, we need to add the dt column
to
+ -- the select, then sort, and finally drop the dt column again. This issue
+ -- buys us the subquery :-(
+ SELECT
+ line
+ FROM (
+ SELECT
+ CONCAT_WS(
+ " ",
+ hostname,
+ CAST(sequence AS string),
+ dt,
+ CAST(time_firstbyte AS string),
+ ip,
+ CONCAT_WS('/', cache_status, http_status),
+ CAST(response_size AS string),
+ http_method,
+ CONCAT('http://', uri_host, uri_path, uri_query),
+ "-",
+ content_type,
+ referer,
+ x_forwarded_for,
+ user_agent,
+ accept_language,
+ x_analytics
+ ) line,
+ dt
+ -- It would be nice to be able to say TABLESAMPLE(1 PERCENT) in the
+ -- following line, but that would pull an unfair sample that typically
+ -- covers only an hour worth of data. Hence, we resort to BUCKET
+ -- sampling.
+ FROM ${webrequest_table} TABLESAMPLE(BUCKET 1 OUT OF 100 ON rand())
+ WHERE webrequest_source IN ('mobile', 'text')
+ AND year=${year}
+ AND month=${month}
+ AND day=${day}
+ AND uri_path LIKE '%Special:BannerRandom%'
+ ORDER BY dt
+ LIMIT 100000000
+ ) line_with_timing;
--
To view, visit https://gerrit.wikimedia.org/r/184492
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I090bb852ac0634d1d29ef9d7267aed0d06b53416
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits