Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/184490
to review the following change.
Change subject: Add Oozie legacy_tsv job for landingpages TSVs
......................................................................
Add Oozie legacy_tsv job for landingpages TSVs
The udp2log filter for those TSVs lives at:
https://git.wikimedia.org/blob/operations%2Fpuppet.git/bc6ec933e0ee1000dd57cbd48cd788497ea584c2/templates%2Fudp2log%2Ffilters.erbium.erb#L10
Change-Id: If693909e89d341f2301ec7b9da5361cb1f39b241
---
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/generate_landingpages_tsv.hql
2 files changed, 91 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/90/184490/1
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index bb830ee..c14046f 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -94,4 +94,26 @@
</property>
</configuration>
</coordinator>
+
+ <coordinator name="webrequest_legacy_tsvs-landingpages">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>aspect_name</name>
+ <value>edits</value>
+ </property>
+ <property>
+ <name>aspect_tsv_archive_directory</name>
+ <value>${webrequest_archive_directory}/landingpages</value>
+ </property>
+ <property>
+ <name>hour_offset</name>
+ <value>8</value>
+ </property>
+ <property>
+ <name>needs_webrequest_upload</name>
+ <value>no</value>
+ </property>
+ </configuration>
+ </coordinator>
</bundle-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_landingpages_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_landingpages_tsv.hql
new file mode 100644
index 0000000..a6bcc4f
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_landingpages_tsv.hql
@@ -0,0 +1,69 @@
+SET hive.exec.compress.output=true;
+SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Generates a TSV for wikimediafoundation.org and donate.wikimedia.org
+--
+-- Parameters:
+-- destination_directory -- Directory in HDFS where to store the generated
+-- data in.
+-- webrequest_table -- table containing webrequests
+-- year -- year of the to-be-generated hour
+-- month -- month of the to-be-generated hour
+-- day -- day of the to-be-generated hour
+--
+--
+-- Usage:
+-- hive -f generate_landingpages_tsv.hql \
+-- -d destination_directory=/tmp/foo \
+-- -d webrequest_table=wmf_raw.webrequest \
+-- -d year=2014 \
+-- -d month=4 \
+-- -d day=1
+--
+
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+ -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+ -- works for exports to local directories (see HIVE-5672), we have to
+ -- prepare the lines by hand through concatenation :-(
+ --
+ -- This concatenation however means that we cannot sort by dt without also
+ -- adding it to the SELECTed columns. Hence, we need to add the dt column
to
+ -- the select, then sort, and finally drop the dt column again. This issue
+ -- buys us the subquery :-(
+ SELECT
+ line
+ FROM (
+ SELECT
+ CONCAT_WS(
+ " ",
+ hostname,
+ CAST(sequence AS string),
+ dt,
+ CAST(time_firstbyte AS string),
+ ip,
+ CONCAT_WS('/', cache_status, http_status),
+ CAST(response_size AS string),
+ http_method,
+ CONCAT('http://', uri_host, uri_path, uri_query),
+ "-",
+ content_type,
+ referer,
+ x_forwarded_for,
+ user_agent,
+ accept_language,
+ x_analytics
+ ) line,
+ dt
+ FROM ${webrequest_table}
+ WHERE webrequest_source IN ('text', 'mobile')
+ AND year=${year}
+ AND month=${month}
+ AND day=${day}
+ AND (
+ uri_host LIKE '%wikimediafoundation.org'
+ OR uri_host LIKE '%donate.wikimedia.org'
+ )
+ ORDER BY dt
+ LIMIT 100000000
+ ) line_with_timing;
--
To view, visit https://gerrit.wikimedia.org/r/184490
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If693909e89d341f2301ec7b9da5361cb1f39b241
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits