Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/184496
to review the following change.
Change subject: Add Oozie legacy_tsv job for 5xx TSVs
......................................................................
Add Oozie legacy_tsv job for 5xx TSVs
The udp2log filter for those TSVs lives at:
https://git.wikimedia.org/blob/operations%2Fpuppet.git/bc6ec933e0ee1000dd57cbd48cd788497ea584c2/templates%2Fudp2log%2Ffilters.oxygen.erb#L28
Change-Id: Ic16b8f6bad2b44529c9642132633d1b10c1dbef6
---
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
2 files changed, 90 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/96/184496/1
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 7266633..6fdec59 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -222,4 +222,26 @@
</property>
</configuration>
</coordinator>
+
+ <coordinator name="webrequest_legacy_tsvs-5xx">
+ <app-path>${coordinator_file}</app-path>
+ <configuration>
+ <property>
+ <name>aspect_name</name>
+ <value>5xx</value>
+ </property>
+ <property>
+ <name>aspect_tsv_archive_directory</name>
+ <value>${webrequest_archive_directory}/5xx</value>
+ </property>
+ <property>
+ <name>hour_offset</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>needs_webrequest_upload</name>
+ <value>no</value>
+ </property>
+ </configuration>
+ </coordinator>
</bundle-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
new file mode 100644
index 0000000..bbfe8fb
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -0,0 +1,68 @@
+SET hive.exec.compress.output=true;
+SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Generates a TSV of server errors on the mobile, and text partitions
+--
+-- Parameters:
+-- destination_directory -- Directory in HDFS where to store the generated
+-- data in.
+-- webrequest_table -- table containing webrequests
+-- year -- year of the to-be-generated hour
+-- month -- month of the to-be-generated hour
+-- day -- day of the to-be-generated hour
+--
+--
+-- Usage:
+-- hive -f generate_5xx_tsv.hql \
+-- -d destination_directory=/tmp/foo \
+-- -d webrequest_table=wmf_raw.webrequest \
+-- -d year=2014 \
+-- -d month=4 \
+-- -d day=1
+--
+
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+ -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+ -- works for exports to local directories (see HIVE-5672), we have to
+ -- prepare the lines by hand through concatenation :-(
+ --
+ -- This concatenation however means that we cannot sort by dt without also
+ -- adding it to the SELECTed columns. Hence, we need to add the dt column
to
+ -- the select, then sort, and finally drop the dt column again. This issue
+ -- buys us the subquery :-(
+ SELECT
+ line
+ FROM (
+ SELECT
+ CONCAT_WS(
+ " ",
+ hostname,
+ CAST(sequence AS string),
+ dt,
+ CAST(time_firstbyte AS string),
+ ip,
+ CONCAT_WS('/', cache_status, http_status),
+ CAST(response_size AS string),
+ http_method,
+ CONCAT('http://', uri_host, uri_path, uri_query),
+ "-",
+ content_type,
+ referer,
+ x_forwarded_for,
+ user_agent,
+ accept_language,
+ x_analytics
+ ) line,
+ dt
+ FROM ${webrequest_table}
+ WHERE webrequest_source IN ('text', 'mobile')
+ -- TODO: Add 'bits', once it's turned on again
+ -- TODO: Add 'misc', once it's available
+ AND year=${year}
+ AND month=${month}
+ AND day=${day}
+ AND http_status BETWEEN 500 AND 599
+ ORDER BY dt
+ LIMIT 100000000
+ ) line_with_timing;
--
To view, visit https://gerrit.wikimedia.org/r/184496
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic16b8f6bad2b44529c9642132633d1b10c1dbef6
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits