Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/184496

to review the following change.

Change subject: Add Oozie legacy_tsv job for 5xx TSVs
......................................................................

Add Oozie legacy_tsv job for 5xx TSVs

The udp2log filter for those TSVs lives at:

  
https://git.wikimedia.org/blob/operations%2Fpuppet.git/bc6ec933e0ee1000dd57cbd48cd788497ea584c2/templates%2Fudp2log%2Ffilters.oxygen.erb#L28

Change-Id: Ic16b8f6bad2b44529c9642132633d1b10c1dbef6
---
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
2 files changed, 90 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/96/184496/1

diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml 
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 7266633..6fdec59 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -222,4 +222,26 @@
             </property>
         </configuration>
     </coordinator>
+
+    <coordinator name="webrequest_legacy_tsvs-5xx">
+        <app-path>${coordinator_file}</app-path>
+        <configuration>
+            <property>
+                <name>aspect_name</name>
+                <value>5xx</value>
+            </property>
+            <property>
+                <name>aspect_tsv_archive_directory</name>
+                <value>${webrequest_archive_directory}/5xx</value>
+            </property>
+            <property>
+                <name>hour_offset</name>
+                <value>20</value>
+            </property>
+            <property>
+                <name>needs_webrequest_upload</name>
+                <value>no</value>
+            </property>
+        </configuration>
+    </coordinator>
 </bundle-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
new file mode 100644
index 0000000..bbfe8fb
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -0,0 +1,68 @@
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Generates a TSV of server errors on the mobile, and text partitions
+--
+-- Parameters:
+--     destination_directory -- Directory in HDFS where to store the generated
+--                          data in.
+--     webrequest_table  -- table containing webrequests
+--     year              -- year of the to-be-generated hour
+--     month             -- month of the to-be-generated hour
+--     day               -- day of the to-be-generated hour
+--
+--
+-- Usage:
+--     hive -f generate_5xx_tsv.hql                \
+--         -d destination_directory=/tmp/foo       \
+--         -d webrequest_table=wmf_raw.webrequest  \
+--         -d year=2014                            \
+--         -d month=4                              \
+--         -d day=1
+--
+
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+    -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+    -- works for exports to local directories (see HIVE-5672), we have to
+    -- prepare the lines by hand through concatenation :-(
+    --
+    -- This concatenation however means that we cannot sort by dt without also
+    -- adding it to the SELECTed columns. Hence, we need to add the dt column 
to
+    -- the select, then sort, and finally drop the dt column again. This issue
+    -- buys us the subquery :-(
+    SELECT
+        line
+    FROM (
+        SELECT
+            CONCAT_WS(
+                "      ",
+                hostname,
+                CAST(sequence AS string),
+                dt,
+                CAST(time_firstbyte AS string),
+                ip,
+                CONCAT_WS('/', cache_status, http_status),
+                CAST(response_size AS string),
+                http_method,
+                CONCAT('http://', uri_host, uri_path, uri_query),
+                "-",
+                content_type,
+                referer,
+                x_forwarded_for,
+                user_agent,
+                accept_language,
+                x_analytics
+            ) line,
+            dt
+        FROM ${webrequest_table}
+        WHERE webrequest_source IN ('text', 'mobile')
+            -- TODO: Add 'bits', once it's turned on again
+            -- TODO: Add 'misc', once it's available
+            AND year=${year}
+            AND month=${month}
+            AND day=${day}
+            AND http_status BETWEEN 500 AND 599
+        ORDER BY dt
+        LIMIT 100000000
+    ) line_with_timing;

-- 
To view, visit https://gerrit.wikimedia.org/r/184496
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic16b8f6bad2b44529c9642132633d1b10c1dbef6
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to