Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/184493

to review the following change.

Change subject: Add Oozie legacy_tsv job for api-usage TSVs
......................................................................

Add Oozie legacy_tsv job for api-usage TSVs

The udp2log filter for those TSVs lives at:

  
https://git.wikimedia.org/blob/operations%2Fpuppet.git/bc6ec933e0ee1000dd57cbd48cd788497ea584c2/templates%2Fudp2log%2Ffilters.erbium.erb#L17

Change-Id: I5daae29e76a98044497fc67a7598af2e4c837636
---
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
2 files changed, 92 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/93/184493/1

diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml 
b/oozie/webrequest/legacy_tsvs/bundle.xml
index e9451c8..04713ae 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -160,4 +160,26 @@
             </property>
         </configuration>
     </coordinator>
+
+    <coordinator name="webrequest_legacy_tsvs-api-usage">
+        <app-path>${coordinator_file}</app-path>
+        <configuration>
+            <property>
+                <name>aspect_name</name>
+                <value>api-usage</value>
+            </property>
+            <property>
+                <name>aspect_tsv_archive_directory</name>
+                <value>${webrequest_archive_directory}/api</value>
+            </property>
+            <property>
+                <name>hour_offset</name>
+                <value>14</value>
+            </property>
+            <property>
+                <name>needs_webrequest_upload</name>
+                <value>no</value>
+            </property>
+        </configuration>
+    </coordinator>
 </bundle-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
new file mode 100644
index 0000000..29012a9
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
@@ -0,0 +1,70 @@
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Generates a TSV for api requests
+--
+-- Parameters:
+--     destination_directory -- Directory in HDFS where to store the generated
+--                          data in.
+--     webrequest_table  -- table containing webrequests
+--     year              -- year of the to-be-generated hour
+--     month             -- month of the to-be-generated hour
+--     day               -- day of the to-be-generated hour
+--
+--
+-- Usage:
+--     hive -f generate_api-usage_tsv.hql          \
+--         -d destination_directory=/tmp/foo       \
+--         -d webrequest_table=wmf_raw.webrequest  \
+--         -d year=2014                            \
+--         -d month=4                              \
+--         -d day=1
+--
+
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+    -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+    -- works for exports to local directories (see HIVE-5672), we have to
+    -- prepare the lines by hand through concatenation :-(
+    --
+    -- This concatenation however means that we cannot sort by dt without also
+    -- adding it to the SELECTed columns. Hence, we need to add the dt column 
to
+    -- the select, then sort, and finally drop the dt column again. This issue
+    -- buys us the subquery :-(
+    SELECT
+        line
+    FROM (
+        SELECT
+            CONCAT_WS(
+                "      ",
+                hostname,
+                CAST(sequence AS string),
+                dt,
+                CAST(time_firstbyte AS string),
+                ip,
+                CONCAT_WS('/', cache_status, http_status),
+                CAST(response_size AS string),
+                http_method,
+                CONCAT('http://', uri_host, uri_path, uri_query),
+                "-",
+                content_type,
+                referer,
+                x_forwarded_for,
+                user_agent,
+                accept_language,
+                x_analytics
+            ) line,
+            dt
+        -- It would be nice to be able to say TABLESAMPLE(1 PERCENT) in the
+        -- following line, but that would pull an unfair sample that typically
+        -- covers only an hour worth of data. Hence, we resort to BUCKET
+        -- sampling.
+        FROM ${webrequest_table} TABLESAMPLE(BUCKET 1 OUT OF 100 ON rand())
+        WHERE webrequest_source IN ('mobile', 'text')
+            AND year=${year}
+            AND month=${month}
+            AND day=${day}
+            AND uri_path LIKE '%/w/api.php%'
+        ORDER BY dt
+        LIMIT 100000000
+    ) line_with_timing;

-- 
To view, visit https://gerrit.wikimedia.org/r/184493
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5daae29e76a98044497fc67a7598af2e4c837636
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to