Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/188010

to review the following change.

Change subject: Geocode glam_nara files
......................................................................

Geocode glam_nara files

Geocoding happens directly on the client ip without resolving
X-Forwarded-For. While that's wrong from the start, it is what the
tsvs on the udp2log pipeline do and did. And as at this point the
objective is switching away from udp2log, fixing the conceptual issues
with the glam_nara tsvs is left for the future.

Change-Id: I54f5db3f61291b0d455e44fd20de090b40c1a3ef
---
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
M oozie/webrequest/legacy_tsvs/coordinator_mobile.xml
M oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
M oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
M oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
M oozie/webrequest/legacy_tsvs/workflow.xml
7 files changed, 57 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/10/188010/1

diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties 
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 900fbd3..85eef77 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -12,9 +12,20 @@
 job_tracker                         = 
resourcemanager.analytics.eqiad.wmnet:8032
 queue_name                          = default
 
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should
+# override this to point directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory.
+# E.g.  /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                  = ${name_node}/wmf/refinery/current
+
+# HDFS path to artifacts that will be used by this job.
+# E.g. refinery-hive.jar should exist here.
+artifacts_directory                 = ${refinery_directory}/artifacts
+
 # Base path in HDFS to oozie files.
 # Other files will be used relative to this path.
-oozie_directory                     = ${name_node}/wmf/refinery/current/oozie
+oozie_directory                     = ${refinery_directory}/oozie
 
 # HDFS paths to the coordinators to run.
 # All of them are essentially the same coordinator and differ only in the
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
index 56fbb84..9f17521 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
@@ -21,6 +21,7 @@
         <property><name>webrequest_datasets_file</name></property>
         <property><name>webrequest_data_directory</name></property>
         <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
         <property><name>workflow_file</name></property>
         <property><name>webrequest_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
@@ -101,6 +102,10 @@
                     <value>${hive_site_xml}</value>
                 </property>
                 <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
                     <name>webrequest_table</name>
                     <value>${webrequest_table}</value>
                 </property>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml
index eb64da3..c5199aa 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml
@@ -21,6 +21,7 @@
         <property><name>webrequest_datasets_file</name></property>
         <property><name>webrequest_data_directory</name></property>
         <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
         <property><name>workflow_file</name></property>
         <property><name>webrequest_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
@@ -91,6 +92,10 @@
                     <value>${hive_site_xml}</value>
                 </property>
                 <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
                     <name>webrequest_table</name>
                     <value>${webrequest_table}</value>
                 </property>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
index 942ec66..5abf779 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
@@ -21,6 +21,7 @@
         <property><name>webrequest_datasets_file</name></property>
         <property><name>webrequest_data_directory</name></property>
         <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
         <property><name>workflow_file</name></property>
         <property><name>webrequest_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
@@ -96,6 +97,10 @@
                     <value>${hive_site_xml}</value>
                 </property>
                 <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
                     <name>webrequest_table</name>
                     <value>${webrequest_table}</value>
                 </property>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
index 32ab6ec..20dc69a 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
@@ -21,6 +21,7 @@
         <property><name>webrequest_datasets_file</name></property>
         <property><name>webrequest_data_directory</name></property>
         <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
         <property><name>workflow_file</name></property>
         <property><name>webrequest_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
@@ -101,6 +102,10 @@
                     <value>${hive_site_xml}</value>
                 </property>
                 <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
                     <name>webrequest_table</name>
                     <value>${webrequest_table}</value>
                 </property>
diff --git a/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
index 2e59a8e..becde2d 100644
--- a/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql
@@ -5,7 +5,10 @@
 -- Generates a TSV for GLAM with _NARA_ in the URL
 --
 -- Parameters:
---     destination_directory -- Directory in HDFS where to store the generated
+--     artifacts_directory
+--                       -- Directory in HDFS containing the refinery-hive.jar
+--     destination_directory
+--                       -- Directory in HDFS where to store the generated
 --                          data in.
 --     webrequest_table  -- table containing webrequests
 --     year              -- year of the to-be-generated hour
@@ -14,13 +17,17 @@
 --
 --
 -- Usage:
---     hive -f generate_glam-nara_tsv.hql          \
---         -d destination_directory=/tmp/foo       \
---         -d webrequest_table=wmf_raw.webrequest  \
---         -d year=2014                            \
---         -d month=4                              \
+--     hive -f generate_glam-nara_tsv.hql                     \
+--         -d artifacts_directory=/path/to/refinery/artifacts \
+--         -d destination_directory=/tmp/foo                  \
+--         -d webrequest_table=wmf_raw.webrequest             \
+--         -d year=2014                                       \
+--         -d month=4                                         \
 --         -d day=1
 --
+
+ADD JAR 
${artifacts_directory}/org/wikimedia/analytics/refinery/refinery-hive-0.0.4.jar;
+CREATE TEMPORARY FUNCTION geocode_country as 
'org.wikimedia.analytics.refinery.hive.GeocodedCountryUDF';
 
 INSERT OVERWRITE DIRECTORY "${destination_directory}"
     -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
@@ -41,8 +48,13 @@
                 CAST(sequence AS string),
                 dt,
                 CAST(time_firstbyte AS string),
-                CONCAT_WS('|', ip, 'XX'), -- TODO: Put geocoding UDF here,
-                                          -- once it is available.
+                CONCAT_WS('|', ip, geocode_country(ip)), -- This geocoding
+                    -- happens directly on the client ip without resolving
+                    -- X-Forwarded-For. While this is known to be off, it is
+                    -- what udp-filter did for the glam_nara tsvs on the 
udp2log
+                    -- pipeline. We mimick this behaviour 1:1 for now and leave
+                    -- improvements to the future, when there is a proper UDF
+                    -- that handles X-Forward-For resolvement.
                 CONCAT_WS('/', cache_status, http_status),
                 CAST(response_size AS string),
                 http_method,
diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml 
b/oozie/webrequest/legacy_tsvs/workflow.xml
index b74e957..f907909 100644
--- a/oozie/webrequest/legacy_tsvs/workflow.xml
+++ b/oozie/webrequest/legacy_tsvs/workflow.xml
@@ -22,6 +22,10 @@
             <description>hive-site.xml file path in HDFS</description>
         </property>
         <property>
+            <name>artifacts_directory</name>
+            <description>Path in HDFS to artifacts.  refinery-hive.jar should 
be here.</description>
+        </property>
+        <property>
             <name>webrequest_table</name>
             <description>
                 Hive table to read webrequest data from.
@@ -86,6 +90,7 @@
 
             <script>generate_${aspect_name}_tsv.hql</script>
 
+            <param>artifacts_directory=${artifacts_directory}</param>
             <param>webrequest_table=${webrequest_table}</param>
             
<param>destination_directory=${temporary_directory}/${wf:id()}-${aspect_name}</param>
             <param>year=${year}</param>

-- 
To view, visit https://gerrit.wikimedia.org/r/188010
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I54f5db3f61291b0d455e44fd20de090b40c1a3ef
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to