Hello Ottomata, I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/188010 to review the following change. Change subject: Geocode glam_nara files ...................................................................... Geocode glam_nara files Geocoding happens directly on the client ip without resolving X-Forwarded-For. While that's wrong from the start, it is what the tsvs on the udp2log pipeline do and did. And as at this point the objective is switching away from udp2log, fixing the conceptual issues with the glam_nara tsvs is left for the future. Change-Id: I54f5db3f61291b0d455e44fd20de090b40c1a3ef --- M oozie/webrequest/legacy_tsvs/bundle.properties M oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml M oozie/webrequest/legacy_tsvs/coordinator_mobile.xml M oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml M oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml M oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql M oozie/webrequest/legacy_tsvs/workflow.xml 7 files changed, 57 insertions(+), 9 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/10/188010/1 diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties b/oozie/webrequest/legacy_tsvs/bundle.properties index 900fbd3..85eef77 100644 --- a/oozie/webrequest/legacy_tsvs/bundle.properties +++ b/oozie/webrequest/legacy_tsvs/bundle.properties @@ -12,9 +12,20 @@ job_tracker = resourcemanager.analytics.eqiad.wmnet:8032 queue_name = default +# Base path in HDFS to refinery. +# When submitting this job for production, you should +# override this to point directly at a deployed +# directory name, and not the 'symbolic' 'current' directory. +# E.g. /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07 +refinery_directory = ${name_node}/wmf/refinery/current + +# HDFS path to artifacts that will be used by this job. +# E.g. refinery-hive.jar should exist here. +artifacts_directory = ${refinery_directory}/artifacts + # Base path in HDFS to oozie files. # Other files will be used relative to this path. -oozie_directory = ${name_node}/wmf/refinery/current/oozie +oozie_directory = ${refinery_directory}/oozie # HDFS paths to the coordinators to run. # All of them are essentially the same coordinator and differ only in the diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml index 56fbb84..9f17521 100644 --- a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml +++ b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml @@ -21,6 +21,7 @@ <property><name>webrequest_datasets_file</name></property> <property><name>webrequest_data_directory</name></property> <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> <property><name>workflow_file</name></property> <property><name>webrequest_table</name></property> <property><name>mark_directory_done_workflow_file</name></property> @@ -101,6 +102,10 @@ <value>${hive_site_xml}</value> </property> <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> <name>webrequest_table</name> <value>${webrequest_table}</value> </property> diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml b/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml index eb64da3..c5199aa 100644 --- a/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml +++ b/oozie/webrequest/legacy_tsvs/coordinator_mobile.xml @@ -21,6 +21,7 @@ <property><name>webrequest_datasets_file</name></property> <property><name>webrequest_data_directory</name></property> <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> <property><name>workflow_file</name></property> <property><name>webrequest_table</name></property> <property><name>mark_directory_done_workflow_file</name></property> @@ -91,6 +92,10 @@ <value>${hive_site_xml}</value> </property> <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> <name>webrequest_table</name> <value>${webrequest_table}</value> </property> diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml index 942ec66..5abf779 100644 --- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml +++ b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml @@ -21,6 +21,7 @@ <property><name>webrequest_datasets_file</name></property> <property><name>webrequest_data_directory</name></property> <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> <property><name>workflow_file</name></property> <property><name>webrequest_table</name></property> <property><name>mark_directory_done_workflow_file</name></property> @@ -96,6 +97,10 @@ <value>${hive_site_xml}</value> </property> <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> <name>webrequest_table</name> <value>${webrequest_table}</value> </property> diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml index 32ab6ec..20dc69a 100644 --- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml +++ b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml @@ -21,6 +21,7 @@ <property><name>webrequest_datasets_file</name></property> <property><name>webrequest_data_directory</name></property> <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> <property><name>workflow_file</name></property> <property><name>webrequest_table</name></property> <property><name>mark_directory_done_workflow_file</name></property> @@ -101,6 +102,10 @@ <value>${hive_site_xml}</value> </property> <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> <name>webrequest_table</name> <value>${webrequest_table}</value> </property> diff --git a/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql b/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql index 2e59a8e..becde2d 100644 --- a/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql +++ b/oozie/webrequest/legacy_tsvs/generate_glam_nara_tsv.hql @@ -5,7 +5,10 @@ -- Generates a TSV for GLAM with _NARA_ in the URL -- -- Parameters: --- destination_directory -- Directory in HDFS where to store the generated +-- artifacts_directory +-- -- Directory in HDFS containing the refinery-hive.jar +-- destination_directory +-- -- Directory in HDFS where to store the generated -- data in. -- webrequest_table -- table containing webrequests -- year -- year of the to-be-generated hour @@ -14,13 +17,17 @@ -- -- -- Usage: --- hive -f generate_glam-nara_tsv.hql \ --- -d destination_directory=/tmp/foo \ --- -d webrequest_table=wmf_raw.webrequest \ --- -d year=2014 \ --- -d month=4 \ +-- hive -f generate_glam-nara_tsv.hql \ +-- -d artifacts_directory=/path/to/refinery/artifacts \ +-- -d destination_directory=/tmp/foo \ +-- -d webrequest_table=wmf_raw.webrequest \ +-- -d year=2014 \ +-- -d month=4 \ -- -d day=1 -- + +ADD JAR ${artifacts_directory}/org/wikimedia/analytics/refinery/refinery-hive-0.0.4.jar; +CREATE TEMPORARY FUNCTION geocode_country as 'org.wikimedia.analytics.refinery.hive.GeocodedCountryUDF'; INSERT OVERWRITE DIRECTORY "${destination_directory}" -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only @@ -41,8 +48,13 @@ CAST(sequence AS string), dt, CAST(time_firstbyte AS string), - CONCAT_WS('|', ip, 'XX'), -- TODO: Put geocoding UDF here, - -- once it is available. + CONCAT_WS('|', ip, geocode_country(ip)), -- This geocoding + -- happens directly on the client ip without resolving + -- X-Forwarded-For. While this is known to be off, it is + -- what udp-filter did for the glam_nara tsvs on the udp2log + -- pipeline. We mimick this behaviour 1:1 for now and leave + -- improvements to the future, when there is a proper UDF + -- that handles X-Forward-For resolvement. CONCAT_WS('/', cache_status, http_status), CAST(response_size AS string), http_method, diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml b/oozie/webrequest/legacy_tsvs/workflow.xml index b74e957..f907909 100644 --- a/oozie/webrequest/legacy_tsvs/workflow.xml +++ b/oozie/webrequest/legacy_tsvs/workflow.xml @@ -22,6 +22,10 @@ <description>hive-site.xml file path in HDFS</description> </property> <property> + <name>artifacts_directory</name> + <description>Path in HDFS to artifacts. refinery-hive.jar should be here.</description> + </property> + <property> <name>webrequest_table</name> <description> Hive table to read webrequest data from. @@ -86,6 +90,7 @@ <script>generate_${aspect_name}_tsv.hql</script> + <param>artifacts_directory=${artifacts_directory}</param> <param>webrequest_table=${webrequest_table}</param> <param>destination_directory=${temporary_directory}/${wf:id()}-${aspect_name}</param> <param>year=${year}</param> -- To view, visit https://gerrit.wikimedia.org/r/188010 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I54f5db3f61291b0d455e44fd20de090b40c1a3ef Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits