Nschaaf has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/311964

Change subject: Add Oozie job to extract data for WDQS research
......................................................................

Add Oozie job to extract data for WDQS research

Bug: T146064
Change-Id: I466649e8dfce4acd6c435da2cc00abb51d642526
---
A hive/wikidata/create_wdqs_extract_table.hql
A oozie/wikidata/wdqs_extract/coordinator.properties
A oozie/wikidata/wdqs_extract/coordinator.xml
A oozie/wikidata/wdqs_extract/wdqs_extract.hql
A oozie/wikidata/wdqs_extract/workflow.xml
5 files changed, 472 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/64/311964/1

diff --git a/hive/wikidata/create_wdqs_extract_table.hql 
b/hive/wikidata/create_wdqs_extract_table.hql
new file mode 100644
index 0000000..9b86ef4
--- /dev/null
+++ b/hive/wikidata/create_wdqs_extract_table.hql
@@ -0,0 +1,67 @@
+-- Creates table statement for wdqs_extract table.
+--
+-- NOTE:  When choosing partition field types,
+-- one should take into consideration Hive's
+-- insistence on storing partition values
+-- as strings.  See:
+-- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png
+-- and
+-- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_wdqs_extract_table.hql --database wmf
+--
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `wdqs_extract`(
+    `hostname`          string  COMMENT 'Source node hostname',
+    `sequence`          bigint  COMMENT 'Per host sequence number',
+    `dt`                string  COMMENT 'Timestame at cache in ISO 8601',
+    `time_firstbyte`    double  COMMENT 'Time to first byte',
+    `ip`                string  COMMENT 'IP of packet at cache',
+    `cache_status`      string  COMMENT 'Cache status',
+    `http_status`       string  COMMENT 'HTTP status of response',
+    `response_size`     bigint  COMMENT 'Response size',
+    `http_method`       string  COMMENT 'HTTP method of request',
+    `uri_host`          string  COMMENT 'Host of request',
+    `uri_path`          string  COMMENT 'Path of request',
+    `uri_query`         string  COMMENT 'Query of request',
+    `content_type`      string  COMMENT 'Content-Type header of response',
+    `referer`           string  COMMENT 'Referer header of request',
+    `x_forwarded_for`   string  COMMENT 'X-Forwarded-For header of request 
(deprecated)',
+    `user_agent`        string  COMMENT 'User-Agent header of request',
+    `accept_language`   string  COMMENT 'Accept-Language header of request',
+    `x_analytics`       string  COMMENT 'X-Analytics header of response',
+    `range`             string  COMMENT 'Range header of response',
+    `is_pageview`       boolean COMMENT 'Indicates if this record was marked 
as a pageview during refinement',
+    `record_version`    string  COMMENT 'Keeps track of changes in the table 
content definition - 
https://wikitech.wikimedia.org/wiki/Analytics/Data/Webrequest',
+    `client_ip`         string  COMMENT 'Client IP - DEPRECATED - Same as IP.',
+    `geocoded_data`     map<string, string>  COMMENT 'Geocoded map with 
continent, country_code, country, city, subdivision, postal_code, latitude, 
longitude, timezone keys  and associated values.',
+    -- Waiting for x_cache format to change before parsing into a map
+    `x_cache`           string  COMMENT 'X-Cache header of response',
+    -- Next two fields are to replace original ua and x_analytics ones.
+    -- However such schema modification implies backward incompatibility.
+    -- We will replace once we feel confident enough that 'every' backward 
incompatible change is done.
+    `user_agent_map`    map<string, string>  COMMENT 'User-agent map with 
browser_name, browser_major, device, os_name, os_major, os_minor and 
wmf_app_version keys and associated values',
+    `x_analytics_map`   map<string, string>  COMMENT 'X_analytics map view of 
the x_analytics field',
+    `ts`                timestamp            COMMENT 'Unix timestamp in 
milliseconds extracted from dt',
+    `access_method`     string  COMMENT 'Method used to accessing the site 
(mobile app|mobile web|desktop)',
+    `agent_type`        string  COMMENT 'Categorise the agent making the 
webrequest as either user or spider (automatas to be added).',
+    `is_zero`           boolean COMMENT 'Indicates if the webrequest is 
accessed through a zero provider',
+    `referer_class`     string  COMMENT 'Indicates if a referer is internal, 
external or unknown.',
+    `normalized_host`   struct<project_class: string, project:string, 
qualifiers: array<string>, tld: String>  COMMENT 'struct containing 
project_class (such as wikipedia or wikidata for instance), project (such as en 
or commons), qualifiers (a list of in-between values, such as m and/or zero) 
and tld (org most often)',
+    `pageview_info`     map<string, string>  COMMENT 'map containing project, 
language_variant and page_title values only when is_pageview = TRUE.',
+    `page_id`           int     COMMENT 'MediaWiki page_id for this page 
title. For redirects this could be the page_id of the redirect or the page_id 
of the target. This may not always be set, even if the page is actually a 
pageview.'
+)
+PARTITIONED BY (
+    `year`              int     COMMENT 'Unpadded year of request',
+    `month`             int     COMMENT 'Unpadded month of request',
+    `day`               int     COMMENT 'Unpadded day of request',
+    `hour`              int     COMMENT 'Unpadded hour of request'
+)
+STORED AS PARQUET
+LOCATION '/wmf/data/wmf/wdqs_extract'
+;
+
diff --git a/oozie/wikidata/wdqs_extract/coordinator.properties 
b/oozie/wikidata/wdqs_extract/coordinator.properties
new file mode 100644
index 0000000..49dca55
--- /dev/null
+++ b/oozie/wikidata/wdqs_extract/coordinator.properties
@@ -0,0 +1,58 @@
+# Configures a coordinator to manage automatically extracting wdqs data from
+# the refined webrequest table.
+# Any of the following properties are overidable with -D.
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config 
oozie/wikidata/wdqs_extract/coordinator.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+
+user                              = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should
+# override this to point directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory.
+# E.g.  /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                = ${name_node}/wmf/refinery/current
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                   = ${refinery_directory}/oozie
+
+# HDFS path to coordinator to run.
+coordinator_file                  = 
${oozie_directory}/wikidata/wdqs_extract/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file                     = 
${oozie_directory}/wikidata/wdqs_extract/workflow.xml
+
+# HDFS path to refine webrequest dataset definitions
+webrequest_datasets_file          = ${oozie_directory}/webrequest/datasets.xml
+webrequest_data_directory         = ${name_node}/wmf/data/wmf/webrequest
+
+# Initial import time of the webrequest dataset.
+start_time                        = 2016-07-20T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# Workflow to send an error email
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+# hive_site_xml                     = 
${oozie_directory}/util/hive/hive-site.xml
+hive_site_xml                     = /etc/hive/conf/hive-site.xml
+
+# Fully qualified Hive table name.
+webrequest_table                  = wmf.webrequest
+wdqs_extract_table                = wmf.wdqs_extract
+
+# Coordintator to start.
+oozie.coord.application.path      = ${coordinator_file}
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
diff --git a/oozie/wikidata/wdqs_extract/coordinator.xml 
b/oozie/wikidata/wdqs_extract/coordinator.xml
new file mode 100644
index 0000000..3b9ce1e
--- /dev/null
+++ b/oozie/wikidata/wdqs_extract/coordinator.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="wikidata-wdqs_extract-coord"
+    frequency="${coord:hours(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property><name>workflow_file</name></property>
+
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>webrequest_datasets_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+
+        <property><name>hive_site_xml</name></property>
+
+        <property><name>webrequest_table</name></property>
+        <property><name>wdqs_extract_table</name></property>
+
+        <property><name>send_error_email_workflow_file</name></property>
+    </parameters>
+
+    <controls>
+        <!--
+        By having materialized jobs not timeout, we ease backfilling incidents
+        after recoverable hiccups on the dataset producers.
+        -->
+        <timeout>-1</timeout>
+
+        <!--
+        pageview aggregation is not too heavy, but we limit
+        concurrency for resource sharing.
+
+        Also note, that back-filling is not limited by the
+        coordinator's frequency, so back-filling works nicely
+        even-though the concurrency is low.
+        -->
+        <concurrency>4</concurrency>
+
+
+        <!--
+        Since we expect only one incarnation per hourly dataset, the
+        default throttle of 12 is way to high, and there is not need
+        to keep that many materialized jobs around.
+
+        By resorting to 2, we keep the hdfs checks on the datasets
+        low, while still being able to easily feed the concurrency.
+        -->
+        <throttle>8</throttle>
+    </controls>
+
+    <datasets>
+        <!--
+        Include refined and aqs datasets files.
+        $webrequest_datasets_file will be used as the input events
+        $aqs_datasets_file will be used as the output events
+        -->
+        <include>${webrequest_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="misc_refined_input" dataset="webrequest_misc">
+            <instance>${coord:current(0)}</instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"M")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"d")}</value>
+                </property>
+                <property>
+                    <name>hour</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"H")}</value>
+                </property>
+
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
diff --git a/oozie/wikidata/wdqs_extract/wdqs_extract.hql 
b/oozie/wikidata/wdqs_extract/wdqs_extract.hql
new file mode 100644
index 0000000..4b413c6
--- /dev/null
+++ b/oozie/wikidata/wdqs_extract/wdqs_extract.hql
@@ -0,0 +1,105 @@
+-- Parameters:
+--     source_table      -- Fully qualified table name to compute the
+--                          extraction for.
+--     destination_table -- Fully qualified table name to fill in
+--                          extracted values.
+--     year              -- year of partition to compute aggregation
+--                          for.
+--     month             -- month of partition to compute aggregation
+--                          for.
+--     day               -- day of partition to compute aggregation
+--                          for.
+--     hour              -- hour of partition to compute aggregation
+--                          for.
+--
+-- Usage:
+--     hive -f wdqs_extract.hql                                   \
+--         -d source_table=wmf.webrequest                         \
+--         -d destination_table=wmf.wdqs_extract                  \
+--         -d year=2016                                           \
+--         -d month=9                                             \
+--         -d day=1                                               \
+--         -d hour=0
+--
+
+SET parquet.compression              = SNAPPY;
+SET mapred.reduce.tasks              = 8;
+
+INSERT OVERWRITE TABLE ${destination_table}
+    PARTITION(year=${year},month=${month},day=${day},hour=${hour})
+    SELECT
+        hostname,
+        sequence,
+        dt,
+        time_firstbyte,
+        ip,
+        cache_status,
+        http_status,
+        response_size,
+        http_method,
+        uri_host,
+        uri_path,
+        uri_query,
+        content_type,
+        referer,
+        x_forwarded_for,
+        user_agent,
+        accept_language,
+        x_analytics,
+        range,
+        is_pageview,
+        record_version,
+        client_ip,
+        geocoded_data,
+        x_cache,
+        user_agent_map,
+        x_analytics_map,
+        ts,
+        access_method,
+        agent_type,
+        is_zero,
+        referer_class,
+        normalized_host,
+        pageview_info,
+        page_id
+    FROM
+        ${source_table}
+    WHERE webrequest_source = 'misc'
+        AND year = ${year} AND month = ${month} AND day = ${day} AND hour = 
${hour}
+        AND uri_host = 'query.wikidata.org'
+    GROUP BY
+        hostname,
+        sequence,
+        dt,
+        time_firstbyte,
+        ip,
+        cache_status,
+        http_status,
+        response_size,
+        http_method,
+        uri_host,
+        uri_path,
+        uri_query,
+        content_type,
+        referer,
+        x_forwarded_for,
+        user_agent,
+        accept_language,
+        x_analytics,
+        range,
+        is_pageview,
+        record_version,
+        client_ip,
+        geocoded_data,
+        x_cache,
+        user_agent_map,
+        x_analytics_map,
+        ts,
+        access_method,
+        agent_type,
+        is_zero,
+        referer_class,
+        normalized_host,
+        pageview_info,
+        page_id
+;
diff --git a/oozie/wikidata/wdqs_extract/workflow.xml 
b/oozie/wikidata/wdqs_extract/workflow.xml
new file mode 100644
index 0000000..bf03a25
--- /dev/null
+++ b/oozie/wikidata/wdqs_extract/workflow.xml
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+    name="wikidata-wdqs_extract-wf-${year}-${month}-${day}-${hour}">
+
+    <parameters>
+
+        <!-- Default values for inner oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>256</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>queue_name</name></property>
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+
+
+        <!-- Aggregation related configuration properties-->
+        <property>
+            <name>hive_script</name>
+            <!-- This is relative to the containing directory of this file. -->
+            <value>wdqs_extract.hql</value>
+            <description>Hive script to run.</description>
+        </property>
+
+        <property>
+            <name>hive_site_xml</name>
+            <description>hive-site.xml file path in HDFS</description>
+        </property>
+        <property>
+            <name>webrequest_table</name>
+            <description>Hive table to extract from</description>
+        </property>
+        <property>
+            <name>wdqs_extract_table</name>
+            <description>The destinaton table to store extracted data 
in.</description>
+        </property>
+
+        <property>
+            <name>year</name>
+            <description>The partition's year</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>The partition's month</description>
+        </property>
+        <property>
+            <name>day</name>
+            <description>The partition's day</description>
+        </property>
+        <property>
+            <name>hour</name>
+            <description>The partition's hour</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
+        </property>
+
+    </parameters>
+
+    <start to="extract"/>
+
+    <action name="extract">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+            <script>${hive_script}</script>
+            <param>source_table=${webrequest_table}</param>
+            <param>destination_table=${wdqs_extract_table}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+            <param>hour=${hour}</param>
+        </hive>
+
+        <ok to="end"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>parent_failed_action</name>
+                    <value>${wf:lastErrorNode()}</value>
+                </property>
+                <property>
+                    <name>parent_error_code</name>
+                    <value>${wf:errorCode(wf:lastErrorNode())}</value>
+                </property>
+                <property>
+                    <name>parent_error_message</name>
+                    <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+                </property>
+                <property>
+                    <name>to</name>
+                    
<value>analytics-ale...@wikimedia.org,nsch...@wikimedia.org,l...@wikimedia.org</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/311964
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I466649e8dfce4acd6c435da2cc00abb51d642526
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Nschaaf <nsch...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to