Ottomata has submitted this change and it was merged.

Change subject: Monthly last_access uniques oozie job
......................................................................


Monthly last_access uniques oozie job

Based on WMF-Last-Access cookie and the nocookie
header in X-Analytics.

Bug: 124678

Change-Id: I423c90a52e87a701d584fc12c88729bd564baf70
---
A hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
A oozie/last_access_uniques/README.md
M oozie/last_access_uniques/datasets.xml
A oozie/last_access_uniques/monthly/coordinator.properties
A oozie/last_access_uniques/monthly/coordinator.xml
A oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
A oozie/last_access_uniques/monthly/workflow.xml
7 files changed, 422 insertions(+), 2 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git 
a/hive/last_access_uniques/create_last_access_uniques_monthly_table.hql 
b/hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
new file mode 100644
index 0000000..b6c8a7f
--- /dev/null
+++ b/hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
@@ -0,0 +1,24 @@
+-- Creates table statement for WMF-Last-Access cookie base uniques monthly 
table.
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_last_access_uniques_monthly_table.hql --database wmf
+
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `last_access_uniques_monthly`(
+    `uri_host`             string  COMMENT 'The lower cased host defining a 
project (en.wikipedia.org for instance)',
+    `country`              string  COMMENT 'Country name of the accessing 
agents (computed using maxmind GeoIP database)',
+    `country_code`         string  COMMENT '2 letter country code',
+    `uniques_underestimate` int    COMMENT 'Under estimation of unique devices 
seen based on last access cookie, and the nocookies header',
+    `uniques_offset`       int     COMMENT 'Unique devices offset computed as 
1-action sessions without cookies',
+    `uniques_estimate`     int     COMMENT 'Estimate of unique devices seen as 
uniques_underestimate plus offset'
+)
+PARTITIONED BY (
+    `year`              int     COMMENT 'Unpadded year of requests',
+    `month`             int     COMMENT 'Unpadded month of requests'
+)
+STORED AS PARQUET
+LOCATION '/wmf/data/wmf/last_access_uniques/monthly'
+;
diff --git a/oozie/last_access_uniques/README.md 
b/oozie/last_access_uniques/README.md
new file mode 100644
index 0000000..ea5affd
--- /dev/null
+++ b/oozie/last_access_uniques/README.md
@@ -0,0 +1,23 @@
+Includes daily and monthly hive jobs to calculate the last access cookie based 
uniques.
+
+These are currently launched as:
+
+```
+sudo -u hdfs oozie job --oozie $OOZIE_URL \
+  -Drefinery_directory=hdfs://analytics-hadoop$(hdfs dfs -ls -d 
/wmf/refinery/2016* | tail -n 1 | awk '{print $NF}') \
+  -Dqueue_name=production \
+  -Doozie_launcher_queue_name=production \
+  -Doozie_launcher_memory=256 \
+  -Dstart_time=2015-12-01T00:00Z \
+  -config daily/coordinator.properties \
+  -run
+
+sudo -u hdfs oozie job --oozie $OOZIE_URL \
+  -Drefinery_directory=hdfs://analytics-hadoop$(hdfs dfs -ls -d 
/wmf/refinery/2016* | tail -n 1 | awk '{print $NF}') \
+  -Dqueue_name=production \
+  -Doozie_launcher_queue_name=production \
+  -Doozie_launcher_memory=256 \
+  -Dstart_time=2015-12-01T00:00Z \
+  -config monthly/coordinator.properties \
+  -run
+```
\ No newline at end of file
diff --git a/oozie/last_access_uniques/datasets.xml 
b/oozie/last_access_uniques/datasets.xml
index 2790db0..51d96e9 100644
--- a/oozie/last_access_uniques/datasets.xml
+++ b/oozie/last_access_uniques/datasets.xml
@@ -13,8 +13,8 @@
 <datasets>
 
     <!--
-    The last_access_uniques_daily dataset contains uniques devices per project
-    host and country, counted using the last_access method.
+    The last_access_uniques_[daily,monthly] datasets contain uniques devices
+    per project, host and country, counted using the last_access method.
 
     Note that we do not use “${...}” but “${"$"}{...}", as dataset files are
     passed to EL twice in cascade, and in the first EL level, ${MONTH}
@@ -31,4 +31,12 @@
         
<uri-template>${last_access_uniques_data_directory}/daily/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}</uri-template>
         <done-flag>_SUCCESS</done-flag>
     </dataset>
+
+    <dataset name="last_access_uniques_monthly"
+             frequency="${coord:months(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${last_access_uniques_data_directory}/monthly/year=${YEAR}/month=${"$"}{MONTH
 + 0}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
 </datasets>
diff --git a/oozie/last_access_uniques/monthly/coordinator.properties 
b/oozie/last_access_uniques/monthly/coordinator.properties
new file mode 100644
index 0000000..d1adc82
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/coordinator.properties
@@ -0,0 +1,44 @@
+
+# Configures a coordinator to generate daily uniques using last_access method
+# on webrequest table.
+# Any of the following properties are overidable with -D.
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config 
oozie/last_access_uniques/daily/coordinator.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+
+refinery_directory                = ${name_node}/wmf/refinery/current
+oozie_directory                   = ${refinery_directory}/oozie
+hive_site_xml                     = 
${refinery_directory}/oozie/util/hive/hive-site.xml
+
+start_time                        = 2015-12-01T00:00Z
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# Sub worflows path
+mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+source_table                      = wmf.webrequest
+destination_table                 = wmf.last_access_uniques_monthly
+
+# HDFS path to webrequest dataset definition
+webrequest_data_directory         = ${name_node}/wmf/data/wmf/webrequest
+webrequest_datasets_file          = ${oozie_directory}/webrequest/datasets.xml
+
+# HDFS path to last access uniques dataset definition
+last_access_uniques_data_directory = 
${name_node}/wmf/data/wmf/last_access_uniques
+last_access_uniques_datasets_file = 
${oozie_directory}/last_access_uniques/datasets.xml
+
+user                              = hdfs
+workflow_file                     = 
${oozie_directory}/last_access_uniques/monthly/workflow.xml
+
+# Coordinator app to run.
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
+oozie.coord.application.path      = 
${oozie_directory}/last_access_uniques/monthly/coordinator.xml
diff --git a/oozie/last_access_uniques/monthly/coordinator.xml 
b/oozie/last_access_uniques/monthly/coordinator.xml
new file mode 100644
index 0000000..896d609
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/coordinator.xml
@@ -0,0 +1,84 @@
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="last_access_uniques_monthly-coord"
+    frequency="${coord:months(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+    <parameters>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>send_error_email_workflow_file</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>source_table</name></property>
+        <property><name>destination_table</name></property>
+
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>webrequest_datasets_file</name></property>
+        <property><name>last_access_uniques_data_directory</name></property>
+        <property><name>last_access_uniques_datasets_file</name></property>
+    </parameters>
+
+    <controls>
+        <!--(timeout is measured in minutes)-->
+        <timeout>-1</timeout>
+
+        <!-- Heavy on resource so limit parallel execution to 2 -->
+        <concurrency>2</concurrency>
+
+        <throttle>2</throttle>
+    </controls>
+
+    <datasets>
+        <include>${webrequest_datasets_file}</include>
+        <include>${last_access_uniques_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <!--
+            Please see datasets definition, the webrequest_mobile
+            and webrequest_text are refined datasets from the raw data.
+        -->
+        <data-in name="mobile" dataset="webrequest_mobile">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(coord:daysInMonth(0) * 24 - 
1)}</end-instance>
+        </data-in>
+        <data-in name="text" dataset="webrequest_text">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(coord:daysInMonth(0) * 24 - 
1)}</end-instance>
+        </data-in>
+    </input-events>
+
+    <output-events>
+        <data-out name="last_access_uniques_monthly_output" 
dataset="last_access_uniques_monthly">
+            <instance>${coord:current(0)}</instance>
+        </data-out>
+    </output-events>
+
+   <action>
+      <workflow>
+          <app-path>${workflow_file}</app-path>
+
+            <configuration>
+              <!-- Pass these properties through to the workflow -->
+              <property>
+                  <name>year</name>
+                  <value>${coord:formatTime(coord:nominalTime(), "y")}</value>
+              </property>
+              <property>
+                  <name>month</name>
+                  <value>${coord:formatTime(coord:nominalTime(), "M")}</value>
+              </property>
+              <property>
+                    <name>destination_dataset_directory</name>
+                    
<value>${coord:dataOut('last_access_uniques_monthly_output')}</value>
+                </property>
+          </configuration>
+      </workflow>
+  </action>
+  </coordinator-app>
\ No newline at end of file
diff --git a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql 
b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
new file mode 100644
index 0000000..6ead49d
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
@@ -0,0 +1,113 @@
+-- Generates monthly uniques based on WMF-Last-Access cookie
+--
+-- Parameters:
+--     source_table        -- Table containing source data
+--     destination_table   -- Table where to right newly computed data
+--     year                -- year of the to-be-generated
+--     month               -- month of the to-be-generated
+--
+-- Usage
+--     hive -f last_access_uniques_monthly.hql \
+--         -d source_table=wmf.webrequest \
+--         -d destination_table=wmf.last_access_uniques_monthly \
+--         -d year=2016 \
+--         -d month=1
+
+
+
+-- Set parquet compression codec
+SET parquet.compression              = SNAPPY;
+
+WITH last_access_dates AS (
+    SELECT
+        year,
+        month,
+        day,
+        lower(uri_host) as uri_host,,
+        geocoded_data['country'] AS country,
+        geocoded_data['country_code'] AS country_code,
+        unix_timestamp(x_analytics_map['WMF-Last-Access'], 'dd-MMM-yyyy') AS 
last_access,
+        x_analytics_map['nocookies'] AS nocookies,
+        ip,
+        user_agent,
+        accept_language
+    FROM ${source_table}
+    WHERE x_analytics_map IS NOT NULL
+      AND agent_type = 'user'
+      AND is_pageview = TRUE
+      AND webrequest_source IN ('mobile','text')
+      AND year = ${year}
+      AND month = ${month}
+),
+
+-- Only keeping clients having 1 event without cookies (fresh sessions)
+fresh_sessions_aggregated AS (
+    SELECT
+        uri_host,
+        country_code,
+        COUNT(1) AS uniques_offset
+    FROM (
+        SELECT
+            hash(ip, user_agent, accept_language, uri_host) AS id,
+            uri_host,
+            country_code,
+            COUNT(1) AS cardinal
+        FROM
+            last_access_dates
+        WHERE
+            -- Only keeping clients NOT having cookies (yet)
+            nocookies is NOT NULL
+        GROUP BY
+            hash(ip, user_agent, accept_language, uri_host),
+            uri_host,
+            country_code
+        -- Only keeping clients having done 1 event with no cookies
+        HAVING COUNT(1) = 1
+        ) fresh_sessions
+    GROUP BY
+        uri_host,
+        country_code
+)
+
+INSERT OVERWRITE TABLE ${destination_table}
+    PARTITION(year = ${year}, month = ${month})
+SELECT
+    la.uri_host,
+    la.country,
+    la.country_code,
+    SUM(CASE
+        -- Last access not set and client accept cookies --> first visit, count
+        WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
+        -- Last access set and date before today --> First visit today, count
+        WHEN ((la.last_access IS NOT NULL)
+            AND (la.last_access < unix_timestamp(CONCAT('${year}-', 
LPAD(${month}, 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
+        -- Other cases, don't
+        ELSE 0
+    END) AS uniques_underestimate,
+    oenca.uniques_offset AS uniques_offset,
+    SUM(CASE
+        -- Last access not set and client accept cookies --> first visit, count
+        WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
+        -- Last access set and date before today --> First visit today, count
+        WHEN ((la.last_access IS NOT NULL)
+            AND (la.last_access < unix_timestamp(CONCAT('${year}-', 
LPAD('${month}', 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
+        -- Other cases, don't
+        ELSE 0
+    END) + oenca.uniques_offset AS uniques_estimate
+FROM
+    last_access_dates AS la
+    INNER JOIN fresh_sessions_aggregated AS oenca
+        ON (oenca.uri_host = la.uri_host
+            AND oenca.country_code = la.country_code)
+GROUP BY
+    la.uri_host,
+    la.country,
+    la.country_code,
+    oenca.uniques_offset
+-- TODO
+-- Add HAVING clause to restrict on long tail (maybe ?)
+ORDER BY
+    uniques_estimate DESC
+-- Limit enforced by hive strict mapreduce setting.
+-- 1000000000 == NO LIMIT !
+LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/monthly/workflow.xml 
b/oozie/last_access_uniques/monthly/workflow.xml
new file mode 100644
index 0000000..0de7763
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/workflow.xml
@@ -0,0 +1,124 @@
+<workflow-app name="last_access_uniques_monthly-${year}-${month}-wf" 
xmlns="uri:oozie:workflow:0.4">
+    <parameters>
+
+        <!-- Default values for oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>256</value>
+        </property>
+
+
+        <!-- Required properties -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+        <property>
+            <name>hive_site_xml</name>
+            <description>hive-site.xml file path in HDFS</description>
+        </property>
+        <!-- specifying parameter values in file to test running -->
+        <property>
+            <name>source_table</name>
+            <description>Hive table to read data from.</description>
+        </property>
+        <property>
+            <name>destination_table</name>
+            <description>The destinaton table to store uniques data 
in.</description>
+        </property>
+        <property>
+            <name>year</name>
+            <description>The partition's year</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>The partition's month</description>
+        </property>
+        <property>
+            <name>mark_directory_done_workflow_file</name>
+            <description>Workflow for marking a directory done</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
+        </property>
+    </parameters>
+
+    <start to="compute_last_access_uniques_monthly"/>
+
+    <action name="compute_last_access_uniques_monthly">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+            <script>last_access_uniques_monthly.hql</script>
+            <param>source_table=${source_table}</param>
+            <param>destination_table=${destination_table}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+        </hive>
+        <ok to="mark_dataset_done"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="mark_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    <value>${destination_dataset_directory}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="end"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>parent_failed_action</name>
+                    <value>${wf:lastErrorNode()}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/268087
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I423c90a52e87a701d584fc12c88729bd564baf70
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to