Joal has uploaded a new change for review.
https://gerrit.wikimedia.org/r/268087
Change subject: Monthly last_access uniques oozie job
......................................................................
Monthly last_access uniques oozie job
Based on WMF-Last-Access cookie and the nocookie
header in X-Analytics.
Bug: 124678
Change-Id: I423c90a52e87a701d584fc12c88729bd564baf70
---
A hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
A oozie/last_access_uniques/README.md
M oozie/last_access_uniques/datasets.xml
A oozie/last_access_uniques/monthly/coordinator.properties
A oozie/last_access_uniques/monthly/coordinator.xml
A oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
A oozie/last_access_uniques/monthly/workflow.xml
7 files changed, 422 insertions(+), 2 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/87/268087/1
diff --git
a/hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
b/hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
new file mode 100644
index 0000000..b6c8a7f
--- /dev/null
+++ b/hive/last_access_uniques/create_last_access_uniques_monthly_table.hql
@@ -0,0 +1,24 @@
+-- Creates table statement for WMF-Last-Access cookie base uniques monthly
table.
+--
+-- Parameters:
+-- <none>
+--
+-- Usage
+-- hive -f create_last_access_uniques_monthly_table.hql --database wmf
+
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `last_access_uniques_monthly`(
+ `uri_host` string COMMENT 'The lower cased host defining a
project (en.wikipedia.org for instance)',
+ `country` string COMMENT 'Country name of the accessing
agents (computed using maxmind GeoIP database)',
+ `country_code` string COMMENT '2 letter country code',
+ `uniques_underestimate` int COMMENT 'Under estimation of unique devices
seen based on last access cookie, and the nocookies header',
+ `uniques_offset` int COMMENT 'Unique devices offset computed as
1-action sessions without cookies',
+ `uniques_estimate` int COMMENT 'Estimate of unique devices seen as
uniques_underestimate plus offset'
+)
+PARTITIONED BY (
+ `year` int COMMENT 'Unpadded year of requests',
+ `month` int COMMENT 'Unpadded month of requests'
+)
+STORED AS PARQUET
+LOCATION '/wmf/data/wmf/last_access_uniques/monthly'
+;
diff --git a/oozie/last_access_uniques/README.md
b/oozie/last_access_uniques/README.md
new file mode 100644
index 0000000..ea5affd
--- /dev/null
+++ b/oozie/last_access_uniques/README.md
@@ -0,0 +1,23 @@
+Includes daily and monthly hive jobs to calculate the last access cookie based
uniques.
+
+These are currently launched as:
+
+```
+sudo -u hdfs oozie job --oozie $OOZIE_URL \
+ -Drefinery_directory=hdfs://analytics-hadoop$(hdfs dfs -ls -d
/wmf/refinery/2016* | tail -n 1 | awk '{print $NF}') \
+ -Dqueue_name=production \
+ -Doozie_launcher_queue_name=production \
+ -Doozie_launcher_memory=256 \
+ -Dstart_time=2015-12-01T00:00Z \
+ -config daily/coordinator.properties \
+ -run
+
+sudo -u hdfs oozie job --oozie $OOZIE_URL \
+ -Drefinery_directory=hdfs://analytics-hadoop$(hdfs dfs -ls -d
/wmf/refinery/2016* | tail -n 1 | awk '{print $NF}') \
+ -Dqueue_name=production \
+ -Doozie_launcher_queue_name=production \
+ -Doozie_launcher_memory=256 \
+ -Dstart_time=2015-12-01T00:00Z \
+ -config monthly/coordinator.properties \
+ -run
+```
\ No newline at end of file
diff --git a/oozie/last_access_uniques/datasets.xml
b/oozie/last_access_uniques/datasets.xml
index 2790db0..51d96e9 100644
--- a/oozie/last_access_uniques/datasets.xml
+++ b/oozie/last_access_uniques/datasets.xml
@@ -13,8 +13,8 @@
<datasets>
<!--
- The last_access_uniques_daily dataset contains uniques devices per project
- host and country, counted using the last_access method.
+ The last_access_uniques_[daily,monthly] datasets contain uniques devices
+ per project, host and country, counted using the last_access method.
Note that we do not use “${...}” but “${"$"}{...}", as dataset files are
passed to EL twice in cascade, and in the first EL level, ${MONTH}
@@ -31,4 +31,12 @@
<uri-template>${last_access_uniques_data_directory}/daily/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}</uri-template>
<done-flag>_SUCCESS</done-flag>
</dataset>
+
+ <dataset name="last_access_uniques_monthly"
+ frequency="${coord:months(1)}"
+ initial-instance="${start_time}"
+ timezone="Universal">
+
<uri-template>${last_access_uniques_data_directory}/monthly/year=${YEAR}/month=${"$"}{MONTH
+ 0}</uri-template>
+ <done-flag>_SUCCESS</done-flag>
+ </dataset>
</datasets>
diff --git a/oozie/last_access_uniques/monthly/coordinator.properties
b/oozie/last_access_uniques/monthly/coordinator.properties
new file mode 100644
index 0000000..d1adc82
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/coordinator.properties
@@ -0,0 +1,44 @@
+
+# Configures a coordinator to generate daily uniques using last_access method
+# on webrequest table.
+# Any of the following properties are overidable with -D.
+# Usage:
+# oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config
oozie/last_access_uniques/daily/coordinator.properties
+#
+# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
+# .xml files exist there when this job is submitted.
+
+name_node = hdfs://analytics-hadoop
+job_tracker = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name = default
+
+refinery_directory = ${name_node}/wmf/refinery/current
+oozie_directory = ${refinery_directory}/oozie
+hive_site_xml =
${refinery_directory}/oozie/util/hive/hive-site.xml
+
+start_time = 2015-12-01T00:00Z
+# Time to stop running this coordinator. Year 3000 == never!
+stop_time = 3000-01-01T00:00Z
+
+# Sub worflows path
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
+send_error_email_workflow_file =
${oozie_directory}/util/send_error_email/workflow.xml
+
+source_table = wmf.webrequest
+destination_table = wmf.last_access_uniques_monthly
+
+# HDFS path to webrequest dataset definition
+webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
+webrequest_datasets_file = ${oozie_directory}/webrequest/datasets.xml
+
+# HDFS path to last access uniques dataset definition
+last_access_uniques_data_directory =
${name_node}/wmf/data/wmf/last_access_uniques
+last_access_uniques_datasets_file =
${oozie_directory}/last_access_uniques/datasets.xml
+
+user = hdfs
+workflow_file =
${oozie_directory}/last_access_uniques/monthly/workflow.xml
+
+# Coordinator app to run.
+oozie.use.system.libpath = true
+oozie.action.external.stats.write = true
+oozie.coord.application.path =
${oozie_directory}/last_access_uniques/monthly/coordinator.xml
diff --git a/oozie/last_access_uniques/monthly/coordinator.xml
b/oozie/last_access_uniques/monthly/coordinator.xml
new file mode 100644
index 0000000..896d609
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/coordinator.xml
@@ -0,0 +1,84 @@
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+ name="last_access_uniques_monthly-coord"
+ frequency="${coord:months(1)}"
+ start="${start_time}"
+ end="${stop_time}"
+ timezone="Universal">
+ <parameters>
+
+ <!-- Required properties. -->
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+ <property><name>send_error_email_workflow_file</name></property>
+ <property><name>workflow_file</name></property>
+ <property><name>start_time</name></property>
+ <property><name>stop_time</name></property>
+
+ <property><name>source_table</name></property>
+ <property><name>destination_table</name></property>
+
+ <property><name>webrequest_data_directory</name></property>
+ <property><name>webrequest_datasets_file</name></property>
+ <property><name>last_access_uniques_data_directory</name></property>
+ <property><name>last_access_uniques_datasets_file</name></property>
+ </parameters>
+
+ <controls>
+ <!--(timeout is measured in minutes)-->
+ <timeout>-1</timeout>
+
+ <!-- Heavy on resource so limit parallel execution to 2 -->
+ <concurrency>2</concurrency>
+
+ <throttle>2</throttle>
+ </controls>
+
+ <datasets>
+ <include>${webrequest_datasets_file}</include>
+ <include>${last_access_uniques_datasets_file}</include>
+ </datasets>
+
+ <input-events>
+ <!--
+ Please see datasets definition, the webrequest_mobile
+ and webrequest_text are refined datasets from the raw data.
+ -->
+ <data-in name="mobile" dataset="webrequest_mobile">
+ <start-instance>${coord:current(0)}</start-instance>
+ <end-instance>${coord:current(coord:daysInMonth(0) * 24 -
1)}</end-instance>
+ </data-in>
+ <data-in name="text" dataset="webrequest_text">
+ <start-instance>${coord:current(0)}</start-instance>
+ <end-instance>${coord:current(coord:daysInMonth(0) * 24 -
1)}</end-instance>
+ </data-in>
+ </input-events>
+
+ <output-events>
+ <data-out name="last_access_uniques_monthly_output"
dataset="last_access_uniques_monthly">
+ <instance>${coord:current(0)}</instance>
+ </data-out>
+ </output-events>
+
+ <action>
+ <workflow>
+ <app-path>${workflow_file}</app-path>
+
+ <configuration>
+ <!-- Pass these properties through to the workflow -->
+ <property>
+ <name>year</name>
+ <value>${coord:formatTime(coord:nominalTime(), "y")}</value>
+ </property>
+ <property>
+ <name>month</name>
+ <value>${coord:formatTime(coord:nominalTime(), "M")}</value>
+ </property>
+ <property>
+ <name>destination_dataset_directory</name>
+
<value>${coord:dataOut('last_access_uniques_monthly_output')}</value>
+ </property>
+ </configuration>
+ </workflow>
+ </action>
+ </coordinator-app>
\ No newline at end of file
diff --git a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
new file mode 100644
index 0000000..6ead49d
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
@@ -0,0 +1,113 @@
+-- Generates monthly uniques based on WMF-Last-Access cookie
+--
+-- Parameters:
+-- source_table -- Table containing source data
+-- destination_table -- Table where to right newly computed data
+-- year -- year of the to-be-generated
+-- month -- month of the to-be-generated
+--
+-- Usage
+-- hive -f last_access_uniques_monthly.hql \
+-- -d source_table=wmf.webrequest \
+-- -d destination_table=wmf.last_access_uniques_monthly \
+-- -d year=2016 \
+-- -d month=1
+
+
+
+-- Set parquet compression codec
+SET parquet.compression = SNAPPY;
+
+WITH last_access_dates AS (
+ SELECT
+ year,
+ month,
+ day,
+ lower(uri_host) as uri_host,,
+ geocoded_data['country'] AS country,
+ geocoded_data['country_code'] AS country_code,
+ unix_timestamp(x_analytics_map['WMF-Last-Access'], 'dd-MMM-yyyy') AS
last_access,
+ x_analytics_map['nocookies'] AS nocookies,
+ ip,
+ user_agent,
+ accept_language
+ FROM ${source_table}
+ WHERE x_analytics_map IS NOT NULL
+ AND agent_type = 'user'
+ AND is_pageview = TRUE
+ AND webrequest_source IN ('mobile','text')
+ AND year = ${year}
+ AND month = ${month}
+),
+
+-- Only keeping clients having 1 event without cookies (fresh sessions)
+fresh_sessions_aggregated AS (
+ SELECT
+ uri_host,
+ country_code,
+ COUNT(1) AS uniques_offset
+ FROM (
+ SELECT
+ hash(ip, user_agent, accept_language, uri_host) AS id,
+ uri_host,
+ country_code,
+ COUNT(1) AS cardinal
+ FROM
+ last_access_dates
+ WHERE
+ -- Only keeping clients NOT having cookies (yet)
+ nocookies is NOT NULL
+ GROUP BY
+ hash(ip, user_agent, accept_language, uri_host),
+ uri_host,
+ country_code
+ -- Only keeping clients having done 1 event with no cookies
+ HAVING COUNT(1) = 1
+ ) fresh_sessions
+ GROUP BY
+ uri_host,
+ country_code
+)
+
+INSERT OVERWRITE TABLE ${destination_table}
+ PARTITION(year = ${year}, month = ${month})
+SELECT
+ la.uri_host,
+ la.country,
+ la.country_code,
+ SUM(CASE
+ -- Last access not set and client accept cookies --> first visit, count
+ WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
+ -- Last access set and date before today --> First visit today, count
+ WHEN ((la.last_access IS NOT NULL)
+ AND (la.last_access < unix_timestamp(CONCAT('${year}-',
LPAD(${month}, 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
+ -- Other cases, don't
+ ELSE 0
+ END) AS uniques_underestimate,
+ oenca.uniques_offset AS uniques_offset,
+ SUM(CASE
+ -- Last access not set and client accept cookies --> first visit, count
+ WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
+ -- Last access set and date before today --> First visit today, count
+ WHEN ((la.last_access IS NOT NULL)
+ AND (la.last_access < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
+ -- Other cases, don't
+ ELSE 0
+ END) + oenca.uniques_offset AS uniques_estimate
+FROM
+ last_access_dates AS la
+ INNER JOIN fresh_sessions_aggregated AS oenca
+ ON (oenca.uri_host = la.uri_host
+ AND oenca.country_code = la.country_code)
+GROUP BY
+ la.uri_host,
+ la.country,
+ la.country_code,
+ oenca.uniques_offset
+-- TODO
+-- Add HAVING clause to restrict on long tail (maybe ?)
+ORDER BY
+ uniques_estimate DESC
+-- Limit enforced by hive strict mapreduce setting.
+-- 1000000000 == NO LIMIT !
+LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/monthly/workflow.xml
b/oozie/last_access_uniques/monthly/workflow.xml
new file mode 100644
index 0000000..0de7763
--- /dev/null
+++ b/oozie/last_access_uniques/monthly/workflow.xml
@@ -0,0 +1,124 @@
+<workflow-app name="last_access_uniques_monthly-${year}-${month}-wf"
xmlns="uri:oozie:workflow:0.4">
+ <parameters>
+
+ <!-- Default values for oozie settings -->
+ <property>
+ <name>oozie_launcher_queue_name</name>
+ <value>${queue_name}</value>
+ </property>
+ <property>
+ <name>oozie_launcher_memory</name>
+ <value>256</value>
+ </property>
+
+
+ <!-- Required properties -->
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+ <property><name>queue_name</name></property>
+ <property>
+ <name>hive_site_xml</name>
+ <description>hive-site.xml file path in HDFS</description>
+ </property>
+ <!-- specifying parameter values in file to test running -->
+ <property>
+ <name>source_table</name>
+ <description>Hive table to read data from.</description>
+ </property>
+ <property>
+ <name>destination_table</name>
+ <description>The destinaton table to store uniques data
in.</description>
+ </property>
+ <property>
+ <name>year</name>
+ <description>The partition's year</description>
+ </property>
+ <property>
+ <name>month</name>
+ <description>The partition's month</description>
+ </property>
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <description>Workflow for marking a directory done</description>
+ </property>
+ <property>
+ <name>send_error_email_workflow_file</name>
+ <description>Workflow for sending an email</description>
+ </property>
+ </parameters>
+
+ <start to="compute_last_access_uniques_monthly"/>
+
+ <action name="compute_last_access_uniques_monthly">
+ <hive xmlns="uri:oozie:hive-action:0.2">
+ <job-tracker>${job_tracker}</job-tracker>
+ <name-node>${name_node}</name-node>
+ <job-xml>${hive_site_xml}</job-xml>
+ <configuration>
+ <!--make sure oozie:launcher runs in a low priority queue -->
+ <property>
+ <name>oozie.launcher.mapred.job.queue.name</name>
+ <value>${oozie_launcher_queue_name}</value>
+ </property>
+ <property>
+ <name>oozie.launcher.mapreduce.map.memory.mb</name>
+ <value>${oozie_launcher_memory}</value>
+ </property>
+ <property>
+ <name>mapreduce.job.queuename</name>
+ <value>${queue_name}</value>
+ </property>
+ <property>
+ <name>hive.exec.scratchdir</name>
+ <value>/tmp/hive-${user}</value>
+ </property>
+ </configuration>
+ <script>last_access_uniques_monthly.hql</script>
+ <param>source_table=${source_table}</param>
+ <param>destination_table=${destination_table}</param>
+ <param>year=${year}</param>
+ <param>month=${month}</param>
+ </hive>
+ <ok to="mark_dataset_done"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="mark_dataset_done">
+ <sub-workflow>
+ <app-path>${mark_directory_done_workflow_file}</app-path>
+ <configuration>
+ <property>
+ <name>directory</name>
+ <value>${destination_dataset_directory}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
+ <ok to="end"/>
+ <error to="send_error_email"/>
+ </action>
+
+ <action name="send_error_email">
+ <sub-workflow>
+ <app-path>${send_error_email_workflow_file}</app-path>
+ <propagate-configuration/>
+ <configuration>
+ <property>
+ <name>parent_name</name>
+ <value>${wf:name()}</value>
+ </property>
+ <property>
+ <name>parent_failed_action</name>
+ <value>${wf:lastErrorNode()}</value>
+ </property>
+ </configuration>
+ </sub-workflow>
+ <ok to="kill"/>
+ <error to="kill"/>
+ </action>
+
+ <kill name="kill">
+ <message>Action failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+ </kill>
+
+ <end name="end"/>
+</workflow-app>
--
To view, visit https://gerrit.wikimedia.org/r/268087
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I423c90a52e87a701d584fc12c88729bd564baf70
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits