Mforns has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/246851

Change subject: [WIP] Add oozie job to compute browser usage reports
......................................................................

[WIP] Add oozie job to compute browser usage reports

Bug: T88504
Change-Id: I106a9853b07dac5a0efabaf92a9987de2b8a7ec2
---
A oozie/pageview/by_browser/README.md
A oozie/pageview/by_browser/coordinator.properties
A oozie/pageview/by_browser/coordinator.xml
A oozie/pageview/by_browser/pageview_by_browser.hql
A oozie/pageview/by_browser/workflow.xml
5 files changed, 304 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/51/246851/1

diff --git a/oozie/pageview/by_browser/README.md 
b/oozie/pageview/by_browser/README.md
new file mode 100644
index 0000000..0c15eb4
--- /dev/null
+++ b/oozie/pageview/by_browser/README.md
@@ -0,0 +1,12 @@
+# Browser usage
+
+This job aggregates pageviews from the pageview_hourly table
+into a new table that holds pageviews broken down by browser
+(and also broken down by access method) in weekly recurrence.
+
+Output is appended into (year, month, day) partitions
+in /wmf/data/wmf/pageview/by_browser
+
+# Outline
+
+...
diff --git a/oozie/pageview/by_browser/coordinator.properties 
b/oozie/pageview/by_browser/coordinator.properties
new file mode 100644
index 0000000..1751e9c
--- /dev/null
+++ b/oozie/pageview/by_browser/coordinator.properties
@@ -0,0 +1,53 @@
+# Configures a coordinator to generate pageviews by browser reports from
+# the pageview_houly data. Any of the following properties are overidable with 
-D.
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-05-01T00:00Z -submit -config 
oozie/mobile_apps/session_metrics/coordinator.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+user                              = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should
+# override this to point directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory.
+# E.g.  /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                = ${name_node}/wmf/refinery/current
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                   = ${refinery_directory}/oozie
+
+# HDFS path to coordinator to run for each webrequest_source.
+coordinator_file                  = 
${oozie_directory}/pageview/by_browser/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file                     = 
${oozie_directory}/pageview/by_browser/workflow.xml
+
+# HDFS path to pageview dataset definitions
+pageview_datasets_file          = ${oozie_directory}/pageview/datasets.xml
+pageview_data_directory         = ${name_node}/wmf/data/wmf/pageview
+
+# Initial import time of the webrequest dataset.
+start_time                        = 2015-10-04T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
+
+# Fully qualified Hive table name.
+source_table                      = wmf.webrequest
+
+# Record version to keep track of changes
+record_version                    = 0.0.1
+
+# Coordinator to start.
+oozie.coord.application.path      = ${coordinator_file}
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
diff --git a/oozie/pageview/by_browser/coordinator.xml 
b/oozie/pageview/by_browser/coordinator.xml
new file mode 100644
index 0000000..be1f776
--- /dev/null
+++ b/oozie/pageview/by_browser/coordinator.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="pageview_by_browser-coord"
+    frequency="${coord:days(7)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property><name>workflow_file</name></property>
+        
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>pageview_datasets_file</name></property>
+        <property><name>pageview_data_directory</name></property>
+
+        <property><name>hive_site_xml</name></property>
+        <property><name>source_table</name></property>
+    </parameters>
+
+    <controls>
+        <timeout>-1</timeout>
+        <concurrency>2</concurrency>
+        <throttle>2</throttle>
+    </controls>
+
+    <datasets>
+        <include>${pageview_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="pageview_hourly_input" dataset="pageview_hourly">
+            <!-- 7 days of data in hours -->
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(7 * 30 - 1)}</end-instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"M")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"d")}</value>
+                </property>
+
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
diff --git a/oozie/pageview/by_browser/pageview_by_browser.hql 
b/oozie/pageview/by_browser/pageview_by_browser.hql
new file mode 100644
index 0000000..9ba9025
--- /dev/null
+++ b/oozie/pageview/by_browser/pageview_by_browser.hql
@@ -0,0 +1,66 @@
+-- Parameters:
+--     source_table      -- Fully qualified table name to compute the
+--                          aggregation for.
+--     destination_table -- Fully qualified table name to fill in
+--                          aggregated values.
+--     record_version    -- record_version keeping track of changes
+--                          in the table content definition.
+--     year              -- year of partition to compute aggregation
+--                          for.
+--     month             -- month of partition to compute aggregation
+--                          for.
+--     day               -- day of partition to compute aggregation
+--                          for (always Sunday, weekly recurrence).
+--
+-- Usage:
+--     hive -f pageview_hourly.hql                                \
+--         -d source_table=wmf.webrequest                         \
+--         -d destination_table=wmf.pageview_hourly               \
+--         -d record_version=0.0.1                                \
+--         -d year=2015                                           \
+--         -d month=11                                            \
+--         -d day=1
+--
+
+SET parquet.compression              = SNAPPY;
+SET mapred.reduce.tasks              = 8;
+
+-------------------------
+
+WITH total AS (
+  SELECT
+    SUM(view_count) as view_count_total
+  FROM
+    wmf.projectview_hourly
+  WHERE
+    year = 2015
+    AND month = 10
+    AND day = 15
+    AND hour = 0
+    AND agent_type = 'user'
+)
+SELECT
+  -- Since the total table is small enough to fit in RAM,
+  -- it strongly improves efficiency to use a Map-side.
+  /*+ MAPJOIN(total) */
+  CONCAT(user_agent_map['os_family'], ' ', user_agent_map['os_major']) AS os,
+  CONCAT(user_agent_map['browser_family'], ' ', 
user_agent_map['browser_major']) AS browser,
+  SUM(view_count) * 100 / total.view_count_total AS percent
+FROM
+  wmf.pageview_hourly
+  JOIN total
+WHERE
+  year = 2015
+  AND month = 10
+  AND day = 15
+  AND hour = 0
+  AND agent_type = 'user'
+GROUP BY
+  CONCAT(user_agent_map['os_family'], ' ', user_agent_map['os_major']),
+  CONCAT(user_agent_map['browser_family'], ' ', 
user_agent_map['browser_major']),
+  total.view_count_total
+HAVING
+  (SUM(view_count) * 100 / total.view_count_total) > 0.5
+SORT BY percent DESC
+LIMIT 10
+;
\ No newline at end of file
diff --git a/oozie/pageview/by_browser/workflow.xml 
b/oozie/pageview/by_browser/workflow.xml
new file mode 100644
index 0000000..e41abe0
--- /dev/null
+++ b/oozie/pageview/by_browser/workflow.xml
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+    name="pageview_by_browser-${year}-${month}-${day}-wf">
+
+    <parameters>
+       <property>
+            <name>format_version</name>
+            <value>0</value>
+            <description>
+                Version number for the output format. Increase this number,
+                if you change the output format or meaning of the columns.
+            </description>
+        </property>
+
+        <!-- Default values for inner oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>256</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>queue_name</name></property>
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+
+        <!-- Aggregation related configuration properties-->
+        <property>
+            <name>hive_script</name>
+            <!-- This is relative to the containing directory of this file. -->
+            <value>pageview_by_browser.hql</value>
+            <description>Hive script to run.</description>
+        </property>
+        <property>
+            <name>hive_site_xml</name>
+            <description>hive-site.xml file path in HDFS.</description>
+        </property>
+        <property>
+            <name>source_table</name>
+            <description>Hive table to refine.</description>
+        </property>
+        <property>
+            <name>output_directory</name>
+            <description>Directory where the results of the job are written 
into.</description>
+        </property>
+        <property>
+            <name>year</name>
+            <description>Year of date from which last 7 days are 
calculated.</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>Month of date from which last 7 days are 
calculated.</description>
+        </property>
+        <property>
+            <name>day</name>
+            <description>Day of month from which last 7 days are 
calculated.</description>
+        </property>
+    </parameters>
+
+    <start to="aggregate"/>
+
+    <action name="aggregate">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+
+            <script>${hive_script}</script>
+            <param>source_table=${source_table}</param>
+            <param>output_directory=${output_directory}</param>
+            <param>record_version=${record_version}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+        </hive>
+
+        <ok to="end"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/246851
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I106a9853b07dac5a0efabaf92a9987de2b8a7ec2
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Mforns <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to