Ejegg has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/258058

Change subject: Oozie/Hive scripts to get email tracking data
......................................................................

Oozie/Hive scripts to get email tracking data

TODO: move files from hdfs, sync to someplace FR can see 'em

Bug: T114010
Change-Id: I460171a761c568a59aad27c55e650a7fbf2007a7
---
A email_links/coordinator.properties
A email_links/coordinator.xml
A email_links/requestparams.hql
A email_links/workflow.properties
A email_links/workflow.xml
5 files changed, 235 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools 
refs/changes/58/258058/1

diff --git a/email_links/coordinator.properties 
b/email_links/coordinator.properties
new file mode 100644
index 0000000..caba8dd
--- /dev/null
+++ b/email_links/coordinator.properties
@@ -0,0 +1,10 @@
+oozie.coord.application.path=hdfs://analytics-hadoop/tmp/extract-donatewiki-params
+hive_site_xml=${refinery_directory}/oozie/util/hive/hive-site.xml
+name_node=hdfs://analytics-hadoop
+oozie_directory=${name_node}/wmf/refinery/current/oozie
+oozie.use.system.libpath=true
+oozie.action.external.stats.write=true
+start_time=2015-12-08T22:00Z
+end_time=2050-01-01T06:00Z
+webrequest_data_directory=${name_node}/wmf/data/wmf/webrequest
+webrequest_datasets_file=${oozie_directory}/webrequest/datasets.xml
diff --git a/email_links/coordinator.xml b/email_links/coordinator.xml
new file mode 100644
index 0000000..7581e17
--- /dev/null
+++ b/email_links/coordinator.xml
@@ -0,0 +1,51 @@
+<coordinator-app
+    xmlns="uri:oozie:coordinator:0.1"
+    name="extract-donatewiki-params-coord"
+    frequency="${coord:hours(1)}"
+    start="${start_time}"
+    end="${end_time}"
+    timezone="UTC">
+
+    <datasets>
+        <include>${webrequest_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="text_refined_input" dataset="webrequest_text">
+            <instance>${coord:current(0)}</instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            
<app-path>hdfs://analytics-hadoop/tmp/extract-donatewiki-params</app-path>
+            <configuration>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"M")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"d")}</value>
+                </property>
+                <property>
+                    <name>hour</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"H")}</value>
+                </property>
+                <property>
+                    <name>name_node</name>
+                    <value>hdfs://analytics-hadoop</value>
+                </property>
+                <property>
+                    <name>job_tracker</name>
+                    <value>resourcemanager.analytics.eqiad.wmnet:8032</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
+
diff --git a/email_links/requestparams.hql b/email_links/requestparams.hql
new file mode 100644
index 0000000..db62b8f
--- /dev/null
+++ b/email_links/requestparams.hql
@@ -0,0 +1,91 @@
+USE ejegg;
+
+CREATE TABLE IF NOT EXISTS donatewiki (
+  `contact_id` string, 
+  `utm_campaign` string, 
+  `utm_source` string, 
+  `link_id` string, 
+  `year` int, 
+  `month` int, 
+  `day` int, 
+  `hour` int);
+
+INSERT INTO TABLE donatewiki
+SELECT params['contact_id'], params['utm_campaign'], params['utm_source'], '', 
year, month, day, hour FROM (
+  SELECT str_to_map(substr(uri_query, 2), '&', '=') AS params, year, month, 
day, hour
+  FROM wmf.webrequest
+  WHERE uri_host='donate.wikimedia.org'
+  AND uri_path='/'
+  AND year=${year} AND month=${month} AND day=${day} AND hour=${hour}
+  AND webrequest_source='text'
+  AND uri_query LIKE '%contact_id%'
+) AS subselect;
+
+CREATE TABLE IF NOT EXISTS donatewiki_unique (
+  `contact_id` string, 
+  `utm_campaign` string, 
+  `utm_source` string, 
+  `link_id` string, 
+  `year` int, 
+  `month` int, 
+  `day` int, 
+  `hour` int);
+
+INSERT INTO TABLE donatewiki_unique
+SELECT donatewiki.contact_id, donatewiki.utm_campaign, donatewiki.utm_source, 
donatewiki.link_id, donatewiki.year, donatewiki.month, donatewiki.day, 
donatewiki.hour
+FROM donatewiki
+LEFT OUTER JOIN donatewiki_unique ON (
+  donatewiki.contact_id = donatewiki_unique.contact_id
+  AND donatewiki.utm_campaign = donatewiki_unique.utm_campaign
+  AND donatewiki.year=${year}
+  AND donatewiki.month=${month}
+  AND donatewiki.day=${day}
+  AND donatewiki.hour=${hour}
+)
+WHERE donatewiki_unique.utm_campaign IS NULL
+GROUP BY donatewiki.contact_id, donatewiki.utm_campaign, 
donatewiki.utm_source, donatewiki.link_id, donatewiki.year, donatewiki.month, 
donatewiki.day, donatewiki.hour;
+
+CREATE TABLE IF NOT EXISTS donatewiki_counts (
+  `utm_campaign` string, 
+  `utm_source` string, 
+  `link_id` string, 
+  `year` int, 
+  `month` int, 
+  `day` int, 
+  `hour` int,
+  `total` int)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+   "separatorChar" = ",",
+   "quoteChar"     = "'",
+   "escapeChar"    = "\\"
+)  
+STORED AS TEXTFILE;
+
+TRUNCATE TABLE donatewiki_counts;
+
+INSERT INTO donatewiki_counts
+SELECT utm_campaign, utm_source, link_id, year, month, day, hour, COUNT(*) AS 
total
+FROM donatewiki_unique
+GROUP BY utm_campaign, utm_source, link_id, year, month, day, hour;
+
+CREATE TABLE IF NOT EXISTS donatewiki_rollup_counts (
+  `utm_campaign` string,
+  `utm_source` string,
+  `link_id` string,
+  `total` int)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+   "separatorChar" = ",",
+   "quoteChar"     = "'",
+   "escapeChar"    = "\\"
+)
+STORED AS TEXTFILE;
+
+TRUNCATE TABLE donatewiki_rollup_counts;
+
+INSERT INTO donatewiki_rollup_counts
+SELECT utm_campaign, utm_source, link_id, SUM(total)
+FROM donatewiki_rollup_counts
+GROUP BY utm_campaign, utm_source, link_id;
+
diff --git a/email_links/workflow.properties b/email_links/workflow.properties
new file mode 100644
index 0000000..c2384e3
--- /dev/null
+++ b/email_links/workflow.properties
@@ -0,0 +1,17 @@
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+oozie_directory                   = ${name_node}/wmf/refinery/current/oozie
+# for testing locally, this won't work:
+# hive_site_xml                   = ${oozie_directory}/util/hive/hive-site.xml
+hive_site_xml                     = 
${refinery_directory}/oozie/util/hive/hive-site.xml
+# Workflow app to run.
+oozie.wf.application.path         = 
hdfs://analytics-hadoop/tmp/extract-donatewiki-params/workflow.xml
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
+# parameters
+year = 2015
+month = 11
+day = 30
+hour = 10
+user = ejegg
diff --git a/email_links/workflow.xml b/email_links/workflow.xml
new file mode 100644
index 0000000..7fcf9ce
--- /dev/null
+++ b/email_links/workflow.xml
@@ -0,0 +1,66 @@
+<workflow-app name="extract-donatewiki-params" xmlns="uri:oozie:workflow:0.4">
+    <parameters>
+        <property>
+            <name>queue_name</name>
+            <value>default</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+
+        <property>
+            <name>hive_site_xml</name>
+            <description>hive-site.xml file path in HDFS</description>
+        </property>
+        <!-- specifying parameter values in file to test running -->
+        <property>
+            <name>year</name>
+            <description>The partition's year</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>The partition's month</description>
+        </property>
+        <property>
+            <name>day</name>
+            <description>The partition's day</description>
+        </property>
+        <property>
+            <name>hour</name>
+            <description>The partition's hour</description>
+        </property>
+    </parameters>
+
+    <start to="donatewiki-params"/>
+    <action name="donatewiki-params">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+            <script>requestparams.hql</script>
+
+            
<param>destination_directory=/tmp/ejegg/${year}-${month}-${day}-${hour}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+            <param>hour=${hour}</param>
+        </hive>
+        <ok to="end"/>
+        <error to="kill"/>
+    </action>
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/258058
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I460171a761c568a59aad27c55e650a7fbf2007a7
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Ejegg <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to