[MediaWiki-commits] [Gerrit] analytics/refinery[master]: [WIP] Add oozie jobs loading uniques in druid

Joal (Code Review) Tue, 11 Apr 2017 07:02:29 -0700

Joal has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/347611 )


Change subject: [WIP] Add oozie jobs loading uniques in druid
......................................................................

[WIP] Add oozie jobs loading uniques in druid

The jobs are in oozie files are in oozie/last_access_uniques/druid
folder, and provide both daily and monthly jobs.

Bug: T159471
Change-Id: I043f841a7f62cf1035aa09e84934bfe6aabe95e2
---
A oozie/last_access_uniques/druid/README.md
A oozie/last_access_uniques/druid/daily/coordinator.properties
A oozie/last_access_uniques/druid/daily/coordinator.xml
A oozie/last_access_uniques/druid/daily/generate_daily_druid_uniques.hql
A oozie/last_access_uniques/druid/daily/workflow.xml
A oozie/last_access_uniques/druid/monthly/coordinator.properties
A oozie/last_access_uniques/druid/monthly/coordinator.xml
A oozie/last_access_uniques/druid/monthly/generate_monthly_druid_uniques.hql
A oozie/last_access_uniques/druid/monthly/load_uniques_monthly.json.template
A oozie/last_access_uniques/druid/monthly/workflow.xml
10 files changed, 872 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/11/347611/1

diff --git a/oozie/last_access_uniques/druid/README.md 
b/oozie/last_access_uniques/druid/README.md
new file mode 100644
index 0000000..71a972a
--- /dev/null
+++ b/oozie/last_access_uniques/druid/README.md
@@ -0,0 +1,14 @@
+Oozie jobs to schedule importing last access uniques in druid.
+Daily and monthly folders contain coordinator to load daily
+and monthly uniques in druid.
+
+The workflow launches a hive action generating json data,
+then launches the druid indexation on these files.
+The script waits for indexation to finish then delete the json files.
+
+Example command for running the coordinator on command line:
+
+    oozie job -run \
+         -config daily/coordinator.properties \
+         -D refinery_directory=hdfs://analytics-hadoop/wmf/refinery/current \
+         -D 
spark_job_jar=hdfs://analytics-hadoop/wmf/refinery/current/artifacts/refinery-job.jar
diff --git a/oozie/last_access_uniques/druid/daily/coordinator.properties 
b/oozie/last_access_uniques/druid/daily/coordinator.properties
new file mode 100644
index 0000000..bc0e92f
--- /dev/null
+++ b/oozie/last_access_uniques/druid/daily/coordinator.properties
@@ -0,0 +1,64 @@
+# Configures a coordinator to generate a day of JSON daily uniques and load 
them in druid.
+# Any of the following properties are override-able with -D.
+# Usage:
+#   oozie job -Dstart_time=2016-06-01T00:00Z -submit -config 
oozie/last_access_uniques/druid/daily/coordinator.properties
+#
+# NOTE:  The $refinery_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+name_node                           = hdfs://analytics-hadoop
+job_tracker                         = 
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                          = default
+
+#Default user
+user                                = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should override this to point 
directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory. E.g. 
/wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                  = ${name_node}/wmf/refinery/current
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                     = ${refinery_directory}/oozie
+
+# HDFS path to the hive-site.xml file to use
+hive_site_xml                       = ${name_node}/user/hive/hive-site.xml
+
+# HDFS path to coordinator to run.
+coordinator_file                    = 
${oozie_directory}/last_access_uniques/druid/daily/coordinator.xml
+# HDFS path to workflow to run.
+workflow_file                       = 
${oozie_directory}/last_access_uniques/druid/daily/workflow.xml
+
+# HDFS path to last_access_uniques dataset definitions
+last_access_uniques_datasets_file   = 
${oozie_directory}/last_access_uniques/datasets.xml
+last_access_uniques_data_directory  = 
${name_node}/wmf/data/wmf/last_access_uniques
+
+# last_access_uniques_daily table name
+last_access_uniques_daily_table     = wmf.last_access_uniques_daily
+
+# Initial import time of the last_access_uniques_daily dataset.
+start_time                          = 2015-12-17T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                           = 3000-01-01T00:00Z
+
+# Temporary directory
+temporary_directory                 = ${name_node}/tmp
+
+# HDFS path to template to use.
+druid_template_file                 = 
${oozie_directory}/last_access_uniques/druid/daily/load_uniques_daily.json.template
+# Druid overlord url
+druid_overlord_url                  = http://druid1001.eqiad.wmnet:8090
+
+# HDFS path to workflow to load druid
+load_druid_workflow_file            = 
${oozie_directory}/util/druid/load/workflow.xml
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file   = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+# Workflow to send an error email
+send_error_email_workflow_file      = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# Coordinator to start.
+oozie.coord.application.path        = ${coordinator_file}
+oozie.use.system.libpath            = true
+oozie.action.external.stats.write   = true
diff --git a/oozie/last_access_uniques/druid/daily/coordinator.xml 
b/oozie/last_access_uniques/druid/daily/coordinator.xml
new file mode 100644
index 0000000..f98a725
--- /dev/null
+++ b/oozie/last_access_uniques/druid/daily/coordinator.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="last_access_uniques-druid-daily-coord"
+    frequency="${coord:days(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property><name>hive_site_xml</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>last_access_uniques_datasets_file</name></property>
+        <property><name>last_access_uniques_data_directory</name></property>
+        <property><name>last_access_uniques_daily_table</name></property>
+
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>druid_template_file</name></property>
+        <property><name>druid_overlord_url</name></property>
+
+        <property><name>temporary_directory</name></property>
+
+        <property><name>load_druid_workflow_file</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>send_error_email_workflow_file</name></property>
+
+    </parameters>
+
+    <controls>
+        <!--(timeout is measured in minutes)-->
+        <timeout>-1</timeout>
+
+        <!-- Setting low concurrency cause the job is hungry in resources -->
+        <concurrency>1</concurrency>
+
+        <throttle>2</throttle>
+
+    </controls>
+
+    <datasets>
+        <include>${last_access_uniques_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="uniques_daily_input" 
dataset="last_access_uniques_daily">
+            <!-- Dataset uses day as frequency -->
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(1)}</end-instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"M")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"d")}</value>
+                </property>
+                <property>
+                    <name>loaded_period</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy-MM-dd")}/${coord:formatTime(coord:dateOffset(coord:nominalTime(), 1, 
"DAY"), "yyyy-MM-dd")}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
\ No newline at end of file
diff --git 
a/oozie/last_access_uniques/druid/daily/generate_daily_druid_uniques.hql 
b/oozie/last_access_uniques/druid/daily/generate_daily_druid_uniques.hql
new file mode 100644
index 0000000..1d670d4
--- /dev/null
+++ b/oozie/last_access_uniques/druid/daily/generate_daily_druid_uniques.hql
@@ -0,0 +1,54 @@
+-- Extracts one day of json formatted daily uniques to be loaded in Druid
+--
+-- Usage:
+--     hive -f generate_daily_druid_uniques.hql \
+--         -d source_table=wmf.last_access_uniques_daily \
+--         -d destination_directory=/tmp/druid/daily_json_uniques \
+--         -d year=2016 \
+--         -d month=7 \
+--         -d day=10
+--
+
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+
+
+ADD JAR /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar;
+
+
+DROP TABLE IF EXISTS tmp_daily_druid_uniques_${year}_${month}_${day};
+
+
+CREATE EXTERNAL TABLE IF NOT EXISTS 
tmp_daily_druid_uniques_${year}_${month}_${day} (
+    `dt`                     string,
+    `host`                   string,
+    `country`                string,
+    `country_code`           string,
+    `uniques_underestimate`  bigint,
+    `uniques_offset`         bigint,
+    `uniques_estimate`       bigint
+)
+ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+STORED AS TEXTFILE
+LOCATION '${destination_directory}';
+
+
+INSERT OVERWRITE TABLE tmp_daily_druid_uniques_${year}_${month}_${day}
+SELECT
+    CONCAT(
+        LPAD(year, 4, '0'), '-',
+        LPAD(month, 2, '0'), '-',
+        LPAD(day, 2, '0'), 'T00:00:00Z') AS dt,
+    uri_host AS host,
+    country AS country,
+    country_code AS country_code,
+    uniques_underestimate as uniques_underestimate,
+    uniques_offset AS uniques_offset,
+    uniques_estimate AS uniques_estimate
+FROM ${source_table}
+WHERE year = ${year}
+    AND month = ${month}
+    AND day = ${day};
+
+
+DROP TABLE IF EXISTS tmp_daily_druid_uniques_${year}_${month}_${day};
diff --git a/oozie/last_access_uniques/druid/daily/workflow.xml 
b/oozie/last_access_uniques/druid/daily/workflow.xml
new file mode 100644
index 0000000..df0fe3a
--- /dev/null
+++ b/oozie/last_access_uniques/druid/daily/workflow.xml
@@ -0,0 +1,200 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+    name="last_access_uniques-druid-daily-wf-${year}-${month}-${day}">
+
+    <parameters>
+
+        <!-- Default values for inner oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>2048</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property>
+            <name>hive_site_xml</name>
+            <description>hive-site.xml file path in HDFS</description>
+        </property>
+
+        <property>
+            <name>last_access_uniques_daily_table</name>
+            <description>The hive last access uniques daily table to 
use</description>
+        </property>
+
+        <property>
+            <name>year</name>
+            <description>The partition's year</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>The partition's month</description>
+        </property>
+        <property>
+            <name>day</name>
+            <description>The partition's day</description>
+        </property>
+
+        <property>
+            <name>loaded_period</name>
+            <description>Period of the data loaded in interval format 
(yyyy-MM-dd/yyy-MM-dd)</description>
+        </property>
+        <property>
+            <name>druid_template_file</name>
+            <description>File to use as a template to define druid loading 
(absolute since used by load_druid sub-workflow)</description>
+        </property>
+        <property>
+            <name>druid_overlord_url</name>
+            <description>The druid overlord url used for loading</description>
+        </property>
+        <property>
+            <name>temporary_directory</name>
+            <description>A directory in HDFS for temporary files</description>
+        </property>
+        <property>
+            <name>load_druid_workflow_file</name>
+            <description>Workflow for loading druid</description>
+        </property>
+        <property>
+            <name>mark_directory_done_workflow_file</name>
+            <description>Workflow for marking a directory done</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
+        </property>
+
+    </parameters>
+
+    <start to="generate_json_uniques_daily"/>
+
+    <action name="generate_json_uniques_daily">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--Let hive decide on the number of reducers -->
+                <property>
+                    <name>mapred.reduce.tasks</name>
+                    <value>-1</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+            <script>generate_daily_druid_uniques.hql</script>
+            <param>source_table=${last_access_uniques_daily_table}</param>
+            
<param>destination_directory=${temporary_directory}/${wf:id()}-daily-druid-uniques-${year}-${month}-${day}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+        </hive>
+        <ok to="mark_json_uniques_daily_dataset_done" />
+        <error to="send_error_email" />
+    </action>
+
+    <action name="mark_json_uniques_daily_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    
<value>${temporary_directory}/${wf:id()}-daily-druid-uniques-${year}-${month}-${day}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="index_druid"/>
+        <error to="send_error_email"/>
+    </action>
+
+
+    <action name="index_druid">
+        <sub-workflow>
+            <app-path>${load_druid_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>source_directory</name>
+                    
<value>${temporary_directory}/${wf:id()}-daily-druid-uniques-${year}-${month}-${day}</value>
+                </property>
+                <property>
+                    <name>template_file</name>
+                    <value>${druid_template_file}</value>
+                </property>
+                <property>
+                    <name>loaded_period</name>
+                    <value>${loaded_period}</value>
+                </property>
+                <property>
+                    <name>druid_overlord_url</name>
+                    <value>${druid_overlord_url}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="remove_temporary_data"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="remove_temporary_data">
+        <fs>
+            <delete 
path="${temporary_directory}/${wf:id()}-daily-druid-uniques-${year}-${month}-${day}"/>
+        </fs>
+        <ok to="end"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>parent_failed_action</name>
+                    <value>${wf:lastErrorNode()}</value>
+                </property>
+                <property>
+                    <name>parent_error_code</name>
+                    <value>${wf:errorCode(wf:lastErrorNode())}</value>
+                </property>
+                <property>
+                    <name>parent_error_message</name>
+                    <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>
diff --git a/oozie/last_access_uniques/druid/monthly/coordinator.properties 
b/oozie/last_access_uniques/druid/monthly/coordinator.properties
new file mode 100644
index 0000000..fc85bb8
--- /dev/null
+++ b/oozie/last_access_uniques/druid/monthly/coordinator.properties
@@ -0,0 +1,64 @@
+# Configures a coordinator to generate a month of JSON monthly uniques and 
load them in druid.
+# Any of the following properties are override-able with -D.
+# Usage:
+#   oozie job -Dstart_time=2016-06-01T00:00Z -submit -config 
oozie/last_access_uniques/druid/monthly/coordinator.properties
+#
+# NOTE:  The $refinery_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+name_node                           = hdfs://analytics-hadoop
+job_tracker                         = 
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                          = default
+
+#Default user
+user                                = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should override this to point 
directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory. E.g. 
/wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                  = ${name_node}/wmf/refinery/current
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                     = ${refinery_directory}/oozie
+
+# HDFS path to the hive-site.xml file to use
+hive_site_xml                       = ${name_node}/user/hive/hive-site.xml
+
+# HDFS path to coordinator to run.
+coordinator_file                    = 
${oozie_directory}/last_access_uniques/druid/monthly/coordinator.xml
+# HDFS path to workflow to run.
+workflow_file                       = 
${oozie_directory}/last_access_uniques/druid/monthly/workflow.xml
+
+# HDFS path to last_access_uniques dataset definitions
+last_access_uniques_datasets_file   = 
${oozie_directory}/last_access_uniques/datasets.xml
+last_access_uniques_data_directory  = 
${name_node}/wmf/data/wmf/last_access_uniques
+
+# last_access_uniques_monthly table name
+last_access_uniques_monthly_table   = wmf.last_access_uniques_monthly
+
+# Initial import time of the last_access_uniques_monthly dataset.
+start_time                          = 2016-01-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                           = 3000-01-01T00:00Z
+
+# Temporary directory
+temporary_directory                 = ${name_node}/tmp
+
+# HDFS path to template to use.
+druid_template_file                 = 
${oozie_directory}/last_access_uniques/druid/monthly/load_uniques_monthly.json.template
+# Druid overlord url
+druid_overlord_url                  = http://druid1001.eqiad.wmnet:8090
+
+# HDFS path to workflow to load druid
+load_druid_workflow_file            = 
${oozie_directory}/util/druid/load/workflow.xml
+# HDFS path to workflow to mark a directory as done
+mark_directory_done_workflow_file   = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+# Workflow to send an error email
+send_error_email_workflow_file      = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# Coordinator to start.
+oozie.coord.application.path        = ${coordinator_file}
+oozie.use.system.libpath            = true
+oozie.action.external.stats.write   = true
diff --git a/oozie/last_access_uniques/druid/monthly/coordinator.xml 
b/oozie/last_access_uniques/druid/monthly/coordinator.xml
new file mode 100644
index 0000000..2172bd9
--- /dev/null
+++ b/oozie/last_access_uniques/druid/monthly/coordinator.xml
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="last_access_uniques-druid-monthly-coord"
+    frequency="${coord:months(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property><name>hive_site_xml</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>last_access_uniques_datasets_file</name></property>
+        <property><name>last_access_uniques_data_directory</name></property>
+        <property><name>last_access_uniques_monthly_table</name></property>
+
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>druid_template_file</name></property>
+        <property><name>druid_overlord_url</name></property>
+
+        <property><name>temporary_directory</name></property>
+
+        <property><name>load_druid_workflow_file</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>send_error_email_workflow_file</name></property>
+
+    </parameters>
+
+    <controls>
+        <!--(timeout is measured in minutes)-->
+        <timeout>-1</timeout>
+
+        <!-- Setting low concurrency cause the job is hungry in resources -->
+        <concurrency>1</concurrency>
+
+        <throttle>2</throttle>
+
+    </controls>
+
+    <datasets>
+        <include>${last_access_uniques_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="uniques_monthly_input" 
dataset="last_access_uniques_monthly">
+            <!-- Dataset uses month as frequency -->
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(1)}</end-instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"M")}</value>
+                </property>
+                <property>
+                    <name>loaded_period</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy-MM-dd")}/${coord:formatTime(coord:dateOffset(coord:nominalTime(), 1, 
"MONTH"), "yyyy-MM-dd")}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
\ No newline at end of file
diff --git 
a/oozie/last_access_uniques/druid/monthly/generate_monthly_druid_uniques.hql 
b/oozie/last_access_uniques/druid/monthly/generate_monthly_druid_uniques.hql
new file mode 100644
index 0000000..3912d68
--- /dev/null
+++ b/oozie/last_access_uniques/druid/monthly/generate_monthly_druid_uniques.hql
@@ -0,0 +1,51 @@
+-- Extracts one month of json formatted monthly uniques to be loaded in Druid
+--
+-- Usage:
+--     hive -f generate_monthly_druid_uniques.hql \
+--         -d source_table=wmf.last_access_uniques_monthly \
+--         -d destination_directory=/tmp/druid/monthly_json_uniques \
+--         -d year=2016 \
+--         -d month=7
+--
+
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+
+
+ADD JAR /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar;
+
+
+DROP TABLE IF EXISTS tmp_monthly_druid_uniques_${year}_${month};
+
+
+CREATE EXTERNAL TABLE IF NOT EXISTS tmp_monthly_druid_uniques_${year}_${month} 
(
+    `dt`                     string,
+    `host`                   string,
+    `country`                string,
+    `country_code`           string,
+    `uniques_underestimate`  bigint,
+    `uniques_offset`         bigint,
+    `uniques_estimate`       bigint
+)
+ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+STORED AS TEXTFILE
+LOCATION '${destination_directory}';
+
+
+INSERT OVERWRITE TABLE tmp_monthly_druid_uniques_${year}_${month}
+SELECT
+    CONCAT(
+        LPAD(year, 4, '0'), '-',
+        LPAD(month, 2, '0'), '-01T00:00:00Z') AS dt,
+    uri_host AS host,
+    country AS country,
+    country_code AS country_code,
+    uniques_underestimate as uniques_underestimate,
+    uniques_offset AS uniques_offset,
+    uniques_estimate AS uniques_estimate
+FROM ${source_table}
+WHERE year = ${year}
+    AND month = ${month};
+
+
+DROP TABLE IF EXISTS tmp_monthly_druid_uniques_${year}_${month};
diff --git 
a/oozie/last_access_uniques/druid/monthly/load_uniques_monthly.json.template 
b/oozie/last_access_uniques/druid/monthly/load_uniques_monthly.json.template
new file mode 100644
index 0000000..d057451
--- /dev/null
+++ b/oozie/last_access_uniques/druid/monthly/load_uniques_monthly.json.template
@@ -0,0 +1,70 @@
+{
+  "type" : "index_hadoop",
+  "spec" : {
+    "ioConfig" : {
+      "type" : "hadoop",
+      "inputSpec" : {
+        "type" : "static",
+        "paths" : "*INPUT_PATH*"
+      }
+    },
+    "dataSchema" : {
+      "dataSource" : "unique-devices-monthly",
+      "granularitySpec" : {
+        "type" : "uniform",
+        "segmentGranularity" : "month",
+        "queryGranularity" : "month",
+        "intervals" : *INTERVALS_ARRAY*
+      },
+      "parser" : {
+        "type" : "string",
+        "parseSpec" : {
+          "format" : "json",
+          "dimensionsSpec" : {
+            "dimensions" : [
+                "host",
+                "country",
+                "country_code"
+            ]
+          },
+          "timestampSpec" : {
+            "format" : "auto",
+            "column" : "dt"
+          }
+        }
+      },
+      "metricsSpec" : [
+        {
+          "name" : "uniques_underestimate",
+          "type" : "longSum",
+          "fieldName": "uniques_underestimate"
+        },
+        {
+          "name" : "uniques_offset",
+          "type" : "longSum",
+          "fieldName": "uniques_offset"
+        },
+        {
+          "name" : "uniques_estimate",
+          "type" : "longSum",
+          "fieldName": "uniques_estimate"
+        }
+      ]
+    },
+    "tuningConfig" : {
+      "type" : "hadoop",
+      "overwriteFiles": true,
+      "ignoreInvalidRows" : false,
+      "partitionsSpec" : {
+        "type" : "hashed",
+        "numShards" : 1
+      },
+      "jobProperties" : {
+        "mapreduce.reduce.memory.mb" : "8192",
+        "mapreduce.output.fileoutputformat.compress": 
"org.apache.hadoop.io.compress.GzipCodec",
+        "mapreduce.job.queuename": "*HADOOP_QUEUE*"
+      }
+    }
+  }
+}
+
diff --git a/oozie/last_access_uniques/druid/monthly/workflow.xml 
b/oozie/last_access_uniques/druid/monthly/workflow.xml
new file mode 100644
index 0000000..4c098e4
--- /dev/null
+++ b/oozie/last_access_uniques/druid/monthly/workflow.xml
@@ -0,0 +1,195 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+    name="last_access_uniques-druid-monthly-wf-${year}-${month}">
+
+    <parameters>
+
+        <!-- Default values for inner oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>2048</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property>
+            <name>hive_site_xml</name>
+            <description>hive-site.xml file path in HDFS</description>
+        </property>
+
+        <property>
+            <name>last_access_uniques_monthly_table</name>
+            <description>The hive pageview table to use</description>
+        </property>
+
+        <property>
+            <name>year</name>
+            <description>The partition's year</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>The partition's month</description>
+        </property>
+
+        <property>
+            <name>loaded_period</name>
+            <description>Period of the data loaded in interval format 
(yyyy-MM-dd/yyy-MM-dd)</description>
+        </property>
+        <property>
+            <name>druid_template_file</name>
+            <description>File to use as a template to define druid loading 
(absolute since used by load_druid sub-workflow)</description>
+        </property>
+        <property>
+            <name>druid_overlord_url</name>
+            <description>The druid overlord url used for loading</description>
+        </property>
+        <property>
+            <name>temporary_directory</name>
+            <description>A directory in HDFS for temporary files</description>
+        </property>
+        <property>
+            <name>load_druid_workflow_file</name>
+            <description>Workflow for loading druid</description>
+        </property>
+        <property>
+            <name>mark_directory_done_workflow_file</name>
+            <description>Workflow for marking a directory done</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
+        </property>
+
+    </parameters>
+
+    <start to="generate_json_uniques_monthly"/>
+
+    <action name="generate_json_uniques_monthly">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--Let hive decide on the number of reducers -->
+                <property>
+                    <name>mapred.reduce.tasks</name>
+                    <value>-1</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+            <script>generate_monthly_druid_uniques.hql</script>
+            <param>source_table=${last_access_uniques_monthly_table}</param>
+            
<param>destination_directory=${temporary_directory}/${wf:id()}-monthly-druid-uniques-${year}-${month}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+        </hive>
+        <ok to="mark_json_uniques_monthly_dataset_done" />
+        <error to="send_error_email" />
+    </action>
+
+    <action name="mark_json_uniques_monthly_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    
<value>${temporary_directory}/${wf:id()}-monthly-druid-uniques-${year}-${month}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="index_druid"/>
+        <error to="send_error_email"/>
+    </action>
+
+
+    <action name="index_druid">
+        <sub-workflow>
+            <app-path>${load_druid_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>source_directory</name>
+                    
<value>${temporary_directory}/${wf:id()}-monthly-druid-uniques-${year}-${month}</value>
+                </property>
+                <property>
+                    <name>template_file</name>
+                    <value>${druid_template_file}</value>
+                </property>
+                <property>
+                    <name>loaded_period</name>
+                    <value>${loaded_period}</value>
+                </property>
+                <property>
+                    <name>druid_overlord_url</name>
+                    <value>${druid_overlord_url}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="remove_temporary_data"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="remove_temporary_data">
+        <fs>
+            <delete 
path="${temporary_directory}/${wf:id()}-monthly-druid-uniques-${year}-${month}"/>
+        </fs>
+        <ok to="end"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>parent_failed_action</name>
+                    <value>${wf:lastErrorNode()}</value>
+                </property>
+                <property>
+                    <name>parent_error_code</name>
+                    <value>${wf:errorCode(wf:lastErrorNode())}</value>
+                </property>
+                <property>
+                    <name>parent_error_message</name>
+                    <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/347611
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I043f841a7f62cf1035aa09e84934bfe6aabe95e2
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] analytics/refinery[master]: [WIP] Add oozie jobs loading uniques in druid

Reply via email to