Addshore has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/301661

Change subject: WIP Create wikidata/specialentitydata_metrics coordinator
......................................................................

WIP Create wikidata/specialentitydata_metrics coordinator

Bug: T141525
Change-Id: Ic84ef563a30913be3857bb442a39a670102fd7a0
---
A oozie/wikidata/specialentitydata_metrics/README.md
A oozie/wikidata/specialentitydata_metrics/coordinator.properties
A oozie/wikidata/specialentitydata_metrics/coordinator.xml
A oozie/wikidata/specialentitydata_metrics/workflow.xml
4 files changed, 332 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/61/301661/1

diff --git a/oozie/wikidata/specialentitydata_metrics/README.md 
b/oozie/wikidata/specialentitydata_metrics/README.md
new file mode 100644
index 0000000..b5d8c42
--- /dev/null
+++ b/oozie/wikidata/specialentitydata_metrics/README.md
@@ -0,0 +1,16 @@
+Oozie job to schedule generating metrics for the Wikidata Special:EntityData 
extension.
+The job runs every day and reports metrics to Graphite.
+
+The oozie workflow launches a spark action that runs the
+WikidataSpecialEntityDataMetrics scala job in 
analytics-refinery-source/refinery-job here -
+https://phabricator.wikimedia.org/diffusion/ANRS/
+
+Example command for running the coordinator on command line:
+
+    oozie job -run \
+         -config coordinator.properties \
+         -D refinery_directory=hdfs://analytics-hadoop/wmf/refinery/current \
+         -D 
spark_job_jar=hdfs://analytics-hadoop/wmf/refinery/current/artifacts/refinery-job.jar
+
+The results of this job can be viewed in Graphite (graphite.wikimedia.org) 
under the
+daily.wikidata.entitydata namespace
\ No newline at end of file
diff --git a/oozie/wikidata/specialentitydata_metrics/coordinator.properties 
b/oozie/wikidata/specialentitydata_metrics/coordinator.properties
new file mode 100644
index 0000000..cd43a75
--- /dev/null
+++ b/oozie/wikidata/specialentitydata_metrics/coordinator.properties
@@ -0,0 +1,69 @@
+# Configures a coordinator to automatically manage generating and sending 
Wikidata Special:EntityData
+# metrics to Graphite from the webrequest data.
+# Any of the following properties are override-able with -D.
+#
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-08-01T00:00Z -submit -config 
oozie/wikidata_specialentitydata_metrics/coordinator.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+
+#Default user
+user                              = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should override this to point 
directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory. E.g. 
/wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                = ${name_node}/wmf/refinery/current
+
+# HDFS path to artifacts that will be used by this job.
+# E.g. refinery-job.jar should exist here.
+artifacts_directory               = ${refinery_directory}/artifacts
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                   = ${refinery_directory}/oozie
+
+# HDFS path to coordinator to run for each webrequest_source.
+coordinator_file                  = 
${oozie_directory}/wikidata/specialentitydata_metrics/coordinator.xml
+
+# HDFS path to workflow to run.
+workflow_file                     = 
${oozie_directory}/wikidata/specialentitydata_metrics/workflow.xml
+
+# HDFS path to refine webrequest dataset
+webrequest_table                    = wmf.webrequest
+webrequest_dataset_file             = 
${oozie_directory}/webrequest/datasets.xml
+webrequest_data_directory           = ${name_node}/wmf/data/wmf/webrequest
+
+# Initial import time of the webrequest dataset.
+start_time                        = 2015-08-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# Spark job parameters
+spark_master                      = yarn
+spark_deploy                      = cluster
+spark_assembly_jar                = 
${name_node}/user/spark/share/lib/spark-assembly.jar
+spark_job_jar                     = 
${artifacts_directory}/org/wikimedia/analytics/refinery/refinery-job-0.0.34.jar
+spark_job_class                   = 
org.wikimedia.analytics.refinery.job.WikidataSpecialEntityDataMetrics
+spark_job_name                    = wikidata_specialentitydata_metrics
+spark_additional_files            = 
${refinery_directory}/oozie/util/hive/hive-site.xml
+spark_executor_memory             = 2G
+spark_driver_memory               = 4G
+graphite_namespace                = daily.wikidata.entitydata
+graphite_host                     = graphite-in.eqiad.wmnet
+graphite_port                     = 2003
+
+# Workflow to send an error email
+error_emails_recipients           = 
[email protected],[email protected]
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# Coordinator to start.
+oozie.coord.application.path      = ${coordinator_file}
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
\ No newline at end of file
diff --git a/oozie/wikidata/specialentitydata_metrics/coordinator.xml 
b/oozie/wikidata/specialentitydata_metrics/coordinator.xml
new file mode 100644
index 0000000..3682c4f
--- /dev/null
+++ b/oozie/wikidata/specialentitydata_metrics/coordinator.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="wikidata-specialentitydata_metrics-coord"
+    frequency="${coord:days(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property><name>workflow_file</name></property>
+        <property><name>webrequest_dataset_file</name></property>
+
+        <property><name>start_time</name></property>
+        <property><name>stop_time</name></property>
+
+        <property><name>spark_master</name></property>
+        <property><name>spark_job_jar</name></property>
+        <property><name>spark_job_class</name></property>
+        <property><name>spark_additional_files</name></property>
+        <property><name>spark_executor_memory</name></property>
+        <property><name>spark_driver_memory</name></property>
+        <property><name>graphite_host</name></property>
+        <property><name>graphite_port</name></property>
+        <property><name>graphite_namespace</name></property>
+        <property><name>webrequest_table</name></property>
+        <property><name>webrequest_data_directory</name></property>
+
+        <property><name>error_emails_recipients</name></property>
+        <property><name>send_error_email_workflow_file</name></property>
+    </parameters>
+
+    <controls>
+        <!--(timeout is measured in minutes)-->
+        <timeout>-1</timeout>
+
+        <!-- Setting low concurrency for resource sharing.
+             The job runs pretty fast (~1 minute) and increasing concurrency 
should not cause any problems-->
+        <concurrency>2</concurrency>
+
+        <throttle>2</throttle>
+
+    </controls>
+
+    <datasets>
+        <!--
+        Include webrequest datasets files.
+        $webrequest_dataset_file will be used as the input events
+        -->
+        <include>${webrequest_dataset_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="webrequest_input" dataset="webrequest_text">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"M")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"d")}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
\ No newline at end of file
diff --git a/oozie/wikidata/specialentitydata_metrics/workflow.xml 
b/oozie/wikidata/specialentitydata_metrics/workflow.xml
new file mode 100644
index 0000000..28e2fb1
--- /dev/null
+++ b/oozie/wikidata/specialentitydata_metrics/workflow.xml
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4"
+    name="wikidata-specialentitydata_metrics-wf-${year}-${month}-${day}">
+
+    <parameters>
+
+        <!-- Default values for inner oozie settings -->
+        <property>
+            <name>oozie_launcher_queue_name</name>
+            <value>${queue_name}</value>
+        </property>
+        <property>
+            <name>oozie_launcher_memory</name>
+            <value>256</value>
+        </property>
+
+        <!-- Required properties -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>queue_name</name></property>
+
+        <property>
+            <name>spark_master</name>
+            <description>Master to be used for Spark (yarn, local, 
other)</description>
+        </property>
+        <property>
+            <name>spark_job_jar</name>
+            <description>Path to the jar to be used to run spark 
job</description>
+        </property>
+        <property>
+            <name>spark_job_class</name>
+            <description>Class of the spark job to be run</description>
+        </property>
+        <property>
+            <name>spark_executor_memory</name>
+            <description>Memory to allocate for each spark 
executor</description>
+        </property>
+        <property>
+            <name>spark_driver_memory</name>
+            <description>Memory to allocate for spark driver 
process</description>
+        </property>
+        <property>
+            <name>spark_additional_files</name>
+            <description>Additional files for spark</description>
+        </property>
+        <property>
+            <name>webrequest_table</name>
+            <description>The webrequest table name</description>
+        </property>
+        <property>
+            <name>year</name>
+            <description>The partition's year</description>
+        </property>
+        <property>
+            <name>month</name>
+            <description>The partition's month</description>
+        </property>
+        <property>
+            <name>day</name>
+            <description>The partition's day</description>
+        </property>
+        <property>
+            <name>graphite_host</name>
+            <description>Graphite Host url</description>
+        </property>
+        <property>
+            <name>graphite_port</name>
+            <description>Graphite port</description>
+        </property>
+        <property>
+            <name>graphite_namespace</name>
+            <description>Namespace/prefix for metric in Graphite</description>
+        </property>
+        <property>
+            <name>error_emails_recipients</name>
+            <description>Emails to send errors to</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
+        </property>
+    </parameters>
+
+    <start to="generate_wikidata_specialentitydata_metrics"/>
+
+    <action name="generate_wikidata_specialentitydata_metrics">
+        <spark xmlns="uri:oozie:spark-action:0.1">
+
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <configuration>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+            </configuration>
+            <master>${spark_master}</master>
+            <mode>${spark_deploy}</mode>
+            <name>${spark_job_name}-${year}-${month}-${day}</name>
+            <class>${spark_job_class}</class>
+            <jar>${spark_job_jar}</jar>
+            <spark-opts>--conf spark.yarn.jar=${spark_assembly_jar} --conf 
spark.dynamicAllocation.enabled=true --conf spark.shuffle.service.enabled=true 
--executor-memory ${spark_executor_memory} --driver-memory 
${spark_driver_memory} --queue ${queue_name} --files ${spark_additional_files} 
</spark-opts>
+            <arg>--year</arg>
+            <arg>${year}</arg>
+            <arg>--month</arg>
+            <arg>${month}</arg>
+            <arg>--day</arg>
+            <arg>${day}</arg>
+            <arg>--graphite-host</arg>
+            <arg>${graphite_host}</arg>
+            <arg>--graphite-port</arg>
+            <arg>${graphite_port}</arg>
+            <arg>--graphite-namespace</arg>
+            <arg>${graphite_namespace}</arg>
+            <arg>--webrequest-table</arg>
+            <arg>${webrequest_table}</arg>
+        </spark>
+        <ok to="end" />
+        <error to="kill" />
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>to</name>
+                    <value>${error_emails_recipients}</value>
+                </property>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>parent_failed_action</name>
+                    <value>${wf:lastErrorNode()}</value>
+                </property>
+                <property>
+                    <name>parent_error_code</name>
+                    <value>${wf:errorCode(wf:lastErrorNode())}</value>
+                </property>
+                <property>
+                    <name>parent_error_message</name>
+                    <value>${wf:errorMessage(wf:lastErrorNode())}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
+        <error to="kill"/>
+    </action>
+
+    <kill name="kill">
+        <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <end name="end"/>
+</workflow-app>

-- 
To view, visit https://gerrit.wikimedia.org/r/301661
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic84ef563a30913be3857bb442a39a670102fd7a0
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Addshore <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to