Addshore has uploaded a new change for review. https://gerrit.wikimedia.org/r/301661
Change subject: WIP Create wikidata/specialentitydata_metrics coordinator ...................................................................... WIP Create wikidata/specialentitydata_metrics coordinator Bug: T141525 Change-Id: Ic84ef563a30913be3857bb442a39a670102fd7a0 --- A oozie/wikidata/specialentitydata_metrics/README.md A oozie/wikidata/specialentitydata_metrics/coordinator.properties A oozie/wikidata/specialentitydata_metrics/coordinator.xml A oozie/wikidata/specialentitydata_metrics/workflow.xml 4 files changed, 332 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/61/301661/1 diff --git a/oozie/wikidata/specialentitydata_metrics/README.md b/oozie/wikidata/specialentitydata_metrics/README.md new file mode 100644 index 0000000..b5d8c42 --- /dev/null +++ b/oozie/wikidata/specialentitydata_metrics/README.md @@ -0,0 +1,16 @@ +Oozie job to schedule generating metrics for the Wikidata Special:EntityData extension. +The job runs every day and reports metrics to Graphite. + +The oozie workflow launches a spark action that runs the +WikidataSpecialEntityDataMetrics scala job in analytics-refinery-source/refinery-job here - +https://phabricator.wikimedia.org/diffusion/ANRS/ + +Example command for running the coordinator on command line: + + oozie job -run \ + -config coordinator.properties \ + -D refinery_directory=hdfs://analytics-hadoop/wmf/refinery/current \ + -D spark_job_jar=hdfs://analytics-hadoop/wmf/refinery/current/artifacts/refinery-job.jar + +The results of this job can be viewed in Graphite (graphite.wikimedia.org) under the +daily.wikidata.entitydata namespace \ No newline at end of file diff --git a/oozie/wikidata/specialentitydata_metrics/coordinator.properties b/oozie/wikidata/specialentitydata_metrics/coordinator.properties new file mode 100644 index 0000000..cd43a75 --- /dev/null +++ b/oozie/wikidata/specialentitydata_metrics/coordinator.properties @@ -0,0 +1,69 @@ +# Configures a coordinator to automatically manage generating and sending Wikidata Special:EntityData +# metrics to Graphite from the webrequest data. +# Any of the following properties are override-able with -D. +# +# Usage: +# oozie job -Duser=$USER -Dstart_time=2015-08-01T00:00Z -submit -config oozie/wikidata_specialentitydata_metrics/coordinator.properties +# +# NOTE: The $oozie_directory must be synced to HDFS so that all relevant +# .xml files exist there when this job is submitted. + +name_node = hdfs://analytics-hadoop +job_tracker = resourcemanager.analytics.eqiad.wmnet:8032 +queue_name = default + +#Default user +user = hdfs + +# Base path in HDFS to refinery. +# When submitting this job for production, you should override this to point directly at a deployed +# directory name, and not the 'symbolic' 'current' directory. E.g. /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07 +refinery_directory = ${name_node}/wmf/refinery/current + +# HDFS path to artifacts that will be used by this job. +# E.g. refinery-job.jar should exist here. +artifacts_directory = ${refinery_directory}/artifacts + +# Base path in HDFS to oozie files. +# Other files will be used relative to this path. +oozie_directory = ${refinery_directory}/oozie + +# HDFS path to coordinator to run for each webrequest_source. +coordinator_file = ${oozie_directory}/wikidata/specialentitydata_metrics/coordinator.xml + +# HDFS path to workflow to run. +workflow_file = ${oozie_directory}/wikidata/specialentitydata_metrics/workflow.xml + +# HDFS path to refine webrequest dataset +webrequest_table = wmf.webrequest +webrequest_dataset_file = ${oozie_directory}/webrequest/datasets.xml +webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest + +# Initial import time of the webrequest dataset. +start_time = 2015-08-01T00:00Z + +# Time to stop running this coordinator. Year 3000 == never! +stop_time = 3000-01-01T00:00Z + +# Spark job parameters +spark_master = yarn +spark_deploy = cluster +spark_assembly_jar = ${name_node}/user/spark/share/lib/spark-assembly.jar +spark_job_jar = ${artifacts_directory}/org/wikimedia/analytics/refinery/refinery-job-0.0.34.jar +spark_job_class = org.wikimedia.analytics.refinery.job.WikidataSpecialEntityDataMetrics +spark_job_name = wikidata_specialentitydata_metrics +spark_additional_files = ${refinery_directory}/oozie/util/hive/hive-site.xml +spark_executor_memory = 2G +spark_driver_memory = 4G +graphite_namespace = daily.wikidata.entitydata +graphite_host = graphite-in.eqiad.wmnet +graphite_port = 2003 + +# Workflow to send an error email +error_emails_recipients = [email protected],[email protected] +send_error_email_workflow_file = ${oozie_directory}/util/send_error_email/workflow.xml + +# Coordinator to start. +oozie.coord.application.path = ${coordinator_file} +oozie.use.system.libpath = true +oozie.action.external.stats.write = true \ No newline at end of file diff --git a/oozie/wikidata/specialentitydata_metrics/coordinator.xml b/oozie/wikidata/specialentitydata_metrics/coordinator.xml new file mode 100644 index 0000000..3682c4f --- /dev/null +++ b/oozie/wikidata/specialentitydata_metrics/coordinator.xml @@ -0,0 +1,85 @@ +<?xml version="1.0" encoding="UTF-8"?> +<coordinator-app xmlns="uri:oozie:coordinator:0.4" + name="wikidata-specialentitydata_metrics-coord" + frequency="${coord:days(1)}" + start="${start_time}" + end="${stop_time}" + timezone="Universal"> + + <parameters> + + <!-- Required properties. --> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + <property><name>queue_name</name></property> + + <property><name>workflow_file</name></property> + <property><name>webrequest_dataset_file</name></property> + + <property><name>start_time</name></property> + <property><name>stop_time</name></property> + + <property><name>spark_master</name></property> + <property><name>spark_job_jar</name></property> + <property><name>spark_job_class</name></property> + <property><name>spark_additional_files</name></property> + <property><name>spark_executor_memory</name></property> + <property><name>spark_driver_memory</name></property> + <property><name>graphite_host</name></property> + <property><name>graphite_port</name></property> + <property><name>graphite_namespace</name></property> + <property><name>webrequest_table</name></property> + <property><name>webrequest_data_directory</name></property> + + <property><name>error_emails_recipients</name></property> + <property><name>send_error_email_workflow_file</name></property> + </parameters> + + <controls> + <!--(timeout is measured in minutes)--> + <timeout>-1</timeout> + + <!-- Setting low concurrency for resource sharing. + The job runs pretty fast (~1 minute) and increasing concurrency should not cause any problems--> + <concurrency>2</concurrency> + + <throttle>2</throttle> + + </controls> + + <datasets> + <!-- + Include webrequest datasets files. + $webrequest_dataset_file will be used as the input events + --> + <include>${webrequest_dataset_file}</include> + </datasets> + + <input-events> + <data-in name="webrequest_input" dataset="webrequest_text"> + <start-instance>${coord:current(0)}</start-instance> + <end-instance>${coord:current(23)}</end-instance> + </data-in> + </input-events> + + <action> + <workflow> + <app-path>${workflow_file}</app-path> + <configuration> + + <property> + <name>year</name> + <value>${coord:formatTime(coord:nominalTime(), "y")}</value> + </property> + <property> + <name>month</name> + <value>${coord:formatTime(coord:nominalTime(), "M")}</value> + </property> + <property> + <name>day</name> + <value>${coord:formatTime(coord:nominalTime(), "d")}</value> + </property> + </configuration> + </workflow> + </action> +</coordinator-app> \ No newline at end of file diff --git a/oozie/wikidata/specialentitydata_metrics/workflow.xml b/oozie/wikidata/specialentitydata_metrics/workflow.xml new file mode 100644 index 0000000..28e2fb1 --- /dev/null +++ b/oozie/wikidata/specialentitydata_metrics/workflow.xml @@ -0,0 +1,162 @@ +<?xml version="1.0" encoding="UTF-8"?> +<workflow-app xmlns="uri:oozie:workflow:0.4" + name="wikidata-specialentitydata_metrics-wf-${year}-${month}-${day}"> + + <parameters> + + <!-- Default values for inner oozie settings --> + <property> + <name>oozie_launcher_queue_name</name> + <value>${queue_name}</value> + </property> + <property> + <name>oozie_launcher_memory</name> + <value>256</value> + </property> + + <!-- Required properties --> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + <property><name>queue_name</name></property> + + <property> + <name>spark_master</name> + <description>Master to be used for Spark (yarn, local, other)</description> + </property> + <property> + <name>spark_job_jar</name> + <description>Path to the jar to be used to run spark job</description> + </property> + <property> + <name>spark_job_class</name> + <description>Class of the spark job to be run</description> + </property> + <property> + <name>spark_executor_memory</name> + <description>Memory to allocate for each spark executor</description> + </property> + <property> + <name>spark_driver_memory</name> + <description>Memory to allocate for spark driver process</description> + </property> + <property> + <name>spark_additional_files</name> + <description>Additional files for spark</description> + </property> + <property> + <name>webrequest_table</name> + <description>The webrequest table name</description> + </property> + <property> + <name>year</name> + <description>The partition's year</description> + </property> + <property> + <name>month</name> + <description>The partition's month</description> + </property> + <property> + <name>day</name> + <description>The partition's day</description> + </property> + <property> + <name>graphite_host</name> + <description>Graphite Host url</description> + </property> + <property> + <name>graphite_port</name> + <description>Graphite port</description> + </property> + <property> + <name>graphite_namespace</name> + <description>Namespace/prefix for metric in Graphite</description> + </property> + <property> + <name>error_emails_recipients</name> + <description>Emails to send errors to</description> + </property> + <property> + <name>send_error_email_workflow_file</name> + <description>Workflow for sending an email</description> + </property> + </parameters> + + <start to="generate_wikidata_specialentitydata_metrics"/> + + <action name="generate_wikidata_specialentitydata_metrics"> + <spark xmlns="uri:oozie:spark-action:0.1"> + + <job-tracker>${job_tracker}</job-tracker> + <name-node>${name_node}</name-node> + <configuration> + <!--make sure oozie:launcher runs in a low priority queue --> + <property> + <name>oozie.launcher.mapred.job.queue.name</name> + <value>${oozie_launcher_queue_name}</value> + </property> + <property> + <name>oozie.launcher.mapreduce.map.memory.mb</name> + <value>${oozie_launcher_memory}</value> + </property> + </configuration> + <master>${spark_master}</master> + <mode>${spark_deploy}</mode> + <name>${spark_job_name}-${year}-${month}-${day}</name> + <class>${spark_job_class}</class> + <jar>${spark_job_jar}</jar> + <spark-opts>--conf spark.yarn.jar=${spark_assembly_jar} --conf spark.dynamicAllocation.enabled=true --conf spark.shuffle.service.enabled=true --executor-memory ${spark_executor_memory} --driver-memory ${spark_driver_memory} --queue ${queue_name} --files ${spark_additional_files} </spark-opts> + <arg>--year</arg> + <arg>${year}</arg> + <arg>--month</arg> + <arg>${month}</arg> + <arg>--day</arg> + <arg>${day}</arg> + <arg>--graphite-host</arg> + <arg>${graphite_host}</arg> + <arg>--graphite-port</arg> + <arg>${graphite_port}</arg> + <arg>--graphite-namespace</arg> + <arg>${graphite_namespace}</arg> + <arg>--webrequest-table</arg> + <arg>${webrequest_table}</arg> + </spark> + <ok to="end" /> + <error to="kill" /> + </action> + + <action name="send_error_email"> + <sub-workflow> + <app-path>${send_error_email_workflow_file}</app-path> + <propagate-configuration/> + <configuration> + <property> + <name>to</name> + <value>${error_emails_recipients}</value> + </property> + <property> + <name>parent_name</name> + <value>${wf:name()}</value> + </property> + <property> + <name>parent_failed_action</name> + <value>${wf:lastErrorNode()}</value> + </property> + <property> + <name>parent_error_code</name> + <value>${wf:errorCode(wf:lastErrorNode())}</value> + </property> + <property> + <name>parent_error_message</name> + <value>${wf:errorMessage(wf:lastErrorNode())}</value> + </property> + </configuration> + </sub-workflow> + <ok to="kill"/> + <error to="kill"/> + </action> + + <kill name="kill"> + <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> + </kill> + <end name="end"/> +</workflow-app> -- To view, visit https://gerrit.wikimedia.org/r/301661 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic84ef563a30913be3857bb442a39a670102fd7a0 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Addshore <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
