Elukey has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/395504 )
Change subject: role::analytics_cluster::coordinator: add a profile to restart
streaming jobs
......................................................................
role::analytics_cluster::coordinator: add a profile to restart streaming jobs
This patch adds a cron job checking and possibly relaunching banner
spark streaming job in Yarn.
Bug: T176983
Change-Id: Icdf583cce4dd6b6b69a145f57c8355bafc62aa08
---
A modules/profile/manifests/analytics/refinery/job/streams_check.pp
M modules/role/manifests/analytics_cluster/coordinator.pp
2 files changed, 35 insertions(+), 0 deletions(-)
Approvals:
Elukey: Looks good to me, approved
jenkins-bot: Verified
diff --git a/modules/profile/manifests/analytics/refinery/job/streams_check.pp
b/modules/profile/manifests/analytics/refinery/job/streams_check.pp
new file mode 100644
index 0000000..8afea43
--- /dev/null
+++ b/modules/profile/manifests/analytics/refinery/job/streams_check.pp
@@ -0,0 +1,34 @@
+# == Class profile::analytics::refinery::job::streams_check
+#
+# Deploy cron scripts able to check and restart (if needed) streaming jobs
+# running on the Hadoop cluster that might have failed. This profile does not
+# take care of alarming, that needs to be done separately.
+#
+
+class profile::analytics::refinery::job::streams_check {
+ require ::profile::analytics::refinery
+
+ # Shortcut var to DRY up cron commands.
+ $refinery_path = $role::analytics_cluster::refinery::path
+
+ $refinery_job_jar = "${refinery_path}/artifacts/refinery-job.jar"
+ $spark_num_executors = 4
+ $spark_executor_cores = 3
+ $spark_driver_memory = '2G'
+ $spark_executor_memory = '4G'
+ $druid_segment_gran = 'HOUR'
+ $tranq_window_period = 'PT10M'
+ $batch_duration_secs = '60'
+ $job_name = 'BannerImpressionsStream'
+
+ # No log needed as job runs in cluster mode
+ $command = "PYTHONPATH=${refinery_path}/python
${refinery_path}/bin/is-yarn-app-running ${job_name} || /usr/bin/spark2-submit
--master yarn --deploy-mode cluster --queue production --conf
spark.dynamicAllocation.enabled=false --driver-memory ${spark_driver_memory}
--executor-memory ${spark_executor_memory} --executor-cores
${spark_executor_cores} --num-executors ${spark_num_executors} --class
org.wikimedia.analytics.refinery.job.druid.BannerImpressionsStream --name
${job_name} ${refinery_job_jar} --druid-indexing-segment-granularity
${druid_segment_gran} --druid-indexing-window-period ${tranq_window_period}
--batch-duration-seconds ${batch_duration_secs} > /dev/null 2>&1"
+
+ # This checks for banner streaming job running in Yarn, and relaunches it
if needed.
+ cron { 'refinery-relaunch-banner-streaming':
+ command => $command,
+ environment => '[email protected]',
+ user => 'hdfs',
+ minute => '*/5'
+ }
+}
diff --git a/modules/role/manifests/analytics_cluster/coordinator.pp
b/modules/role/manifests/analytics_cluster/coordinator.pp
index 2222a96..9f38259 100644
--- a/modules/role/manifests/analytics_cluster/coordinator.pp
+++ b/modules/role/manifests/analytics_cluster/coordinator.pp
@@ -58,6 +58,7 @@
include ::profile::analytics::refinery::job::project_namespace_map
include ::profile::analytics::refinery::job::sqoop_mediawiki
include ::profile::analytics::refinery::job::json_refine
+ include ::profile::analytics::refinery::job::streams_check
include standard
include ::profile::base::firewall
--
To view, visit https://gerrit.wikimedia.org/r/395504
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icdf583cce4dd6b6b69a145f57c8355bafc62aa08
Gerrit-PatchSet: 6
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits