[MediaWiki-commits] [Gerrit] operations/puppet[production]: role::mediawiki::webserver: restart hhvm routinely

2016-10-18 Thread Giuseppe Lavagetto (Code Review)
Giuseppe Lavagetto has submitted this change and it was merged.

Change subject: role::mediawiki::webserver: restart hhvm routinely
..


role::mediawiki::webserver: restart hhvm routinely

This cron will restart HHVM if it's running since more than 3 days.

Bug: T147773
Change-Id: I204331607ba80169fafdd205ba9bffeeabf9a443
---
A modules/hhvm/files/hhvm-needs-restart.sh
M modules/hhvm/manifests/init.pp
M modules/role/manifests/mediawiki/webserver.pp
3 files changed, 111 insertions(+), 0 deletions(-)

Approvals:
  Giuseppe Lavagetto: Verified; Looks good to me, approved



diff --git a/modules/hhvm/files/hhvm-needs-restart.sh 
b/modules/hhvm/files/hhvm-needs-restart.sh
new file mode 100755
index 000..cc9f062
--- /dev/null
+++ b/modules/hhvm/files/hhvm-needs-restart.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Script used to determine if HHVM needs to be restarted.
+# Will return an exit code of 0 if HHVM needs to be restarted,
+# and 1 if it doesn't.
+
+# Maximum number of days HHVM should run without being restarted
+MAX_RUN_DAYS=3
+# Maximum memory occupation from HHVM before being restarted
+MAX_MEM=50
+# Maximum queue size with respect to the load before being restarted.
+# This is very dangerous and should only be defined after very careful 
consideration
+MAX_QUEUE_RATIO=
+
+function usage {
+cat < ${MAX_RUN} )); then
+echo "HHVM needs restarting: running since ${RUN_TIME} seconds"
+exit 0
+fi
+
+# Used Memory
+/bin/ps -C hhvm -o pmem= | awk -v max_mem=${MAX_MEM} '{sum+=$1}
+END {
+  if (sum > max_mem) {
+print "HHVM needs restart: using " sum "% of available memory";
+exit 0;
+  }
+}'
+
+# Queue size
+# If not defined, just exit as if everything is fine
+test -z $MAX_QUEUE_RATIO && exit 1
+HIGH_RATIO=$(hhvmadm check-health | \
+jq "if (.queued > (${MAX_QUEUE_RATIO} * .load)) then 1 
else 0 end")
+if (( $HIGH_RATIO )); then
+print "HHVM needs restart: queue > ${MAX_QUEUE_RATIO} * load"
+exit 0
+fi
+# No need for a restart
+exit 1
diff --git a/modules/hhvm/manifests/init.pp b/modules/hhvm/manifests/init.pp
index 74535cd..adc75ad 100644
--- a/modules/hhvm/manifests/init.pp
+++ b/modules/hhvm/manifests/init.pp
@@ -264,6 +264,13 @@
 mode   => '0555',
 }
 
+file {  '/usr/local/bin/hhvm-needs-restart':
+ensure => present,
+owner  => 'root',
+group  => 'root',
+mode   => '0555',
+source => 'puppet:///modules/hhvm/hhvm-needs-restart.sh',
+}
 
 ## Run-time data and logging
 
diff --git a/modules/role/manifests/mediawiki/webserver.pp 
b/modules/role/manifests/mediawiki/webserver.pp
index 6d47a9b..beefddf 100644
--- a/modules/role/manifests/mediawiki/webserver.pp
+++ b/modules/role/manifests/mediawiki/webserver.pp
@@ -12,10 +12,30 @@
 }
 
 if hiera('has_lvs', true) {
+include ::lvs::configuration
 include ::role::lvs::realserver
 
 # Conftool config
 include ::mediawiki::conftool
+
+# Restart HHVM if it is running since more than 3 days or
+# memory occupation exceeds 50% of the available RAM
+# This should prevent a series of cpu usage surges we've been seeing
+# on long-running HHVM processes. T147773
+$pool = $::role::lvs::realserver::lvs_pools['hhvm']['lvs_name']
+$lvs_service = pick($::lvs::configuration::lvs_services[$pool], {})
+$conftool_config = pick($lvs_service['conftool'], {'cluster' => 
'appserver'})
+$module_path = get_module_path($module_name)
+$site_nodes = 
loadyaml("${module_path}/../../conftool-data/nodes/${::site}.yaml")
+$pool_nodes = keys($site_nodes[$conftool_config['cluster']])
+if member($pool_nodes, $::fqdn) {
+$times = cron_splay($pool_nodes, 'daily', 
'hhvm-conditional-restarts')
+cron { 'hhvm-conditional-restart':
+command => '/usr/local/bin/hhvm-needs-restart && 
/usr/local/bin/run-no-puppet /usr/local/bin/restart-hhvm > /dev/null',
+hour=> $times['hour'],
+minute  => $times['minute'],
+}
+}
 }
 
 ferm::service { 'mediawiki-http':

-- 
To view, visit https://gerrit.wikimedia.org/r/315938
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I204331607ba80169fafdd205ba9bffeeabf9a443
Gerrit-PatchSet: 14
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Giuseppe Lavagetto 
Gerrit-Reviewer: BBlack 
Gerrit-Reviewer: Dzahn 
Gerrit-Reviewer: Giuseppe Lavagetto 
Gerrit-Reviewer: jenkins-bot <>

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/puppet[production]: role::mediawiki::webserver: restart hhvm routinely

2016-10-14 Thread Giuseppe Lavagetto (Code Review)
Giuseppe Lavagetto has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/315938

Change subject: role::mediawiki::webserver: restart hhvm routinely
..

role::mediawiki::webserver: restart hhvm routinely

This cron will restart HHVM if it's running since more than 3 days.

Bug: T147773
Change-Id: I204331607ba80169fafdd205ba9bffeeabf9a443
---
M modules/role/manifests/mediawiki/webserver.pp
1 file changed, 13 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/38/315938/1

diff --git a/modules/role/manifests/mediawiki/webserver.pp 
b/modules/role/manifests/mediawiki/webserver.pp
index bc270e2..774c9ae 100644
--- a/modules/role/manifests/mediawiki/webserver.pp
+++ b/modules/role/manifests/mediawiki/webserver.pp
@@ -26,6 +26,19 @@
 lvs_class_hosts => $lvs::configuration::lvs_class_hosts,
 lvs_services_config => $lvs::configuration::lvs_services
 }
+
+# Restart HHVM if it is running since more than 3 days.
+# This should prevent a series of cpu usage surges we've been seeing
+# on long-running HHVM processes. T147773
+$conftool_config = $lvs::configuration::lvs_services['conftool']
+$site_nodes = loadyaml("conftool-data/nodes/${::site}.yaml")
+$pool_nodes = keys($site_nodes[$conftool_config['cluster']])
+$times = cron_splay($pool_nodes, 'daily', 'hhvm-conditional-restarts')
+cron { 'hhvm-conditional-restart':
+command => '(( $(ps -C hhvm -o etimes= | head -n 1 ) > 259200 )) 
&& /usr/local/bin/run-no-puppet /usr/local/bin/restart-hhvm > /dev/null',
+hour=> $times['hour'],
+minute  => $times['minute'],
+}
 }
 
 ferm::service { 'mediawiki-http':

-- 
To view, visit https://gerrit.wikimedia.org/r/315938
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I204331607ba80169fafdd205ba9bffeeabf9a443
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Giuseppe Lavagetto 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits