Giuseppe Lavagetto has uploaded a new change for review.
https://gerrit.wikimedia.org/r/264055
Change subject: jobrunner: contain gwt jobs to run on two specific hosts
......................................................................
jobrunner: contain gwt jobs to run on two specific hosts
Since GWT jobs are leaking memory and it has caused instabilities on the
jobrunners, we do as follows:
* Remove the hack introduced in I44990808 to contain running GWT runners
on odd-numbered jobrunners
* Contain gwt jobs to run on two specific servers, which will be running
those jobs exclusively. We use two newer, high-memory ones.
* Modify the cronjob hack to run each hour on those machines, and to
restart HHVM once it occupies a sizeable amount of the memory
Bug: T122069
Change-Id: I3ff20b324ee98e93a926e0aa1ed568cc9343ae90
---
A hieradata/hosts/mw1161.yaml
A hieradata/hosts/mw1162.yaml
M hieradata/role/common/mediawiki/jobrunner.yaml
M modules/mediawiki/manifests/jobrunner.pp
M modules/mediawiki/templates/jobrunner/jobrunner.conf.erb
5 files changed, 20 insertions(+), 7 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/55/264055/1
diff --git a/hieradata/hosts/mw1161.yaml b/hieradata/hosts/mw1161.yaml
new file mode 100644
index 0000000..48d3708
--- /dev/null
+++ b/hieradata/hosts/mw1161.yaml
@@ -0,0 +1,5 @@
+mediawiki::jobrunner::runners_basic: 0
+mediawiki::jobrunner::runners_gwt: 10
+mediawiki::jobrunner::runners_restbase: 0
+mediawiki::jobrunner::runners_translate: 0
+mediawiki::jobrunner::runners_upload: 0
diff --git a/hieradata/hosts/mw1162.yaml b/hieradata/hosts/mw1162.yaml
new file mode 100644
index 0000000..48d3708
--- /dev/null
+++ b/hieradata/hosts/mw1162.yaml
@@ -0,0 +1,5 @@
+mediawiki::jobrunner::runners_basic: 0
+mediawiki::jobrunner::runners_gwt: 10
+mediawiki::jobrunner::runners_restbase: 0
+mediawiki::jobrunner::runners_translate: 0
+mediawiki::jobrunner::runners_upload: 0
diff --git a/hieradata/role/common/mediawiki/jobrunner.yaml
b/hieradata/role/common/mediawiki/jobrunner.yaml
index 84d964c..f5c4dea 100644
--- a/hieradata/role/common/mediawiki/jobrunner.yaml
+++ b/hieradata/role/common/mediawiki/jobrunner.yaml
@@ -3,7 +3,7 @@
- deployment
- perf-roots
mediawiki::jobrunner::runners_basic: 20
-mediawiki::jobrunner::runners_gwt: 1
+mediawiki::jobrunner::runners_gwt: 0
mediawiki::jobrunner::runners_restbase: 3
mediawiki::jobrunner::runners_translate: 1
mediawiki::jobrunner::runners_upload: 7
diff --git a/modules/mediawiki/manifests/jobrunner.pp
b/modules/mediawiki/manifests/jobrunner.pp
index 3513bb5..dc156d5 100644
--- a/modules/mediawiki/manifests/jobrunner.pp
+++ b/modules/mediawiki/manifests/jobrunner.pp
@@ -107,11 +107,14 @@
content => template('mediawiki/jobrunner/site.conf.erb')
}
- # Hack for T122069: Once a day, check the uptime of HHVM. If HHVM has
- # been running for more than a day, restart it.
- cron { 'periodic_hhvm_restart':
- command => '/bin/ps -C hhvm -o etime= | /bin/grep -q - &&
/sbin/initctl restart hhvm >/dev/null 2>/dev/null',
- hour => fqdn_rand(23, 'periodic_hhvm_restart'),
+ # Hack for T122069: on servers running GWT jobs, restart HHVM
+ # once it occupies more than 60% of the available memory
+ if ($runners_gwt > 0) {
+ cron { 'periodic_hhvm_restart':
+ command => '/bin/ps -C hhvm -o pmem= | awk \'{sum+=$1} END {
if (sum > 60.0) print "restart" }\' | grep -q restart && /usr/sbin/service
hhvm restart >/dev/null 2>/dev/null',
+ minute => 0,
+ }
+
}
}
diff --git a/modules/mediawiki/templates/jobrunner/jobrunner.conf.erb
b/modules/mediawiki/templates/jobrunner/jobrunner.conf.erb
index 4f60685..365fd6b 100644
--- a/modules/mediawiki/templates/jobrunner/jobrunner.conf.erb
+++ b/modules/mediawiki/templates/jobrunner/jobrunner.conf.erb
@@ -26,7 +26,7 @@
]
},
"gwt": {
- "runners": <%= @hostname.split('.').first.slice(-1).to_i % 2 == 1
? @runners_gwt : 0 %>,
+ "runners": <%= @runners_gwt %>,
"include": [
"gwtoolsetUploadMetadataJob",
"gwtoolsetUploadMediafileJob",
--
To view, visit https://gerrit.wikimedia.org/r/264055
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I3ff20b324ee98e93a926e0aa1ed568cc9343ae90
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Giuseppe Lavagetto <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits