Volans has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/349737 )
Change subject: Mediawiki: refactor stop/start maintenance
......................................................................
Mediawiki: refactor stop/start maintenance
- move common calls for jobrunners, videoscalers and cronjobs to the
mediawiki library
- re-enable and run puppet also in dc_from, where was disabled too
- add verification in t09_start_maintenance that all the services are
still stopped in dc_from after the puppet run
Bug: T163372
Change-Id: I4fc3a9632fcc6cd3737188e29c10a9bb30708907
---
M switchdc/lib/mediawiki.py
M switchdc/stages/t01_stop_maintenance.py
M switchdc/stages/t09_start_maintenance.py
3 files changed, 99 insertions(+), 53 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/switchdc
refs/changes/37/349737/1
diff --git a/switchdc/lib/mediawiki.py b/switchdc/lib/mediawiki.py
index 791f4a2..4d26489 100644
--- a/switchdc/lib/mediawiki.py
+++ b/switchdc/lib/mediawiki.py
@@ -3,7 +3,7 @@
import requests
from switchdc.log import logger
-from switchdc.lib.remote import Remote
+from switchdc.lib.remote import Remote, RemoteExecutionError
def check_config_line(filename, expected):
@@ -42,3 +42,82 @@
command = 'su - {user} -c \'scap sync-file --force
wmf-config/{filename}.php "{message}"\''.format(
user=os.getlogin(), filename=filename, message=message)
remote.sync(command)
+
+
+def jobrunners(dc, stop=False, verify_stopped=False):
+ """Manage and verify the MediaWiki jobrunners.
+
+ Arguments:
+ dc -- the name of the datacenter to filter for.
+ stop -- whether to stop the jobrunners (True) or left them
untouched (False).
+ verify_stopped -- whether to verify that the jobrunners are running
(False) or are stopped (True).
+ """
+ remote = Remote(site=dc)
+ remote.select('R:class = role::mediawiki::jobrunner')
+
+ if stop:
+ logger.info('Stopping jobrunners in {dc}'.format(dc=dc))
+ # We wait for all jobs on HHVM on jobrunners to finish before
proceeding
+ remote.async('service jobrunner stop', 'service jobchron stop',
+ 'while [ "$(hhvmadm /check-load)" -gt 1 ]; do sleep 1;
done')
+
+ prefix = ''
+ if verify_stopped:
+ prefix = '! '
+
+ remote.async('{prefix}service jobrunner status >
/dev/null'.format(prefix=prefix),
+ '{prefix}service jobchron status >
/dev/null'.format(prefix=prefix), is_safe=True)
+
+
+def videoscalers(dc, stop=False, verify_stopped=False):
+ """Manage and verify the MediaWiki videoscalers.
+
+ Arguments:
+ dc -- the name of the datacenter to filter for.
+ stop -- whether to stop the videoscalers (True) or left them
untouched (False).
+ verify_stopped -- whether to verify that the videoscalers are running
(False) or are stopped (True).
+ """
+ remote = Remote(site=dc)
+ remote.select('R:class = role::mediawiki::videoscaler')
+
+ if stop:
+ logger.info('Stopping videoscalers in {dc}'.format(dc=dc))
+ # On videoscalers we are forced to restart HHVM without waiting as
transcodes can take a long time
+ remote.async('stop jobrunner || exit 0', 'stop jobchron || exit 0',
'restart hhvm')
+
+ option = ''
+ if verify_stopped:
+ option = 'v'
+
+ remote.async('status jobrunner | grep -q{option}
running'.format(option=option),
+ 'status jobchron | grep -q{option}
running'.format(option=option), is_safe=True)
+
+
+def cronjobs(dc, stop=False, verify_stopped=False):
+ """Manage and verify the MediaWiki cronjobs.
+
+ Arguments:
+ dc -- the name of the datacenter to filter for.
+ stop -- whether to stop the cronjobs (True) or left them
untouched (False).
+ verify_stopped -- whether to verify that the cronjobs are running (False)
or are stopped (True).
+ """
+ remote = Remote(site=dc)
+ remote.select('R:class = role::mediawiki::maintenance')
+
+ if stop:
+ logger.info('Disabling MediaWiki cronjobs in {dc}'.format(dc=dc))
+ remote.async('crontab -u www-data -r', 'killall -r php', 'sleep 5',
'killall -9 -r php')
+
+ option = ''
+ if verify_stopped:
+ option = '-z '
+
+ remote.sync('test {option}"$(crontab -u www-data -l | sed -r
\'/^(#|$)/d\')"'.format(option=option), is_safe=True)
+
+ if verify_stopped:
+ # We just log an error, don't actually report a failure to the system.
We can live with this.
+ try:
+ remote.sync('pgrep -c php', is_safe=True)
+ logger.error('Stray php processes still present on the maintenance
host, please check')
+ except RemoteExecutionError:
+ pass
diff --git a/switchdc/stages/t01_stop_maintenance.py
b/switchdc/stages/t01_stop_maintenance.py
index 6da2677..5e65501 100644
--- a/switchdc/stages/t01_stop_maintenance.py
+++ b/switchdc/stages/t01_stop_maintenance.py
@@ -1,40 +1,11 @@
-from switchdc.lib.remote import Remote, RemoteExecutionError
-from switchdc.log import logger
+from switchdc.lib import mediawiki
-__title__ = 'Stop MediaWiki jobrunners, videoscalers and maintenance in
{dc_from}'
+__title__ = 'Stop MediaWiki jobrunners, videoscalers and cronjobs in {dc_from}'
def execute(dc_from, dc_to):
- """Sets mediawiki-maintenance offline, stopping jobrunners and cronjobs."""
- # Note: the two steps here could be run in parallel; split the file here if
- # deemed necessary. Since this is pre-read-only, I didn't think it would be
- # an issue
+ """Sets mediawiki-maintenance offline, stopping jobrunners, videoscalers
and cronjobs."""
- # 1: Stop the jobrunners in dc_from
- remote = Remote(site=dc_from)
- logger.info('Stopping jobrunners in {dc}'.format(dc=dc_from))
- remote.select('R:class = role::mediawiki::jobrunner')
- # We wait for all jobs on HHVM on jobrunners to finish before proceeding
- remote.async('service jobrunner stop', 'service jobchron stop',
- 'while [ "$(hhvmadm /check-load)" -gt 1 ]; do sleep 1; done')
- remote.async('! service jobrunner status > /dev/null', '! service jobchron
status > /dev/null', is_safe=True)
-
- remote.select('R:class = role::mediawiki::videoscaler')
- # On videoscalers we are forced to restart HHVM as transcodes can take a
long time
- remote.async('stop jobrunner || exit 0', 'stop jobchron || exit 0',
'restart hhvm')
- remote.async('status jobrunner | grep -qv running', 'status jobchron |
grep -qv running')
-
- # 2: disable and kill cronjobs
- logger.info('Disabling MediaWiki cronjobs in {dc}'.format(dc=dc_from))
- remote.select('R:class = role::mediawiki::maintenance')
- remote.async('crontab -u www-data -r', 'killall -r php', 'sleep 5',
'killall -9 -r php')
-
- # Verify that the crontab has no entries
- remote.sync('test -z "$(crontab -u www-data -l | sed -r \'/^(#|$)/d\')"',
is_safe=True)
-
- # We just log an error, don't actually report a failure to the system. We
can live with this.
- try:
- remote.sync('pgrep -c php', is_safe=True)
- logger.error('Stray php processes still present on the maintenance
host, please check')
- except RemoteExecutionError:
- pass
+ mediawiki.jobrunners(dc_from, stop=True, verify_stopped=True)
+ mediawiki.videoscalers(dc_from, stop=True, verify_stopped=True)
+ mediawiki.cronjobs(dc_from, stop=True, verify_stopped=True)
diff --git a/switchdc/stages/t09_start_maintenance.py
b/switchdc/stages/t09_start_maintenance.py
index 045ccd8..33403b5 100644
--- a/switchdc/stages/t09_start_maintenance.py
+++ b/switchdc/stages/t09_start_maintenance.py
@@ -1,28 +1,24 @@
from switchdc import get_reason
+from switchdc.lib import mediawiki
from switchdc.lib.remote import Remote
__title__ = 'Start MediaWiki jobrunners, videoscalers and maintenance in
{dc_to}'
def execute(dc_from, dc_to):
- """Sets mediawiki-maintenance online, starting jobrunners and cronjobs."""
- remote = Remote(site=dc_to)
-
- # 1: Run puppet on all jobrunner and maintenace machines in dc_to
- jobrunners = Remote.query('R:class = role::mediawiki::jobrunner')
- videoscalers = Remote.query('R:class = role::mediawiki::videoscaler')
- maintenance = Remote.query('R:class = role::mediawiki::maintenance')
- all_jobs = videoscalers | jobrunners | maintenance
- remote.select(all_jobs)
+ """Sets mediawiki-maintenance online, starting jobrunners, videoscalers
and cronjobs."""
+ # Enable and run puppet on the hosts where it was disabled
+ remote = Remote()
+ remote.select('R:class = profile::mediawiki::jobrunner or R:class =
role::mediawiki::maintenance')
command = 'run-puppet-agent --enable
"{message}"'.format(message=get_reason())
remote.async(command, batch_size=30)
- # Verify
- remote.select(jobrunners)
- remote.async('service jobrunner status > /dev/null', 'service jobchron
status > /dev/null', is_safe=True)
+ # Verify all services are started in dc_to
+ mediawiki.jobrunners(dc_to)
+ mediawiki.videoscalers(dc_to)
+ mediawiki.cronjobs(dc_to)
- remote.select(videoscalers)
- remote.async('status jobrunner | grep -q running', 'status jobchron | grep
-q running')
- # Verify that the crontab has entries
- remote.select(maintenance)
- remote.sync('test "$(crontab -u www-data -l | sed -r \'/^(#|$)/d\')"',
is_safe=True)
+ # Verify all services are still stopped in dc_from
+ mediawiki.jobrunners(dc_from, verify_stopped=True)
+ mediawiki.videoscalers(dc_from, verify_stopped=True)
+ mediawiki.cronjobs(dc_from, verify_stopped=True)
--
To view, visit https://gerrit.wikimedia.org/r/349737
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I4fc3a9632fcc6cd3737188e29c10a9bb30708907
Gerrit-PatchSet: 1
Gerrit-Project: operations/switchdc
Gerrit-Branch: master
Gerrit-Owner: Volans <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits