Volans has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/349737 )

Change subject: Mediawiki: refactor stop/start maintenance
......................................................................

Mediawiki: refactor stop/start maintenance

- move common calls for jobrunners, videoscalers and cronjobs to the
  mediawiki library
- re-enable and run puppet also in dc_from, where was disabled too
- add verification in t09_start_maintenance that all the services are
  still stopped in dc_from after the puppet run

Bug: T163372
Change-Id: I4fc3a9632fcc6cd3737188e29c10a9bb30708907
---
M switchdc/lib/mediawiki.py
M switchdc/stages/t01_stop_maintenance.py
M switchdc/stages/t09_start_maintenance.py
3 files changed, 99 insertions(+), 53 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/switchdc 
refs/changes/37/349737/1

diff --git a/switchdc/lib/mediawiki.py b/switchdc/lib/mediawiki.py
index 791f4a2..4d26489 100644
--- a/switchdc/lib/mediawiki.py
+++ b/switchdc/lib/mediawiki.py
@@ -3,7 +3,7 @@
 import requests
 
 from switchdc.log import logger
-from switchdc.lib.remote import Remote
+from switchdc.lib.remote import Remote, RemoteExecutionError
 
 
 def check_config_line(filename, expected):
@@ -42,3 +42,82 @@
     command = 'su - {user} -c \'scap sync-file --force 
wmf-config/{filename}.php "{message}"\''.format(
         user=os.getlogin(), filename=filename, message=message)
     remote.sync(command)
+
+
+def jobrunners(dc, stop=False, verify_stopped=False):
+    """Manage and verify the MediaWiki jobrunners.
+
+    Arguments:
+    dc             -- the name of the datacenter to filter for.
+    stop           -- whether to stop the jobrunners (True) or left them 
untouched (False).
+    verify_stopped -- whether to verify that the jobrunners are running 
(False) or are stopped (True).
+    """
+    remote = Remote(site=dc)
+    remote.select('R:class = role::mediawiki::jobrunner')
+
+    if stop:
+        logger.info('Stopping jobrunners in {dc}'.format(dc=dc))
+        # We wait for all jobs on HHVM on jobrunners to finish before 
proceeding
+        remote.async('service jobrunner stop', 'service jobchron stop',
+                     'while [ "$(hhvmadm /check-load)" -gt 1 ]; do sleep 1; 
done')
+
+    prefix = ''
+    if verify_stopped:
+        prefix = '! '
+
+    remote.async('{prefix}service jobrunner status > 
/dev/null'.format(prefix=prefix),
+                 '{prefix}service jobchron status > 
/dev/null'.format(prefix=prefix), is_safe=True)
+
+
+def videoscalers(dc, stop=False, verify_stopped=False):
+    """Manage and verify the MediaWiki videoscalers.
+
+    Arguments:
+    dc             -- the name of the datacenter to filter for.
+    stop           -- whether to stop the videoscalers (True) or left them 
untouched (False).
+    verify_stopped -- whether to verify that the videoscalers are running 
(False) or are stopped (True).
+    """
+    remote = Remote(site=dc)
+    remote.select('R:class = role::mediawiki::videoscaler')
+
+    if stop:
+        logger.info('Stopping videoscalers in {dc}'.format(dc=dc))
+        # On videoscalers we are forced to restart HHVM without waiting as 
transcodes can take a long time
+        remote.async('stop jobrunner || exit 0', 'stop jobchron || exit 0', 
'restart hhvm')
+
+    option = ''
+    if verify_stopped:
+        option = 'v'
+
+    remote.async('status jobrunner | grep -q{option} 
running'.format(option=option),
+                 'status jobchron | grep -q{option} 
running'.format(option=option), is_safe=True)
+
+
+def cronjobs(dc, stop=False, verify_stopped=False):
+    """Manage and verify the MediaWiki cronjobs.
+
+    Arguments:
+    dc             -- the name of the datacenter to filter for.
+    stop           -- whether to stop the cronjobs (True) or left them 
untouched (False).
+    verify_stopped -- whether to verify that the cronjobs are running (False) 
or are stopped (True).
+    """
+    remote = Remote(site=dc)
+    remote.select('R:class = role::mediawiki::maintenance')
+
+    if stop:
+        logger.info('Disabling MediaWiki cronjobs in {dc}'.format(dc=dc))
+        remote.async('crontab -u www-data -r', 'killall -r php', 'sleep 5', 
'killall -9 -r php')
+
+    option = ''
+    if verify_stopped:
+        option = '-z '
+
+    remote.sync('test {option}"$(crontab -u www-data -l | sed -r 
\'/^(#|$)/d\')"'.format(option=option), is_safe=True)
+
+    if verify_stopped:
+        # We just log an error, don't actually report a failure to the system. 
We can live with this.
+        try:
+            remote.sync('pgrep -c php', is_safe=True)
+            logger.error('Stray php processes still present on the maintenance 
host, please check')
+        except RemoteExecutionError:
+            pass
diff --git a/switchdc/stages/t01_stop_maintenance.py 
b/switchdc/stages/t01_stop_maintenance.py
index 6da2677..5e65501 100644
--- a/switchdc/stages/t01_stop_maintenance.py
+++ b/switchdc/stages/t01_stop_maintenance.py
@@ -1,40 +1,11 @@
-from switchdc.lib.remote import Remote, RemoteExecutionError
-from switchdc.log import logger
+from switchdc.lib import mediawiki
 
-__title__ = 'Stop MediaWiki jobrunners, videoscalers and maintenance in 
{dc_from}'
+__title__ = 'Stop MediaWiki jobrunners, videoscalers and cronjobs in {dc_from}'
 
 
 def execute(dc_from, dc_to):
-    """Sets mediawiki-maintenance offline, stopping jobrunners and cronjobs."""
-    # Note: the two steps here could be run in parallel; split the file here if
-    # deemed necessary. Since this is pre-read-only, I didn't think it would be
-    # an issue
+    """Sets mediawiki-maintenance offline, stopping jobrunners, videoscalers 
and cronjobs."""
 
-    # 1: Stop the jobrunners in dc_from
-    remote = Remote(site=dc_from)
-    logger.info('Stopping jobrunners in {dc}'.format(dc=dc_from))
-    remote.select('R:class = role::mediawiki::jobrunner')
-    # We wait for all jobs on HHVM on jobrunners to finish before proceeding
-    remote.async('service jobrunner stop', 'service jobchron stop',
-                 'while [ "$(hhvmadm /check-load)" -gt 1 ]; do sleep 1; done')
-    remote.async('! service jobrunner status > /dev/null', '! service jobchron 
status > /dev/null',  is_safe=True)
-
-    remote.select('R:class = role::mediawiki::videoscaler')
-    # On videoscalers we are forced to restart HHVM as transcodes can take a 
long time
-    remote.async('stop jobrunner || exit 0', 'stop jobchron || exit 0', 
'restart hhvm')
-    remote.async('status jobrunner | grep -qv running', 'status jobchron | 
grep -qv running')
-
-    # 2: disable and kill cronjobs
-    logger.info('Disabling MediaWiki cronjobs in {dc}'.format(dc=dc_from))
-    remote.select('R:class = role::mediawiki::maintenance')
-    remote.async('crontab -u www-data -r', 'killall -r php', 'sleep 5', 
'killall -9 -r php')
-
-    # Verify that the crontab has no entries
-    remote.sync('test -z "$(crontab -u www-data -l | sed -r  \'/^(#|$)/d\')"', 
is_safe=True)
-
-    # We just log an error, don't actually report a failure to the system. We 
can live with this.
-    try:
-        remote.sync('pgrep -c php', is_safe=True)
-        logger.error('Stray php processes still present on the maintenance 
host, please check')
-    except RemoteExecutionError:
-        pass
+    mediawiki.jobrunners(dc_from, stop=True, verify_stopped=True)
+    mediawiki.videoscalers(dc_from, stop=True, verify_stopped=True)
+    mediawiki.cronjobs(dc_from, stop=True, verify_stopped=True)
diff --git a/switchdc/stages/t09_start_maintenance.py 
b/switchdc/stages/t09_start_maintenance.py
index 045ccd8..33403b5 100644
--- a/switchdc/stages/t09_start_maintenance.py
+++ b/switchdc/stages/t09_start_maintenance.py
@@ -1,28 +1,24 @@
 from switchdc import get_reason
+from switchdc.lib import mediawiki
 from switchdc.lib.remote import Remote
 
 __title__ = 'Start MediaWiki jobrunners, videoscalers and maintenance in 
{dc_to}'
 
 
 def execute(dc_from, dc_to):
-    """Sets mediawiki-maintenance online, starting jobrunners and cronjobs."""
-    remote = Remote(site=dc_to)
-
-    # 1: Run puppet on all jobrunner and maintenace machines in dc_to
-    jobrunners = Remote.query('R:class = role::mediawiki::jobrunner')
-    videoscalers = Remote.query('R:class = role::mediawiki::videoscaler')
-    maintenance = Remote.query('R:class = role::mediawiki::maintenance')
-    all_jobs = videoscalers | jobrunners | maintenance
-    remote.select(all_jobs)
+    """Sets mediawiki-maintenance online, starting jobrunners, videoscalers 
and cronjobs."""
+    # Enable and run puppet on the hosts where it was disabled
+    remote = Remote()
+    remote.select('R:class = profile::mediawiki::jobrunner or R:class = 
role::mediawiki::maintenance')
     command = 'run-puppet-agent --enable 
"{message}"'.format(message=get_reason())
     remote.async(command, batch_size=30)
 
-    # Verify
-    remote.select(jobrunners)
-    remote.async('service jobrunner status > /dev/null', 'service jobchron 
status > /dev/null', is_safe=True)
+    # Verify all services are started in dc_to
+    mediawiki.jobrunners(dc_to)
+    mediawiki.videoscalers(dc_to)
+    mediawiki.cronjobs(dc_to)
 
-    remote.select(videoscalers)
-    remote.async('status jobrunner | grep -q running', 'status jobchron | grep 
-q running')
-    # Verify that the crontab has entries
-    remote.select(maintenance)
-    remote.sync('test "$(crontab -u www-data -l | sed -r \'/^(#|$)/d\')"', 
is_safe=True)
+    # Verify all services are still stopped in dc_from
+    mediawiki.jobrunners(dc_from, verify_stopped=True)
+    mediawiki.videoscalers(dc_from, verify_stopped=True)
+    mediawiki.cronjobs(dc_from, verify_stopped=True)

-- 
To view, visit https://gerrit.wikimedia.org/r/349737
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4fc3a9632fcc6cd3737188e29c10a9bb30708907
Gerrit-PatchSet: 1
Gerrit-Project: operations/switchdc
Gerrit-Branch: master
Gerrit-Owner: Volans <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to