Volans has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/379518 )

Change subject: wmf-auto-reimage: small improvements
......................................................................

wmf-auto-reimage: small improvements

* add safety limit for multiple execution, at most 3 in parallel or 5 in
  sequence. It can be overridden by the --force parameter.
* Use line-buffered output for the cumin log
* use ArgumentParser.error instead of raising ValueError
* Add sligthly more logging in the console while waiting for the Debian
  installer and the first Puppet run.
* Increased long timeouts to 2 hours

Bug: T148814
Change-Id: I465735896411053065aaec431f3bf09c9e1bb888
---
M modules/profile/files/cumin/wmf_auto_reimage.py
M modules/profile/files/cumin/wmf_auto_reimage_host.py
M modules/profile/files/cumin/wmf_auto_reimage_lib.py
3 files changed, 29 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/18/379518/1

diff --git a/modules/profile/files/cumin/wmf_auto_reimage.py 
b/modules/profile/files/cumin/wmf_auto_reimage.py
index ce7f6c5..a137551 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage.py
@@ -31,32 +31,41 @@
         help=('amount of seconds to sleep between one reimage and the next 
when --sequential '
               'is set. Has no effect if --sequential is not set. [default: 
0]'))
     parser.add_argument(
+        '--force', action='store_true',
+        help='override the default limit of that can be reimaged: 3 in 
parallel, 5 in sequence.')
+    parser.add_argument(
         'hosts', metavar='HOST', nargs='+', action='store',
         help='FQDN of the host(s) to be reimaged')
 
     args = parser.parse_args()
 
+    # Safety limits
+    if not args.force:
+        if args.sequential and len(args.hosts) > 5:
+            parser.error('More than 5 sequential hosts specified and --force 
not set')
+        elif len(args.hosts) > 3:
+            parser.error(("More than 3 parallel hosts specified and --force 
not set. Before using "
+                          "the --force parameter, ensure that there aren't too 
many hosts in the "
+                          "same rack."))
+
     # Perform a quick sanity check on the hosts
     for host in args.hosts:
         if '.' not in host or not lib.HOSTS_PATTERN.match(host):
-            raise ValueError("Expected FQDN of hosts, got '{host}'".format(
-                host=host))
+            parser.error("Expected FQDN of hosts, got 
'{host}'".format(host=host))
 
         if not lib.is_hostname_valid(host):
-            raise ValueError(
-                "Unable to resolve host '{host}'".format(host=host))
+            parser.error("Unable to resolve host '{host}'".format(host=host))
 
     # Ensure there are no duplicates in the hosts list
     duplicates = {host for host in args.hosts if args.hosts.count(host) > 1}
     if len(duplicates) > 0:
-        raise ValueError("Duplicate hosts detected: {dup}".format(
-            dup=duplicates))
+        parser.error("Duplicate hosts detected: {dup}".format(dup=duplicates))
 
     # Ensure Phab task is properly formatted
     if (args.phab_task_id is not None and
             lib.PHAB_TASK_PATTERN.search(args.phab_task_id) is None):
-        raise ValueError(("Invalid Phabricator task ID '{task}', expected in "
-                          "the form T12345").format(task=args.phab_task_id))
+        parser.error(("Invalid Phabricator task ID '{task}', expected in "
+                      "the form T12345").format(task=args.phab_task_id))
 
     return args
 
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_host.py 
b/modules/profile/files/cumin/wmf_auto_reimage_host.py
index 7ce9a7e..0052c8b 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_host.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_host.py
@@ -177,7 +177,7 @@
 
     try:
         # This is needed due to a bug in tqdm and a limitation in Cumin
-        with open(cumin_output_path, 'w') as cumin_output:
+        with open(cumin_output_path, 'w', 1) as cumin_output:
             stderr = sys.stderr
             stdout = sys.stdout
             sys.stderr = cumin_output
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_lib.py 
b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
index 0a50f74..61cb365 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_lib.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
@@ -122,7 +122,7 @@
         help='do not set the host in downtime on Icinga. Included if --new is 
set.')
     parser.add_argument(
         '--no-pxe', action='store_true',
-        help=('do not reboot into PXE and reimage. To be used when the reimage 
has issue and was'
+        help=('do not reboot into PXE and reimage. To be used when the reimage 
had issues and was '
               'manually fixed.'))
     parser.add_argument(
         '--new', action='store_true',
@@ -522,15 +522,16 @@
     sign_command = "puppet cert -s '{host}'".format(host=host)
     puppetmaster_host = resolve_dns(PUPPET_DOMAIN, 'CNAME')
     start = datetime.now()
-    timeout = 3600  # 1 hour
+    timeout = 7200  # 2 hours
     retries = 0
 
+    print_line('Polling until a Puppet sign request appears', host=host)
     while True:
         retries += 1
         logger.debug('Waiting for Puppet cert to sign 
({retries})'.format(retries=retries))
         if retries % WATCHER_LOG_LOOPS == 0:
-            logger.info('Still waiting for Puppet cert to sign after {min} 
minutes'.format(
-                min=(retries * WATCHER_LONG_SLEEP) / 60.0))
+            print_line('Still waiting for Puppet cert to sign after {min} 
minutes'.format(
+                min=(retries * WATCHER_LONG_SLEEP) / 60.0), host=host)
 
         try:
             exit_code, worker = run_cumin(
@@ -574,7 +575,7 @@
                  '--ignorecache --no-usecacheonfailure')]
 
     print_line('Started first puppet run (sit back, relax, and enjoy the 
wait)', host=host)
-    run_cumin('puppet_first_run', host, commands, timeout=3600, installer=True)
+    run_cumin('puppet_first_run', host, commands, timeout=7200, installer=True)
     print_line('First Puppet run completed', host=host)
 
 
@@ -645,7 +646,7 @@
     if start is None:
         start = datetime.now()
 
-    timeout = 3600  # 1 hour
+    timeout = 7200  # 2 hours
     retries = 0
     command = ("source /usr/local/share/bash/puppet-common.sh && 
last_run_success && "
                "grep last_run \"${PUPPET_SUMMARY}\" | awk '{ print $2 }'")
@@ -654,8 +655,8 @@
         retries += 1
         logger.debug('Waiting for Puppet ({retries})'.format(retries=retries))
         if retries % WATCHER_LOG_LOOPS == 0:
-            logger.info('Still waiting for Puppet after {min} minutes'.format(
-                min=(retries * WATCHER_LONG_SLEEP) / 60.0))
+            print_line('Still waiting for Puppet after {min} minutes'.format(
+                min=(retries * WATCHER_LONG_SLEEP) / 60.0), host=host)
 
         try:
             exit_code, worker = run_cumin('wait_puppet_run', host, [command])
@@ -717,8 +718,8 @@
         retries += 1
         logger.debug('Waiting for reboot ({retries})'.format(retries=retries))
         if retries % WATCHER_LOG_LOOPS == 0:
-            logger.info('Still waiting for reboot after {min} minutes'.format(
-                min=(retries * WATCHER_LONG_SLEEP) / 60.0))
+            print_line('Still waiting for reboot after {min} minutes'.format(
+                min=(retries * WATCHER_LONG_SLEEP) / 60.0), host=host)
 
         try:
             check_uptime(host, maximum=(datetime.now() - 
start).total_seconds(),

-- 
To view, visit https://gerrit.wikimedia.org/r/379518
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I465735896411053065aaec431f3bf09c9e1bb888
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Volans <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to