Volans has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/399161 )
Change subject: wmf-auto-reimage: improve resume capabilities
......................................................................
wmf-auto-reimage: improve resume capabilities
* If the reimage has issues after the debian-installer, it's useful to
be able to resume it with the --no-pxe option, but there are still
some manual steps, depending on the status of the host's Puppet
certificate.
* Improve the resume capability when --no-pxe is set to auto-detect the
status of the Puppet certificate and automatically generate and sign
it if missing.
* Increased the timeout for the reboots to 1 hour to have more room to
manually fix any issue in the reboot process.
Bug: T182702
Change-Id: I41f92341ea9650c1a330492a8211d21b2a347978
---
M modules/profile/files/cumin/wmf_auto_reimage.py
M modules/profile/files/cumin/wmf_auto_reimage_host.py
M modules/profile/files/cumin/wmf_auto_reimage_lib.py
3 files changed, 51 insertions(+), 22 deletions(-)
Approvals:
jenkins-bot: Verified
Volans: Looks good to me, approved
diff --git a/modules/profile/files/cumin/wmf_auto_reimage.py
b/modules/profile/files/cumin/wmf_auto_reimage.py
index ea2338d..70b9d95 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage.py
@@ -136,7 +136,7 @@
# Validate hosts
if not args.new:
- lib.validate_hosts(args.hosts, args.no_verify)
+ lib.validate_hosts(args.hosts, no_raise=args.no_verify)
# Update the Phabricator task
if args.phab_task_id is not None:
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_host.py
b/modules/profile/files/cumin/wmf_auto_reimage_host.py
index 6cff051..a6373e6 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_host.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_host.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python
-"""Automated reimaging of a list of hosts."""
+"""Automated reimaging of a single host."""
import argparse
import logging
@@ -97,8 +97,8 @@
rename_from = None # In case of host rename, hold the previous hostname
# Validate hosts have a signed Puppet certificate
- if not args.new and not args.no_verify:
- lib.validate_hosts([args.host], args.no_verify)
+ if not args.new:
+ lib.validate_hosts([args.host], no_raise=args.no_verify)
# Set Icinga downtime
if not args.new and not args.no_downtime:
@@ -112,6 +112,10 @@
if args.no_pxe:
lib.print_line('Skipping PXE reboot', host=args.host)
+ if (not lib.validate_hosts([args.host], no_raise=True) and
+ lib.puppet_check_cert_to_sign(args.host) == 1):
+ # There is no signed or pending signing certificate for the host
+ lib.puppet_generate_cert(args.host)
else:
lib.puppet_remove_host(args.host) # Cleanup Puppet
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_lib.py
b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
index 32d131c..8b8ef9a 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_lib.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python
-"""Automated reimaging of a list of hosts."""
+"""Library for the wmf-auto-reimage and wmf-auto-reimage-host scripts."""
from __future__ import print_function
import argparse
@@ -487,6 +487,7 @@
if no_raise:
logger.warning(message)
+ return False
else:
raise RuntimeError(message)
else:
@@ -494,6 +495,8 @@
print_line('Validated host', host=host)
else:
print_line('Validated hosts: {hosts}'.format(hosts=hosts))
+
+ return True
def icinga_downtime(host, user, phab_task):
@@ -560,13 +563,43 @@
print_line('{message} on hosts: {hosts}'.format(message=message,
hosts=hosts))
+def puppet_check_cert_to_sign(host):
+ """Check if on the puppetmaster there is a new certificate to sign for the
given host.
+
+ Return 0 if there is a pending certificate to be signed, 1 if there isn't
and 2 if the
+ certificate is already signed.
+
+ Arguments:
+ host -- the host to check for a certificate pending signing.
+ """
+ command = "puppet cert list '{host}' 2> /dev/null".format(host=host)
+ puppetmaster_host = get_puppet_ca_master()
+
+ try:
+ exit_code, worker = run_cumin(
+ 'puppet_check_cert_to_sign', puppetmaster_host, [command])
+ except RuntimeError:
+ return 1
+
+ for _, output in worker.get_results():
+ if host in output.message():
+ break
+
+ if output.message().startswith(' "{host}"'.format(host=host)):
+ return 0
+ elif output.message().startswith('+ "{host}"'.format(host=host)):
+ print_line('Puppet cert already signed', host=host)
+ return 2
+ else:
+ raise RuntimeError('Unable to find cert to sign')
+
+
def puppet_wait_cert_and_sign(host):
"""Poll the puppetmaster looking for a new key to sign for the given host.
Arguments:
host -- the host to monitor for a complete Puppet run
"""
- wait_command = "puppet cert list '{host}' 2> /dev/null".format(host=host)
sign_command = "puppet cert -s '{host}'".format(host=host)
puppetmaster_host = get_puppet_ca_master()
start = datetime.utcnow()
@@ -581,28 +614,20 @@
print_line('Still waiting for Puppet cert to sign after {min}
minutes'.format(
min=(retries * WATCHER_LONG_SLEEP) // 60.0), host=host)
- try:
- exit_code, worker = run_cumin(
- 'puppet_wait_cert_and_sign', puppetmaster_host, [wait_command])
- except RuntimeError:
+ check_cert = puppet_check_cert_to_sign(host)
+ if check_cert == 0: # Found Puppet cert to sign
+ break
+ elif check_cert == 1: # Puppet cert to sign still missing
if (datetime.utcnow() - start).total_seconds() > timeout:
logger.error('Timeout reached')
raise RuntimeError('Timeout reached')
time.sleep(WATCHER_LONG_SLEEP)
continue
-
- for _, output in worker.get_results():
- if host in output.message():
- break
-
- if output.message().startswith(' "{host}"'.format(host=host)):
- break
- elif output.message().startswith('+ "{host}"'.format(host=host)):
- print_line('Puppet cert already signed', host=host)
+ elif check_cert == 2: # Puppet cert already signed
return False
- else:
- raise RuntimeError('Unable to find cert to sign')
+ else: # Should never happen
+ raise RuntimeError('Unable to check Puppet certificate status on
puppetmaster')
run_cumin('puppet_wait_cert_and_sign', puppetmaster_host, [sign_command])
print_line('Signed Puppet cert', host=host)
@@ -755,7 +780,7 @@
if start is None:
start = datetime.utcnow()
check_start = datetime.utcnow()
- timeout = 1800 # 30 minutes
+ timeout = 3600 # 1 hour
retries = 0
while True:
--
To view, visit https://gerrit.wikimedia.org/r/399161
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I41f92341ea9650c1a330492a8211d21b2a347978
Gerrit-PatchSet: 3
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Volans <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Giuseppe Lavagetto <[email protected]>
Gerrit-Reviewer: Marostegui <[email protected]>
Gerrit-Reviewer: Muehlenhoff <[email protected]>
Gerrit-Reviewer: Volans <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits