Volans has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/379518 )
Change subject: wmf-auto-reimage: small improvements
......................................................................
wmf-auto-reimage: small improvements
* add safety limit for multiple execution, at most 3 in parallel or 5 in
sequence. It can be overridden by the --force parameter.
* Use line-buffered output for the cumin log
* use ArgumentParser.error instead of raising ValueError
* Add sligthly more logging in the console while waiting for the Debian
installer and the first Puppet run.
* Increased long timeouts to 2 hours
Bug: T148814
Change-Id: I465735896411053065aaec431f3bf09c9e1bb888
---
M modules/profile/files/cumin/wmf_auto_reimage.py
M modules/profile/files/cumin/wmf_auto_reimage_host.py
M modules/profile/files/cumin/wmf_auto_reimage_lib.py
3 files changed, 29 insertions(+), 19 deletions(-)
Approvals:
jenkins-bot: Verified
Volans: Looks good to me, approved
diff --git a/modules/profile/files/cumin/wmf_auto_reimage.py
b/modules/profile/files/cumin/wmf_auto_reimage.py
index ce7f6c5..a137551 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage.py
@@ -31,32 +31,41 @@
help=('amount of seconds to sleep between one reimage and the next
when --sequential '
'is set. Has no effect if --sequential is not set. [default:
0]'))
parser.add_argument(
+ '--force', action='store_true',
+ help='override the default limit of that can be reimaged: 3 in
parallel, 5 in sequence.')
+ parser.add_argument(
'hosts', metavar='HOST', nargs='+', action='store',
help='FQDN of the host(s) to be reimaged')
args = parser.parse_args()
+ # Safety limits
+ if not args.force:
+ if args.sequential and len(args.hosts) > 5:
+ parser.error('More than 5 sequential hosts specified and --force
not set')
+ elif len(args.hosts) > 3:
+ parser.error(("More than 3 parallel hosts specified and --force
not set. Before using "
+ "the --force parameter, ensure that there aren't too
many hosts in the "
+ "same rack."))
+
# Perform a quick sanity check on the hosts
for host in args.hosts:
if '.' not in host or not lib.HOSTS_PATTERN.match(host):
- raise ValueError("Expected FQDN of hosts, got '{host}'".format(
- host=host))
+ parser.error("Expected FQDN of hosts, got
'{host}'".format(host=host))
if not lib.is_hostname_valid(host):
- raise ValueError(
- "Unable to resolve host '{host}'".format(host=host))
+ parser.error("Unable to resolve host '{host}'".format(host=host))
# Ensure there are no duplicates in the hosts list
duplicates = {host for host in args.hosts if args.hosts.count(host) > 1}
if len(duplicates) > 0:
- raise ValueError("Duplicate hosts detected: {dup}".format(
- dup=duplicates))
+ parser.error("Duplicate hosts detected: {dup}".format(dup=duplicates))
# Ensure Phab task is properly formatted
if (args.phab_task_id is not None and
lib.PHAB_TASK_PATTERN.search(args.phab_task_id) is None):
- raise ValueError(("Invalid Phabricator task ID '{task}', expected in "
- "the form T12345").format(task=args.phab_task_id))
+ parser.error(("Invalid Phabricator task ID '{task}', expected in "
+ "the form T12345").format(task=args.phab_task_id))
return args
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_host.py
b/modules/profile/files/cumin/wmf_auto_reimage_host.py
index 7ce9a7e..0052c8b 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_host.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_host.py
@@ -177,7 +177,7 @@
try:
# This is needed due to a bug in tqdm and a limitation in Cumin
- with open(cumin_output_path, 'w') as cumin_output:
+ with open(cumin_output_path, 'w', 1) as cumin_output:
stderr = sys.stderr
stdout = sys.stdout
sys.stderr = cumin_output
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_lib.py
b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
index 0a50f74..61cb365 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_lib.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
@@ -122,7 +122,7 @@
help='do not set the host in downtime on Icinga. Included if --new is
set.')
parser.add_argument(
'--no-pxe', action='store_true',
- help=('do not reboot into PXE and reimage. To be used when the reimage
has issue and was'
+ help=('do not reboot into PXE and reimage. To be used when the reimage
had issues and was '
'manually fixed.'))
parser.add_argument(
'--new', action='store_true',
@@ -522,15 +522,16 @@
sign_command = "puppet cert -s '{host}'".format(host=host)
puppetmaster_host = resolve_dns(PUPPET_DOMAIN, 'CNAME')
start = datetime.now()
- timeout = 3600 # 1 hour
+ timeout = 7200 # 2 hours
retries = 0
+ print_line('Polling until a Puppet sign request appears', host=host)
while True:
retries += 1
logger.debug('Waiting for Puppet cert to sign
({retries})'.format(retries=retries))
if retries % WATCHER_LOG_LOOPS == 0:
- logger.info('Still waiting for Puppet cert to sign after {min}
minutes'.format(
- min=(retries * WATCHER_LONG_SLEEP) / 60.0))
+ print_line('Still waiting for Puppet cert to sign after {min}
minutes'.format(
+ min=(retries * WATCHER_LONG_SLEEP) / 60.0), host=host)
try:
exit_code, worker = run_cumin(
@@ -574,7 +575,7 @@
'--ignorecache --no-usecacheonfailure')]
print_line('Started first puppet run (sit back, relax, and enjoy the
wait)', host=host)
- run_cumin('puppet_first_run', host, commands, timeout=3600, installer=True)
+ run_cumin('puppet_first_run', host, commands, timeout=7200, installer=True)
print_line('First Puppet run completed', host=host)
@@ -645,7 +646,7 @@
if start is None:
start = datetime.now()
- timeout = 3600 # 1 hour
+ timeout = 7200 # 2 hours
retries = 0
command = ("source /usr/local/share/bash/puppet-common.sh &&
last_run_success && "
"grep last_run \"${PUPPET_SUMMARY}\" | awk '{ print $2 }'")
@@ -654,8 +655,8 @@
retries += 1
logger.debug('Waiting for Puppet ({retries})'.format(retries=retries))
if retries % WATCHER_LOG_LOOPS == 0:
- logger.info('Still waiting for Puppet after {min} minutes'.format(
- min=(retries * WATCHER_LONG_SLEEP) / 60.0))
+ print_line('Still waiting for Puppet after {min} minutes'.format(
+ min=(retries * WATCHER_LONG_SLEEP) / 60.0), host=host)
try:
exit_code, worker = run_cumin('wait_puppet_run', host, [command])
@@ -717,8 +718,8 @@
retries += 1
logger.debug('Waiting for reboot ({retries})'.format(retries=retries))
if retries % WATCHER_LOG_LOOPS == 0:
- logger.info('Still waiting for reboot after {min} minutes'.format(
- min=(retries * WATCHER_LONG_SLEEP) / 60.0))
+ print_line('Still waiting for reboot after {min} minutes'.format(
+ min=(retries * WATCHER_LONG_SLEEP) / 60.0), host=host)
try:
check_uptime(host, maximum=(datetime.now() -
start).total_seconds(),
--
To view, visit https://gerrit.wikimedia.org/r/379518
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I465735896411053065aaec431f3bf09c9e1bb888
Gerrit-PatchSet: 2
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Volans <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Giuseppe Lavagetto <[email protected]>
Gerrit-Reviewer: Volans <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits