[MediaWiki-commits] [Gerrit] operations/puppet[production]: Reimage: minor improvements
Volans has submitted this change and it was merged. Change subject: Reimage: minor improvements .. Reimage: minor improvements * moved logging to a directory with one log file per run named with datetime, user and PID * improved logging while waiting for Jobs completion * reduced timeout for normal puppet runs Bug: T143536 Change-Id: Ic1c6a06d8b12473cec6aebdfc8e5eb1a2dc13084 --- M modules/salt/files/wmf_auto_reimage.py M modules/salt/manifests/orchestration.pp 2 files changed, 34 insertions(+), 20 deletions(-) Approvals: Elukey: Looks good to me, but someone else must approve Muehlenhoff: Looks good to me, but someone else must approve Volans: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/salt/files/wmf_auto_reimage.py b/modules/salt/files/wmf_auto_reimage.py index 460d935..57cc621 100644 --- a/modules/salt/files/wmf_auto_reimage.py +++ b/modules/salt/files/wmf_auto_reimage.py @@ -13,7 +13,7 @@ import time from datetime import datetime -from logging.handlers import RotatingFileHandler +from logging.handlers import FileHandler import dns.resolver import salt.client @@ -26,7 +26,7 @@ INTERNAL_TLD = 'wmnet' MANAGEMENT_DOMAIN = 'mgmt' -LOG_PATH = '/var/log/wmf_auto_reimage.log' +LOG_PATTERN = '/var/log/wmf-auto-reimage/{start}_{user}_{pid}.log' # TODO: move it to a dedicated ops-orchestration-bot PHABRICATOR_CONFIG_FILE = '/etc/phabricator_ops-monitoring-bot.conf' @@ -138,21 +138,24 @@ def setup_logging(user): -""" Setup the logger instance +""" Setup the logger instance and return the log file path Arguments: user -- the real user to use in the logging formatter for auditing """ +log_path = LOG_PATTERN.format(start=datetime.now().strftime('%Y%m%d%H%M'), + user=user, pid=os.getpid()) log_formatter = logging.Formatter( fmt=('%(asctime)s [%(levelname)s] ({user}) %(name)s::%(funcName)s: ' '%(message)s').format(user=user), datefmt='%F %T') -log_handler = RotatingFileHandler( -LOG_PATH, maxBytes=5*(1024**2), backupCount=10) +log_handler = FileHandler(log_path) log_handler.setFormatter(log_formatter) logger.addHandler(log_handler) logger.raiseExceptions = False logger.setLevel(logging.INFO) + +return log_path def get_mgmt(host): @@ -430,7 +433,8 @@ sleep = WATCHER_SHORT_SLEEP log_loops = 0 while True: -logger.debug('Watching for jobs...') +logger.debug('Watching for jobs: {jobs}'.format( +jobs=(running - completed))) log_loops += 1 if log_loops == WATCHER_SLEEP_THRESHOLD: @@ -478,11 +482,13 @@ if log_loops == WATCHER_LOG_LOOPS and sleep == WATCHER_LONG_SLEEP: log_loops = 0 -logger.info('Job completion progress: {done}/{total}'.format( -done=len(completed), total=len(running))) +logger.info('Job done ({done}/{total}), waiting for {jobs}'.format( +done=len(completed), total=len(running), +jobs=(running - completed))) if timeout > 0 and (datetime.now() - start).total_seconds() > timeout: -logger.warning('Timeout reached') +logger.warning('Timeout reached for jobs: {jobs}'.format( +jobs=(running - completed))) raise StopIteration() time.sleep(sleep) @@ -618,7 +624,7 @@ """ success_hosts = [] -for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=1800): +for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=300): if result['success'] and result['return']['retcode'] == 0: success_hosts.append(result['id']) @@ -933,16 +939,14 @@ return message -def run(args, user): +def run(args, user, log_path): """ Run the WMF auto reimage according to command line arguments Arguments: -args -- parsed command line arguments -user -- the user that launched the script, for auditing purposes +args -- parsed command line arguments +user -- the user that launched the script, for auditing purposes +log_path -- the path of the logfile """ -print('START') -print('To monitor the full log:\ntail -F {log}'.format(log=LOG_PATH)) - # Get additional informations ipmi_password = get_ipmi_password() custom_mgmts = get_custom_mgmts(args.hosts) @@ -960,7 +964,7 @@ phabricator_task_update( phab_client, args.phab_task_id, PHAB_COMMENT_PRE.format( user=user, hostname=socket.getfqdn(), hosts=hosts, -log=LOG_PATH)) +log=log_path)) # Set downtime on Icinga hosts = icinga_downtime(icinga_host, hosts, user, args.phab_task_id) @@ -1016,7 +1020,6 @@ logger.info(("Auto reimaging of hosts '{hosts}' compl
[MediaWiki-commits] [Gerrit] operations/puppet[production]: Reimage: minor improvements
Volans has uploaded a new change for review. https://gerrit.wikimedia.org/r/311701 Change subject: Reimage: minor improvements .. Reimage: minor improvements * moved logging to a directory with one log file per run named with datetime, user and PID * improved logging while waiting for Jobs completion * reduced timeout for normal puppet runs Bug: T143536 Change-Id: Ic1c6a06d8b12473cec6aebdfc8e5eb1a2dc13084 --- M modules/salt/files/wmf_auto_reimage.py M modules/salt/manifests/orchestration.pp 2 files changed, 34 insertions(+), 20 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/01/311701/1 diff --git a/modules/salt/files/wmf_auto_reimage.py b/modules/salt/files/wmf_auto_reimage.py index 460d935..57cc621 100644 --- a/modules/salt/files/wmf_auto_reimage.py +++ b/modules/salt/files/wmf_auto_reimage.py @@ -13,7 +13,7 @@ import time from datetime import datetime -from logging.handlers import RotatingFileHandler +from logging.handlers import FileHandler import dns.resolver import salt.client @@ -26,7 +26,7 @@ INTERNAL_TLD = 'wmnet' MANAGEMENT_DOMAIN = 'mgmt' -LOG_PATH = '/var/log/wmf_auto_reimage.log' +LOG_PATTERN = '/var/log/wmf-auto-reimage/{start}_{user}_{pid}.log' # TODO: move it to a dedicated ops-orchestration-bot PHABRICATOR_CONFIG_FILE = '/etc/phabricator_ops-monitoring-bot.conf' @@ -138,21 +138,24 @@ def setup_logging(user): -""" Setup the logger instance +""" Setup the logger instance and return the log file path Arguments: user -- the real user to use in the logging formatter for auditing """ +log_path = LOG_PATTERN.format(start=datetime.now().strftime('%Y%m%d%H%M'), + user=user, pid=os.getpid()) log_formatter = logging.Formatter( fmt=('%(asctime)s [%(levelname)s] ({user}) %(name)s::%(funcName)s: ' '%(message)s').format(user=user), datefmt='%F %T') -log_handler = RotatingFileHandler( -LOG_PATH, maxBytes=5*(1024**2), backupCount=10) +log_handler = FileHandler(log_path) log_handler.setFormatter(log_formatter) logger.addHandler(log_handler) logger.raiseExceptions = False logger.setLevel(logging.INFO) + +return log_path def get_mgmt(host): @@ -430,7 +433,8 @@ sleep = WATCHER_SHORT_SLEEP log_loops = 0 while True: -logger.debug('Watching for jobs...') +logger.debug('Watching for jobs: {jobs}'.format( +jobs=(running - completed))) log_loops += 1 if log_loops == WATCHER_SLEEP_THRESHOLD: @@ -478,11 +482,13 @@ if log_loops == WATCHER_LOG_LOOPS and sleep == WATCHER_LONG_SLEEP: log_loops = 0 -logger.info('Job completion progress: {done}/{total}'.format( -done=len(completed), total=len(running))) +logger.info('Job done ({done}/{total}), waiting for {jobs}'.format( +done=len(completed), total=len(running), +jobs=(running - completed))) if timeout > 0 and (datetime.now() - start).total_seconds() > timeout: -logger.warning('Timeout reached') +logger.warning('Timeout reached for jobs: {jobs}'.format( +jobs=(running - completed))) raise StopIteration() time.sleep(sleep) @@ -618,7 +624,7 @@ """ success_hosts = [] -for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=1800): +for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=300): if result['success'] and result['return']['retcode'] == 0: success_hosts.append(result['id']) @@ -933,16 +939,14 @@ return message -def run(args, user): +def run(args, user, log_path): """ Run the WMF auto reimage according to command line arguments Arguments: -args -- parsed command line arguments -user -- the user that launched the script, for auditing purposes +args -- parsed command line arguments +user -- the user that launched the script, for auditing purposes +log_path -- the path of the logfile """ -print('START') -print('To monitor the full log:\ntail -F {log}'.format(log=LOG_PATH)) - # Get additional informations ipmi_password = get_ipmi_password() custom_mgmts = get_custom_mgmts(args.hosts) @@ -960,7 +964,7 @@ phabricator_task_update( phab_client, args.phab_task_id, PHAB_COMMENT_PRE.format( user=user, hostname=socket.getfqdn(), hosts=hosts, -log=LOG_PATH)) +log=log_path)) # Set downtime on Icinga hosts = icinga_downtime(icinga_host, hosts, user, args.phab_task_id) @@ -1016,7 +1020,6 @@ logger.info(("Auto reimaging of hosts '{hosts}' completed, hosts " "'{successful}' were successful.").format