[MediaWiki-commits] [Gerrit] operations/puppet[production]: Reimage: minor improvements

2016-09-21 Thread Volans (Code Review)
Volans has submitted this change and it was merged.

Change subject: Reimage: minor improvements
..


Reimage: minor improvements

* moved logging to a directory with one log file per run named with
  datetime, user and PID
* improved logging while waiting for Jobs completion
* reduced timeout for normal puppet runs

Bug: T143536
Change-Id: Ic1c6a06d8b12473cec6aebdfc8e5eb1a2dc13084
---
M modules/salt/files/wmf_auto_reimage.py
M modules/salt/manifests/orchestration.pp
2 files changed, 34 insertions(+), 20 deletions(-)

Approvals:
  Elukey: Looks good to me, but someone else must approve
  Muehlenhoff: Looks good to me, but someone else must approve
  Volans: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/salt/files/wmf_auto_reimage.py 
b/modules/salt/files/wmf_auto_reimage.py
index 460d935..57cc621 100644
--- a/modules/salt/files/wmf_auto_reimage.py
+++ b/modules/salt/files/wmf_auto_reimage.py
@@ -13,7 +13,7 @@
 import time
 
 from datetime import datetime
-from logging.handlers import RotatingFileHandler
+from logging.handlers import FileHandler
 
 import dns.resolver
 import salt.client
@@ -26,7 +26,7 @@
 INTERNAL_TLD = 'wmnet'
 MANAGEMENT_DOMAIN = 'mgmt'
 
-LOG_PATH = '/var/log/wmf_auto_reimage.log'
+LOG_PATTERN = '/var/log/wmf-auto-reimage/{start}_{user}_{pid}.log'
 # TODO: move it to a dedicated ops-orchestration-bot
 PHABRICATOR_CONFIG_FILE = '/etc/phabricator_ops-monitoring-bot.conf'
 
@@ -138,21 +138,24 @@
 
 
 def setup_logging(user):
-""" Setup the logger instance
+""" Setup the logger instance and return the log file path
 
 Arguments:
 user -- the real user to use in the logging formatter for auditing
 """
+log_path = LOG_PATTERN.format(start=datetime.now().strftime('%Y%m%d%H%M'),
+  user=user, pid=os.getpid())
 log_formatter = logging.Formatter(
 fmt=('%(asctime)s [%(levelname)s] ({user}) %(name)s::%(funcName)s: '
  '%(message)s').format(user=user),
 datefmt='%F %T')
-log_handler = RotatingFileHandler(
-LOG_PATH, maxBytes=5*(1024**2), backupCount=10)
+log_handler = FileHandler(log_path)
 log_handler.setFormatter(log_formatter)
 logger.addHandler(log_handler)
 logger.raiseExceptions = False
 logger.setLevel(logging.INFO)
+
+return log_path
 
 
 def get_mgmt(host):
@@ -430,7 +433,8 @@
 sleep = WATCHER_SHORT_SLEEP
 log_loops = 0
 while True:
-logger.debug('Watching for jobs...')
+logger.debug('Watching for jobs: {jobs}'.format(
+jobs=(running - completed)))
 
 log_loops += 1
 if log_loops == WATCHER_SLEEP_THRESHOLD:
@@ -478,11 +482,13 @@
 
 if log_loops == WATCHER_LOG_LOOPS and sleep == WATCHER_LONG_SLEEP:
 log_loops = 0
-logger.info('Job completion progress: {done}/{total}'.format(
-done=len(completed), total=len(running)))
+logger.info('Job done ({done}/{total}), waiting for {jobs}'.format(
+done=len(completed), total=len(running),
+jobs=(running - completed)))
 
 if timeout > 0 and (datetime.now() - start).total_seconds() > timeout:
-logger.warning('Timeout reached')
+logger.warning('Timeout reached for jobs: {jobs}'.format(
+jobs=(running - completed)))
 raise StopIteration()
 
 time.sleep(sleep)
@@ -618,7 +624,7 @@
 """
 success_hosts = []
 
-for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=1800):
+for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=300):
 if result['success'] and result['return']['retcode'] == 0:
 success_hosts.append(result['id'])
 
@@ -933,16 +939,14 @@
 return message
 
 
-def run(args, user):
+def run(args, user, log_path):
 """ Run the WMF auto reimage according to command line arguments
 
 Arguments:
-args -- parsed command line arguments
-user -- the user that launched the script, for auditing purposes
+args -- parsed command line arguments
+user -- the user that launched the script, for auditing purposes
+log_path -- the path of the logfile
 """
-print('START')
-print('To monitor the full log:\ntail -F {log}'.format(log=LOG_PATH))
-
 # Get additional informations
 ipmi_password = get_ipmi_password()
 custom_mgmts = get_custom_mgmts(args.hosts)
@@ -960,7 +964,7 @@
 phabricator_task_update(
 phab_client, args.phab_task_id, PHAB_COMMENT_PRE.format(
 user=user, hostname=socket.getfqdn(), hosts=hosts,
-log=LOG_PATH))
+log=log_path))
 
 # Set downtime on Icinga
 hosts = icinga_downtime(icinga_host, hosts, user, args.phab_task_id)
@@ -1016,7 +1020,6 @@
 logger.info(("Auto reimaging of hosts '{hosts}' compl

[MediaWiki-commits] [Gerrit] operations/puppet[production]: Reimage: minor improvements

2016-09-20 Thread Volans (Code Review)
Volans has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/311701

Change subject: Reimage: minor improvements
..

Reimage: minor improvements

* moved logging to a directory with one log file per run named with
  datetime, user and PID
* improved logging while waiting for Jobs completion
* reduced timeout for normal puppet runs

Bug: T143536
Change-Id: Ic1c6a06d8b12473cec6aebdfc8e5eb1a2dc13084
---
M modules/salt/files/wmf_auto_reimage.py
M modules/salt/manifests/orchestration.pp
2 files changed, 34 insertions(+), 20 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/01/311701/1

diff --git a/modules/salt/files/wmf_auto_reimage.py 
b/modules/salt/files/wmf_auto_reimage.py
index 460d935..57cc621 100644
--- a/modules/salt/files/wmf_auto_reimage.py
+++ b/modules/salt/files/wmf_auto_reimage.py
@@ -13,7 +13,7 @@
 import time
 
 from datetime import datetime
-from logging.handlers import RotatingFileHandler
+from logging.handlers import FileHandler
 
 import dns.resolver
 import salt.client
@@ -26,7 +26,7 @@
 INTERNAL_TLD = 'wmnet'
 MANAGEMENT_DOMAIN = 'mgmt'
 
-LOG_PATH = '/var/log/wmf_auto_reimage.log'
+LOG_PATTERN = '/var/log/wmf-auto-reimage/{start}_{user}_{pid}.log'
 # TODO: move it to a dedicated ops-orchestration-bot
 PHABRICATOR_CONFIG_FILE = '/etc/phabricator_ops-monitoring-bot.conf'
 
@@ -138,21 +138,24 @@
 
 
 def setup_logging(user):
-""" Setup the logger instance
+""" Setup the logger instance and return the log file path
 
 Arguments:
 user -- the real user to use in the logging formatter for auditing
 """
+log_path = LOG_PATTERN.format(start=datetime.now().strftime('%Y%m%d%H%M'),
+  user=user, pid=os.getpid())
 log_formatter = logging.Formatter(
 fmt=('%(asctime)s [%(levelname)s] ({user}) %(name)s::%(funcName)s: '
  '%(message)s').format(user=user),
 datefmt='%F %T')
-log_handler = RotatingFileHandler(
-LOG_PATH, maxBytes=5*(1024**2), backupCount=10)
+log_handler = FileHandler(log_path)
 log_handler.setFormatter(log_formatter)
 logger.addHandler(log_handler)
 logger.raiseExceptions = False
 logger.setLevel(logging.INFO)
+
+return log_path
 
 
 def get_mgmt(host):
@@ -430,7 +433,8 @@
 sleep = WATCHER_SHORT_SLEEP
 log_loops = 0
 while True:
-logger.debug('Watching for jobs...')
+logger.debug('Watching for jobs: {jobs}'.format(
+jobs=(running - completed)))
 
 log_loops += 1
 if log_loops == WATCHER_SLEEP_THRESHOLD:
@@ -478,11 +482,13 @@
 
 if log_loops == WATCHER_LOG_LOOPS and sleep == WATCHER_LONG_SLEEP:
 log_loops = 0
-logger.info('Job completion progress: {done}/{total}'.format(
-done=len(completed), total=len(running)))
+logger.info('Job done ({done}/{total}), waiting for {jobs}'.format(
+done=len(completed), total=len(running),
+jobs=(running - completed)))
 
 if timeout > 0 and (datetime.now() - start).total_seconds() > timeout:
-logger.warning('Timeout reached')
+logger.warning('Timeout reached for jobs: {jobs}'.format(
+jobs=(running - completed)))
 raise StopIteration()
 
 time.sleep(sleep)
@@ -618,7 +624,7 @@
 """
 success_hosts = []
 
-for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=1800):
+for result in run_command_on_hosts(hosts, 'wmfpuppet.run', timeout=300):
 if result['success'] and result['return']['retcode'] == 0:
 success_hosts.append(result['id'])
 
@@ -933,16 +939,14 @@
 return message
 
 
-def run(args, user):
+def run(args, user, log_path):
 """ Run the WMF auto reimage according to command line arguments
 
 Arguments:
-args -- parsed command line arguments
-user -- the user that launched the script, for auditing purposes
+args -- parsed command line arguments
+user -- the user that launched the script, for auditing purposes
+log_path -- the path of the logfile
 """
-print('START')
-print('To monitor the full log:\ntail -F {log}'.format(log=LOG_PATH))
-
 # Get additional informations
 ipmi_password = get_ipmi_password()
 custom_mgmts = get_custom_mgmts(args.hosts)
@@ -960,7 +964,7 @@
 phabricator_task_update(
 phab_client, args.phab_task_id, PHAB_COMMENT_PRE.format(
 user=user, hostname=socket.getfqdn(), hosts=hosts,
-log=LOG_PATH))
+log=log_path))
 
 # Set downtime on Icinga
 hosts = icinga_downtime(icinga_host, hosts, user, args.phab_task_id)
@@ -1016,7 +1020,6 @@
 logger.info(("Auto reimaging of hosts '{hosts}' completed, hosts "
  "'{successful}' were successful.").format