Yuvipanda has uploaded a new change for review.
https://gerrit.wikimedia.org/r/202342
Change subject: Handle webservice calls erroring out
......................................................................
Handle webservice calls erroring out
- Log errors and also push them to graphite
- Add a 15s timeout to each webservice call
Change-Id: Ie53664655def94ebd924d56dec1e00040be7ecd1
---
M tools/manifest/servicemonitor.py
1 file changed, 19 insertions(+), 12 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/software/tools-manifest
refs/changes/42/202342/1
diff --git a/tools/manifest/servicemonitor.py b/tools/manifest/servicemonitor.py
index 6b65571..121bb17 100644
--- a/tools/manifest/servicemonitor.py
+++ b/tools/manifest/servicemonitor.py
@@ -11,14 +11,23 @@
def _start_webservice(self, manifest):
self.log.info('Starting webservice for tool %s', manifest.tool.name)
- return subprocess.check_output([
- '/usr/bin/sudo',
- '-i', '-u', manifest.tool.username,
- '/usr/local/bin/webservice',
- '--release', manifest.webservice_release,
- manifest.webservice_server,
- 'start',
- ])
+ try:
+ subprocess.check_output([
+ '/usr/bin/sudo',
+ '-i', '-u', manifest.tool.username,
+ '/usr/local/bin/webservice',
+ '--release', manifest.webservice_release,
+ manifest.webservice_server,
+ 'start',
+ ], timeout=15) # 15 second timeout!
+ self.log.info('Started webservice for %s', manifest.tool.name)
+ return True
+ except subprocess.CalledProcessError:
+ self.log.exception('Could not start webservice for tool %s',
manifest.tool.name)
+ self.stats.incr('webservice_startfailed')
+ except subprocess.TimeoutExpired:
+ self.log.exception('Could not start webservice in time for tool
%s', manifest.tool.name)
+ self.stats.incr('webservice_startfailed')
def run(self):
qstat_xml = ET.fromstring(subprocess.check_output(['/usr/bin/qstat',
'-u', '*', '-xml']))
@@ -28,11 +37,9 @@
continue
job = qstat_xml.find('.//job_list[JB_name="%s"]' %
self._webjob_name(manifest))
if job is None or 'r' not in job.findtext('.//state'):
- self._start_webservice(manifest)
manifest.tool.log('No running webservice job found, starting
it')
- self.log.info('Started webservice for %s', manifest.tool.name)
- self.stats.incr('webservice.%s.restarted' % manifest.tool.name)
- restarts_count += 1
+ if self._start_webservice(manifest):
+ restarts_count += 1
self.log.info('Service monitor run completed, %s webservices
restarted', restarts_count)
self.stats.incr('webservices_restarted', restarts_count)
--
To view, visit https://gerrit.wikimedia.org/r/202342
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie53664655def94ebd924d56dec1e00040be7ecd1
Gerrit-PatchSet: 1
Gerrit-Project: operations/software/tools-manifest
Gerrit-Branch: master
Gerrit-Owner: Yuvipanda <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits