[MediaWiki-commits] [Gerrit] operations...pybal[master]: Add monitoring specific metric to RunCommand
Ema has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/371105 ) Change subject: Add monitoring specific metric to RunCommand .. Add monitoring specific metric to RunCommand pybal_monitor_runcommand_run_duration_seconds is a gauge that represents the run latency, differentiated by the 'result' label (successful/failed). Bug: T171710 Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069 --- M pybal/monitors/runcommand.py 1 file changed, 30 insertions(+), 0 deletions(-) Approvals: Ema: Verified; Looks good to me, approved Mark Bergsma: Looks good to me, but someone else must approve diff --git a/pybal/monitors/runcommand.py b/pybal/monitors/runcommand.py index a0ca0ec..b10afce 100644 --- a/pybal/monitors/runcommand.py +++ b/pybal/monitors/runcommand.py @@ -7,11 +7,13 @@ from pybal import monitor from pybal.util import log +from pybal.metrics import Gauge import os, sys, signal, errno import logging from twisted.internet import reactor, process, error +from twisted.python.runtime import seconds class ProcessGroupProcess(process.Process, object): """ @@ -89,6 +91,20 @@ TIMEOUT_RUN = 20 +metric_labelnames = ('service', 'host', 'monitor') +metric_keywords = { +'namespace': 'pybal', +'subsystem': 'monitor_' + __name__.lower() +} + +runcommand_metrics = { +'run_duration_seconds': Gauge( +'run_duration_seconds', +'Command duration', +labelnames=metric_labelnames + ('result', 'exitcode'), +**metric_keywords) +} + def __init__(self, coordinator, server, configuration={}): """Constructor""" @@ -131,6 +147,7 @@ def runCommand(self): """Periodically called method that does a single uptime check.""" +self.checkStartTime = seconds() self.runningProcess = self._spawnProcess(self, self.command, [self.command] + self.arguments, sessionLeader=True, timeout=(self.timeout or None)) @@ -157,10 +174,23 @@ Called when the process has ended """ +duration = seconds() - self.checkStartTime if reason.check(error.ProcessDone): self._resultUp() +result = 'successful' +exitcode = 0 elif reason.check(error.ProcessTerminated): self._resultDown(reason.getErrorMessage()) +result = 'failed' +exitcode = reason.value.exitCode +else: +result = None +exitcode = None + +self.runcommand_metrics['run_duration_seconds'].labels( +result=result, exitcode=exitcode, +**self.metric_labels +).set(duration) # Schedule the next check if self.active: -- To view, visit https://gerrit.wikimedia.org/r/371105 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069 Gerrit-PatchSet: 2 Gerrit-Project: operations/debs/pybal Gerrit-Branch: master Gerrit-Owner: Mark BergsmaGerrit-Reviewer: Ema Gerrit-Reviewer: Giuseppe Lavagetto Gerrit-Reviewer: Mark Bergsma Gerrit-Reviewer: Volans ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] operations...pybal[master]: Add monitoring specific metric to RunCommand
Mark Bergsma has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/371105 ) Change subject: Add monitoring specific metric to RunCommand .. Add monitoring specific metric to RunCommand pybal_monitor_runcommand_run_duration_seconds is a gauge that represents the run latency, differentiated by the 'result' label (successful/failed). Bug: T171710 Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069 --- M pybal/monitors/runcommand.py 1 file changed, 30 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/debs/pybal refs/changes/05/371105/1 diff --git a/pybal/monitors/runcommand.py b/pybal/monitors/runcommand.py index a0ca0ec..b10afce 100644 --- a/pybal/monitors/runcommand.py +++ b/pybal/monitors/runcommand.py @@ -7,11 +7,13 @@ from pybal import monitor from pybal.util import log +from pybal.metrics import Gauge import os, sys, signal, errno import logging from twisted.internet import reactor, process, error +from twisted.python.runtime import seconds class ProcessGroupProcess(process.Process, object): """ @@ -89,6 +91,20 @@ TIMEOUT_RUN = 20 +metric_labelnames = ('service', 'host', 'monitor') +metric_keywords = { +'namespace': 'pybal', +'subsystem': 'monitor_' + __name__.lower() +} + +runcommand_metrics = { +'run_duration_seconds': Gauge( +'run_duration_seconds', +'Command duration', +labelnames=metric_labelnames + ('result', 'exitcode'), +**metric_keywords) +} + def __init__(self, coordinator, server, configuration={}): """Constructor""" @@ -131,6 +147,7 @@ def runCommand(self): """Periodically called method that does a single uptime check.""" +self.checkStartTime = seconds() self.runningProcess = self._spawnProcess(self, self.command, [self.command] + self.arguments, sessionLeader=True, timeout=(self.timeout or None)) @@ -157,10 +174,23 @@ Called when the process has ended """ +duration = seconds() - self.checkStartTime if reason.check(error.ProcessDone): self._resultUp() +result = 'successful' +exitcode = 0 elif reason.check(error.ProcessTerminated): self._resultDown(reason.getErrorMessage()) +result = 'failed' +exitcode = reason.value.exitCode +else: +result = None +exitcode = None + +self.runcommand_metrics['run_duration_seconds'].labels( +result=result, exitcode=exitcode, +**self.metric_labels +).set(duration) # Schedule the next check if self.active: -- To view, visit https://gerrit.wikimedia.org/r/371105 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069 Gerrit-PatchSet: 1 Gerrit-Project: operations/debs/pybal Gerrit-Branch: master Gerrit-Owner: Mark Bergsma___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits