[MediaWiki-commits] [Gerrit] operations...pybal[master]: Add monitoring specific metric to RunCommand

2017-08-10 Thread Ema (Code Review)
Ema has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/371105 )

Change subject: Add monitoring specific metric to RunCommand
..


Add monitoring specific metric to RunCommand

pybal_monitor_runcommand_run_duration_seconds is a gauge that represents
the run latency, differentiated by the 'result' label
(successful/failed).

Bug: T171710
Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069
---
M pybal/monitors/runcommand.py
1 file changed, 30 insertions(+), 0 deletions(-)

Approvals:
  Ema: Verified; Looks good to me, approved
  Mark Bergsma: Looks good to me, but someone else must approve



diff --git a/pybal/monitors/runcommand.py b/pybal/monitors/runcommand.py
index a0ca0ec..b10afce 100644
--- a/pybal/monitors/runcommand.py
+++ b/pybal/monitors/runcommand.py
@@ -7,11 +7,13 @@
 
 from pybal import monitor
 from pybal.util import log
+from pybal.metrics import Gauge
 
 import os, sys, signal, errno
 import logging
 
 from twisted.internet import reactor, process, error
+from twisted.python.runtime import seconds
 
 class ProcessGroupProcess(process.Process, object):
 """
@@ -89,6 +91,20 @@
 
 TIMEOUT_RUN = 20
 
+metric_labelnames = ('service', 'host', 'monitor')
+metric_keywords = {
+'namespace': 'pybal',
+'subsystem': 'monitor_' + __name__.lower()
+}
+
+runcommand_metrics = {
+'run_duration_seconds': Gauge(
+'run_duration_seconds',
+'Command duration',
+labelnames=metric_labelnames + ('result', 'exitcode'),
+**metric_keywords)
+}
+
 def __init__(self, coordinator, server, configuration={}):
 """Constructor"""
 
@@ -131,6 +147,7 @@
 def runCommand(self):
 """Periodically called method that does a single uptime check."""
 
+self.checkStartTime = seconds()
 self.runningProcess = self._spawnProcess(self, self.command, 
[self.command] + self.arguments,
  sessionLeader=True, 
timeout=(self.timeout or None))
 
@@ -157,10 +174,23 @@
 Called when the process has ended
 """
 
+duration = seconds() - self.checkStartTime
 if reason.check(error.ProcessDone):
 self._resultUp()
+result = 'successful'
+exitcode = 0
 elif reason.check(error.ProcessTerminated):
 self._resultDown(reason.getErrorMessage())
+result = 'failed'
+exitcode = reason.value.exitCode
+else:
+result = None
+exitcode = None
+
+self.runcommand_metrics['run_duration_seconds'].labels(
+result=result, exitcode=exitcode,
+**self.metric_labels
+).set(duration)
 
 # Schedule the next check
 if self.active:

-- 
To view, visit https://gerrit.wikimedia.org/r/371105
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069
Gerrit-PatchSet: 2
Gerrit-Project: operations/debs/pybal
Gerrit-Branch: master
Gerrit-Owner: Mark Bergsma 
Gerrit-Reviewer: Ema 
Gerrit-Reviewer: Giuseppe Lavagetto 
Gerrit-Reviewer: Mark Bergsma 
Gerrit-Reviewer: Volans 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations...pybal[master]: Add monitoring specific metric to RunCommand

2017-08-10 Thread Mark Bergsma (Code Review)
Mark Bergsma has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/371105 )

Change subject: Add monitoring specific metric to RunCommand
..

Add monitoring specific metric to RunCommand

pybal_monitor_runcommand_run_duration_seconds is a gauge that represents
the run latency, differentiated by the 'result' label
(successful/failed).

Bug: T171710
Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069
---
M pybal/monitors/runcommand.py
1 file changed, 30 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/debs/pybal 
refs/changes/05/371105/1

diff --git a/pybal/monitors/runcommand.py b/pybal/monitors/runcommand.py
index a0ca0ec..b10afce 100644
--- a/pybal/monitors/runcommand.py
+++ b/pybal/monitors/runcommand.py
@@ -7,11 +7,13 @@
 
 from pybal import monitor
 from pybal.util import log
+from pybal.metrics import Gauge
 
 import os, sys, signal, errno
 import logging
 
 from twisted.internet import reactor, process, error
+from twisted.python.runtime import seconds
 
 class ProcessGroupProcess(process.Process, object):
 """
@@ -89,6 +91,20 @@
 
 TIMEOUT_RUN = 20
 
+metric_labelnames = ('service', 'host', 'monitor')
+metric_keywords = {
+'namespace': 'pybal',
+'subsystem': 'monitor_' + __name__.lower()
+}
+
+runcommand_metrics = {
+'run_duration_seconds': Gauge(
+'run_duration_seconds',
+'Command duration',
+labelnames=metric_labelnames + ('result', 'exitcode'),
+**metric_keywords)
+}
+
 def __init__(self, coordinator, server, configuration={}):
 """Constructor"""
 
@@ -131,6 +147,7 @@
 def runCommand(self):
 """Periodically called method that does a single uptime check."""
 
+self.checkStartTime = seconds()
 self.runningProcess = self._spawnProcess(self, self.command, 
[self.command] + self.arguments,
  sessionLeader=True, 
timeout=(self.timeout or None))
 
@@ -157,10 +174,23 @@
 Called when the process has ended
 """
 
+duration = seconds() - self.checkStartTime
 if reason.check(error.ProcessDone):
 self._resultUp()
+result = 'successful'
+exitcode = 0
 elif reason.check(error.ProcessTerminated):
 self._resultDown(reason.getErrorMessage())
+result = 'failed'
+exitcode = reason.value.exitCode
+else:
+result = None
+exitcode = None
+
+self.runcommand_metrics['run_duration_seconds'].labels(
+result=result, exitcode=exitcode,
+**self.metric_labels
+).set(duration)
 
 # Schedule the next check
 if self.active:

-- 
To view, visit https://gerrit.wikimedia.org/r/371105
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2aeea030955c1b2b1abe872e6ed8f5b334c09069
Gerrit-PatchSet: 1
Gerrit-Project: operations/debs/pybal
Gerrit-Branch: master
Gerrit-Owner: Mark Bergsma 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits