Mark Bergsma has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/371126 )

Change subject: Add pybal service (coordinator) metrics
......................................................................

Add pybal service (coordinator) metrics

Change-Id: I4a4e067a0bb56a77cb3123b14942e5e470702998
---
M pybal/coordinator.py
1 file changed, 75 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/debs/pybal 
refs/changes/26/371126/1

diff --git a/pybal/coordinator.py b/pybal/coordinator.py
index c828ca3..401c9a5 100755
--- a/pybal/coordinator.py
+++ b/pybal/coordinator.py
@@ -15,6 +15,7 @@
 from twisted.python import failure
 
 from pybal import config, util
+from pybal.metrics import Counter, Gauge
 
 log = util.log
 
@@ -280,6 +281,43 @@
 
     intvLoadServers = 60
 
+    metric_keywords = {
+        'labelnames': ('service', ),
+        'namespace': 'pybal',
+        'subsystem': 'service'
+    }
+
+    metrics = {
+        'servers': Gauge(
+            'servers',
+            'Amount of servers',
+            **metric_keywords),
+        'servers_enabled': Gauge(
+            'servers_enabled',
+            'Amount of enabled servers',
+            **metric_keywords),
+        'servers_up': Gauge(
+            'servers_up',
+            'Amount of up servers',
+            **metric_keywords),
+        'servers_pooled': Gauge(
+            'servers_pooled',
+            'Amount of pooled servers',
+            **metric_keywords),
+        'can_depool': Gauge(
+            'can_depool',
+            'Can depool more servers',
+            **metric_keywords),
+        'pooled_down_servers': Counter(
+            'pooled_down_servers',
+            'Amount of down servers pooled because too many down',
+            **metric_keywords),
+        'could_not_depool_total': Counter(
+            'could_not_depool_total',
+            'Pybal could not depool a server because too many down',
+            **metric_keywords),
+    }
+
     def __init__(self, lvsservice, configUrl):
         """Constructor"""
 
@@ -291,6 +329,10 @@
         self.serverInitDeferredList = defer.Deferred()
         self.configObserver = config.ConfigurationObserver.fromUrl(self, 
configUrl)
         self.configObserver.startObserving()
+
+        self.metric_labels = {
+            'service': self.lvsservice.name
+        }
 
     def __str__(self):
         return "[%s]" % self.lvsservice.name
@@ -358,11 +400,14 @@
         if self.canDepool():
             self.lvsservice.removeServer(server)
             self.pooledDownServers.discard(server)
+            self.metrics['servers_pooled'].labels(**self.metric_labels).dec()
         else:
             self.pooledDownServers.add(server)
             msg = "Could not depool server " \
                   "{} because of too many down!".format(server.host)
             log.error(msg, system=self.lvsservice.name)
+            
self.metrics['could_not_depool_total'].labels(**self.metric_labels).inc()
+        self._updatePooledDownMetrics()
 
     def repool(self, server):
         """
@@ -374,16 +419,19 @@
 
         if not server.pooled:
             self.lvsservice.addServer(server)
+            self.metrics['servers_pooled'].labels(**self.metric_labels).inc()
         else:
             msg = "Leaving previously pooled but down server {} pooled"
             log.info(msg.format(server.host), system=self.lvsservice.name)
 
         # If it had been pooled in down state before, remove it from the list
         self.pooledDownServers.discard(server)
+        self._updatePooledDownMetrics()
 
         # See if we can depool any servers that could not be depooled before
         while len(self.pooledDownServers) > 0 and self.canDepool():
             self.depool(self.pooledDownServers.pop())
+            self.metrics['servers_pooled'].labels(**self.metric_labels).dec()
 
     def canDepool(self):
         """Returns a boolean denoting whether another server can be depooled"""
@@ -438,6 +486,9 @@
         # Wait for all new servers to finish initializing
         self.serverInitDeferredList = 
defer.DeferredList(initList).addCallback(self._serverInitDone)
 
+        # Update metrics
+        self._updateServerMetrics()
+
     def _serverInitDone(self, result):
         """Called when all (new) servers have finished initializing"""
 
@@ -445,3 +496,27 @@
 
         # Assign the updated list of enabled servers to the LVSService instance
         self.assignServers()
+
+    def _updateServerMetrics(self):
+        """Update gauge metrics for servers on config change"""
+        self.metrics['servers'].labels(
+            **self.metric_labels
+            ).set(
+                len(self.servers))
+        self.metrics['servers_enabled'].labels(
+            **self.metric_labels
+            ).set(
+                len([s for s in self.servers.itervalues() if server.enabled]))
+        self.metrics['servers_up'].labels(
+            **self.metric_labels
+            ).set(
+                len([s for s in self.servers.itervalues() if server.up]))
+
+    def _updatePooledDownMetrics(self):
+        """Update gauge metrics for pooled-but-down servers"""
+        self.metrics['pooled_down_servers'].labels(
+            **self.metric_labels
+            ).set(len(self.pooledDownServers))
+        self.metrics['can_depool'].labels(
+            **self.metric_labels
+            ).set(self.canDepool())

-- 
To view, visit https://gerrit.wikimedia.org/r/371126
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4a4e067a0bb56a77cb3123b14942e5e470702998
Gerrit-PatchSet: 1
Gerrit-Project: operations/debs/pybal
Gerrit-Branch: master
Gerrit-Owner: Mark Bergsma <m...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to