Mark Bergsma has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/371126 )
Change subject: Add pybal service (coordinator) metrics
......................................................................
Add pybal service (coordinator) metrics
Change-Id: I4a4e067a0bb56a77cb3123b14942e5e470702998
---
M pybal/coordinator.py
1 file changed, 75 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/debs/pybal
refs/changes/26/371126/1
diff --git a/pybal/coordinator.py b/pybal/coordinator.py
index c828ca3..401c9a5 100755
--- a/pybal/coordinator.py
+++ b/pybal/coordinator.py
@@ -15,6 +15,7 @@
from twisted.python import failure
from pybal import config, util
+from pybal.metrics import Counter, Gauge
log = util.log
@@ -280,6 +281,43 @@
intvLoadServers = 60
+ metric_keywords = {
+ 'labelnames': ('service', ),
+ 'namespace': 'pybal',
+ 'subsystem': 'service'
+ }
+
+ metrics = {
+ 'servers': Gauge(
+ 'servers',
+ 'Amount of servers',
+ **metric_keywords),
+ 'servers_enabled': Gauge(
+ 'servers_enabled',
+ 'Amount of enabled servers',
+ **metric_keywords),
+ 'servers_up': Gauge(
+ 'servers_up',
+ 'Amount of up servers',
+ **metric_keywords),
+ 'servers_pooled': Gauge(
+ 'servers_pooled',
+ 'Amount of pooled servers',
+ **metric_keywords),
+ 'can_depool': Gauge(
+ 'can_depool',
+ 'Can depool more servers',
+ **metric_keywords),
+ 'pooled_down_servers': Counter(
+ 'pooled_down_servers',
+ 'Amount of down servers pooled because too many down',
+ **metric_keywords),
+ 'could_not_depool_total': Counter(
+ 'could_not_depool_total',
+ 'Pybal could not depool a server because too many down',
+ **metric_keywords),
+ }
+
def __init__(self, lvsservice, configUrl):
"""Constructor"""
@@ -291,6 +329,10 @@
self.serverInitDeferredList = defer.Deferred()
self.configObserver = config.ConfigurationObserver.fromUrl(self,
configUrl)
self.configObserver.startObserving()
+
+ self.metric_labels = {
+ 'service': self.lvsservice.name
+ }
def __str__(self):
return "[%s]" % self.lvsservice.name
@@ -358,11 +400,14 @@
if self.canDepool():
self.lvsservice.removeServer(server)
self.pooledDownServers.discard(server)
+ self.metrics['servers_pooled'].labels(**self.metric_labels).dec()
else:
self.pooledDownServers.add(server)
msg = "Could not depool server " \
"{} because of too many down!".format(server.host)
log.error(msg, system=self.lvsservice.name)
+
self.metrics['could_not_depool_total'].labels(**self.metric_labels).inc()
+ self._updatePooledDownMetrics()
def repool(self, server):
"""
@@ -374,16 +419,19 @@
if not server.pooled:
self.lvsservice.addServer(server)
+ self.metrics['servers_pooled'].labels(**self.metric_labels).inc()
else:
msg = "Leaving previously pooled but down server {} pooled"
log.info(msg.format(server.host), system=self.lvsservice.name)
# If it had been pooled in down state before, remove it from the list
self.pooledDownServers.discard(server)
+ self._updatePooledDownMetrics()
# See if we can depool any servers that could not be depooled before
while len(self.pooledDownServers) > 0 and self.canDepool():
self.depool(self.pooledDownServers.pop())
+ self.metrics['servers_pooled'].labels(**self.metric_labels).dec()
def canDepool(self):
"""Returns a boolean denoting whether another server can be depooled"""
@@ -438,6 +486,9 @@
# Wait for all new servers to finish initializing
self.serverInitDeferredList =
defer.DeferredList(initList).addCallback(self._serverInitDone)
+ # Update metrics
+ self._updateServerMetrics()
+
def _serverInitDone(self, result):
"""Called when all (new) servers have finished initializing"""
@@ -445,3 +496,27 @@
# Assign the updated list of enabled servers to the LVSService instance
self.assignServers()
+
+ def _updateServerMetrics(self):
+ """Update gauge metrics for servers on config change"""
+ self.metrics['servers'].labels(
+ **self.metric_labels
+ ).set(
+ len(self.servers))
+ self.metrics['servers_enabled'].labels(
+ **self.metric_labels
+ ).set(
+ len([s for s in self.servers.itervalues() if server.enabled]))
+ self.metrics['servers_up'].labels(
+ **self.metric_labels
+ ).set(
+ len([s for s in self.servers.itervalues() if server.up]))
+
+ def _updatePooledDownMetrics(self):
+ """Update gauge metrics for pooled-but-down servers"""
+ self.metrics['pooled_down_servers'].labels(
+ **self.metric_labels
+ ).set(len(self.pooledDownServers))
+ self.metrics['can_depool'].labels(
+ **self.metric_labels
+ ).set(self.canDepool())
--
To view, visit https://gerrit.wikimedia.org/r/371126
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I4a4e067a0bb56a77cb3123b14942e5e470702998
Gerrit-PatchSet: 1
Gerrit-Project: operations/debs/pybal
Gerrit-Branch: master
Gerrit-Owner: Mark Bergsma <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits