Elukey has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/396051 )
Change subject: Fix metric rendering
......................................................................
Fix metric rendering
Change-Id: I792dfc34f5fd0185c5ec379e861aa50d28e88869
---
M druid_exporter/collector.py
M test/test_collector.py
M tox.ini
3 files changed, 317 insertions(+), 57 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/software/druid_exporter
refs/changes/51/396051/1
diff --git a/druid_exporter/collector.py b/druid_exporter/collector.py
index fc4affd..f3a6ead 100644
--- a/druid_exporter/collector.py
+++ b/druid_exporter/collector.py
@@ -34,44 +34,62 @@
# List of supported metrics and their fields of the JSON dictionary
# sent by a Druid daemon. These fields will be added as labels
# when returning the available metrics in @collect.
+ # Due to the fact that metric names are not unique (like segment/used),
+ # it is necessary to split the data structure by daemon.
self.supported_metric_names = {
- # Broker, Historical
- 'query/time': ['dataSource'],
- 'query/bytes': ['dataSource'],
- 'query/cache/total/numEntries': None,
- 'query/cache/total/sizeBytes': None,
- 'query/cache/total/hits': None,
- 'query/cache/total/misses': None,
- 'query/cache/total/evictions': None,
- 'query/cache/total/timeouts': None,
- 'query/cache/total/errors': None,
- # Historical + Coordinator
- 'segment/count': ['dataSource'],
- # Historical
- 'segment/max': None,
- 'segment/used': ['tier', 'dataSource'],
- 'segment/scan/pending': None,
- # Coordinator
- 'segment/assigned/count': ['tier'],
- 'segment/moved/count': ['tier'],
- 'segment/dropped/count': ['tier'],
- 'segment/deleted/count': ['tier'],
- 'segment/unneeded/count': ['tier'],
- 'segment/overShadowed/count': None,
- 'segment/loadQueue/failed': ['server'],
- 'segment/loadQueue/count': ['server'],
- 'segment/dropQueue/count': ['server'],
- 'segment/size': ['dataSource'],
- 'segment/unavailable/count': ['dataSource'],
- 'segment/underReplicated/count': ['tier', 'dataSource'],
- 'ingest/events/thrownAway': ['dataSource'],
- 'ingest/events/unparseable': ['dataSource'],
- 'ingest/events/processed': ['dataSource'],
- 'ingest/rows/output': ['dataSource'],
- 'ingest/persists/count': ['dataSource'],
- 'ingest/persists/failed': ['dataSource'],
- 'ingest/handoff/failed': ['dataSource'],
- 'ingest/handoff/count': ['dataSource'],
+ 'broker': {
+ 'query/time': ['dataSource'],
+ 'query/bytes': ['dataSource'],
+ 'query/cache/total/numEntries': None,
+ 'query/cache/total/sizeBytes': None,
+ 'query/cache/total/hits': None,
+ 'query/cache/total/misses': None,
+ 'query/cache/total/evictions': None,
+ 'query/cache/total/timeouts': None,
+ 'query/cache/total/errors': None,
+ },
+ 'historical': {
+ 'query/time': ['dataSource'],
+ 'query/bytes': ['dataSource'],
+ 'query/cache/total/numEntries': None,
+ 'query/cache/total/sizeBytes': None,
+ 'query/cache/total/hits': None,
+ 'query/cache/total/misses': None,
+ 'query/cache/total/evictions': None,
+ 'query/cache/total/timeouts': None,
+ 'query/cache/total/errors': None,
+ 'segment/count': ['tier', 'dataSource'],
+ 'segment/max': None,
+ 'segment/used': ['tier', 'dataSource'],
+ 'segment/scan/pending': None,
+ },
+ 'coordinator': {
+ 'segment/count': ['dataSource'],
+ 'segment/assigned/count': ['tier'],
+ 'segment/moved/count': ['tier'],
+ 'segment/dropped/count': ['tier'],
+ 'segment/deleted/count': ['tier'],
+ 'segment/unneeded/count': ['tier'],
+ 'segment/overShadowed/count': None,
+ 'segment/loadQueue/failed': ['server'],
+ 'segment/loadQueue/count': ['server'],
+ 'segment/dropQueue/count': ['server'],
+ 'segment/size': ['dataSource'],
+ 'segment/unavailable/count': ['dataSource'],
+ 'segment/underReplicated/count': ['tier', 'dataSource'],
+ },
+ 'peon': {
+ 'query/time': ['dataSource'],
+ 'query/bytes': ['dataSource'],
+ 'ingest/events/thrownAway': ['dataSource'],
+ 'ingest/events/unparseable': ['dataSource'],
+ 'ingest/events/processed': ['dataSource'],
+ 'ingest/rows/output': ['dataSource'],
+ 'ingest/persists/count': ['dataSource'],
+ 'ingest/persists/failed': ['dataSource'],
+ 'ingest/handoff/failed': ['dataSource'],
+ 'ingest/handoff/count': ['dataSource'],
+ },
}
# Buckets used when storing histogram metrics.
@@ -167,7 +185,7 @@
'Number of times handoff failed.',
labels=['datasource']),
'ingest/handoff/count': GaugeMetricFamily(
- 'druid_realtime_ingest_handoff_count',
+ 'druid_realtime_ingest_handoff_failed_count',
'Number of times handoff has happened.',
labels=['datasource']),
}
@@ -217,11 +235,11 @@
'segment/count': GaugeMetricFamily(
'druid_historical_segment_count',
'Number of served segments.',
- labels=['datasource']),
+ labels=['tier', 'datasource']),
'segment/used': GaugeMetricFamily(
'druid_historical_segment_used_bytes',
'Bytes used for served segments.',
- labels=['datasource']),
+ labels=['tier', 'datasource']),
'segment/scan/pending': GaugeMetricFamily(
'druid_historical_segment_scan_pending',
'Number of segments in queue waiting to be scanned.'),
@@ -229,6 +247,10 @@
def _get_coordinator_counters(self):
return {
+ 'segment/count': GaugeMetricFamily(
+ 'druid_coordinator_segment_count',
+ 'Number of available segments.',
+ labels=['datasource']),
'segment/assigned/count': GaugeMetricFamily(
'druid_coordinator_segment_assigned_count',
'Number of segments assigned to be loaded in the cluster.',
@@ -265,12 +287,8 @@
'Number of segments to drop.',
labels=['server']),
'segment/size': GaugeMetricFamily(
- 'druid_coordinator_segment_size_bytes',
+ 'druid_coordinator_segment_size',
'Size in bytes of available segments.',
- labels=['datasource']),
- 'segment/count': GaugeMetricFamily(
- 'druid_coordinator_segment_count',
- 'Number of served segments.',
labels=['datasource']),
'segment/unavailable/count': GaugeMetricFamily(
'druid_coordinator_segment_unavailable_count',
@@ -304,7 +322,7 @@
metric_value = float(datapoint['value'])
metrics_storage = self.counters[metric_name]
- metric_labels = self.supported_metric_names[metric_name]
+ metric_labels = self.supported_metric_names[daemon_name][metric_name]
metrics_storage.setdefault(daemon_name, {})
@@ -381,7 +399,7 @@
for metric in cache_metrics:
if not self.counters[metric] or daemon not in
self.counters[metric]:
- if not self.supported_metric_names[metric]:
+ if not self.supported_metric_names[daemon][metric]:
cache_metrics[metric].add_metric([], float('nan'))
else:
continue
@@ -397,12 +415,12 @@
('peon', realtime_metrics)]:
for metric in metrics:
if not self.counters[metric] or daemon not in
self.counters[metric]:
- if not self.supported_metric_names[metric]:
+ if not self.supported_metric_names[daemon][metric]:
metrics[metric].add_metric([], float('nan'))
else:
continue
else:
- labels = self.supported_metric_names[metric]
+ labels = self.supported_metric_names[daemon][metric]
if not labels:
metrics[metric].add_metric(
[], self.counters[metric][daemon])
@@ -425,8 +443,10 @@
yield registered
def register_datapoint(self, datapoint):
+ daemon = DruidCollector.sanitize_field(str(datapoint['service']))
+ daemon = DruidCollector.sanitize_field(str(datapoint['service']))
if (datapoint['feed'] != 'metrics' or
- datapoint['metric'] not in self.supported_metric_names):
+ datapoint['metric'] not in
self.supported_metric_names[daemon]):
log.debug("The following datapoint is not supported, either "
"because the 'feed' field is not 'metrics' or "
"the metric itself is not supported: {}"
diff --git a/test/test_collector.py b/test/test_collector.py
index 8a60d66..118f76a 100644
--- a/test/test_collector.py
+++ b/test/test_collector.py
@@ -23,6 +23,199 @@
def setUp(self):
self.collector = DruidCollector()
+
+ # List of metric names as emitted by Druid, coupled with their
+ # Prometheus metric name and labels.
+ self.supported_metric_names = {
+ 'broker': {
+ 'query/time': {
+ 'metric_name': 'druid_broker_query_time_ms',
+ 'labels': ['dataSource']
+ },
+ 'query/bytes': {
+ 'metric_name': 'druid_broker_query_bytes',
+ 'labels': ['dataSource']
+ },
+ 'query/cache/total/numEntries': {
+ 'metric_name': 'druid_broker_query_cache_numentries_count',
+ 'labels': None
+ },
+ 'query/cache/total/sizeBytes': {
+ 'metric_name': 'druid_broker_query_cache_sizebytes_count',
+ 'labels': None
+ },
+ 'query/cache/total/hits': {
+ 'metric_name': 'druid_broker_query_cache_hits_count',
+ 'labels': None
+ },
+ 'query/cache/total/misses': {
+ 'metric_name':'druid_broker_query_cache_hits_count',
+ 'labels': None
+ },
+ 'query/cache/total/evictions': {
+ 'metric_name':'druid_broker_query_cache_evictions_count',
+ 'labels': None
+ },
+ 'query/cache/total/timeouts': {
+ 'metric_name':'druid_broker_query_cache_timeouts_count',
+ 'labels': None
+ },
+ 'query/cache/total/errors': {
+ 'metric_name':'druid_broker_query_cache_errors_count',
+ 'labels': None
+ }
+ },
+ 'historical': {
+ 'query/time': {
+ 'metric_name': 'druid_historical_query_time_ms',
+ 'labels': ['dataSource']
+ },
+ 'query/bytes': {
+ 'metric_name': 'druid_historical_query_bytes',
+ 'labels': ['dataSource']
+ },
+ 'query/cache/total/numEntries': {
+ 'metric_name':
'druid_historical_query_cache_numentries_count',
+ 'labels': None
+ },
+ 'query/cache/total/sizeBytes': {
+ 'metric_name':
'druid_historical_query_cache_sizebytes_count',
+ 'labels': None
+ },
+ 'query/cache/total/hits': {
+ 'metric_name': 'druid_historical_query_cache_hits_count',
+ 'labels': None
+ },
+ 'query/cache/total/misses': {
+ 'metric_name':'druid_historical_query_cache_hits_count',
+ 'labels': None
+ },
+ 'query/cache/total/evictions': {
+
'metric_name':'druid_historical_query_cache_evictions_count',
+ 'labels': None
+ },
+ 'query/cache/total/timeouts': {
+
'metric_name':'druid_historical_query_cache_timeouts_count',
+ 'labels': None
+ },
+ 'query/cache/total/errors': {
+ 'metric_name':'druid_historical_query_cache_errors_count',
+ 'labels': None
+ },
+ 'segment/count': {
+ 'metric_name': 'druid_historical_segment_count',
+ 'labels': ['tier', 'dataSource']
+ },
+ 'segment/max': {
+ 'metric_name': 'druid_historical_max_segment_bytes',
+ 'labels': None
+ },
+ 'segment/used': {
+ 'metric_name': 'druid_historical_segment_used_bytes',
+ 'labels': ['tier', 'dataSource']
+ },
+ 'segment/scan/pending': {
+ 'metric_name': 'druid_historical_segment_scan_pending',
+ 'labels': None
+ }
+ },
+ 'coordinator': {
+ 'segment/assigned/count': {
+ 'metric_name': 'druid_coordinator_segment_assigned_count',
+ 'labels': ['tier'],
+ },
+ 'segment/moved/count': {
+ 'metric_name': 'druid_coordinator_segment_moved_count',
+ 'labels': ['tier']
+ },
+ 'segment/dropped/count': {
+ 'metric_name': 'druid_coordinator_segment_dropped_count',
+ 'labels': ['tier']
+ },
+ 'segment/deleted/count': {
+ 'metric_name': 'druid_coordinator_segment_deleted_count',
+ 'labels': ['tier']
+ },
+ 'segment/unneeded/count': {
+ 'metric_name': 'druid_coordinator_segment_unneeded_count',
+ 'labels': ['tier']
+ },
+ 'segment/overShadowed/count': {
+ 'metric_name':
'druid_coordinator_segment_overshadowed_count',
+ 'labels': None
+ },
+ 'segment/loadQueue/failed': {
+ 'metric_name':
'druid_coordinator_segment_loadqueue_failed_count',
+ 'labels': ['server']
+ },
+ 'segment/loadQueue/count': {
+ 'metric_name': 'druid_coordinator_segment_loadqueue_count',
+ 'labels': ['server']
+ },
+ 'segment/dropQueue/count': {
+ 'metric_name': 'druid_coordinator_segment_dropqueue_count',
+ 'labels': ['server']
+ },
+ 'segment/size': {
+ 'metric_name': 'druid_coordinator_segment_size',
+ 'labels': ['dataSource']
+ },
+ 'segment/count': {
+ 'metric_name': 'druid_coordinator_segment_count',
+ 'labels': ['dataSource']
+ },
+ 'segment/unavailable/count': {
+ 'metric_name':
'druid_coordinator_segment_unavailable_count',
+ 'labels': ['dataSource']
+ },
+ 'segment/underReplicated/count': {
+ 'metric_name':
'druid_coordinator_segment_under_replicated_count',
+ 'labels': ['tier', 'dataSource']
+ }
+ },
+ 'peon': {
+ 'query/time': {
+ 'metric_name': 'druid_peon_query_time_ms',
+ 'labels': ['dataSource']
+ },
+ 'query/bytes': {
+ 'metric_name': 'druid_peon_query_bytes',
+ 'labels': ['dataSource']
+ },
+ 'ingest/events/thrownAway': {
+ 'metric_name':
'druid_realtime_ingest_events_thrown_away_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/events/unparseable': {
+ 'metric_name':
'druid_realtime_ingest_events_unparseable_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/events/processed': {
+ 'metric_name':
'druid_realtime_ingest_events_processed_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/rows/output': {
+ 'metric_name': 'druid_realtime_ingest_rows_output_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/persists/count': {
+ 'metric_name': 'druid_realtime_ingest_persists_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/persists/failed': {
+ 'metric_name':
'druid_realtime_ingest_persists_failed_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/handoff/failed': {
+ 'metric_name':
'druid_realtime_ingest_handoff_failed_count',
+ 'labels': ['dataSource'],
+ },
+ 'ingest/handoff/count': {
+ 'metric_name':
'druid_realtime_ingest_handoff_failed_count',
+ 'labels': ['dataSource'],
+ }
+ }
+ }
self.metrics_without_labels = [
'druid_historical_segment_scan_pending',
'druid_historical_max_segment_bytes',
@@ -122,16 +315,16 @@
# Third datapoint for the same metric as used in the first test, should
# add a key to the already existent dictionary.
datapoint = {'feed': 'metrics', 'service': 'druid/historical',
'dataSource': 'test2',
- 'metric': 'segment/used', 'tier': '_default_tier',
'value': 543}
+ 'metric': 'segment/count', 'tier': '_default_tier',
'value': 543}
self.collector.register_datapoint(datapoint)
-
expected_result['segment/used']['historical']['_default_tier']['test2'] = 543.0
+ expected_result['segment/count'] = {'historical': {'_default_tier':
{'test2': 543.0}}}
self.assertEqual(self.collector.counters, expected_result)
- # Fourth datapoint for an already seen metric but differen broker
- datapoint = {'feed': 'metrics', 'service': 'druid/broker',
'dataSource': 'test',
- 'metric': 'segment/used', 'tier': '_default_tier',
'value': 111}
+ # Fourth datapoint for an already seen metric but different daemon
+ datapoint = {'feed': 'metrics', 'service': 'druid/coordinator',
'dataSource': 'test',
+ 'metric': 'segment/count', 'value': 111}
self.collector.register_datapoint(datapoint)
- expected_result['segment/used']['broker'] = {'_default_tier': {'test':
111.0}}
+ expected_result['segment/count']['coordinator'] = {'test': 111.0}
self.assertEqual(self.collector.counters, expected_result)
# Fifth datapoint should override a pre-existent value
@@ -303,6 +496,12 @@
"service": "druid/coordinator", "host": "druid1001.eqiad.wmnet:
8081",
"metric": "segment/count", "value": 56, "dataSource": "netflow"},
+ {"feed": "metrics", "timestamp": "2017-12-07T09:55:04.937Z",
+ "service": "druid/historical", "host":
"druid1001.eqiad.wmnet:8083",
+ "metric": "segment/used", "value": 3252671142,
+ "dataSource": "banner_activity_minutely",
+ "priority": "0", "tier": "_default_tier"},
+
{"feed": "metrics", "timestamp": "2017-11-14T13:08:20.820Z",
"service": "druid/historical", "host":
"druid1001.eqiad.wmnet:8083",
"metric": "segment/max", "value": 2748779069440},
@@ -440,13 +639,20 @@
for datapoint in datapoints:
self.collector.register_datapoint(datapoint)
+ # Running it twice should not produce more metrics
+ for datapoint in datapoints:
+ self.collector.register_datapoint(datapoint)
+
collected_metrics = 0
+ prometheus_metric_samples = []
for metric in self.collector.collect():
# Metrics should not be returned if no sample is associated
# (not even a 'nan')
self.assertNotEqual(metric.samples, [])
if metric.samples and metric.samples[0][0].startswith('druid_'):
+ print(metric.samples)
collected_metrics += 1
+ prometheus_metric_samples.append(metric.samples)
# Number of metrics pushed using register_datapoint plus the ones
# generated by the exporter for bookeeping,
@@ -454,6 +660,40 @@
expected_druid_metrics_len = len(datapoints) + 1
self.assertEqual(collected_metrics, expected_druid_metrics_len)
+ for datapoint in datapoints:
+ metric = datapoint['metric']
+ daemon = datapoint['service'].split('/')[1]
+ prometheus_metric_name =
self.supported_metric_names[daemon][metric]['metric_name']
+ prometheus_metric_labels =
self.supported_metric_names[daemon][metric]['labels']
+
+ # The prometheus metric samples are in two forms:
+ # 1) histograms:
+ # [('druid_broker_query_time_ms_bucket', {'datasource':
'NavigationTiming', 'le': '10'}, 2),
+ # [...]
+ # ('druid_broker_query_time_ms_bucket', {'datasource':
'NavigationTiming', 'le': 'inf'}, 2),
+ # ('druid_broker_query_time_ms_count', {'datasource':
'NavigationTiming'}, 2),
+ # ('druid_broker_query_time_ms_sum', {'datasource':
'NavigationTiming'}, 20.0)]
+ #
+ # 2) counter/gauge
+ # [('druid_coordinator_segment_unneeded_count', {'tier':
'_default_tier'}, 0.0)]
+ #
+ # The idea of the following test is to make sure that after sending
+ # one data point for each metric, the sample contains the
information
+ # needed.
+ #
+ if "query" not in metric:
+ for sample in prometheus_metric_samples:
+ if prometheus_metric_name == sample[0][0]:
+ if prometheus_metric_labels:
+ for label in prometheus_metric_labels:
+ self.assertTrue(label.lower() in sample[0][1])
+ else:
+ self.assertTrue(sample[0][1] == {})
+ break
+ else:
+ # TODO
+ pass
+
def test_register_datapoints_count(self):
datapoints = [
diff --git a/tox.ini b/tox.ini
index 8b8b169..bf33a45 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,5 +4,5 @@
[testenv]
deps=nose
commands=
- nosetests \
+ nosetests --logging-level=INFO \
[] # substitute with tox' positional arguments
--
To view, visit https://gerrit.wikimedia.org/r/396051
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I792dfc34f5fd0185c5ec379e861aa50d28e88869
Gerrit-PatchSet: 1
Gerrit-Project: operations/software/druid_exporter
Gerrit-Branch: master
Gerrit-Owner: Elukey <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits