This is an automated email from the ASF dual-hosted git repository.
chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new e89ebe1e8 [CELEBORN-1977] Add help/type on prometheus exposed metrics
e89ebe1e8 is described below
commit e89ebe1e8f0d05c1a5a33bb796dea9542a7c3e3d
Author: Nicolas Fraison <[email protected]>
AuthorDate: Mon Apr 28 16:03:36 2025 +0800
[CELEBORN-1977] Add help/type on prometheus exposed metrics
### What changes were proposed in this pull request?
Add help/type on prometheus exposed metrics:
```
# HELP metrics_UpdateResourceConsumptionTime_Count
# TYPE metrics_UpdateResourceConsumptionTime_Count counter
metrics_UpdateResourceConsumptionTime_Count{instance="192.168.192.143:9098",role="master"}
1 1745390288743
```
### Why are the changes needed?
Datadog agent rely on this type to discover the type of the exposed
prometheus metrics:
https://docs.datadoghq.com/integrations/openmetrics/#missing-untyped-metrics
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Started one master and worker celeborn instance with below
metrics.properties config:
```
*.sink.prometheusServlet.class=org.apache.celeborn.common.metrics.sink.PrometheusServlet
*.sink.jsonServlet.class=org.apache.celeborn.common.metrics.sink.JsonServlet
```
Then connected to the master and worker metrics endpoint for prometheus.
All the metrics now have the help/type annotation.
Closes #3223 from ashangit/nfraison/prometheus.
Authored-by: Nicolas Fraison <[email protected]>
Signed-off-by: Cheng Pan <[email protected]>
---
.../common/metrics/source/AbstractSource.scala | 221 +++++++++++++++++----
1 file changed, 178 insertions(+), 43 deletions(-)
diff --git
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
index d9aa630d1..705344381 100644
---
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
+++
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
@@ -403,33 +403,80 @@ abstract class AbstractSource(conf: CelebornConf, role:
String)
metricsCleaner.scheduleWithFixedDelay(cleanTask, 10, 10, TimeUnit.MINUTES)
}
+ private def addMetricsWithPrometheusHelpType(
+ metricName: String,
+ label: String,
+ value: Any,
+ timestamp: Long,
+ promType: String): String = {
+ val sb = new StringBuilder
+ sb.append(s"# HELP ${metricName}\n")
+ sb.append(s"# TYPE ${metricName} ${promType}\n")
+ sb.append(s"${metricName}$label ${value} $timestamp\n")
+ sb.toString()
+ }
+
def getCounterMetrics(nc: NamedCounter): String = {
val timestamp = System.currentTimeMillis
+ val sb = new StringBuilder
val label = nc.labelString
- val str = s"${normalizeKey(nc.name)}Count$label ${nc.counter.getCount}
$timestamp\n"
- str
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${normalizeKey(nc.name)}Count",
+ label,
+ nc.counter.getCount,
+ timestamp,
+ "counter"))
+ sb.toString()
}
def getGaugeMetrics(ng: NamedGauge[_]): String = {
val timestamp = System.currentTimeMillis
val sb = new StringBuilder
val label = ng.labelString
- sb.append(s"${normalizeKey(ng.name)}Value$label ${ng.gauge.getValue}
$timestamp\n")
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${normalizeKey(ng.name)}Value",
+ label,
+ ng.gauge.getValue,
+ timestamp,
+ "gauge"))
sb.toString()
}
def getMeterMetrics(nm: NamedMeter): String = {
val timestamp = System.currentTimeMillis
val sb = new StringBuilder
+ val prefix = normalizeKey(nm.name)
val label = nm.labelString
- sb.append(s"${normalizeKey(nm.name)}Count$label ${nm.meter.getCount}
$timestamp\n")
- sb.append(s"${normalizeKey(nm.name)}MeanRate$label ${nm.meter.getMeanRate}
$timestamp\n")
- sb.append(
- s"${normalizeKey(nm.name)}OneMinuteRate$label
${nm.meter.getOneMinuteRate} $timestamp\n")
- sb.append(
- s"${normalizeKey(nm.name)}FiveMinuteRate$label
${nm.meter.getFiveMinuteRate} $timestamp\n")
- sb.append(
- s"${normalizeKey(nm.name)}FifteenMinuteRate$label
${nm.meter.getFifteenMinuteRate} $timestamp\n")
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Count",
+ label,
+ nm.meter.getCount,
+ timestamp,
+ "counter"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}MeanRate",
+ label,
+ nm.meter.getMeanRate,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}OneMinuteRate",
+ label,
+ nm.meter.getOneMinuteRate,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}FiveMinuteRate",
+ label,
+ nm.meter.getFiveMinuteRate,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}FifteenMinuteRate",
+ label,
+ nm.meter.getFifteenMinuteRate,
+ timestamp,
+ "gauge"))
sb.toString()
}
@@ -439,22 +486,66 @@ abstract class AbstractSource(conf: CelebornConf, role:
String)
val snapshot = nh.histogram.getSnapshot
val prefix = normalizeKey(nh.name)
val label = nh.labelString
- sb.append(s"${prefix}Count$label ${nh.histogram.getCount} $timestamp\n")
- sb.append(s"${prefix}Max$label ${(snapshot.getMax)} $timestamp\n")
- sb.append(s"${prefix}Mean$label ${(snapshot.getMean)} $timestamp\n")
- sb.append(s"${prefix}Min$label ${(snapshot.getMin)} $timestamp\n")
- sb.append(s"${prefix}50thPercentile$label" +
- s" ${snapshot.getMedian} $timestamp\n")
- sb.append(s"${prefix}75thPercentile$label" +
- s" ${snapshot.get75thPercentile} $timestamp\n")
- sb.append(s"${prefix}95thPercentile$label" +
- s" ${snapshot.get95thPercentile} $timestamp\n")
- sb.append(s"${prefix}98thPercentile$label" +
- s" ${snapshot.get98thPercentile} $timestamp\n")
- sb.append(s"${prefix}99thPercentile$label" +
- s" ${snapshot.get99thPercentile} $timestamp\n")
- sb.append(s"${prefix}999thPercentile$label" +
- s" ${snapshot.get999thPercentile} $timestamp\n")
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Count",
+ label,
+ nh.histogram.getCount,
+ timestamp,
+ "counter"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Max",
+ label,
+ snapshot.getMax,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Mean",
+ label,
+ snapshot.getMean,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Min",
+ label,
+ snapshot.getMin,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}50thPercentile",
+ label,
+ snapshot.getMedian,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}75thPercentile",
+ label,
+ snapshot.get75thPercentile,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}95thPercentile",
+ label,
+ snapshot.get95thPercentile,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}98thPercentile",
+ label,
+ snapshot.get98thPercentile,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}99thPercentile",
+ label,
+ snapshot.get99thPercentile,
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}999thPercentile",
+ label,
+ snapshot.get999thPercentile,
+ timestamp,
+ "gauge"))
sb.toString()
}
@@ -464,22 +555,66 @@ abstract class AbstractSource(conf: CelebornConf, role:
String)
val snapshot = nt.timer.getSnapshot
val prefix = normalizeKey(nt.name)
val label = nt.labelString
- sb.append(s"${prefix}Count$label ${nt.timer.getCount} $timestamp\n")
- sb.append(s"${prefix}Max$label ${reportNanosAsMills(snapshot.getMax)}
$timestamp\n")
- sb.append(s"${prefix}Mean$label ${reportNanosAsMills(snapshot.getMean)}
$timestamp\n")
- sb.append(s"${prefix}Min$label ${reportNanosAsMills(snapshot.getMin)}
$timestamp\n")
- sb.append(s"${prefix}50thPercentile$label" +
- s" ${reportNanosAsMills(snapshot.getMedian)} $timestamp\n")
- sb.append(s"${prefix}75thPercentile$label" +
- s" ${reportNanosAsMills(snapshot.get75thPercentile)} $timestamp\n")
- sb.append(s"${prefix}95thPercentile$label" +
- s" ${reportNanosAsMills(snapshot.get95thPercentile)} $timestamp\n")
- sb.append(s"${prefix}98thPercentile$label" +
- s" ${reportNanosAsMills(snapshot.get98thPercentile)} $timestamp\n")
- sb.append(s"${prefix}99thPercentile$label" +
- s" ${reportNanosAsMills(snapshot.get99thPercentile)} $timestamp\n")
- sb.append(s"${prefix}999thPercentile$label" +
- s" ${reportNanosAsMills(snapshot.get999thPercentile)} $timestamp\n")
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Count",
+ label,
+ nt.timer.getCount,
+ timestamp,
+ "counter"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Max",
+ label,
+ reportNanosAsMills(snapshot.getMax),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Mean",
+ label,
+ reportNanosAsMills(snapshot.getMean),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}Min",
+ label,
+ reportNanosAsMills(snapshot.getMin),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}50thPercentile",
+ label,
+ reportNanosAsMills(snapshot.getMedian),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}75thPercentile",
+ label,
+ reportNanosAsMills(snapshot.get75thPercentile),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}95thPercentile",
+ label,
+ reportNanosAsMills(snapshot.get95thPercentile),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}98thPercentile",
+ label,
+ reportNanosAsMills(snapshot.get98thPercentile),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}99thPercentile",
+ label,
+ reportNanosAsMills(snapshot.get99thPercentile),
+ timestamp,
+ "gauge"))
+ sb.append(addMetricsWithPrometheusHelpType(
+ s"${prefix}999thPercentile",
+ label,
+ reportNanosAsMills(snapshot.get999thPercentile),
+ timestamp,
+ "gauge"))
sb.toString()
}