This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new e89ebe1e8 [CELEBORN-1977] Add help/type on prometheus exposed metrics
e89ebe1e8 is described below

commit e89ebe1e8f0d05c1a5a33bb796dea9542a7c3e3d
Author: Nicolas Fraison <[email protected]>
AuthorDate: Mon Apr 28 16:03:36 2025 +0800

    [CELEBORN-1977] Add help/type on prometheus exposed metrics
    
    ### What changes were proposed in this pull request?
    
    Add help/type on prometheus exposed metrics:
    ```
    # HELP metrics_UpdateResourceConsumptionTime_Count
    # TYPE metrics_UpdateResourceConsumptionTime_Count counter
    
metrics_UpdateResourceConsumptionTime_Count{instance="192.168.192.143:9098",role="master"}
 1 1745390288743
    ```
    
    ### Why are the changes needed?
    
    Datadog agent rely on this type to discover the type of the exposed 
prometheus metrics: 
https://docs.datadoghq.com/integrations/openmetrics/#missing-untyped-metrics
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Started one master and worker celeborn instance with below 
metrics.properties config:
    ```
    
*.sink.prometheusServlet.class=org.apache.celeborn.common.metrics.sink.PrometheusServlet
    *.sink.jsonServlet.class=org.apache.celeborn.common.metrics.sink.JsonServlet
    ```
    
    Then connected to the master and worker metrics endpoint for prometheus.
    All the metrics now have the help/type annotation.
    
    Closes #3223 from ashangit/nfraison/prometheus.
    
    Authored-by: Nicolas Fraison <[email protected]>
    Signed-off-by: Cheng Pan <[email protected]>
---
 .../common/metrics/source/AbstractSource.scala     | 221 +++++++++++++++++----
 1 file changed, 178 insertions(+), 43 deletions(-)

diff --git 
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
 
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
index d9aa630d1..705344381 100644
--- 
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
+++ 
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
@@ -403,33 +403,80 @@ abstract class AbstractSource(conf: CelebornConf, role: 
String)
     metricsCleaner.scheduleWithFixedDelay(cleanTask, 10, 10, TimeUnit.MINUTES)
   }
 
+  private def addMetricsWithPrometheusHelpType(
+      metricName: String,
+      label: String,
+      value: Any,
+      timestamp: Long,
+      promType: String): String = {
+    val sb = new StringBuilder
+    sb.append(s"# HELP ${metricName}\n")
+    sb.append(s"# TYPE ${metricName} ${promType}\n")
+    sb.append(s"${metricName}$label ${value} $timestamp\n")
+    sb.toString()
+  }
+
   def getCounterMetrics(nc: NamedCounter): String = {
     val timestamp = System.currentTimeMillis
+    val sb = new StringBuilder
     val label = nc.labelString
-    val str = s"${normalizeKey(nc.name)}Count$label ${nc.counter.getCount} 
$timestamp\n"
-    str
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${normalizeKey(nc.name)}Count",
+      label,
+      nc.counter.getCount,
+      timestamp,
+      "counter"))
+    sb.toString()
   }
 
   def getGaugeMetrics(ng: NamedGauge[_]): String = {
     val timestamp = System.currentTimeMillis
     val sb = new StringBuilder
     val label = ng.labelString
-    sb.append(s"${normalizeKey(ng.name)}Value$label ${ng.gauge.getValue} 
$timestamp\n")
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${normalizeKey(ng.name)}Value",
+      label,
+      ng.gauge.getValue,
+      timestamp,
+      "gauge"))
     sb.toString()
   }
 
   def getMeterMetrics(nm: NamedMeter): String = {
     val timestamp = System.currentTimeMillis
     val sb = new StringBuilder
+    val prefix = normalizeKey(nm.name)
     val label = nm.labelString
-    sb.append(s"${normalizeKey(nm.name)}Count$label ${nm.meter.getCount} 
$timestamp\n")
-    sb.append(s"${normalizeKey(nm.name)}MeanRate$label ${nm.meter.getMeanRate} 
$timestamp\n")
-    sb.append(
-      s"${normalizeKey(nm.name)}OneMinuteRate$label 
${nm.meter.getOneMinuteRate} $timestamp\n")
-    sb.append(
-      s"${normalizeKey(nm.name)}FiveMinuteRate$label 
${nm.meter.getFiveMinuteRate} $timestamp\n")
-    sb.append(
-      s"${normalizeKey(nm.name)}FifteenMinuteRate$label 
${nm.meter.getFifteenMinuteRate} $timestamp\n")
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Count",
+      label,
+      nm.meter.getCount,
+      timestamp,
+      "counter"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}MeanRate",
+      label,
+      nm.meter.getMeanRate,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}OneMinuteRate",
+      label,
+      nm.meter.getOneMinuteRate,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}FiveMinuteRate",
+      label,
+      nm.meter.getFiveMinuteRate,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}FifteenMinuteRate",
+      label,
+      nm.meter.getFifteenMinuteRate,
+      timestamp,
+      "gauge"))
     sb.toString()
   }
 
@@ -439,22 +486,66 @@ abstract class AbstractSource(conf: CelebornConf, role: 
String)
     val snapshot = nh.histogram.getSnapshot
     val prefix = normalizeKey(nh.name)
     val label = nh.labelString
-    sb.append(s"${prefix}Count$label ${nh.histogram.getCount} $timestamp\n")
-    sb.append(s"${prefix}Max$label ${(snapshot.getMax)} $timestamp\n")
-    sb.append(s"${prefix}Mean$label ${(snapshot.getMean)} $timestamp\n")
-    sb.append(s"${prefix}Min$label ${(snapshot.getMin)} $timestamp\n")
-    sb.append(s"${prefix}50thPercentile$label" +
-      s" ${snapshot.getMedian} $timestamp\n")
-    sb.append(s"${prefix}75thPercentile$label" +
-      s" ${snapshot.get75thPercentile} $timestamp\n")
-    sb.append(s"${prefix}95thPercentile$label" +
-      s" ${snapshot.get95thPercentile} $timestamp\n")
-    sb.append(s"${prefix}98thPercentile$label" +
-      s" ${snapshot.get98thPercentile} $timestamp\n")
-    sb.append(s"${prefix}99thPercentile$label" +
-      s" ${snapshot.get99thPercentile} $timestamp\n")
-    sb.append(s"${prefix}999thPercentile$label" +
-      s" ${snapshot.get999thPercentile} $timestamp\n")
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Count",
+      label,
+      nh.histogram.getCount,
+      timestamp,
+      "counter"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Max",
+      label,
+      snapshot.getMax,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Mean",
+      label,
+      snapshot.getMean,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Min",
+      label,
+      snapshot.getMin,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}50thPercentile",
+      label,
+      snapshot.getMedian,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}75thPercentile",
+      label,
+      snapshot.get75thPercentile,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}95thPercentile",
+      label,
+      snapshot.get95thPercentile,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}98thPercentile",
+      label,
+      snapshot.get98thPercentile,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}99thPercentile",
+      label,
+      snapshot.get99thPercentile,
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}999thPercentile",
+      label,
+      snapshot.get999thPercentile,
+      timestamp,
+      "gauge"))
     sb.toString()
   }
 
@@ -464,22 +555,66 @@ abstract class AbstractSource(conf: CelebornConf, role: 
String)
     val snapshot = nt.timer.getSnapshot
     val prefix = normalizeKey(nt.name)
     val label = nt.labelString
-    sb.append(s"${prefix}Count$label ${nt.timer.getCount} $timestamp\n")
-    sb.append(s"${prefix}Max$label ${reportNanosAsMills(snapshot.getMax)} 
$timestamp\n")
-    sb.append(s"${prefix}Mean$label ${reportNanosAsMills(snapshot.getMean)} 
$timestamp\n")
-    sb.append(s"${prefix}Min$label ${reportNanosAsMills(snapshot.getMin)} 
$timestamp\n")
-    sb.append(s"${prefix}50thPercentile$label" +
-      s" ${reportNanosAsMills(snapshot.getMedian)} $timestamp\n")
-    sb.append(s"${prefix}75thPercentile$label" +
-      s" ${reportNanosAsMills(snapshot.get75thPercentile)} $timestamp\n")
-    sb.append(s"${prefix}95thPercentile$label" +
-      s" ${reportNanosAsMills(snapshot.get95thPercentile)} $timestamp\n")
-    sb.append(s"${prefix}98thPercentile$label" +
-      s" ${reportNanosAsMills(snapshot.get98thPercentile)} $timestamp\n")
-    sb.append(s"${prefix}99thPercentile$label" +
-      s" ${reportNanosAsMills(snapshot.get99thPercentile)} $timestamp\n")
-    sb.append(s"${prefix}999thPercentile$label" +
-      s" ${reportNanosAsMills(snapshot.get999thPercentile)} $timestamp\n")
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Count",
+      label,
+      nt.timer.getCount,
+      timestamp,
+      "counter"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Max",
+      label,
+      reportNanosAsMills(snapshot.getMax),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Mean",
+      label,
+      reportNanosAsMills(snapshot.getMean),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}Min",
+      label,
+      reportNanosAsMills(snapshot.getMin),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}50thPercentile",
+      label,
+      reportNanosAsMills(snapshot.getMedian),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}75thPercentile",
+      label,
+      reportNanosAsMills(snapshot.get75thPercentile),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}95thPercentile",
+      label,
+      reportNanosAsMills(snapshot.get95thPercentile),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}98thPercentile",
+      label,
+      reportNanosAsMills(snapshot.get98thPercentile),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}99thPercentile",
+      label,
+      reportNanosAsMills(snapshot.get99thPercentile),
+      timestamp,
+      "gauge"))
+    sb.append(addMetricsWithPrometheusHelpType(
+      s"${prefix}999thPercentile",
+      label,
+      reportNanosAsMills(snapshot.get999thPercentile),
+      timestamp,
+      "gauge"))
     sb.toString()
   }
 

Reply via email to