This is an automated email from the ASF dual-hosted git repository.

zhouky pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new b4dfc09dc [CELEBORN-1007] Improve JVM metrics naming and add 
ThreadStates metrics
b4dfc09dc is described below

commit b4dfc09dcf528f61da22bea01e66dc3c450ec0b6
Author: onebox-li <[email protected]>
AuthorDate: Thu Sep 28 10:18:37 2023 +0800

    [CELEBORN-1007] Improve JVM metrics naming and add ThreadStates metrics
    
    ### What changes were proposed in this pull request?
    Since we use codahale metrics to expose JVM metrics, the name without 
prefix is not clear and it‘s not easy to make a grafana template for these 
metrics because it adds collector name or pool name in names rather than labels.
    
    So here I add jvm metric prefixes, remove pool info from name and obtain 
the pool name as labels if needed.
    And add ThreadStates metrics additionally.
    
    ### Why are the changes needed?
    Make jvm metrics easy to understand and get template
    
    ### Does this PR introduce _any_ user-facing change?
    Yes,jvm metrics naming is changed,expose threads state additionally.
    
    change examples like below:
    For GarbageCollectorMetricSet, G1-Old-Generation.time -> 
jvm.gc.time{name="G1-Old-Generation"}
    For MemoryUsageGaugeSet, total.init -> jvm.memory.total.init ; 
pools.Metaspace.usage -> jvm.memory.pools.usage{name="Metaspace"}
    For BufferPoolMetricSet, direct.count -> jvm.direct.count
    For ThreadStatesGaugeSet, add jvm.thread.count.
    
    For G1, the jvm metrics exposed now:
    metrics_jvm_gc_time_Value{name="G1-Old-Generation",role="Worker"} 0 
1695731141588
    metrics_jvm_gc_count_Value{name="G1-Young-Generation",role="Worker"} 2 
1695731141588
    metrics_jvm_gc_time_Value{name="G1-Young-Generation",role="Worker"} 74 
1695731141588
    metrics_jvm_gc_count_Value{name="G1-Old-Generation",role="Worker"} 0 
1695731141588
    
    metrics_jvm_heap_committed_Value{role="Worker"} 2109734912 1695731141588
    metrics_jvm_non_heap_used_Value{role="Worker"} 47700056 1695731141588
    metrics_jvm_heap_used_Value{role="Worker"} 82801184 1695731141588
    metrics_jvm_total_committed_Value{role="Worker"} 2160263168 1695731141588
    metrics_jvm_total_init_Value{role="Worker"} 2112290816 1695731141588
    metrics_jvm_non_heap_max_Value{role="Worker"} -1 1695731141588
    metrics_jvm_heap_usage_Value{role="Worker"} 0.009639326483011246 
1695731141588
    metrics_jvm_total_used_Value{role="Worker"} 130502480 1695731141589
    metrics_jvm_heap_init_Value{role="Worker"} 2109734912 1695731141589
    metrics_jvm_non_heap_committed_Value{role="Worker"} 50528256 1695731141589
    metrics_jvm_non_heap_init_Value{role="Worker"} 2555904 1695731141589
    metrics_jvm_non_heap_usage_Value{role="Worker"} -4.7701296E7 1695731141589
    metrics_jvm_heap_max_Value{role="Worker"} 8589934592 1695731141589
    metrics_jvm_total_max_Value{role="Worker"} 8589934591 1695731141589
    metrics_jvm_memory_pool_used_Value{name="Code-Cache",role="Worker"} 
10314368 1695731141588
    metrics_jvm_memory_pool_committed_Value{name="Code-Cache",role="Worker"} 
10944512 1695731141588
    metrics_jvm_memory_pool_init_Value{name="G1-Eden-Space",role="Worker"} 
111149056 1695731141588
    metrics_jvm_memory_pool_max_Value{name="G1-Old-Gen",role="Worker"} 
8589934592 1695731141588
    
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Survivor-Space",role="Worker"}
 14680064 1695731141588
    
metrics_jvm_memory_pool_used_Value{name="Compressed-Class-Space",role="Worker"} 
4440192 1695731141588
    metrics_jvm_memory_pool_usage_Value{name="Metaspace",role="Worker"} 
0.9449504192610433 1695731141588
    metrics_jvm_memory_pool_max_Value{name="Metaspace",role="Worker"} -1 
1695731141588
    metrics_jvm_memory_pool_init_Value{name="G1-Survivor-Space",role="Worker"} 
0 1695731141588
    metrics_jvm_memory_pool_committed_Value{name="G1-Old-Gen",role="Worker"} 
1998585856 1695731141588
    
metrics_jvm_memory_pool_committed_Value{name="G1-Survivor-Space",role="Worker"} 
14680064 1695731141588
    metrics_jvm_memory_pool_committed_Value{name="G1-Eden-Space",role="Worker"} 
96468992 1695731141588
    metrics_jvm_memory_pool_max_Value{name="G1-Survivor-Space",role="Worker"} 
-1 1695731141588
    
metrics_jvm_memory_pool_usage_Value{name="Compressed-Class-Space",role="Worker"}
 0.004135251045227051 1695731141588
    metrics_jvm_memory_pool_usage_Value{name="G1-Survivor-Space",role="Worker"} 
1.0 1695731141588
    metrics_jvm_memory_pool_max_Value{name="Code-Cache",role="Worker"} 
251658240 1695731141588
    
metrics_jvm_memory_pool_init_Value{name="Compressed-Class-Space",role="Worker"} 
0 1695731141589
    metrics_jvm_memory_pool_usage_Value{name="G1-Eden-Space",role="Worker"} 
0.34782608695652173 1695731141589
    metrics_jvm_memory_pool_init_Value{name="Metaspace",role="Worker"} 0 
1695731141589
    metrics_jvm_memory_pool_max_Value{name="G1-Eden-Space",role="Worker"} -1 
1695731141589
    metrics_jvm_memory_pool_usage_Value{name="Code-Cache",role="Worker"} 
0.04098917643229167 1695731141589
    
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Eden-Space",role="Worker"} 
0 1695731141589
    metrics_jvm_memory_pool_init_Value{name="Code-Cache",role="Worker"} 2555904 
1695731141589
    metrics_jvm_memory_pool_used_Value{name="G1-Survivor-Space",role="Worker"} 
14680064 1695731141589
    
metrics_jvm_memory_pool_committed_Value{name="Compressed-Class-Space",role="Worker"}
 4718592 1695731141589
    metrics_jvm_memory_pool_used_Value{name="G1-Eden-Space",role="Worker"} 
33554432 1695731141589
    metrics_jvm_memory_pool_used_Value{name="G1-Old-Gen",role="Worker"} 
34566688 1695731141589
    metrics_jvm_memory_pool_usage_Value{name="G1-Old-Gen",role="Worker"} 
0.004024092108011246 1695731141589
    
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Old-Gen",role="Worker"} 0 
1695731141589
    metrics_jvm_memory_pool_committed_Value{name="Metaspace",role="Worker"} 
34865152 1695731141589
    metrics_jvm_memory_pool_init_Value{name="G1-Old-Gen",role="Worker"} 
1998585856 1695731141589
    metrics_jvm_memory_pool_used_Value{name="Metaspace",role="Worker"} 32945840 
1695731141589
    
metrics_jvm_memory_pool_max_Value{name="Compressed-Class-Space",role="Worker"} 
1073741824 1695731141589
    
    metrics_jvm_direct_count_Value{role="Worker"} 8 1695731141589
    metrics_jvm_direct_capacity_Value{role="Worker"} 1036 1695731141589
    metrics_jvm_direct_used_Value{role="Worker"} 1037 1695731141589
    metrics_jvm_mapped_used_Value{role="Worker"} 0 1695731141589
    metrics_jvm_mapped_capacity_Value{role="Worker"} 0 1695731141589
    metrics_jvm_mapped_count_Value{role="Worker"} 0 1695731141589
    
    metrics_jvm_thread_timed_waiting_count_Value{role="Worker"} 23 1695731141589
    metrics_jvm_thread_deadlock_count_Value{role="Worker"} 0 1695731141589
    metrics_jvm_thread_count_Value{role="Worker"} 78 1695731141589
    metrics_jvm_thread_waiting_count_Value{role="Worker"} 45 1695731141589
    metrics_jvm_thread_daemon_count_Value{role="Worker"} 75 1695731141589
    metrics_jvm_thread_new_count_Value{role="Worker"} 0 1695731141589
    metrics_jvm_thread_blocked_count_Value{role="Worker"} 0 1695731141590
    metrics_jvm_thread_deadlocks_Value{role="Worker"} [] 1695731141590
    metrics_jvm_thread_runnable_count_Value{role="Worker"} 10 1695731141590
    metrics_jvm_thread_terminated_count_Value{role="Worker"} 0 1695731141590
    
    ### How was this patch tested?
    UT and cluster test with g1, PS-Scavenge/PS-MarkSweep and ParNew/CMS
    
    Closes #1939 from onebox-li/improve-jvm-metrics.
    
    Authored-by: onebox-li <[email protected]>
    Signed-off-by: zky.zhoukeyong <[email protected]>
---
 .../celeborn/common/metrics/source/JVMSource.scala | 82 +++++++++++++++++++---
 .../common/metrics/source/JVMSourceSuite.scala     | 55 +++++++++++++++
 2 files changed, 126 insertions(+), 11 deletions(-)

diff --git 
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
 
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
index 374024dd2..a87924868 100644
--- 
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
+++ 
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
@@ -20,26 +20,86 @@ package org.apache.celeborn.common.metrics.source
 import java.lang.management.ManagementFactory
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
-import com.codahale.metrics.Gauge
-import com.codahale.metrics.jvm.{BufferPoolMetricSet, 
GarbageCollectorMetricSet, MemoryUsageGaugeSet}
+import com.codahale.metrics.{Gauge, MetricRegistry}
+import com.codahale.metrics.jvm.{BufferPoolMetricSet, 
GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet}
 
 import org.apache.celeborn.common.CelebornConf
 
 class JVMSource(conf: CelebornConf, role: String) extends AbstractSource(conf, 
role) {
   override val sourceName = "JVM"
 
-  // all of metrics of GCMetricSet and BufferPoolMetricSet are Gauge
-  Seq(
-    new GarbageCollectorMetricSet(),
-    new MemoryUsageGaugeSet(),
-    new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer))
-    .map { x =>
-      x.getMetrics.asScala.map {
-        case (name: String, metric: Gauge[_]) => addGauge(name, metric)
-        case (name, metric) => new IllegalArgumentException(s"Unknown metric 
type: $name: $metric")
+  import JVMSource._
+
+  private val gcNames = 
ManagementFactory.getGarbageCollectorMXBeans.asScala.map(bean =>
+    WHITESPACE.matcher(bean.getName).replaceAll("-"))
+  private val poolNames = 
ManagementFactory.getMemoryPoolMXBeans.asScala.map(bean =>
+    WHITESPACE.matcher(bean.getName).replaceAll("-"))
+
+  /**
+   * Add jvm metric prefix, remove pool info from name and obtain the pool 
name as labels if needed
+   * @param metricName metric name from MetricSet
+   * @param targets keywords need to be replaced
+   * @param prefix prefix for new metric name
+   * @param replacement replacement for pool name
+   * @return new metric without target, labels if exists
+   */
+  def handleJVMMetricName(
+      metricName: String,
+      targets: mutable.Buffer[String],
+      prefix: String,
+      replacement: String): (String, Map[String, String]) = {
+    for (target <- targets) {
+      if (metricName.contains(target)) {
+        val labels = Map("name" -> target)
+        var replaceTarget = target
+        if (replacement.isEmpty) {
+          replaceTarget = target + "."
+        }
+        return (MetricRegistry.name(prefix, metricName.replace(replaceTarget, 
replacement)), labels)
       }
     }
+    (MetricRegistry.name(prefix, metricName), Map.empty[String, String])
+  }
+
+  // all metrics in MetricSet are gauges
+  Seq(new GarbageCollectorMetricSet()).map(_.getMetrics.asScala.map {
+    case (name: String, metric: Gauge[_]) =>
+      val newMetrics = handleJVMMetricName(name, gcNames, JVM_METRIC_PREFIX, 
"gc")
+      addGauge(newMetrics._1, newMetrics._2, metric)
+    case (name, metric) => new IllegalArgumentException(s"Unknown metric type: 
$name: $metric")
+  })
+
+  Seq(new MemoryUsageGaugeSet()).map(_.getMetrics.asScala.map {
+    case (name: String, metric: Gauge[_]) =>
+      val newMetrics = handleJVMMetricName(name, poolNames, 
JVM_METRIC_MEMORY_PREFIX, "")
+      addGauge(newMetrics._1, newMetrics._2, metric)
+    case (name, metric) => new IllegalArgumentException(s"Unknown metric type: 
$name: $metric")
+  })
+
+  Seq(
+    new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer)).map(
+    _.getMetrics.asScala.map {
+      case (name: String, metric: Gauge[_]) =>
+        addGauge(MetricRegistry.name(JVM_METRIC_PREFIX, name), metric)
+      case (name, metric) => new IllegalArgumentException(s"Unknown metric 
type: $name: $metric")
+    })
+
+  Seq(new ThreadStatesGaugeSet()).map(_.getMetrics.asScala.map {
+    case (name: String, metric: Gauge[_]) =>
+      addGauge(MetricRegistry.name(JVM_METRIC_THREAD_PREFIX, name), metric)
+    case (name, metric) => new IllegalArgumentException(s"Unknown metric type: 
$name: $metric")
+  })
+
   // start cleaner
   startCleaner()
 }
+
+object JVMSource {
+  private val JVM_METRIC_PREFIX = "jvm"
+  private val JVM_METRIC_MEMORY_PREFIX = JVM_METRIC_PREFIX + ".memory"
+  private val JVM_METRIC_THREAD_PREFIX = JVM_METRIC_PREFIX + ".thread"
+
+  private val WHITESPACE = "\\s+".r.pattern
+}
diff --git 
a/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala
 
b/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala
new file mode 100644
index 000000000..6f4f8cac7
--- /dev/null
+++ 
b/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.common.metrics.source
+
+import org.apache.celeborn.CelebornFunSuite
+import org.apache.celeborn.common.CelebornConf
+
+class JVMSourceSuite extends CelebornFunSuite {
+
+  val gcNames = Seq("G1-Young-Generation", "G1-Old-Generation").toBuffer
+  val poolNames = Seq("G1-Eden-Space", "G1-Survivor-Space", 
"G1-Old-Gen").toBuffer
+
+  val JVM_METRIC_PREFIX = "jvm"
+  val JVM_MEMORY_PREFIX = "jvm.memory"
+
+  test("Test handleJVMMetricName") {
+
+    val jvmSource = new JVMSource(new CelebornConf(), "test")
+
+    val gcMetric1 = "G1-Old-Generation.time"
+    val gcMetric2 = "G1-Young-Generation.count"
+    val gcResult1 = jvmSource.handleJVMMetricName(gcMetric1, gcNames, 
JVM_METRIC_PREFIX, "gc")
+    val gcResult2 = jvmSource.handleJVMMetricName(gcMetric2, gcNames, 
JVM_METRIC_PREFIX, "gc")
+    assert(gcResult1._1 == "jvm.gc.time")
+    assert(gcResult1._2 == Map("name" -> "G1-Old-Generation"))
+    assert(gcResult2._1 == "jvm.gc.count")
+    assert(gcResult2._2 == Map("name" -> "G1-Young-Generation"))
+
+    val memoryMetric1 = "total.init"
+    val memoryMetrics = "pools.G1-Eden-Space.init"
+    val memoryResult1 =
+      jvmSource.handleJVMMetricName(memoryMetric1, poolNames, 
JVM_MEMORY_PREFIX, "")
+    val memoryResult2 =
+      jvmSource.handleJVMMetricName(memoryMetrics, poolNames, 
JVM_MEMORY_PREFIX, "")
+    assert(memoryResult1._1 == "jvm.memory.total.init")
+    assert(memoryResult1._2 == Map.empty[String, String])
+    assert(memoryResult2._1 == "jvm.memory.pools.init")
+    assert(memoryResult2._2 == Map("name" -> "G1-Eden-Space"))
+  }
+}

Reply via email to