This is an automated email from the ASF dual-hosted git repository.
zhouky pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new b4dfc09dc [CELEBORN-1007] Improve JVM metrics naming and add
ThreadStates metrics
b4dfc09dc is described below
commit b4dfc09dcf528f61da22bea01e66dc3c450ec0b6
Author: onebox-li <[email protected]>
AuthorDate: Thu Sep 28 10:18:37 2023 +0800
[CELEBORN-1007] Improve JVM metrics naming and add ThreadStates metrics
### What changes were proposed in this pull request?
Since we use codahale metrics to expose JVM metrics, the name without
prefix is not clear and it‘s not easy to make a grafana template for these
metrics because it adds collector name or pool name in names rather than labels.
So here I add jvm metric prefixes, remove pool info from name and obtain
the pool name as labels if needed.
And add ThreadStates metrics additionally.
### Why are the changes needed?
Make jvm metrics easy to understand and get template
### Does this PR introduce _any_ user-facing change?
Yes,jvm metrics naming is changed,expose threads state additionally.
change examples like below:
For GarbageCollectorMetricSet, G1-Old-Generation.time ->
jvm.gc.time{name="G1-Old-Generation"}
For MemoryUsageGaugeSet, total.init -> jvm.memory.total.init ;
pools.Metaspace.usage -> jvm.memory.pools.usage{name="Metaspace"}
For BufferPoolMetricSet, direct.count -> jvm.direct.count
For ThreadStatesGaugeSet, add jvm.thread.count.
For G1, the jvm metrics exposed now:
metrics_jvm_gc_time_Value{name="G1-Old-Generation",role="Worker"} 0
1695731141588
metrics_jvm_gc_count_Value{name="G1-Young-Generation",role="Worker"} 2
1695731141588
metrics_jvm_gc_time_Value{name="G1-Young-Generation",role="Worker"} 74
1695731141588
metrics_jvm_gc_count_Value{name="G1-Old-Generation",role="Worker"} 0
1695731141588
metrics_jvm_heap_committed_Value{role="Worker"} 2109734912 1695731141588
metrics_jvm_non_heap_used_Value{role="Worker"} 47700056 1695731141588
metrics_jvm_heap_used_Value{role="Worker"} 82801184 1695731141588
metrics_jvm_total_committed_Value{role="Worker"} 2160263168 1695731141588
metrics_jvm_total_init_Value{role="Worker"} 2112290816 1695731141588
metrics_jvm_non_heap_max_Value{role="Worker"} -1 1695731141588
metrics_jvm_heap_usage_Value{role="Worker"} 0.009639326483011246
1695731141588
metrics_jvm_total_used_Value{role="Worker"} 130502480 1695731141589
metrics_jvm_heap_init_Value{role="Worker"} 2109734912 1695731141589
metrics_jvm_non_heap_committed_Value{role="Worker"} 50528256 1695731141589
metrics_jvm_non_heap_init_Value{role="Worker"} 2555904 1695731141589
metrics_jvm_non_heap_usage_Value{role="Worker"} -4.7701296E7 1695731141589
metrics_jvm_heap_max_Value{role="Worker"} 8589934592 1695731141589
metrics_jvm_total_max_Value{role="Worker"} 8589934591 1695731141589
metrics_jvm_memory_pool_used_Value{name="Code-Cache",role="Worker"}
10314368 1695731141588
metrics_jvm_memory_pool_committed_Value{name="Code-Cache",role="Worker"}
10944512 1695731141588
metrics_jvm_memory_pool_init_Value{name="G1-Eden-Space",role="Worker"}
111149056 1695731141588
metrics_jvm_memory_pool_max_Value{name="G1-Old-Gen",role="Worker"}
8589934592 1695731141588
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Survivor-Space",role="Worker"}
14680064 1695731141588
metrics_jvm_memory_pool_used_Value{name="Compressed-Class-Space",role="Worker"}
4440192 1695731141588
metrics_jvm_memory_pool_usage_Value{name="Metaspace",role="Worker"}
0.9449504192610433 1695731141588
metrics_jvm_memory_pool_max_Value{name="Metaspace",role="Worker"} -1
1695731141588
metrics_jvm_memory_pool_init_Value{name="G1-Survivor-Space",role="Worker"}
0 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Old-Gen",role="Worker"}
1998585856 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Survivor-Space",role="Worker"}
14680064 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Eden-Space",role="Worker"}
96468992 1695731141588
metrics_jvm_memory_pool_max_Value{name="G1-Survivor-Space",role="Worker"}
-1 1695731141588
metrics_jvm_memory_pool_usage_Value{name="Compressed-Class-Space",role="Worker"}
0.004135251045227051 1695731141588
metrics_jvm_memory_pool_usage_Value{name="G1-Survivor-Space",role="Worker"}
1.0 1695731141588
metrics_jvm_memory_pool_max_Value{name="Code-Cache",role="Worker"}
251658240 1695731141588
metrics_jvm_memory_pool_init_Value{name="Compressed-Class-Space",role="Worker"}
0 1695731141589
metrics_jvm_memory_pool_usage_Value{name="G1-Eden-Space",role="Worker"}
0.34782608695652173 1695731141589
metrics_jvm_memory_pool_init_Value{name="Metaspace",role="Worker"} 0
1695731141589
metrics_jvm_memory_pool_max_Value{name="G1-Eden-Space",role="Worker"} -1
1695731141589
metrics_jvm_memory_pool_usage_Value{name="Code-Cache",role="Worker"}
0.04098917643229167 1695731141589
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Eden-Space",role="Worker"}
0 1695731141589
metrics_jvm_memory_pool_init_Value{name="Code-Cache",role="Worker"} 2555904
1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Survivor-Space",role="Worker"}
14680064 1695731141589
metrics_jvm_memory_pool_committed_Value{name="Compressed-Class-Space",role="Worker"}
4718592 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Eden-Space",role="Worker"}
33554432 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Old-Gen",role="Worker"}
34566688 1695731141589
metrics_jvm_memory_pool_usage_Value{name="G1-Old-Gen",role="Worker"}
0.004024092108011246 1695731141589
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Old-Gen",role="Worker"} 0
1695731141589
metrics_jvm_memory_pool_committed_Value{name="Metaspace",role="Worker"}
34865152 1695731141589
metrics_jvm_memory_pool_init_Value{name="G1-Old-Gen",role="Worker"}
1998585856 1695731141589
metrics_jvm_memory_pool_used_Value{name="Metaspace",role="Worker"} 32945840
1695731141589
metrics_jvm_memory_pool_max_Value{name="Compressed-Class-Space",role="Worker"}
1073741824 1695731141589
metrics_jvm_direct_count_Value{role="Worker"} 8 1695731141589
metrics_jvm_direct_capacity_Value{role="Worker"} 1036 1695731141589
metrics_jvm_direct_used_Value{role="Worker"} 1037 1695731141589
metrics_jvm_mapped_used_Value{role="Worker"} 0 1695731141589
metrics_jvm_mapped_capacity_Value{role="Worker"} 0 1695731141589
metrics_jvm_mapped_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_timed_waiting_count_Value{role="Worker"} 23 1695731141589
metrics_jvm_thread_deadlock_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_count_Value{role="Worker"} 78 1695731141589
metrics_jvm_thread_waiting_count_Value{role="Worker"} 45 1695731141589
metrics_jvm_thread_daemon_count_Value{role="Worker"} 75 1695731141589
metrics_jvm_thread_new_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_blocked_count_Value{role="Worker"} 0 1695731141590
metrics_jvm_thread_deadlocks_Value{role="Worker"} [] 1695731141590
metrics_jvm_thread_runnable_count_Value{role="Worker"} 10 1695731141590
metrics_jvm_thread_terminated_count_Value{role="Worker"} 0 1695731141590
### How was this patch tested?
UT and cluster test with g1, PS-Scavenge/PS-MarkSweep and ParNew/CMS
Closes #1939 from onebox-li/improve-jvm-metrics.
Authored-by: onebox-li <[email protected]>
Signed-off-by: zky.zhoukeyong <[email protected]>
---
.../celeborn/common/metrics/source/JVMSource.scala | 82 +++++++++++++++++++---
.../common/metrics/source/JVMSourceSuite.scala | 55 +++++++++++++++
2 files changed, 126 insertions(+), 11 deletions(-)
diff --git
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
index 374024dd2..a87924868 100644
---
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
+++
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala
@@ -20,26 +20,86 @@ package org.apache.celeborn.common.metrics.source
import java.lang.management.ManagementFactory
import scala.collection.JavaConverters._
+import scala.collection.mutable
-import com.codahale.metrics.Gauge
-import com.codahale.metrics.jvm.{BufferPoolMetricSet,
GarbageCollectorMetricSet, MemoryUsageGaugeSet}
+import com.codahale.metrics.{Gauge, MetricRegistry}
+import com.codahale.metrics.jvm.{BufferPoolMetricSet,
GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet}
import org.apache.celeborn.common.CelebornConf
class JVMSource(conf: CelebornConf, role: String) extends AbstractSource(conf,
role) {
override val sourceName = "JVM"
- // all of metrics of GCMetricSet and BufferPoolMetricSet are Gauge
- Seq(
- new GarbageCollectorMetricSet(),
- new MemoryUsageGaugeSet(),
- new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer))
- .map { x =>
- x.getMetrics.asScala.map {
- case (name: String, metric: Gauge[_]) => addGauge(name, metric)
- case (name, metric) => new IllegalArgumentException(s"Unknown metric
type: $name: $metric")
+ import JVMSource._
+
+ private val gcNames =
ManagementFactory.getGarbageCollectorMXBeans.asScala.map(bean =>
+ WHITESPACE.matcher(bean.getName).replaceAll("-"))
+ private val poolNames =
ManagementFactory.getMemoryPoolMXBeans.asScala.map(bean =>
+ WHITESPACE.matcher(bean.getName).replaceAll("-"))
+
+ /**
+ * Add jvm metric prefix, remove pool info from name and obtain the pool
name as labels if needed
+ * @param metricName metric name from MetricSet
+ * @param targets keywords need to be replaced
+ * @param prefix prefix for new metric name
+ * @param replacement replacement for pool name
+ * @return new metric without target, labels if exists
+ */
+ def handleJVMMetricName(
+ metricName: String,
+ targets: mutable.Buffer[String],
+ prefix: String,
+ replacement: String): (String, Map[String, String]) = {
+ for (target <- targets) {
+ if (metricName.contains(target)) {
+ val labels = Map("name" -> target)
+ var replaceTarget = target
+ if (replacement.isEmpty) {
+ replaceTarget = target + "."
+ }
+ return (MetricRegistry.name(prefix, metricName.replace(replaceTarget,
replacement)), labels)
}
}
+ (MetricRegistry.name(prefix, metricName), Map.empty[String, String])
+ }
+
+ // all metrics in MetricSet are gauges
+ Seq(new GarbageCollectorMetricSet()).map(_.getMetrics.asScala.map {
+ case (name: String, metric: Gauge[_]) =>
+ val newMetrics = handleJVMMetricName(name, gcNames, JVM_METRIC_PREFIX,
"gc")
+ addGauge(newMetrics._1, newMetrics._2, metric)
+ case (name, metric) => new IllegalArgumentException(s"Unknown metric type:
$name: $metric")
+ })
+
+ Seq(new MemoryUsageGaugeSet()).map(_.getMetrics.asScala.map {
+ case (name: String, metric: Gauge[_]) =>
+ val newMetrics = handleJVMMetricName(name, poolNames,
JVM_METRIC_MEMORY_PREFIX, "")
+ addGauge(newMetrics._1, newMetrics._2, metric)
+ case (name, metric) => new IllegalArgumentException(s"Unknown metric type:
$name: $metric")
+ })
+
+ Seq(
+ new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer)).map(
+ _.getMetrics.asScala.map {
+ case (name: String, metric: Gauge[_]) =>
+ addGauge(MetricRegistry.name(JVM_METRIC_PREFIX, name), metric)
+ case (name, metric) => new IllegalArgumentException(s"Unknown metric
type: $name: $metric")
+ })
+
+ Seq(new ThreadStatesGaugeSet()).map(_.getMetrics.asScala.map {
+ case (name: String, metric: Gauge[_]) =>
+ addGauge(MetricRegistry.name(JVM_METRIC_THREAD_PREFIX, name), metric)
+ case (name, metric) => new IllegalArgumentException(s"Unknown metric type:
$name: $metric")
+ })
+
// start cleaner
startCleaner()
}
+
+object JVMSource {
+ private val JVM_METRIC_PREFIX = "jvm"
+ private val JVM_METRIC_MEMORY_PREFIX = JVM_METRIC_PREFIX + ".memory"
+ private val JVM_METRIC_THREAD_PREFIX = JVM_METRIC_PREFIX + ".thread"
+
+ private val WHITESPACE = "\\s+".r.pattern
+}
diff --git
a/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala
b/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala
new file mode 100644
index 000000000..6f4f8cac7
--- /dev/null
+++
b/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.common.metrics.source
+
+import org.apache.celeborn.CelebornFunSuite
+import org.apache.celeborn.common.CelebornConf
+
+class JVMSourceSuite extends CelebornFunSuite {
+
+ val gcNames = Seq("G1-Young-Generation", "G1-Old-Generation").toBuffer
+ val poolNames = Seq("G1-Eden-Space", "G1-Survivor-Space",
"G1-Old-Gen").toBuffer
+
+ val JVM_METRIC_PREFIX = "jvm"
+ val JVM_MEMORY_PREFIX = "jvm.memory"
+
+ test("Test handleJVMMetricName") {
+
+ val jvmSource = new JVMSource(new CelebornConf(), "test")
+
+ val gcMetric1 = "G1-Old-Generation.time"
+ val gcMetric2 = "G1-Young-Generation.count"
+ val gcResult1 = jvmSource.handleJVMMetricName(gcMetric1, gcNames,
JVM_METRIC_PREFIX, "gc")
+ val gcResult2 = jvmSource.handleJVMMetricName(gcMetric2, gcNames,
JVM_METRIC_PREFIX, "gc")
+ assert(gcResult1._1 == "jvm.gc.time")
+ assert(gcResult1._2 == Map("name" -> "G1-Old-Generation"))
+ assert(gcResult2._1 == "jvm.gc.count")
+ assert(gcResult2._2 == Map("name" -> "G1-Young-Generation"))
+
+ val memoryMetric1 = "total.init"
+ val memoryMetrics = "pools.G1-Eden-Space.init"
+ val memoryResult1 =
+ jvmSource.handleJVMMetricName(memoryMetric1, poolNames,
JVM_MEMORY_PREFIX, "")
+ val memoryResult2 =
+ jvmSource.handleJVMMetricName(memoryMetrics, poolNames,
JVM_MEMORY_PREFIX, "")
+ assert(memoryResult1._1 == "jvm.memory.total.init")
+ assert(memoryResult1._2 == Map.empty[String, String])
+ assert(memoryResult2._1 == "jvm.memory.pools.init")
+ assert(memoryResult2._2 == Map("name" -> "G1-Eden-Space"))
+ }
+}