This is an automated email from the ASF dual-hosted git repository.
roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 3a1b4d23e [#1598] fix(server) Fix inaccurate used_direct_memory_size
metric (#1599)
3a1b4d23e is described below
commit 3a1b4d23e58d772f57733179025f53e819671344
Author: RickyMa <[email protected]>
AuthorDate: Mon Mar 25 19:03:23 2024 +0800
[#1598] fix(server) Fix inaccurate used_direct_memory_size metric (#1599)
### What changes were proposed in this pull request?
Fix the inaccurate `used_direct_memory_size` metric.
Also `used_direct_memory_size_by_netty` and
`used_direct_memory_size_by_grpc_netty` metrics are added to provide more
detailed indicators for locating and analyzing in production.
### Why are the changes needed?
Fix https://github.com/apache/incubator-uniffle/issues/1598.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Tested in our env.
---
.../apache/uniffle/server/NettyDirectMemoryTracker.java | 15 ++++++++++++---
.../org/apache/uniffle/server/ShuffleServerMetrics.java | 11 +++++++++--
2 files changed, 21 insertions(+), 5 deletions(-)
diff --git
a/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java
b/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java
index 96206cc65..e9eb17060 100644
---
a/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java
+++
b/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java
@@ -53,11 +53,20 @@ public class NettyDirectMemoryTracker {
service.scheduleAtFixedRate(
() -> {
try {
- long usedDirectMemory = PlatformDependent.usedDirectMemory();
+ long usedDirectMemoryByNetty =
PlatformDependent.usedDirectMemory();
+ long usedDirectMemoryByGrpcNetty =
+
io.grpc.netty.shaded.io.netty.util.internal.PlatformDependent.usedDirectMemory();
if (LOG.isDebugEnabled()) {
- LOG.debug("Current usedDirectMemory:{}", usedDirectMemory);
+ LOG.debug(
+ "Current usedDirectMemoryByNetty:{},
usedDirectMemoryByGrpcNetty:{}",
+ usedDirectMemoryByNetty,
+ usedDirectMemoryByGrpcNetty);
}
-
ShuffleServerMetrics.gaugeUsedDirectMemorySize.set(usedDirectMemory);
+
ShuffleServerMetrics.gaugeUsedDirectMemorySizeByNetty.set(usedDirectMemoryByNetty);
+ ShuffleServerMetrics.gaugeUsedDirectMemorySizeByGrpcNetty.set(
+ usedDirectMemoryByGrpcNetty);
+ ShuffleServerMetrics.gaugeUsedDirectMemorySize.set(
+ usedDirectMemoryByNetty + usedDirectMemoryByGrpcNetty);
} catch (Throwable t) {
LOG.error("Failed to report direct memory.", t);
}
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 274cde008..f1f37369e 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -79,8 +79,9 @@ public class ShuffleServerMetrics {
private static final String USED_BUFFER_SIZE = "used_buffer_size";
private static final String READ_USED_BUFFER_SIZE = "read_used_buffer_size";
private static final String USED_DIRECT_MEMORY_SIZE =
"used_direct_memory_size";
- private static final String ALLOCATED_DIRECT_MEMORY_SIZE =
"allocated_direct_memory_size";
- private static final String PINNED_DIRECT_MEMORY_SIZE =
"pinned_direct_memory_size";
+ private static final String USED_DIRECT_MEMORY_SIZE_BY_NETTY =
"used_direct_memory_size_by_netty";
+ private static final String USED_DIRECT_MEMORY_SIZE_BY_GRPC_NETTY =
+ "used_direct_memory_size_by_grpc_netty";
private static final String TOTAL_FAILED_WRITTEN_EVENT_NUM =
"total_failed_written_event_num";
private static final String TOTAL_DROPPED_EVENT_NUM =
"total_dropped_event_num";
private static final String TOTAL_HADOOP_WRITE_DATA =
"total_hadoop_write_data";
@@ -186,6 +187,8 @@ public class ShuffleServerMetrics {
public static Gauge.Child gaugeUsedBufferSize;
public static Gauge.Child gaugeReadBufferUsedSize;
public static Gauge.Child gaugeUsedDirectMemorySize;
+ public static Gauge.Child gaugeUsedDirectMemorySizeByNetty;
+ public static Gauge.Child gaugeUsedDirectMemorySizeByGrpcNetty;
public static Gauge.Child gaugeWriteHandler;
public static Gauge.Child gaugeEventQueueSize;
public static Gauge.Child gaugeHadoopFlushThreadPoolQueueSize;
@@ -382,6 +385,10 @@ public class ShuffleServerMetrics {
gaugeUsedBufferSize = metricsManager.addLabeledGauge(USED_BUFFER_SIZE);
gaugeReadBufferUsedSize =
metricsManager.addLabeledGauge(READ_USED_BUFFER_SIZE);
gaugeUsedDirectMemorySize =
metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE);
+ gaugeUsedDirectMemorySizeByNetty =
+ metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE_BY_NETTY);
+ gaugeUsedDirectMemorySizeByGrpcNetty =
+ metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE_BY_GRPC_NETTY);
gaugeWriteHandler = metricsManager.addLabeledGauge(TOTAL_WRITE_HANDLER);
gaugeEventQueueSize = metricsManager.addLabeledGauge(EVENT_QUEUE_SIZE);
gaugeHadoopFlushThreadPoolQueueSize =