This is an automated email from the ASF dual-hosted git repository.

xianjingfeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new 03f42c542 [#2525] improvement(server): add some metrics for LAB (#2553)
03f42c542 is described below

commit 03f42c542ba5245d9a70283c5adc0e99eb40d298
Author: xianjingfeng <xianjingfeng...@gmail.com>
AuthorDate: Fri Jul 25 14:53:04 2025 +0800

    [#2525] improvement(server): add some metrics for LAB (#2553)
    
    ### What changes were proposed in this pull request?
    Add some metrics for LAB.
    
    ### Why are the changes needed?
    These metrics can help us to adjust LAB related parameters.
    Fix: #2525
    
    ### Does this PR introduce any user-facing change?
    No.
    
    ### How was this patch tested?
    Verify in production environment.
---
 .../uniffle/server/ShuffleServerMetrics.java       | 24 ++++++++++++++++++++--
 .../server/buffer/ShuffleBufferWithLinkedList.java |  2 ++
 .../server/buffer/ShuffleBufferWithSkipList.java   |  2 ++
 .../uniffle/server/buffer/lab/ChunkCreator.java    | 20 ++++++++++++++----
 .../org/apache/uniffle/server/buffer/lab/LAB.java  |  3 +++
 5 files changed, 45 insertions(+), 6 deletions(-)

diff --git 
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java 
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 9b6c548e1..f8689553c 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -149,7 +149,13 @@ public class ShuffleServerMetrics {
   private static final String TOTAL_REMOVE_RESOURCE_TIME = 
"total_remove_resource_time";
   private static final String TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME =
       "total_remove_resource_by_shuffle_ids_time";
-
+  private static final String LAB_CREATED_CHUNK_COUNT = 
"lab_created_chunk_count";
+  private static final String LAB_REUSED_CHUNK_COUNT = 
"lab_reused_chunk_count";
+  private static final String LAB_RECLAIMED_CHUNK_COUNT = 
"lab_reclaimed_chunk_count";
+  private static final String LAB_CHUNK_POOL_REMAIN_PERCENT = 
"lab_chunk_pool_remain_percent";
+  private static final String NOT_ON_LAB_BLOCK_COUNT = 
"not_on_lab_block_count";
+  private static final String ON_LAB_BLOCK_COUNT = "on_lab_block_count";
+  private static final String BUFFER_BLOCK_SIZE = "buffer_block_size";
   public static final String TOPN_OF_TOTAL_DATA_SIZE_FOR_APP = 
"topN_of_total_data_size_for_app";
   public static final String TOPN_OF_IN_MEMORY_DATA_SIZE_FOR_APP =
       "topN_of_in_memory_data_size_for_app";
@@ -268,6 +274,13 @@ public class ShuffleServerMetrics {
   public static Counter counterHadoopEventFlush;
   public static Counter counterPreAllocatedBufferExpired;
   public static Counter counterAppNotFound;
+  public static Counter counterLABChunkCreated;
+  public static Counter counterLABChunkReused;
+  public static Gauge gaugeLABChunkReclaimed;
+  public static Gauge gaugeLABChunkPoolRemainPercent;
+  public static Counter counterBlockNotOnLAB;
+  public static Counter counterBlockOnLAB;
+  public static Summary summaryBufferBlockSize;
 
   private static MetricsManager metricsManager;
   private static boolean isRegister = false;
@@ -504,7 +517,14 @@ public class ShuffleServerMetrics {
     summaryTotalRemoveResourceTime = 
metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_TIME);
     summaryTotalRemoveResourceByShuffleIdsTime =
         metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME);
-
+    counterLABChunkCreated = 
metricsManager.addCounter(LAB_CREATED_CHUNK_COUNT);
+    counterLABChunkReused = metricsManager.addCounter(LAB_REUSED_CHUNK_COUNT);
+    gaugeLABChunkReclaimed = 
metricsManager.addGauge(LAB_RECLAIMED_CHUNK_COUNT);
+    gaugeLABChunkPoolRemainPercent = 
metricsManager.addGauge(LAB_CHUNK_POOL_REMAIN_PERCENT);
+
+    counterBlockNotOnLAB = metricsManager.addCounter(NOT_ON_LAB_BLOCK_COUNT);
+    counterBlockOnLAB = metricsManager.addCounter(ON_LAB_BLOCK_COUNT);
+    summaryBufferBlockSize = metricsManager.addSummary(BUFFER_BLOCK_SIZE);
     gaugeTotalDataSizeUsage =
         Gauge.build()
             .name(TOPN_OF_TOTAL_DATA_SIZE_FOR_APP)
diff --git 
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
 
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
index 2216e2907..9d4a5a57f 100644
--- 
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
+++ 
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java
@@ -37,6 +37,7 @@ import org.apache.uniffle.common.util.Constants;
 import org.apache.uniffle.common.util.JavaUtils;
 import org.apache.uniffle.server.ShuffleDataFlushEvent;
 import org.apache.uniffle.server.ShuffleFlushManager;
+import org.apache.uniffle.server.ShuffleServerMetrics;
 
 public class ShuffleBufferWithLinkedList extends AbstractShuffleBuffer {
   // blocks will be added to inFlushBlockMap as <eventId, blocks> pair
@@ -59,6 +60,7 @@ public class ShuffleBufferWithLinkedList extends 
AbstractShuffleBuffer {
     long currentDataLength = 0;
 
     for (ShufflePartitionedBlock block : data.getBlockList()) {
+      
ShuffleServerMetrics.summaryBufferBlockSize.observe(block.getDataLength());
       // If sendShuffleData retried, we may receive duplicate block. The 
duplicate
       // block would gc without release. Here we must release the duplicated 
block.
       if (addBlock(block)) {
diff --git 
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
 
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
index 33ba313ce..83f526a5c 100644
--- 
a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
+++ 
b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java
@@ -39,6 +39,7 @@ import org.apache.uniffle.common.util.Constants;
 import org.apache.uniffle.common.util.JavaUtils;
 import org.apache.uniffle.server.ShuffleDataFlushEvent;
 import org.apache.uniffle.server.ShuffleFlushManager;
+import org.apache.uniffle.server.ShuffleServerMetrics;
 
 public class ShuffleBufferWithSkipList extends AbstractShuffleBuffer {
   private ConcurrentSkipListMap<Long, ShufflePartitionedBlock> blocksMap;
@@ -66,6 +67,7 @@ public class ShuffleBufferWithSkipList extends 
AbstractShuffleBuffer {
     long currentDataLength = 0;
 
     for (ShufflePartitionedBlock block : data.getBlockList()) {
+      
ShuffleServerMetrics.summaryBufferBlockSize.observe(block.getDataLength());
       // If sendShuffleData retried, we may receive duplicate block. The 
duplicate
       // block would gc without release. Here we must release the duplicated 
block.
       if (!blocksMap.containsKey(block.getBlockId())) {
diff --git 
a/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java 
b/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
index 36aa2b115..f20d90521 100644
--- 
a/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
+++ 
b/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java
@@ -35,6 +35,8 @@ import org.apache.hadoop.util.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.uniffle.server.ShuffleServerMetrics;
+
 /**
  * Does the management of LAB chunk creations. A monotonically incrementing id 
is associated with
  * every chunk
@@ -102,10 +104,12 @@ public class ChunkCreator {
     if (pool != null) {
       chunk = pool.getChunk();
       if (chunk == null) {
-        LOG.warn(
-            "The chunk pool is full. Reached maxCount= "
-                + pool.getMaxCount()
-                + ". Creating chunk outside of the pool.");
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(
+              "The chunk pool is full. Reached maxCount= "
+                  + pool.getMaxCount()
+                  + ". Creating chunk outside of the pool.");
+        }
       }
     }
 
@@ -134,6 +138,7 @@ public class ChunkCreator {
     Preconditions.checkArgument(id > 0, "chunkId should be positive.");
     chunk = new OffheapChunk(size, id, pool);
     this.chunkIdMap.put(chunk.getId(), chunk);
+    ShuffleServerMetrics.counterLABChunkCreated.inc();
     return chunk;
   }
 
@@ -207,6 +212,10 @@ public class ChunkCreator {
       if (chunk != null) {
         chunk.reset();
         reusedChunkCount.increment();
+        
ShuffleServerMetrics.gaugeLABChunkReclaimed.set(reclaimedChunks.size());
+        ShuffleServerMetrics.gaugeLABChunkPoolRemainPercent.set(
+            reclaimedChunks.size() * 100d / chunkCount.get());
+        ShuffleServerMetrics.counterLABChunkReused.inc();
       } else {
         // Make a chunk if we have not yet created the maxCount chunks
         while (true) {
@@ -314,5 +323,8 @@ public class ChunkCreator {
         LOG.warn("Chunk {} can not be found in chunkIdMap, ignore it", 
chunkID);
       }
     }
+    
ShuffleServerMetrics.gaugeLABChunkReclaimed.set(chunksPool.reclaimedChunks.size());
+    ShuffleServerMetrics.gaugeLABChunkPoolRemainPercent.set(
+        chunksPool.reclaimedChunks.size() * 100d / 
chunksPool.chunkCount.get());
   }
 }
diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java 
b/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
index 5d3b7000a..4da4ce4af 100644
--- a/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
+++ b/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java
@@ -21,6 +21,7 @@ import java.util.LinkedList;
 import java.util.List;
 
 import org.apache.uniffle.common.ShufflePartitionedBlock;
+import org.apache.uniffle.server.ShuffleServerMetrics;
 
 /**
  * Local allocation buffer.
@@ -61,6 +62,7 @@ public class LAB {
   public ShufflePartitionedBlock tryCopyBlockToChunk(ShufflePartitionedBlock 
block) {
     int size = block.getDataLength();
     if (size > maxAlloc) {
+      ShuffleServerMetrics.counterBlockNotOnLAB.inc();
       return block;
     }
     Chunk c;
@@ -77,6 +79,7 @@ public class LAB {
       currChunk = null;
     }
     c.getData().writeBytes(block.getData());
+    ShuffleServerMetrics.counterBlockOnLAB.inc();
     return new LABShufflePartitionedBlock(
         block.getDataLength(),
         block.getUncompressLength(),

Reply via email to