This is an automated email from the ASF dual-hosted git repository. xianjingfeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/uniffle.git
The following commit(s) were added to refs/heads/master by this push: new 03f42c542 [#2525] improvement(server): add some metrics for LAB (#2553) 03f42c542 is described below commit 03f42c542ba5245d9a70283c5adc0e99eb40d298 Author: xianjingfeng <xianjingfeng...@gmail.com> AuthorDate: Fri Jul 25 14:53:04 2025 +0800 [#2525] improvement(server): add some metrics for LAB (#2553) ### What changes were proposed in this pull request? Add some metrics for LAB. ### Why are the changes needed? These metrics can help us to adjust LAB related parameters. Fix: #2525 ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Verify in production environment. --- .../uniffle/server/ShuffleServerMetrics.java | 24 ++++++++++++++++++++-- .../server/buffer/ShuffleBufferWithLinkedList.java | 2 ++ .../server/buffer/ShuffleBufferWithSkipList.java | 2 ++ .../uniffle/server/buffer/lab/ChunkCreator.java | 20 ++++++++++++++---- .../org/apache/uniffle/server/buffer/lab/LAB.java | 3 +++ 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java index 9b6c548e1..f8689553c 100644 --- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java +++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java @@ -149,7 +149,13 @@ public class ShuffleServerMetrics { private static final String TOTAL_REMOVE_RESOURCE_TIME = "total_remove_resource_time"; private static final String TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME = "total_remove_resource_by_shuffle_ids_time"; - + private static final String LAB_CREATED_CHUNK_COUNT = "lab_created_chunk_count"; + private static final String LAB_REUSED_CHUNK_COUNT = "lab_reused_chunk_count"; + private static final String LAB_RECLAIMED_CHUNK_COUNT = "lab_reclaimed_chunk_count"; + private static final String LAB_CHUNK_POOL_REMAIN_PERCENT = "lab_chunk_pool_remain_percent"; + private static final String NOT_ON_LAB_BLOCK_COUNT = "not_on_lab_block_count"; + private static final String ON_LAB_BLOCK_COUNT = "on_lab_block_count"; + private static final String BUFFER_BLOCK_SIZE = "buffer_block_size"; public static final String TOPN_OF_TOTAL_DATA_SIZE_FOR_APP = "topN_of_total_data_size_for_app"; public static final String TOPN_OF_IN_MEMORY_DATA_SIZE_FOR_APP = "topN_of_in_memory_data_size_for_app"; @@ -268,6 +274,13 @@ public class ShuffleServerMetrics { public static Counter counterHadoopEventFlush; public static Counter counterPreAllocatedBufferExpired; public static Counter counterAppNotFound; + public static Counter counterLABChunkCreated; + public static Counter counterLABChunkReused; + public static Gauge gaugeLABChunkReclaimed; + public static Gauge gaugeLABChunkPoolRemainPercent; + public static Counter counterBlockNotOnLAB; + public static Counter counterBlockOnLAB; + public static Summary summaryBufferBlockSize; private static MetricsManager metricsManager; private static boolean isRegister = false; @@ -504,7 +517,14 @@ public class ShuffleServerMetrics { summaryTotalRemoveResourceTime = metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_TIME); summaryTotalRemoveResourceByShuffleIdsTime = metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME); - + counterLABChunkCreated = metricsManager.addCounter(LAB_CREATED_CHUNK_COUNT); + counterLABChunkReused = metricsManager.addCounter(LAB_REUSED_CHUNK_COUNT); + gaugeLABChunkReclaimed = metricsManager.addGauge(LAB_RECLAIMED_CHUNK_COUNT); + gaugeLABChunkPoolRemainPercent = metricsManager.addGauge(LAB_CHUNK_POOL_REMAIN_PERCENT); + + counterBlockNotOnLAB = metricsManager.addCounter(NOT_ON_LAB_BLOCK_COUNT); + counterBlockOnLAB = metricsManager.addCounter(ON_LAB_BLOCK_COUNT); + summaryBufferBlockSize = metricsManager.addSummary(BUFFER_BLOCK_SIZE); gaugeTotalDataSizeUsage = Gauge.build() .name(TOPN_OF_TOTAL_DATA_SIZE_FOR_APP) diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java index 2216e2907..9d4a5a57f 100644 --- a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java +++ b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithLinkedList.java @@ -37,6 +37,7 @@ import org.apache.uniffle.common.util.Constants; import org.apache.uniffle.common.util.JavaUtils; import org.apache.uniffle.server.ShuffleDataFlushEvent; import org.apache.uniffle.server.ShuffleFlushManager; +import org.apache.uniffle.server.ShuffleServerMetrics; public class ShuffleBufferWithLinkedList extends AbstractShuffleBuffer { // blocks will be added to inFlushBlockMap as <eventId, blocks> pair @@ -59,6 +60,7 @@ public class ShuffleBufferWithLinkedList extends AbstractShuffleBuffer { long currentDataLength = 0; for (ShufflePartitionedBlock block : data.getBlockList()) { + ShuffleServerMetrics.summaryBufferBlockSize.observe(block.getDataLength()); // If sendShuffleData retried, we may receive duplicate block. The duplicate // block would gc without release. Here we must release the duplicated block. if (addBlock(block)) { diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java index 33ba313ce..83f526a5c 100644 --- a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java +++ b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferWithSkipList.java @@ -39,6 +39,7 @@ import org.apache.uniffle.common.util.Constants; import org.apache.uniffle.common.util.JavaUtils; import org.apache.uniffle.server.ShuffleDataFlushEvent; import org.apache.uniffle.server.ShuffleFlushManager; +import org.apache.uniffle.server.ShuffleServerMetrics; public class ShuffleBufferWithSkipList extends AbstractShuffleBuffer { private ConcurrentSkipListMap<Long, ShufflePartitionedBlock> blocksMap; @@ -66,6 +67,7 @@ public class ShuffleBufferWithSkipList extends AbstractShuffleBuffer { long currentDataLength = 0; for (ShufflePartitionedBlock block : data.getBlockList()) { + ShuffleServerMetrics.summaryBufferBlockSize.observe(block.getDataLength()); // If sendShuffleData retried, we may receive duplicate block. The duplicate // block would gc without release. Here we must release the duplicated block. if (!blocksMap.containsKey(block.getBlockId())) { diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java b/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java index 36aa2b115..f20d90521 100644 --- a/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java +++ b/server/src/main/java/org/apache/uniffle/server/buffer/lab/ChunkCreator.java @@ -35,6 +35,8 @@ import org.apache.hadoop.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.uniffle.server.ShuffleServerMetrics; + /** * Does the management of LAB chunk creations. A monotonically incrementing id is associated with * every chunk @@ -102,10 +104,12 @@ public class ChunkCreator { if (pool != null) { chunk = pool.getChunk(); if (chunk == null) { - LOG.warn( - "The chunk pool is full. Reached maxCount= " - + pool.getMaxCount() - + ". Creating chunk outside of the pool."); + if (LOG.isDebugEnabled()) { + LOG.debug( + "The chunk pool is full. Reached maxCount= " + + pool.getMaxCount() + + ". Creating chunk outside of the pool."); + } } } @@ -134,6 +138,7 @@ public class ChunkCreator { Preconditions.checkArgument(id > 0, "chunkId should be positive."); chunk = new OffheapChunk(size, id, pool); this.chunkIdMap.put(chunk.getId(), chunk); + ShuffleServerMetrics.counterLABChunkCreated.inc(); return chunk; } @@ -207,6 +212,10 @@ public class ChunkCreator { if (chunk != null) { chunk.reset(); reusedChunkCount.increment(); + ShuffleServerMetrics.gaugeLABChunkReclaimed.set(reclaimedChunks.size()); + ShuffleServerMetrics.gaugeLABChunkPoolRemainPercent.set( + reclaimedChunks.size() * 100d / chunkCount.get()); + ShuffleServerMetrics.counterLABChunkReused.inc(); } else { // Make a chunk if we have not yet created the maxCount chunks while (true) { @@ -314,5 +323,8 @@ public class ChunkCreator { LOG.warn("Chunk {} can not be found in chunkIdMap, ignore it", chunkID); } } + ShuffleServerMetrics.gaugeLABChunkReclaimed.set(chunksPool.reclaimedChunks.size()); + ShuffleServerMetrics.gaugeLABChunkPoolRemainPercent.set( + chunksPool.reclaimedChunks.size() * 100d / chunksPool.chunkCount.get()); } } diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java b/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java index 5d3b7000a..4da4ce4af 100644 --- a/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java +++ b/server/src/main/java/org/apache/uniffle/server/buffer/lab/LAB.java @@ -21,6 +21,7 @@ import java.util.LinkedList; import java.util.List; import org.apache.uniffle.common.ShufflePartitionedBlock; +import org.apache.uniffle.server.ShuffleServerMetrics; /** * Local allocation buffer. @@ -61,6 +62,7 @@ public class LAB { public ShufflePartitionedBlock tryCopyBlockToChunk(ShufflePartitionedBlock block) { int size = block.getDataLength(); if (size > maxAlloc) { + ShuffleServerMetrics.counterBlockNotOnLAB.inc(); return block; } Chunk c; @@ -77,6 +79,7 @@ public class LAB { currChunk = null; } c.getData().writeBytes(block.getData()); + ShuffleServerMetrics.counterBlockOnLAB.inc(); return new LABShufflePartitionedBlock( block.getDataLength(), block.getUncompressLength(),