This is an automated email from the ASF dual-hosted git repository.

roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new ec4b4a47c [#1286] improvement(server): Add RemoveResourceTime Metric 
(#1288)
ec4b4a47c is described below

commit ec4b4a47cb252604d4cc31fdb440418929f19551
Author: Qing <[email protected]>
AuthorDate: Wed Nov 1 19:02:51 2023 +0800

    [#1286] improvement(server): Add RemoveResourceTime Metric (#1288)
    
    ### What changes were proposed in this pull request?
    
    Add RemoveResourceTime Metric
    
    
    ### Why are the changes needed?
    
    For monitor time used by RemoveResourceTime method
    
    Fix: https://github.com/apache/incubator-uniffle/issues/1286
    
    ### Does this PR introduce _any_ user-facing change?
    No.
---
 .../org/apache/uniffle/server/DefaultFlushEventHandler.java  |  5 +++++
 .../java/org/apache/uniffle/server/ShuffleServerMetrics.java | 12 ++++++++++++
 .../java/org/apache/uniffle/server/ShuffleTaskManager.java   |  7 +++++++
 3 files changed, 24 insertions(+)

diff --git 
a/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java 
b/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
index 5136caafd..e6123d1f4 100644
--- 
a/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
+++ 
b/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
@@ -137,6 +137,11 @@ public class DefaultFlushEventHandler implements 
FlushEventHandler {
         
shuffleServerConf.getInteger(ShuffleServerConf.SERVER_FLUSH_THREAD_POOL_QUEUE_SIZE);
     BlockingQueue<Runnable> waitQueue = 
Queues.newLinkedBlockingQueue(waitQueueSize);
     long keepAliveTime = 
shuffleServerConf.getLong(ShuffleServerConf.SERVER_FLUSH_THREAD_ALIVE);
+    LOG.info(
+        "CreateFlushPool, poolSize:{}, keepAliveTime:{}, queueSize:{}",
+        poolSize,
+        keepAliveTime,
+        waitQueueSize);
     return new ThreadPoolExecutor(
         poolSize,
         poolSize,
diff --git 
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java 
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index dbbe36a69..8f6f177b1 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -24,6 +24,7 @@ import com.google.common.collect.Maps;
 import io.prometheus.client.CollectorRegistry;
 import io.prometheus.client.Counter;
 import io.prometheus.client.Gauge;
+import io.prometheus.client.Summary;
 import org.apache.commons.lang3.StringUtils;
 
 import org.apache.uniffle.common.metrics.MetricsManager;
@@ -105,6 +106,10 @@ public class ShuffleServerMetrics {
   private static final String LOCAL_FILE_EVENT_FLUSH_NUM = 
"local_file_event_flush_num";
   private static final String HADOOP_EVENT_FLUSH_NUM = 
"hadoop_event_flush_num";
 
+  private static final String TOTAL_REMOVE_RESOURCE_TIME = 
"total_remove_resource_time";
+  private static final String TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME =
+      "total_remove_resource_by_shuffle_ids_time";
+
   public static Counter.Child counterTotalAppNum;
   public static Counter.Child counterTotalAppWithHugePartitionNum;
   public static Counter.Child counterTotalPartitionNum;
@@ -140,6 +145,9 @@ public class ShuffleServerMetrics {
   public static Counter.Child counterTotalRequireReadMemoryRetryNum;
   public static Counter.Child counterTotalRequireReadMemoryFailedNum;
 
+  public static Summary summaryTotalRemoveResourceTime;
+  public static Summary summaryTotalRemoveResourceByShuffleIdsTime;
+
   public static Gauge.Child gaugeHugePartitionNum;
   public static Gauge.Child gaugeAppWithHugePartitionNum;
 
@@ -330,5 +338,9 @@ public class ShuffleServerMetrics {
 
     counterLocalFileEventFlush = 
metricsManager.addCounter(LOCAL_FILE_EVENT_FLUSH_NUM);
     counterHadoopEventFlush = 
metricsManager.addCounter(HADOOP_EVENT_FLUSH_NUM);
+
+    summaryTotalRemoveResourceTime = 
metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_TIME);
+    summaryTotalRemoveResourceByShuffleIdsTime =
+        metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME);
   }
 }
diff --git 
a/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java 
b/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
index 9e15e3892..4021c7d33 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
@@ -171,11 +171,18 @@ public class ShuffleTaskManager {
           while (true) {
             try {
               PurgeEvent event = expiredAppIdQueue.take();
+              long startTime = System.currentTimeMillis();
               if (event instanceof AppPurgeEvent) {
                 removeResources(event.getAppId(), true);
+                double usedTime =
+                    (System.currentTimeMillis() - startTime) / 
Constants.MILLION_SECONDS_PER_SECOND;
+                
ShuffleServerMetrics.summaryTotalRemoveResourceTime.observe(usedTime);
               }
               if (event instanceof ShufflePurgeEvent) {
                 removeResourcesByShuffleIds(event.getAppId(), 
event.getShuffleIds());
+                double usedTime =
+                    (System.currentTimeMillis() - startTime) / 
Constants.MILLION_SECONDS_PER_SECOND;
+                
ShuffleServerMetrics.summaryTotalRemoveResourceByShuffleIdsTime.observe(usedTime);
               }
             } catch (Exception e) {
               LOG.error("Exception happened when clear resource for expired 
application", e);

Reply via email to