This is an automated email from the ASF dual-hosted git repository.
roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new ec4b4a47c [#1286] improvement(server): Add RemoveResourceTime Metric
(#1288)
ec4b4a47c is described below
commit ec4b4a47cb252604d4cc31fdb440418929f19551
Author: Qing <[email protected]>
AuthorDate: Wed Nov 1 19:02:51 2023 +0800
[#1286] improvement(server): Add RemoveResourceTime Metric (#1288)
### What changes were proposed in this pull request?
Add RemoveResourceTime Metric
### Why are the changes needed?
For monitor time used by RemoveResourceTime method
Fix: https://github.com/apache/incubator-uniffle/issues/1286
### Does this PR introduce _any_ user-facing change?
No.
---
.../org/apache/uniffle/server/DefaultFlushEventHandler.java | 5 +++++
.../java/org/apache/uniffle/server/ShuffleServerMetrics.java | 12 ++++++++++++
.../java/org/apache/uniffle/server/ShuffleTaskManager.java | 7 +++++++
3 files changed, 24 insertions(+)
diff --git
a/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
b/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
index 5136caafd..e6123d1f4 100644
---
a/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
+++
b/server/src/main/java/org/apache/uniffle/server/DefaultFlushEventHandler.java
@@ -137,6 +137,11 @@ public class DefaultFlushEventHandler implements
FlushEventHandler {
shuffleServerConf.getInteger(ShuffleServerConf.SERVER_FLUSH_THREAD_POOL_QUEUE_SIZE);
BlockingQueue<Runnable> waitQueue =
Queues.newLinkedBlockingQueue(waitQueueSize);
long keepAliveTime =
shuffleServerConf.getLong(ShuffleServerConf.SERVER_FLUSH_THREAD_ALIVE);
+ LOG.info(
+ "CreateFlushPool, poolSize:{}, keepAliveTime:{}, queueSize:{}",
+ poolSize,
+ keepAliveTime,
+ waitQueueSize);
return new ThreadPoolExecutor(
poolSize,
poolSize,
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index dbbe36a69..8f6f177b1 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -24,6 +24,7 @@ import com.google.common.collect.Maps;
import io.prometheus.client.CollectorRegistry;
import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
+import io.prometheus.client.Summary;
import org.apache.commons.lang3.StringUtils;
import org.apache.uniffle.common.metrics.MetricsManager;
@@ -105,6 +106,10 @@ public class ShuffleServerMetrics {
private static final String LOCAL_FILE_EVENT_FLUSH_NUM =
"local_file_event_flush_num";
private static final String HADOOP_EVENT_FLUSH_NUM =
"hadoop_event_flush_num";
+ private static final String TOTAL_REMOVE_RESOURCE_TIME =
"total_remove_resource_time";
+ private static final String TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME =
+ "total_remove_resource_by_shuffle_ids_time";
+
public static Counter.Child counterTotalAppNum;
public static Counter.Child counterTotalAppWithHugePartitionNum;
public static Counter.Child counterTotalPartitionNum;
@@ -140,6 +145,9 @@ public class ShuffleServerMetrics {
public static Counter.Child counterTotalRequireReadMemoryRetryNum;
public static Counter.Child counterTotalRequireReadMemoryFailedNum;
+ public static Summary summaryTotalRemoveResourceTime;
+ public static Summary summaryTotalRemoveResourceByShuffleIdsTime;
+
public static Gauge.Child gaugeHugePartitionNum;
public static Gauge.Child gaugeAppWithHugePartitionNum;
@@ -330,5 +338,9 @@ public class ShuffleServerMetrics {
counterLocalFileEventFlush =
metricsManager.addCounter(LOCAL_FILE_EVENT_FLUSH_NUM);
counterHadoopEventFlush =
metricsManager.addCounter(HADOOP_EVENT_FLUSH_NUM);
+
+ summaryTotalRemoveResourceTime =
metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_TIME);
+ summaryTotalRemoveResourceByShuffleIdsTime =
+ metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME);
}
}
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
index 9e15e3892..4021c7d33 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java
@@ -171,11 +171,18 @@ public class ShuffleTaskManager {
while (true) {
try {
PurgeEvent event = expiredAppIdQueue.take();
+ long startTime = System.currentTimeMillis();
if (event instanceof AppPurgeEvent) {
removeResources(event.getAppId(), true);
+ double usedTime =
+ (System.currentTimeMillis() - startTime) /
Constants.MILLION_SECONDS_PER_SECOND;
+
ShuffleServerMetrics.summaryTotalRemoveResourceTime.observe(usedTime);
}
if (event instanceof ShufflePurgeEvent) {
removeResourcesByShuffleIds(event.getAppId(),
event.getShuffleIds());
+ double usedTime =
+ (System.currentTimeMillis() - startTime) /
Constants.MILLION_SECONDS_PER_SECOND;
+
ShuffleServerMetrics.summaryTotalRemoveResourceByShuffleIdsTime.observe(usedTime);
}
} catch (Exception e) {
LOG.error("Exception happened when clear resource for expired
application", e);