This is an automated email from the ASF dual-hosted git repository.
zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new d47ec222f [#1728] feat(server): Introduce disks timeout metrics (#1729)
d47ec222f is described below
commit d47ec222f83970c43793b0f75e68bd8015b037a2
Author: RickyMa <[email protected]>
AuthorDate: Wed May 22 17:32:43 2024 +0800
[#1728] feat(server): Introduce disks timeout metrics (#1729)
### What changes were proposed in this pull request?
Introduce disks timeout metrics.
### Why are the changes needed?
For https://github.com/apache/incubator-uniffle/issues/1728.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
---
.../java/org/apache/uniffle/server/LocalStorageChecker.java | 11 ++++++++---
.../java/org/apache/uniffle/server/ShuffleServerMetrics.java | 4 ++++
2 files changed, 12 insertions(+), 3 deletions(-)
diff --git
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index c73346fad..c15f2e038 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -122,6 +122,9 @@ public class LocalStorageChecker extends Checker {
ShuffleServerMetrics.gaugeLocalStorageIsWritable
.labels(storageInfo.storage.getBasePath())
.set(isWritable ? 0 : 1);
+ ShuffleServerMetrics.gaugeLocalStorageIsTimeout
+ .labels(storageInfo.storage.getBasePath())
+ .set(0);
if (storageInfo.checkIsSpaceEnough(total, availableBytes)) {
num.incrementAndGet();
@@ -145,10 +148,12 @@ public class LocalStorageChecker extends Checker {
} catch (Exception e) {
if (e instanceof ExecutionException) {
if (e.getCause() instanceof TimeoutException) {
- storageInfo.markCorrupted();
- LOG.error(
- "Timeout of checking local storage: {}. This should not happen
and mark this disk corrupted.",
+ LOG.warn(
+ "Timeout of checking local storage: {}. The current disk's IO
load may be very high.",
storageInfo.storage.getBasePath());
+ ShuffleServerMetrics.gaugeLocalStorageIsTimeout
+ .labels(storageInfo.storage.getBasePath())
+ .set(1);
continue;
}
}
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 140f76566..f97978c0f 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -78,6 +78,7 @@ public class ShuffleServerMetrics {
"total_require_read_memory_failed_num";
private static final String LOCAL_STORAGE_IS_WRITABLE =
"local_storage_is_writable";
+ private static final String LOCAL_STORAGE_IS_TIMEOUT =
"local_storage_is_timeout";
private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM =
"local_storage_total_dirs_num";
private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM =
"local_storage_corrupted_dirs_num";
private static final String LOCAL_STORAGE_TOTAL_SPACE =
"local_storage_total_space";
@@ -189,6 +190,7 @@ public class ShuffleServerMetrics {
public static Gauge.Child gaugeAppWithHugePartitionNum;
public static Gauge gaugeLocalStorageIsWritable;
+ public static Gauge gaugeLocalStorageIsTimeout;
public static Gauge.Child gaugeLocalStorageTotalDirsNum;
public static Gauge.Child gaugeLocalStorageCorruptedDirsNum;
public static Gauge.Child gaugeLocalStorageTotalSpace;
@@ -405,6 +407,8 @@ public class ShuffleServerMetrics {
gaugeLocalStorageIsWritable =
metricsManager.addGauge(LOCAL_STORAGE_IS_WRITABLE,
LOCAL_DISK_PATH_LABEL);
+ gaugeLocalStorageIsTimeout =
+ metricsManager.addGauge(LOCAL_STORAGE_IS_TIMEOUT,
LOCAL_DISK_PATH_LABEL);
gaugeLocalStorageTotalDirsNum =
metricsManager.addLabeledGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
gaugeLocalStorageCorruptedDirsNum =
metricsManager.addLabeledGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);