This is an automated email from the ASF dual-hosted git repository.

zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new d47ec222f [#1728] feat(server): Introduce disks timeout metrics (#1729)
d47ec222f is described below

commit d47ec222f83970c43793b0f75e68bd8015b037a2
Author: RickyMa <[email protected]>
AuthorDate: Wed May 22 17:32:43 2024 +0800

    [#1728] feat(server): Introduce disks timeout metrics (#1729)
    
    ### What changes were proposed in this pull request?
    
    Introduce disks timeout metrics.
    
    ### Why are the changes needed?
    
    For https://github.com/apache/incubator-uniffle/issues/1728.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
---
 .../java/org/apache/uniffle/server/LocalStorageChecker.java   | 11 ++++++++---
 .../java/org/apache/uniffle/server/ShuffleServerMetrics.java  |  4 ++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git 
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java 
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index c73346fad..c15f2e038 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -122,6 +122,9 @@ public class LocalStorageChecker extends Checker {
                 ShuffleServerMetrics.gaugeLocalStorageIsWritable
                     .labels(storageInfo.storage.getBasePath())
                     .set(isWritable ? 0 : 1);
+                ShuffleServerMetrics.gaugeLocalStorageIsTimeout
+                    .labels(storageInfo.storage.getBasePath())
+                    .set(0);
 
                 if (storageInfo.checkIsSpaceEnough(total, availableBytes)) {
                   num.incrementAndGet();
@@ -145,10 +148,12 @@ public class LocalStorageChecker extends Checker {
       } catch (Exception e) {
         if (e instanceof ExecutionException) {
           if (e.getCause() instanceof TimeoutException) {
-            storageInfo.markCorrupted();
-            LOG.error(
-                "Timeout of checking local storage: {}. This should not happen 
and mark this disk corrupted.",
+            LOG.warn(
+                "Timeout of checking local storage: {}. The current disk's IO 
load may be very high.",
                 storageInfo.storage.getBasePath());
+            ShuffleServerMetrics.gaugeLocalStorageIsTimeout
+                .labels(storageInfo.storage.getBasePath())
+                .set(1);
             continue;
           }
         }
diff --git 
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java 
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 140f76566..f97978c0f 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -78,6 +78,7 @@ public class ShuffleServerMetrics {
       "total_require_read_memory_failed_num";
 
   private static final String LOCAL_STORAGE_IS_WRITABLE = 
"local_storage_is_writable";
+  private static final String LOCAL_STORAGE_IS_TIMEOUT = 
"local_storage_is_timeout";
   private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = 
"local_storage_total_dirs_num";
   private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = 
"local_storage_corrupted_dirs_num";
   private static final String LOCAL_STORAGE_TOTAL_SPACE = 
"local_storage_total_space";
@@ -189,6 +190,7 @@ public class ShuffleServerMetrics {
   public static Gauge.Child gaugeAppWithHugePartitionNum;
 
   public static Gauge gaugeLocalStorageIsWritable;
+  public static Gauge gaugeLocalStorageIsTimeout;
   public static Gauge.Child gaugeLocalStorageTotalDirsNum;
   public static Gauge.Child gaugeLocalStorageCorruptedDirsNum;
   public static Gauge.Child gaugeLocalStorageTotalSpace;
@@ -405,6 +407,8 @@ public class ShuffleServerMetrics {
 
     gaugeLocalStorageIsWritable =
         metricsManager.addGauge(LOCAL_STORAGE_IS_WRITABLE, 
LOCAL_DISK_PATH_LABEL);
+    gaugeLocalStorageIsTimeout =
+        metricsManager.addGauge(LOCAL_STORAGE_IS_TIMEOUT, 
LOCAL_DISK_PATH_LABEL);
     gaugeLocalStorageTotalDirsNum = 
metricsManager.addLabeledGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
     gaugeLocalStorageCorruptedDirsNum =
         metricsManager.addLabeledGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);

Reply via email to