This is an automated email from the ASF dual-hosted git repository.

erose pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ffae705c7 HDDS-12239. Volume should not be marked as unhealthy when 
disk full (#7830)
4ffae705c7 is described below

commit 4ffae705c72124ab11d1941f529c4a8bf30f2012
Author: Ashish Kumar <[email protected]>
AuthorDate: Wed Mar 26 19:38:18 2025 +0530

    HDDS-12239. Volume should not be marked as unhealthy when disk full (#7830)
    
    Co-authored-by: ashishk <[email protected]>
---
 .../container/common/volume/StorageVolume.java     | 17 ++++++++
 .../volume/TestStorageVolumeHealthChecks.java      | 47 ++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
index 318fa0ab9d..639317af88 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
@@ -623,6 +623,15 @@ public synchronized VolumeCheckResult check(@Nullable 
Boolean unused)
       return VolumeCheckResult.HEALTHY;
     }
 
+    // At least some space required to check disk read/write
+    // If there are not enough space remaining,
+    // to avoid volume failure we can ignore checking disk read/write
+    int minimumDiskSpace = healthCheckFileSize * 2;
+    if (volumeInfo.get().getCurrentUsage().getAvailable() < minimumDiskSpace) {
+      ioTestSlidingWindow.add(true);
+      return VolumeCheckResult.HEALTHY;
+    }
+
     // Since IO errors may be intermittent, volume remains healthy until the
     // threshold of failures is crossed.
     boolean diskChecksPassed = DiskCheckUtil.checkReadWrite(storageDir,
@@ -634,6 +643,14 @@ public synchronized VolumeCheckResult check(@Nullable 
Boolean unused)
           " interrupted.");
     }
 
+    // As WRITE keeps happening there is probability, disk has become full 
during above check.
+    // We can check again if disk is full. If it is full,
+    // in this case keep volume as healthy so that READ can still be served
+    if (!diskChecksPassed && volumeInfo.get().getCurrentUsage().getAvailable() 
< minimumDiskSpace) {
+      ioTestSlidingWindow.add(true);
+      return VolumeCheckResult.HEALTHY;
+    }
+
     // Move the sliding window of IO test results forward 1 by adding the
     // latest entry and removing the oldest entry from the window.
     // Update the failure counter for the new window.
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
index eddf80ef42..9e16e4f9b7 100644
--- 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
@@ -110,6 +110,53 @@ public boolean checkExistence(File storageDir) {
     assertEquals(VolumeCheckResult.FAILED, result);
   }
 
+  @ParameterizedTest
+  @MethodSource("volumeBuilders")
+  public void testVolumeFullHealth(StorageVolume.Builder<?> builder) throws 
Exception {
+    verifyFullVolumeHealthWithDiskReadWriteStatus(builder, true, false);
+  }
+
+
+  public void 
verifyFullVolumeHealthWithDiskReadWriteStatus(StorageVolume.Builder<?> builder, 
boolean... checkResult)
+      throws Exception {
+
+    for (boolean result : checkResult) {
+      StorageVolume volume = builder.build();
+
+      VolumeUsage usage = volume.getVolumeInfo().get().getUsageForTesting();
+      DatanodeConfiguration dnConf = 
CONF.getObject(DatanodeConfiguration.class);
+      int minimumDiskSpace = dnConf.getVolumeHealthCheckFileSize() * 2;
+      // Keep remaining space as just less than double of 
VolumeHealthCheckFileSize.
+      usage.incrementUsedSpace(usage.getCurrentUsage().getAvailable() - 
minimumDiskSpace + 1);
+      usage.realUsage();
+      DiskCheckUtil.DiskChecks ioFailure = new DiskCheckUtil.DiskChecks() {
+        @Override
+        public boolean checkReadWrite(File storageDir, File testFileDir,
+                                      int numBytesToWrite) {
+          return result;
+        }
+      };
+      DiskCheckUtil.setTestImpl(ioFailure);
+      // Volume will remain healthy as volume don't have enough space to check 
READ/WRITE
+      assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+      // Even in second try volume will remain HEALTHY
+      assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+
+      // Now keep enough space for read/write check to go through
+      usage.decrementUsedSpace(minimumDiskSpace + 1);
+
+      // volumeIOFailureTolerance is 1, so first time it will be HEALTHY always
+      assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+      if (result) {
+        // Volume will remain as healthy as READ/WRITE check is fine
+        assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+      } else {
+        // Second time volume will fail as READ/WRITE check has failed
+        assertEquals(VolumeCheckResult.FAILED, volume.check(false));
+      }
+    }
+  }
+
   @ParameterizedTest
   @MethodSource("volumeBuilders")
   public void testCheckPermissions(StorageVolume.Builder<?> builder)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to