This is an automated email from the ASF dual-hosted git repository.

ChenSammi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new fff83a12616 HDDS-15150. Container scanner should not mark container as 
UNHEALTHY when FD exhausted (#10214)
fff83a12616 is described below

commit fff83a1261664cc984456427296d129ee30da281
Author: Sarveksha Yeshavantha Raju 
<[email protected]>
AuthorDate: Tue May 26 12:00:15 2026 +0530

    HDDS-15150. Container scanner should not mark container as UNHEALTHY when 
FD exhausted (#10214)
---
 .../container/ozoneimpl/ContainerScanHelper.java   | 62 ++++++++++++----
 .../container/ozoneimpl/ScanTransientIOUtil.java   | 85 ++++++++++++++++++++++
 .../TestBackgroundContainerDataScanner.java        | 27 +++++++
 .../TestBackgroundContainerMetadataScanner.java    | 24 ++++++
 .../ozoneimpl/TestScanTransientIOUtil.java         | 82 +++++++++++++++++++++
 5 files changed, 265 insertions(+), 15 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java
index 4c4a45c55d4..65a7a1371d3 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java
@@ -66,27 +66,35 @@ public void scanData(Container<?> container, 
DataTransferThrottler throttler, Ca
     long containerId = containerData.getContainerID();
     logScanStart(containerData, "data");
     DataScanResult result = container.scanData(throttler, canceler);
-
+    Instant now = Instant.now();
+    
     if (result.isDeleted()) {
       log.debug("Container [{}] has been deleted during the data scan.", 
containerId);
-    } else {
+      logScanCompleted(containerData, now);
+      return;
+    }
+
+    boolean isTransientFailure = 
ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(result);
+
+    if (!isTransientFailure) {
       try {
         controller.updateContainerChecksum(containerId, result.getDataTree());
       } catch (IOException ex) {
         log.warn("Failed to update container checksum after scan of container 
{}", containerId, ex);
       }
-      if (result.hasErrors()) {
-        handleUnhealthyScanResult(containerData, result);
-      }
-      metrics.incNumContainersScanned();
     }
 
-    Instant now = Instant.now();
-    if (!result.isDeleted()) {
+    if (result.hasErrors()) {
+      handleUnhealthyScanResult(containerData, result, isTransientFailure);
+    }
+
+    if (!isTransientFailure) {
+      metrics.incNumContainersScanned();
       controller.updateDataScanTimestamp(containerId, now);
+      logScanCompleted(containerData, now);
+    } else {
+      logScanIncomplete(containerData, now, "data");
     }
-    // Even if the container was deleted, mark the scan as completed since we 
already logged it as starting.
-    logScanCompleted(containerData, now);
   }
 
   public void scanMetadata(Container<?> container)
@@ -103,20 +111,37 @@ public void scanMetadata(Container<?> container)
       log.debug("Container [{}] has been deleted during metadata scan.", 
containerId);
       return;
     }
+
+    boolean isTransientFailure = 
ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(result);
+
     if (result.hasErrors()) {
-      handleUnhealthyScanResult(containerData, result);
+      handleUnhealthyScanResult(containerData, result, isTransientFailure);
     }
 
     Instant now = Instant.now();
     // Do not update the scan timestamp after the scan since this was just a
     // metadata scan, not a full data scan.
-    metrics.incNumContainersScanned();
-    // Even if the container was deleted, mark the scan as completed since we 
already logged it as starting.
-    logScanCompleted(containerData, now);
+    if (!isTransientFailure) {
+      metrics.incNumContainersScanned();
+      // Even if the container was deleted, mark the scan as completed since 
we already logged it as starting.
+      logScanCompleted(containerData, now);
+    } else {
+      logScanIncomplete(containerData, now, "metadata");
+    }
   }
 
-  public void handleUnhealthyScanResult(ContainerData containerData, 
ScanResult result) throws IOException {
+  /**
+   * Marks container UNHEALTHY when the scan reports real errors.
+   * If every scan error is related to file-descriptor exhaustion, return 
without marking container unhealthy.
+   */
+  public void handleUnhealthyScanResult(ContainerData containerData, 
ScanResult result,
+      boolean isTransientFailure) throws IOException {
     long containerID = containerData.getContainerID();
+    if (isTransientFailure) {
+      log.warn("Skipped marking container UNHEALTHY [{}]: scan failed due to 
transient " +
+          "file descriptor exhaustion ('Too many open files'). {}", 
containerID, result);
+      return;
+    }
     log.error("Corruption detected in container [{}]. Marking it UNHEALTHY. 
{}", containerID, result);
     if (log.isDebugEnabled()) {
       StringBuilder allErrorString = new StringBuilder();
@@ -205,4 +230,11 @@ private void logScanCompleted(
     log.debug("Completed scan of container {} at {}",
         containerData.getContainerID(), timestamp);
   }
+
+  private void logScanIncomplete(ContainerData containerData, Instant 
timestamp, String scanType) {
+    if (log.isDebugEnabled()) {
+      log.debug("Incomplete {} scan of container {} at {} due to transient 
file descriptor exhaustion",
+          scanType, containerData.getContainerID(), timestamp);
+    }
+  }
 }
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java
new file mode 100644
index 00000000000..1be9acf3816
--- /dev/null
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.container.ozoneimpl;
+
+import java.nio.file.FileSystemException;
+import java.util.Collections;
+import java.util.IdentityHashMap;
+import java.util.Locale;
+import java.util.Set;
+import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;
+
+/**
+ * Utility to catch transient scan failures (typically related to 
file-descriptor exhaustion)
+ * that should not be treated as container data corruption.
+ */
+public final class ScanTransientIOUtil {
+
+  private static final int MAX_CAUSE_CHAIN_DEPTH = 64;
+
+  private static final String TOO_MANY_OPEN_FILES = "too many open files";
+
+  private ScanTransientIOUtil() {
+  }
+
+  /**
+   * Returns true when every scan error is related to file-descriptor 
exhaustion.
+   * Each error's exception chain is checked via {@link 
#isTooManyOpenFiles(Throwable)}.
+   */
+  public static boolean scanErrorsAreOnlyTooManyOpenFiles(ScanResult 
scanResult) {
+    if (!scanResult.hasErrors()) {
+      return false;
+    }
+    return scanResult.getErrors().stream()
+        .allMatch(scanError -> isTooManyOpenFiles(scanError.getException()));
+  }
+
+  public static boolean isTooManyOpenFiles(Throwable throwable) {
+    if (throwable == null) {
+      return false;
+    }
+    Set<Throwable> visited = Collections.newSetFromMap(new 
IdentityHashMap<>());
+    int depth = 0;
+    for (Throwable cause = throwable;
+        cause != null && depth < MAX_CAUSE_CHAIN_DEPTH;
+        cause = cause.getCause(), depth++) {
+      if (!visited.add(cause)) {
+        break;
+      }
+      if (matchesTooManyOpenFiles(cause)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private static boolean matchesTooManyOpenFiles(Throwable throwable) {
+    if (throwable instanceof FileSystemException) {
+      String reason = ((FileSystemException) throwable).getReason();
+      if (reason != null && containsTooManyOpenFiles(reason)) {
+        return true;
+      }
+    }
+    String message = throwable.getMessage();
+    return message != null && containsTooManyOpenFiles(message);
+  }
+
+  private static boolean containsTooManyOpenFiles(String text) {
+    return text.toLowerCase(Locale.ROOT).contains(TOO_MANY_OPEN_FILES);
+  }
+}
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java
index 53598242254..4180a7fb34d 100644
--- 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java
@@ -44,6 +44,7 @@
 import java.io.IOException;
 import java.time.Duration;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Optional;
 import java.util.concurrent.Callable;
 import java.util.concurrent.CountDownLatch;
@@ -56,12 +57,14 @@
 import org.apache.hadoop.hdfs.util.Canceler;
 import org.apache.hadoop.hdfs.util.DataTransferThrottler;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeWriter;
 import org.apache.hadoop.ozone.container.common.impl.ContainerData;
 import org.apache.hadoop.ozone.container.common.interfaces.Container;
 import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;
 import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
 import 
org.apache.hadoop.ozone.container.metadata.DatanodeSchemaThreeDBDefinition;
 import org.apache.hadoop.ozone.container.metadata.DatanodeStoreSchemaThreeImpl;
+import 
org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType;
 import org.apache.ozone.test.GenericTestUtils;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -402,4 +405,28 @@ public void testMerkleTreeWritten() throws Exception {
           
.updateContainerChecksum(eq(container.getContainerData().getContainerID()), 
any());
     }
   }
+
+  /**
+   * When data scan reports only "too many open files" errors due to 
file-descriptor exhaustion,
+   * the container must not be marked UNHEALTHY.
+   */
+  @Test
+  public void testDataScanOnlyTooManyOpenFilesDoesNotMarkUnhealthy() throws 
Exception {
+    Container<?> container = mockKeyValueContainer();
+    IOException ex = new IOException("Too many open files");
+    DataScanResult scanResult = 
DataScanResult.fromErrors(Collections.singletonList(
+                    new ContainerScanError(FailureType.CORRUPT_CHUNK, new 
File("."), ex)),
+            new ContainerMerkleTreeWriter());
+    when(container.scanData(any(DataTransferThrottler.class), 
any(Canceler.class))).thenReturn(scanResult);
+
+    setContainers(container);
+    scanner.runIteration();
+
+    verify(controller, never()).markContainerUnhealthy(anyLong(), 
any(ScanResult.class));
+    verify(controller, 
never()).updateContainerChecksum(eq(container.getContainerData().getContainerID()),
 any());
+    verify(controller, 
never()).updateDataScanTimestamp(eq(container.getContainerData().getContainerID()),
 any());
+    assertEquals(1, scanner.getMetrics().getNumScanIterations());
+    assertEquals(0, scanner.getMetrics().getNumContainersScanned());
+    assertEquals(0, scanner.getMetrics().getNumUnHealthyContainers());
+  }
 }
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java
index 9b6c6aed3f0..f71b9b26bd7 100644
--- 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java
@@ -38,8 +38,10 @@
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
+import java.io.File;
 import java.io.IOException;
 import java.time.Duration;
+import java.util.Collections;
 import java.util.Optional;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
@@ -49,6 +51,7 @@
 import org.apache.hadoop.ozone.container.common.interfaces.Container;
 import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;
 import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
+import 
org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType;
 import org.apache.ozone.test.GenericTestUtils;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -256,4 +259,25 @@ public void testShutdownDuringScan() throws Exception {
     // The container should remain healthy.
     verifyContainerMarkedUnhealthy(healthy, never());
   }
+
+  /**
+   * When metadata scan reports only "too many open files" errors due to 
file-descriptor exhaustion, 
+   * the container must not be marked UNHEALTHY.
+   */
+  @Test
+  public void testMetadataScanOnlyTooManyOpenFilesDoesNotMarkUnhealthy() 
throws Exception {
+    Container<?> container = mockKeyValueContainer();
+    IOException emf = new IOException("Too many open files");
+    MetadataScanResult scanResult = 
MetadataScanResult.fromErrors(Collections.singletonList(
+            new ContainerScanError(FailureType.CORRUPT_CONTAINER_FILE, new 
File("."), emf)));
+    when(container.scanMetaData()).thenReturn(scanResult);
+
+    setContainers(container);
+    scanner.runIteration();
+
+    verify(controller, never()).markContainerUnhealthy(anyLong(), 
any(ScanResult.class));
+    assertEquals(1, scanner.getMetrics().getNumScanIterations());
+    assertEquals(0, scanner.getMetrics().getNumContainersScanned());
+    assertEquals(0, scanner.getMetrics().getNumUnHealthyContainers());
+  }
 }
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java
new file mode 100644
index 00000000000..a92603eb21a
--- /dev/null
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.container.ozoneimpl;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.file.FileSystemException;
+import java.util.Arrays;
+import java.util.Collections;
+import 
org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType;
+import org.junit.jupiter.api.Test;
+
+/** 
+ * Unit tests for {@link ScanTransientIOUtil}.
+ */
+public class TestScanTransientIOUtil {
+
+  @Test
+  public void detectsTooManyOpenFilesInFileSystemException() {
+    assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new 
FileSystemException(null, null, "Too many open files")));
+  }
+
+  @Test
+  public void detectsTooManyOpenFilesInFileNotFoundExceptionMessage() {
+    String msg = "/data/container/metadata/16341719.container (Too many open 
files)";
+    assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new 
FileNotFoundException(msg)));
+  }
+
+  @Test
+  public void detectsTooManyOpenFilesInMessageCauseChain() {
+    IOException throwable = new IOException("Too many open files");
+    assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new 
IOException(throwable)));
+  }
+
+  @Test
+  public void rejectsUnrelatedIOException() {
+    assertFalse(ScanTransientIOUtil.isTooManyOpenFiles(new IOException("disk 
full")));
+  }
+
+  @Test
+  public void scanErrorsOnlyTooManyOpenFilesReturnsTrue() {
+    IOException ex = new IOException("Too many open files");
+    MetadataScanResult scanResult = 
MetadataScanResult.fromErrors(Collections.singletonList(
+        new ContainerScanError(FailureType.CORRUPT_CONTAINER_FILE, new 
File("."), ex)));
+    
assertTrue(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(scanResult));
+  }
+
+  @Test
+  public void scanErrorsMixedReturnsFalse() {
+    IOException ioException = new IOException("Too many open files");
+    FileNotFoundException fileNotFoundException = new 
FileNotFoundException("missing");
+    MetadataScanResult scanResult = 
MetadataScanResult.fromErrors(Arrays.asList(
+        new ContainerScanError(FailureType.CORRUPT_CHUNK, new File("."), 
ioException),
+        new ContainerScanError(FailureType.MISSING_CONTAINER_FILE, new 
File("."), fileNotFoundException)));
+    
assertFalse(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(scanResult));
+  }
+
+  @Test
+  public void emptyScanResult() {
+    assertFalse(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(
+        MetadataScanResult.fromErrors(Collections.emptyList())));
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to