This is an automated email from the ASF dual-hosted git repository.

smengcl pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new e03d8623211 HDDS-14859. Use RocksDb secondary instance for validating 
volumes. (#9947)
e03d8623211 is described below

commit e03d862321171d9fc6a40387e295846fa91b9857
Author: Rishabh Patel <[email protected]>
AuthorDate: Tue Jun 16 16:27:20 2026 -0700

    HDDS-14859. Use RocksDb secondary instance for validating volumes. (#9947)
---
 .../ozone/container/common/volume/HddsVolume.java  | 66 +++++++++++++++++-
 .../container/common/volume/StorageVolume.java     | 11 ++-
 .../hdds/utils/db/managed/ManagedRocksDB.java      | 78 ++++++++++++++++++++++
 3 files changed, 151 insertions(+), 4 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
index f1deedc8d33..8827960248d 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
@@ -25,6 +25,7 @@
 import jakarta.annotation.Nullable;
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.ConcurrentSkipListSet;
@@ -48,6 +49,7 @@
 import org.apache.hadoop.ozone.container.common.utils.RawDB;
 import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
 import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController;
+import org.apache.hadoop.ozone.container.ozoneimpl.ScanTransientIOUtil;
 import org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures;
 import 
org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures.SchemaV3;
 import org.apache.hadoop.util.Time;
@@ -304,24 +306,82 @@ public synchronized VolumeCheckResult check(@Nullable 
Boolean unused)
     return checkDbHealth(dbFile);
   }
 
+  /**
+   * Verifies the per-volume RocksDB's global state files (CURRENT, MANIFEST,
+   * OPTIONS) by opening the DB in secondary mode. A successful open implies
+   * those files are readable and internally consistent and that the
+   * referenced SST file names match what RocksDB expects.
+   *
+   * <p>This check intentionally does <b>not</b> read or checksum SST file
+   * contents or any individual key/value. Per-block / per-key integrity is
+   * verified by the container data scanner, which scans containers (and
+   * their RocksDB rows) on its own schedule.
+   *
+   * <p>The volume is only marked {@link VolumeCheckResult#FAILED} once the
+   * configured threshold of failures is exceeded, matching the parent class's
+   * intermittent-error tolerance. Open failures whose underlying RocksDB
+   * status is {@code IOError(NoSpace)} are not counted: {@code 
openAsSecondary}
+   * writes its info LOG into the disk-check directory, so an out-of-space
+   * failure there is unrelated to DB integrity. Any other status — permission
+   * denied, missing path, corruption, generic IO error — is still counted as
+   * a real failure.
+   */
   @VisibleForTesting
   public VolumeCheckResult checkDbHealth(File dbFile) throws 
InterruptedException {
     if (!(getDiskCheckEnabled() && 
getDatanodeConfig().isRocksDbDiskCheckEnabled())) {
       return VolumeCheckResult.HEALTHY;
     }
 
+    File secondaryDir = new File(getDiskCheckDir(), "rocksdb-secondary-" + 
Time.now());
+    try {
+      Files.createDirectories(secondaryDir.toPath());
+    } catch (IOException e) {
+      LOG.error("Failed to create secondary instance dir {} for volume {}", 
secondaryDir, getStorageDir(), e);
+
+      if (!isNoSpaceAvailable(e) && 
!ScanTransientIOUtil.isTooManyOpenFiles(e)) {
+        getIoTestSlidingWindow().add();
+      }
+
+      return getIoTestSlidingWindow().isExceeded()
+          ? VolumeCheckResult.FAILED
+          : VolumeCheckResult.HEALTHY;
+    }
+
     try (ManagedOptions managedOptions = new ManagedOptions();
-         ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, 
dbFile.toString())) {
+         ManagedRocksDB ignored =
+             ManagedRocksDB.openAsSecondary(managedOptions, dbFile.toString(), 
secondaryDir.getPath())) {
       // Do nothing. Only check if rocksdb is accessible.
       LOG.debug("Successfully opened the database at \"{}\" for HDDS volume 
{}.", dbFile, getStorageDir());
     } catch (Exception e) {
       if (Thread.currentThread().isInterrupted()) {
         throw new InterruptedException("Check of database for volume " + this 
+ " interrupted.");
       }
-      LOG.warn("Could not open Volume DB located at {}", dbFile, e);
-      getIoTestSlidingWindow().add();
+
+      // openAsSecondary writes its info LOG into secondaryDir. If that write
+      // fails because the disk is full, RocksDB surfaces the failure as
+      // IOError(NoSpace) (mapped from ENOSPC). That is unrelated to DB
+      // integrity, so don't count it against the sliding window. Any other
+      // status (permission denied, missing path, corruption, generic IO
+      // error) is still treated as a real failure.
+      if (ManagedRocksDB.isNoSpaceFailure(e)) {
+        LOG.warn("Skipping RocksDB health-check failure accounting for volume 
{}: " +
+            "secondary open returned IOError(NoSpace) for {}.", this, 
secondaryDir, e);
+      } else if (ScanTransientIOUtil.isTooManyOpenFiles(e)) {
+        LOG.warn("Skipping RocksDB health-check failure accounting for volume 
{}: " +
+            "secondary open hit file descriptor exhaustion for {}.", this, 
secondaryDir, e);
+      } else {
+        LOG.error("Could not open Volume DB located at {}", dbFile, e);
+        getIoTestSlidingWindow().add();
+      }
+    } finally {
+      try {
+        FileUtils.deleteDirectory(secondaryDir);
+      } catch (IOException e) {
+        LOG.warn("Failed to delete RocksDB secondary instance dir {}", 
secondaryDir, e);
+      }
     }
 
+
     if (getIoTestSlidingWindow().isExceeded()) {
       LOG.error("Failed to open the database at \"{}\" for HDDS volume {}: " +
               "encountered more than the {} tolerated failures.",
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
index d9424b76a13..389ef2558a3 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
@@ -534,7 +534,6 @@ public File getTmpDir() {
     return this.tmpDir;
   }
 
-  @VisibleForTesting
   public File getDiskCheckDir() {
     return this.diskCheckDir;
   }
@@ -851,4 +850,14 @@ private void setStorageDirPermissions() {
           ScmConfigKeys.HDDS_DATANODE_DATA_DIR_PERMISSIONS);
     }
   }
+
+  public static boolean isNoSpaceAvailable(Throwable t) {
+    for (Throwable cause = t; cause != null; cause = cause.getCause()) {
+      String msg = cause.getMessage();
+      if (msg != null && msg.contains("No space left on device")) {
+        return true;
+      }
+    }
+    return false;
+  }
 }
diff --git 
a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
 
b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
index 3401469f682..3e19e95bca2 100644
--- 
a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
+++ 
b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
@@ -31,6 +31,7 @@
 import org.rocksdb.OptionsUtil;
 import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
+import org.rocksdb.Status;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -75,6 +76,83 @@ public static ManagedRocksDB openReadOnly(
     );
   }
 
+  /**
+   * Opens a RocksDB at {@code dbPath} as a <b>secondary</b> instance.
+   * It is safe to use a secondary instance while a primary writer
+   * is active on the same DB.
+   *
+   * <p>Secondary mode is RocksDB's supported way to attach an extra reader
+   * to a DB that has a live primary writer. If a DB is simultaneously opened
+   * by with the primary writer and as a read-only instance,
+   * it has <i>undefined</i> behavior. It often succeeds if the read-only 
instance
+   * closes quickly, but the contract is unsafe.
+   *
+   * <p><b>Catch-up semantics.</b> A secondary's view does not auto-refresh; it
+   * stays at the snapshot captured at open time. The only way to advance it
+   * is to call {@code tryCatchUpWithPrimary()}, a user-triggered operation
+   * that rebuilds the in-memory memtable from new MANIFEST / WAL entries and
+   * never writes anything to disk.
+   *
+   * <p><b>The secondary log directory.</b> Secondary mode requires its own
+   * directory at {@code secondaryDbLogFilePath} for the RocksDB info
+   * {@code LOG} file. That directory is used <i>only</i> for log files. No
+   * important data lives there. The previous {@code LOG} file is rotated to
+   * {@code LOG.old.<ts>} on each subsequent open, so callers that reopen the
+   * secondary repeatedly should periodically clean these up. Note that the
+   * open will <b>fail</b> if the {@code LOG} cannot be created or written
+   * (directory missing, not writable, or out of space).
+   *
+   * @param options                DB options for the secondary instance.
+   * @param dbPath                 path to the primary DB.
+   * @param secondaryDbLogFilePath directory for the secondary's info log
+   *                               files; must be writable and on a
+   *                               filesystem with at least a small amount
+   *                               of free space.
+   * @return an open secondary {@link ManagedRocksDB}.
+   * @throws RocksDBException if the underlying native open fails for any
+   *                          reason, including an unwritable / full
+   *                          {@code secondaryDbLogFilePath}.
+   */
+  public static ManagedRocksDB openAsSecondary(
+      final ManagedOptions options,
+      final String dbPath,
+      final String secondaryDbLogFilePath)
+      throws RocksDBException {
+    return new ManagedRocksDB(RocksDB.openAsSecondary(options, dbPath, 
secondaryDbLogFilePath));
+  }
+
+  /**
+   * True iff the throwable (or any cause in its chain) is a
+   * {@link RocksDBException} whose status is {@code IOError(NoSpace)}.
+   * RocksDB sets that subcode specifically when the underlying syscall
+   * returns {@code ENOSPC}, so this is a precise signal that the failed
+   * operation hit a full disk — distinct from {@code IOError} causes such
+   * as permission denied, missing path, or DB corruption.
+   *
+   * <p>Callers wanting to consult the {@link Status} on a
+   * {@link RocksDBException} from outside this module would otherwise have
+   * to import {@code org.rocksdb.Status} directly, which is restricted by
+   * the project's {@code banned-rocksdb-imports} enforcer rule. Use this
+   * helper instead.
+   *
+   * @param t the throwable to inspect; the entire cause chain is walked.
+   * @return {@code true} iff a {@code RocksDBException} with status
+   * {@code IOError(NoSpace)} is found.
+   */
+  public static boolean isNoSpaceFailure(Throwable t) {
+    for (Throwable cur = t; cur != null; cur = cur.getCause()) {
+      if (cur instanceof RocksDBException) {
+        Status status = ((RocksDBException) cur).getStatus();
+        if (status != null
+            && status.getCode() == Status.Code.IOError
+            && status.getSubCode() == Status.SubCode.NoSpace) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
   public static ManagedRocksDB open(
       final DBOptions options, final String path,
       final List<ColumnFamilyDescriptor> columnFamilyDescriptors,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to