This is an automated email from the ASF dual-hosted git repository.
smengcl pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new e03d8623211 HDDS-14859. Use RocksDb secondary instance for validating
volumes. (#9947)
e03d8623211 is described below
commit e03d862321171d9fc6a40387e295846fa91b9857
Author: Rishabh Patel <[email protected]>
AuthorDate: Tue Jun 16 16:27:20 2026 -0700
HDDS-14859. Use RocksDb secondary instance for validating volumes. (#9947)
---
.../ozone/container/common/volume/HddsVolume.java | 66 +++++++++++++++++-
.../container/common/volume/StorageVolume.java | 11 ++-
.../hdds/utils/db/managed/ManagedRocksDB.java | 78 ++++++++++++++++++++++
3 files changed, 151 insertions(+), 4 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
index f1deedc8d33..8827960248d 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
@@ -25,6 +25,7 @@
import jakarta.annotation.Nullable;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentSkipListSet;
@@ -48,6 +49,7 @@
import org.apache.hadoop.ozone.container.common.utils.RawDB;
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController;
+import org.apache.hadoop.ozone.container.ozoneimpl.ScanTransientIOUtil;
import org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures;
import
org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures.SchemaV3;
import org.apache.hadoop.util.Time;
@@ -304,24 +306,82 @@ public synchronized VolumeCheckResult check(@Nullable
Boolean unused)
return checkDbHealth(dbFile);
}
+ /**
+ * Verifies the per-volume RocksDB's global state files (CURRENT, MANIFEST,
+ * OPTIONS) by opening the DB in secondary mode. A successful open implies
+ * those files are readable and internally consistent and that the
+ * referenced SST file names match what RocksDB expects.
+ *
+ * <p>This check intentionally does <b>not</b> read or checksum SST file
+ * contents or any individual key/value. Per-block / per-key integrity is
+ * verified by the container data scanner, which scans containers (and
+ * their RocksDB rows) on its own schedule.
+ *
+ * <p>The volume is only marked {@link VolumeCheckResult#FAILED} once the
+ * configured threshold of failures is exceeded, matching the parent class's
+ * intermittent-error tolerance. Open failures whose underlying RocksDB
+ * status is {@code IOError(NoSpace)} are not counted: {@code
openAsSecondary}
+ * writes its info LOG into the disk-check directory, so an out-of-space
+ * failure there is unrelated to DB integrity. Any other status — permission
+ * denied, missing path, corruption, generic IO error — is still counted as
+ * a real failure.
+ */
@VisibleForTesting
public VolumeCheckResult checkDbHealth(File dbFile) throws
InterruptedException {
if (!(getDiskCheckEnabled() &&
getDatanodeConfig().isRocksDbDiskCheckEnabled())) {
return VolumeCheckResult.HEALTHY;
}
+ File secondaryDir = new File(getDiskCheckDir(), "rocksdb-secondary-" +
Time.now());
+ try {
+ Files.createDirectories(secondaryDir.toPath());
+ } catch (IOException e) {
+ LOG.error("Failed to create secondary instance dir {} for volume {}",
secondaryDir, getStorageDir(), e);
+
+ if (!isNoSpaceAvailable(e) &&
!ScanTransientIOUtil.isTooManyOpenFiles(e)) {
+ getIoTestSlidingWindow().add();
+ }
+
+ return getIoTestSlidingWindow().isExceeded()
+ ? VolumeCheckResult.FAILED
+ : VolumeCheckResult.HEALTHY;
+ }
+
try (ManagedOptions managedOptions = new ManagedOptions();
- ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions,
dbFile.toString())) {
+ ManagedRocksDB ignored =
+ ManagedRocksDB.openAsSecondary(managedOptions, dbFile.toString(),
secondaryDir.getPath())) {
// Do nothing. Only check if rocksdb is accessible.
LOG.debug("Successfully opened the database at \"{}\" for HDDS volume
{}.", dbFile, getStorageDir());
} catch (Exception e) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException("Check of database for volume " + this
+ " interrupted.");
}
- LOG.warn("Could not open Volume DB located at {}", dbFile, e);
- getIoTestSlidingWindow().add();
+
+ // openAsSecondary writes its info LOG into secondaryDir. If that write
+ // fails because the disk is full, RocksDB surfaces the failure as
+ // IOError(NoSpace) (mapped from ENOSPC). That is unrelated to DB
+ // integrity, so don't count it against the sliding window. Any other
+ // status (permission denied, missing path, corruption, generic IO
+ // error) is still treated as a real failure.
+ if (ManagedRocksDB.isNoSpaceFailure(e)) {
+ LOG.warn("Skipping RocksDB health-check failure accounting for volume
{}: " +
+ "secondary open returned IOError(NoSpace) for {}.", this,
secondaryDir, e);
+ } else if (ScanTransientIOUtil.isTooManyOpenFiles(e)) {
+ LOG.warn("Skipping RocksDB health-check failure accounting for volume
{}: " +
+ "secondary open hit file descriptor exhaustion for {}.", this,
secondaryDir, e);
+ } else {
+ LOG.error("Could not open Volume DB located at {}", dbFile, e);
+ getIoTestSlidingWindow().add();
+ }
+ } finally {
+ try {
+ FileUtils.deleteDirectory(secondaryDir);
+ } catch (IOException e) {
+ LOG.warn("Failed to delete RocksDB secondary instance dir {}",
secondaryDir, e);
+ }
}
+
if (getIoTestSlidingWindow().isExceeded()) {
LOG.error("Failed to open the database at \"{}\" for HDDS volume {}: " +
"encountered more than the {} tolerated failures.",
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
index d9424b76a13..389ef2558a3 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
@@ -534,7 +534,6 @@ public File getTmpDir() {
return this.tmpDir;
}
- @VisibleForTesting
public File getDiskCheckDir() {
return this.diskCheckDir;
}
@@ -851,4 +850,14 @@ private void setStorageDirPermissions() {
ScmConfigKeys.HDDS_DATANODE_DATA_DIR_PERMISSIONS);
}
}
+
+ public static boolean isNoSpaceAvailable(Throwable t) {
+ for (Throwable cause = t; cause != null; cause = cause.getCause()) {
+ String msg = cause.getMessage();
+ if (msg != null && msg.contains("No space left on device")) {
+ return true;
+ }
+ }
+ return false;
+ }
}
diff --git
a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
index 3401469f682..3e19e95bca2 100644
---
a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
+++
b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java
@@ -31,6 +31,7 @@
import org.rocksdb.OptionsUtil;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
+import org.rocksdb.Status;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -75,6 +76,83 @@ public static ManagedRocksDB openReadOnly(
);
}
+ /**
+ * Opens a RocksDB at {@code dbPath} as a <b>secondary</b> instance.
+ * It is safe to use a secondary instance while a primary writer
+ * is active on the same DB.
+ *
+ * <p>Secondary mode is RocksDB's supported way to attach an extra reader
+ * to a DB that has a live primary writer. If a DB is simultaneously opened
+ * by with the primary writer and as a read-only instance,
+ * it has <i>undefined</i> behavior. It often succeeds if the read-only
instance
+ * closes quickly, but the contract is unsafe.
+ *
+ * <p><b>Catch-up semantics.</b> A secondary's view does not auto-refresh; it
+ * stays at the snapshot captured at open time. The only way to advance it
+ * is to call {@code tryCatchUpWithPrimary()}, a user-triggered operation
+ * that rebuilds the in-memory memtable from new MANIFEST / WAL entries and
+ * never writes anything to disk.
+ *
+ * <p><b>The secondary log directory.</b> Secondary mode requires its own
+ * directory at {@code secondaryDbLogFilePath} for the RocksDB info
+ * {@code LOG} file. That directory is used <i>only</i> for log files. No
+ * important data lives there. The previous {@code LOG} file is rotated to
+ * {@code LOG.old.<ts>} on each subsequent open, so callers that reopen the
+ * secondary repeatedly should periodically clean these up. Note that the
+ * open will <b>fail</b> if the {@code LOG} cannot be created or written
+ * (directory missing, not writable, or out of space).
+ *
+ * @param options DB options for the secondary instance.
+ * @param dbPath path to the primary DB.
+ * @param secondaryDbLogFilePath directory for the secondary's info log
+ * files; must be writable and on a
+ * filesystem with at least a small amount
+ * of free space.
+ * @return an open secondary {@link ManagedRocksDB}.
+ * @throws RocksDBException if the underlying native open fails for any
+ * reason, including an unwritable / full
+ * {@code secondaryDbLogFilePath}.
+ */
+ public static ManagedRocksDB openAsSecondary(
+ final ManagedOptions options,
+ final String dbPath,
+ final String secondaryDbLogFilePath)
+ throws RocksDBException {
+ return new ManagedRocksDB(RocksDB.openAsSecondary(options, dbPath,
secondaryDbLogFilePath));
+ }
+
+ /**
+ * True iff the throwable (or any cause in its chain) is a
+ * {@link RocksDBException} whose status is {@code IOError(NoSpace)}.
+ * RocksDB sets that subcode specifically when the underlying syscall
+ * returns {@code ENOSPC}, so this is a precise signal that the failed
+ * operation hit a full disk — distinct from {@code IOError} causes such
+ * as permission denied, missing path, or DB corruption.
+ *
+ * <p>Callers wanting to consult the {@link Status} on a
+ * {@link RocksDBException} from outside this module would otherwise have
+ * to import {@code org.rocksdb.Status} directly, which is restricted by
+ * the project's {@code banned-rocksdb-imports} enforcer rule. Use this
+ * helper instead.
+ *
+ * @param t the throwable to inspect; the entire cause chain is walked.
+ * @return {@code true} iff a {@code RocksDBException} with status
+ * {@code IOError(NoSpace)} is found.
+ */
+ public static boolean isNoSpaceFailure(Throwable t) {
+ for (Throwable cur = t; cur != null; cur = cur.getCause()) {
+ if (cur instanceof RocksDBException) {
+ Status status = ((RocksDBException) cur).getStatus();
+ if (status != null
+ && status.getCode() == Status.Code.IOError
+ && status.getSubCode() == Status.SubCode.NoSpace) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
public static ManagedRocksDB open(
final DBOptions options, final String path,
final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]