This is an automated email from the ASF dual-hosted git repository.

smengcl pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new d01b3aefc77 HDDS-15314. Disable defrag DB metrics due to crash during 
snapshot defrag (#10301)
d01b3aefc77 is described below

commit d01b3aefc77e6f7216041b28bbb76827ffef6644
Author: Siyao Meng <[email protected]>
AuthorDate: Thu May 21 02:52:52 2026 -0700

    HDDS-15314. Disable defrag DB metrics due to crash during snapshot defrag 
(#10301)
---
 .../hadoop/ozone/om/OmMetadataManagerImpl.java     | 29 +++++++++++++-
 .../om/snapshot/defrag/SnapshotDefragService.java  | 16 ++++++--
 .../snapshot/defrag/TestSnapshotDefragService.java | 44 ++++++++++++++++++++++
 3 files changed, 84 insertions(+), 5 deletions(-)

diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
index 1797acefa28..b76e5aa5262 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
@@ -242,6 +242,12 @@ public static OmMetadataManagerImpl 
createCheckpointMetadataManager(
 
   public static OmMetadataManagerImpl createCheckpointMetadataManager(
       OzoneConfiguration conf, DBCheckpoint checkpoint, boolean readOnly) 
throws IOException {
+    return createCheckpointMetadataManager(conf, checkpoint, readOnly, true);
+  }
+
+  public static OmMetadataManagerImpl createCheckpointMetadataManager(
+      OzoneConfiguration conf, DBCheckpoint checkpoint, boolean readOnly,
+      boolean enableRocksDbMetrics) throws IOException {
     Path path = checkpoint.getCheckpointLocation();
     Path parent = path.getParent();
     if (parent == null) {
@@ -254,7 +260,8 @@ public static OmMetadataManagerImpl 
createCheckpointMetadataManager(
       throw new IllegalStateException("DB checkpoint dir name should not "
           + "have been null. Checkpoint path is " + path);
     }
-    return new OmMetadataManagerImpl(conf, dir, name.toString(), readOnly);
+    return new OmMetadataManagerImpl(
+        conf, dir, name.toString(), readOnly, enableRocksDbMetrics);
   }
 
   protected OmMetadataManagerImpl(OzoneConfiguration conf, File dir, String 
name) throws IOException {
@@ -271,6 +278,24 @@ protected OmMetadataManagerImpl(OzoneConfiguration conf, 
File dir, String name)
    */
   public OmMetadataManagerImpl(OzoneConfiguration conf, File dir, String name, 
boolean readOnly)
       throws IOException {
+    this(conf, dir, name, readOnly, true);
+  }
+
+  /**
+   * Metadata constructor for checkpoints.
+   *
+   * @param conf - Ozone conf.
+   * @param dir - Checkpoint parent directory.
+   * @param name - Checkpoint directory name.
+   * @param readOnly - Whether to open the checkpoint DB read-only.
+   * @param enableRocksDbMetrics - Whether to register generic RocksDB metrics.
+   *     Pass false for transient checkpoint DBs whose column families may be
+   *     dropped or recreated while the DB is open.
+   * @throws IOException
+   */
+  protected OmMetadataManagerImpl(OzoneConfiguration conf, File dir,
+      String name, boolean readOnly, boolean enableRocksDbMetrics)
+      throws IOException {
     lock = new OmReadOnlyLock();
     hierarchicalLockManager = new ReadOnlyHierarchicalResourceLockManager();
     omEpoch = 0;
@@ -282,7 +307,7 @@ public OmMetadataManagerImpl(OzoneConfiguration conf, File 
dir, String name, boo
         .setMaxNumberOfOpenFiles(maxOpenFiles)
         .setEnableCompactionDag(false, null)
         .setCreateCheckpointDirs(false)
-        .setEnableRocksDbMetrics(true)
+        .setEnableRocksDbMetrics(enableRocksDbMetrics)
         .build();
     initializeOmTables(CacheType.PARTIAL_CACHE, false);
     perfMetrics = null;
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/defrag/SnapshotDefragService.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/defrag/SnapshotDefragService.java
index cd3f845dcbe..4f018ed4641 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/defrag/SnapshotDefragService.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/defrag/SnapshotDefragService.java
@@ -527,7 +527,7 @@ int atomicSwitchSnapshotDB(UUID snapshotId, Path 
checkpointPath) throws IOExcept
       RocksDBCheckpoint dbCheckpoint = new RocksDBCheckpoint(nextVersionPath);
       // Add a new version to the local data file.
       try (OmMetadataManagerImpl newVersionCheckpointMetadataManager =
-               OmMetadataManagerImpl.createCheckpointMetadataManager(conf, 
dbCheckpoint, true)) {
+               createDefragCheckpointMetadataManager(dbCheckpoint, true)) {
         RDBStore newVersionCheckpointStore = (RDBStore) 
newVersionCheckpointMetadataManager.getStore();
         
snapshotLocalDataProvider.addSnapshotVersion(newVersionCheckpointStore);
         snapshotLocalDataProvider.commit();
@@ -549,6 +549,16 @@ public BackgroundTaskResult call() throws Exception {
     }
   }
 
+  @VisibleForTesting
+  OmMetadataManagerImpl createDefragCheckpointMetadataManager(
+      DBCheckpoint checkpoint, boolean readOnly) throws IOException {
+    // Defrag checkpoint DBs are transient and drop/recreate column families.
+    // Generic RocksDB metrics are not useful for them and can race with CF 
handle
+    // lifetime changes while the checkpoint is being rewritten.
+    return OmMetadataManagerImpl.createCheckpointMetadataManager(
+        conf, checkpoint, readOnly, false);
+  }
+
   /**
    * Creates a new checkpoint by modifying the metadata manager from a 
snapshot.
    * This involves generating a temporary checkpoint and truncating specified
@@ -570,7 +580,7 @@ OmMetadataManagerImpl createCheckpoint(SnapshotInfo 
snapshotInfo,
         snapshotInfo.getVolumeName(), snapshotInfo.getBucketName(), 
snapshotInfo.getName())) {
       DBCheckpoint checkpoint = 
snapshot.get().getMetadataManager().getStore().getCheckpoint(tmpDefragDir, 
true);
       try (OmMetadataManagerImpl metadataManagerBeforeTruncate =
-               OmMetadataManagerImpl.createCheckpointMetadataManager(conf, 
checkpoint, false)) {
+               createDefragCheckpointMetadataManager(checkpoint, false)) {
         DBStore dbStore = metadataManagerBeforeTruncate.getStore();
         for (String table : metadataManagerBeforeTruncate.listTableNames()) {
           if (!incrementalColumnFamilies.contains(table)) {
@@ -581,7 +591,7 @@ OmMetadataManagerImpl createCheckpoint(SnapshotInfo 
snapshotInfo,
         throw new IOException("Failed to close checkpoint of snapshot: " + 
snapshotInfo.getSnapshotId(), e);
       }
       // This will recreate the column families in the checkpoint.
-      return OmMetadataManagerImpl.createCheckpointMetadataManager(conf, 
checkpoint, false);
+      return createDefragCheckpointMetadataManager(checkpoint, false);
     }
   }
 
diff --git 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/defrag/TestSnapshotDefragService.java
 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/defrag/TestSnapshotDefragService.java
index 224de13840a..f93d95d6ac9 100644
--- 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/defrag/TestSnapshotDefragService.java
+++ 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/defrag/TestSnapshotDefragService.java
@@ -76,6 +76,7 @@
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.hadoop.hdds.StringUtils;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.utils.RocksDBStoreMetrics;
 import org.apache.hadoop.hdds.utils.db.CodecBuffer;
 import org.apache.hadoop.hdds.utils.db.CodecBufferCodec;
 import org.apache.hadoop.hdds.utils.db.CodecException;
@@ -91,6 +92,7 @@
 import org.apache.hadoop.hdds.utils.db.StringInMemoryTestTable;
 import org.apache.hadoop.hdds.utils.db.Table;
 import org.apache.hadoop.hdds.utils.db.TablePrefixInfo;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.ozone.om.OMMetadataManager;
 import org.apache.hadoop.ozone.om.OMPerformanceMetrics;
 import org.apache.hadoop.ozone.om.OmMetadataManagerImpl;
@@ -458,6 +460,20 @@ private static Stream<Arguments> 
testCreateCheckpointCases() {
     );
   }
 
+  private static String rocksDBMetricsSourceName(Path dbLocation) {
+    return RocksDBStoreMetrics.ROCKSDB_CONTEXT_PREFIX + 
dbLocation.getFileName();
+  }
+
+  private static void assertNoRocksDBMetrics(Path dbLocation) {
+    assertNull(DefaultMetricsSystem.instance().getSource(
+        rocksDBMetricsSourceName(dbLocation)));
+  }
+
+  private static void assertRocksDBMetricsRegistered(Path dbLocation) {
+    assertNotNull(DefaultMetricsSystem.instance().getSource(
+        rocksDBMetricsSourceName(dbLocation)));
+  }
+
   private Map<String, Map<String, String>> createTableContents(Path path, 
String keyPrefix) throws IOException {
     DBCheckpoint snapshotCheckpointLocation = new RocksDBCheckpoint(path);
     Map<String, Map<String, String>> tableContents = new HashMap<>();
@@ -525,7 +541,35 @@ public void close() {
           .filter(e -> !incrementalTables.contains(e.getKey()))
           .forEach(e -> e.getValue().clear());
       assertContents(tableContents, result.getStore());
+      assertNoRocksDBMetrics(result.getStore().getDbLocation().toPath());
+    }
+  }
+
+  @Test
+  public void testDefragCheckpointMetadataManagerSkipsRocksDBMetrics() throws 
Exception {
+    Path checkpointPath = tempDir.resolve("defrag-metrics-" + 
UUID.randomUUID());
+    createTableContents(checkpointPath, "_metrics_");
+
+    assertNoRocksDBMetrics(checkpointPath);
+    // The generic checkpoint path should keep the existing behavior and
+    // register RocksDB metrics.
+    try (OmMetadataManagerImpl defaultCheckpointMetadataManager =
+             OmMetadataManagerImpl.createCheckpointMetadataManager(
+                 configuration, new RocksDBCheckpoint(checkpointPath), false)) 
{
+      assertRocksDBMetricsRegistered(
+          
defaultCheckpointMetadataManager.getStore().getDbLocation().toPath());
+    }
+    assertNoRocksDBMetrics(checkpointPath);
+
+    // Defrag checkpoint DBs are transient and must not register generic
+    // RocksDB metrics.
+    try (OmMetadataManagerImpl defragCheckpointMetadataManager =
+             defragService.createDefragCheckpointMetadataManager(
+                 new RocksDBCheckpoint(checkpointPath), false)) {
+      assertNoRocksDBMetrics(
+          defragCheckpointMetadataManager.getStore().getDbLocation().toPath());
     }
+    assertNoRocksDBMetrics(checkpointPath);
   }
 
   private void assertContents(Map<String, Map<String, String>> contents, Path 
path) throws IOException {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to