This is an automated email from the ASF dual-hosted git repository.

weichiu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 56b9b8015e8 HDDS-14699. Fix orphan snapshot versions handling when 
snapshot chain tableKey mapping is stale (#9810)
56b9b8015e8 is described below

commit 56b9b8015e8cbfc17a1161a5226be5b747ffbff7
Author: Siyao Meng <[email protected]>
AuthorDate: Tue Feb 24 14:40:33 2026 -0800

    HDDS-14699. Fix orphan snapshot versions handling when snapshot chain 
tableKey mapping is stale (#9810)
---
 .../apache/hadoop/ozone/om/OmSnapshotManager.java  | 23 ++++++++++--
 .../hadoop/ozone/om/SnapshotChainManager.java      | 16 +++++++-
 .../snapshot/TestOmSnapshotLocalDataManager.java   | 43 ++++++++++++++++++++++
 3 files changed, 76 insertions(+), 6 deletions(-)

diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
index b488997b522..1608d1a8cef 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
@@ -284,12 +284,23 @@ public OmSnapshotManager(OzoneManager ozoneManager) 
throws IOException {
 
   public static boolean isSnapshotPurged(SnapshotChainManager chainManager, 
OMMetadataManager omMetadataManager,
       UUID snapshotId, TransactionInfo transactionInfo) throws IOException {
+    boolean purgeFlushed = transactionInfo != null &&
+        isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
     String tableKey = chainManager.getTableKey(snapshotId);
     if (tableKey == null) {
-      return true;
+      // Snapshot chain is rebuilt from DB on every OM restart 
(loadFromSnapshotInfoTable),
+      // but entries committed to the Raft log (but not yet flushed) after the 
restart
+      // are applied in addToDBBatch (skipping validateAndUpdateCache), so 
they never call
+      // addSnapshot(). This can affect any OM (leader or follower) after a 
restart.
+      //
+      // Need to fall back to transactionInfo. null means no purge has been 
recorded. Treat as active.
+      LOG.debug("snapshotId {} has null tableKey in SnapshotChainManager. "
+              + "transactionInfo={} purgeFlushed={}. Returning {}",
+          snapshotId, transactionInfo, purgeFlushed, purgeFlushed);
+      return purgeFlushed;
     }
-    return !omMetadataManager.getSnapshotInfoTable().isExist(tableKey) && 
transactionInfo != null &&
-        isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
+    boolean inDB = omMetadataManager.getSnapshotInfoTable().isExist(tableKey);
+    return !inDB && purgeFlushed;
   }
 
   /**
@@ -371,8 +382,12 @@ public OmSnapshot load(@Nonnull UUID snapshotId) throws 
IOException {
           }
           try 
(OmSnapshotLocalDataManager.ReadableOmSnapshotLocalDataMetaProvider 
snapshotLocalDataProvider =
                    
snapshotLocalDataManager.getOmSnapshotLocalDataMeta(snapshotInfo)) {
+            final OmSnapshotLocalDataManager.SnapshotVersionsMeta snapshotMeta 
= snapshotLocalDataProvider.getMeta();
+            if (snapshotMeta == null) {
+              throw new OMException("Snapshot local metadata is missing for 
snapshotId: " + snapshotId, FILE_NOT_FOUND);
+            }
             snapshotMetadataManager = 
getSnapshotOmMetadataManager(snapshotInfo,
-                snapshotLocalDataProvider.getMeta().getVersion(), 
maxOpenSstFilesInSnapshotDb, conf);
+                snapshotMeta.getVersion(), maxOpenSstFilesInSnapshotDb, conf);
           }
         } catch (IOException e) {
           LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey, e);
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
index 65074bb8379..c4d8f18637a 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
@@ -296,6 +296,10 @@ private boolean 
loadFromSnapshotInfoTable(OMMetadataManager metadataManager) {
       globalSnapshotChain.clear();
       snapshotChainByPath.clear();
       latestSnapshotIdByPath.clear();
+      if (!snapshotIdToTableKey.isEmpty()) {
+        LOG.debug("Clearing snapshotIdToTableKey (had {} entries) on thread 
{}",
+            snapshotIdToTableKey.size(), Thread.currentThread().getName());
+      }
       snapshotIdToTableKey.clear();
 
       while (keyIter.hasNext()) {
@@ -353,6 +357,8 @@ public synchronized void addSnapshot(SnapshotInfo 
snapshotInfo)
     // store snapshot ID to snapshot DB table key in the map
     snapshotIdToTableKey.put(snapshotInfo.getSnapshotId(),
         snapshotInfo.getTableKey());
+    LOG.debug("Added to snapshotIdToTableKey: snapshotId={} tableKey={}",
+        snapshotInfo.getSnapshotId(), snapshotInfo.getTableKey());
   }
 
   /**
@@ -378,7 +384,8 @@ public synchronized boolean deleteSnapshot(SnapshotInfo 
snapshotInfo)
    */
   public synchronized void removeFromSnapshotIdToTable(UUID snapshotId) throws 
IOException {
     validateSnapshotChain();
-    snapshotIdToTableKey.remove(snapshotId);
+    String tableKey = snapshotIdToTableKey.remove(snapshotId);
+    LOG.debug("Removed from snapshotIdToTableKey: snapshotId={} tableKey={}", 
snapshotId, tableKey);
   }
 
   /**
@@ -570,7 +577,12 @@ public UUID previousPathSnapshot(String snapshotPath,
   }
 
   public String getTableKey(UUID snapshotId) {
-    return snapshotIdToTableKey.get(snapshotId);
+    String tableKey = snapshotIdToTableKey.get(snapshotId);
+    if (tableKey == null) {
+      LOG.debug("getTableKey returned null for snapshotId={}. 
snapshotIdToTableKey has {} entries",
+          snapshotId, snapshotIdToTableKey.size());
+    }
+    return tableKey;
   }
 
   public LinkedHashMap<UUID, SnapshotChainInfo> getSnapshotChainPath(
diff --git 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
index 7cacfd2bc9e..f9d8fba45a8 100644
--- 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
+++ 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
@@ -82,6 +82,7 @@
 import org.apache.hadoop.ozone.om.OmSnapshotLocalData;
 import org.apache.hadoop.ozone.om.OmSnapshotLocalDataYaml;
 import org.apache.hadoop.ozone.om.OmSnapshotManager;
+import org.apache.hadoop.ozone.om.SnapshotChainManager;
 import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
 import org.apache.hadoop.ozone.om.lock.DAGLeveledResource;
 import org.apache.hadoop.ozone.om.lock.HierarchicalResourceLockManager;
@@ -1027,6 +1028,48 @@ public void testInitWithMissingYamlFiles(boolean 
needsUpgrade) throws IOExceptio
     }
   }
 
+  /**
+   * Regression test for NullPointerException in 
OmSnapshotManager#createCacheLoader.
+   * <p>
+   * isSnapshotPurged() now falls back to transactionInfo when getTableKey() 
returns null.
+   * A null transactionInfo means no purge was ever recorded for this snapshot 
in its YAML,
+   * so the snapshot is treated as active and the orphan check should 
correctly skip it.
+   */
+  @Test
+  public void testCheckOrphanSnapshotVersionsWithStaleSnapshotChain() throws 
IOException {
+    localDataManager = getNewOmSnapshotLocalDataManager();
+    UUID snapshotId = createSnapshotLocalData(localDataManager, 1).get(0);
+
+    // Snapshot must be in versionNodeMap before the orphan check.
+    
assertNotNull(localDataManager.getVersionNodeMapUnmodifiable().get(snapshotId));
+
+    // Use the real isSnapshotPurged
+    snapshotUtilMock.when(() -> OmSnapshotManager.isSnapshotPurged(any(), 
any(), any(), any()))
+        .thenCallRealMethod();
+
+    // Simulate a stale SnapshotChainManager: getTableKey returns null for the
+    // snapshot because the snapshot chain has not been correctly updated
+    SnapshotChainManager staleChain = mock(SnapshotChainManager.class);
+    when(staleChain.getTableKey(snapshotId)).thenReturn(null);
+
+    localDataManager.checkOrphanSnapshotVersions(omMetadataManager, 
staleChain, snapshotId);
+
+    // Before the fix: isSnapshotPurged returned true for any null tableKey, 
so the snapshot
+    // was removed from versionNodeMap. getMeta() then returned null, causing 
NullPointerException
+
+    // After the fix: null transactionInfo means no purge has been recorded, 
assuming active snapshot.
+    // versionNodeMap entry will survive the orphan check. getMeta() will be 
non-null.
+
+    
assertNotNull(localDataManager.getVersionNodeMapUnmodifiable().get(snapshotId),
+        "Active snapshot was removed erroneously from versionNodeMap due to 
stale SnapshotChainManager");
+
+    try (OmSnapshotLocalDataManager.ReadableOmSnapshotLocalDataMetaProvider 
provider =
+             localDataManager.getOmSnapshotLocalDataMeta(snapshotId)) {
+      assertNotNull(provider.getMeta(),
+          "getMeta() returned null. Calling getVersion() on it throws 
NullPointerException");
+    }
+  }
+
   @Test
   public void testInitWithInvalidPathThrowsException() throws IOException {
     UUID snapshotId = UUID.randomUUID();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to