This is an automated email from the ASF dual-hosted git repository.
weichiu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 56b9b8015e8 HDDS-14699. Fix orphan snapshot versions handling when
snapshot chain tableKey mapping is stale (#9810)
56b9b8015e8 is described below
commit 56b9b8015e8cbfc17a1161a5226be5b747ffbff7
Author: Siyao Meng <[email protected]>
AuthorDate: Tue Feb 24 14:40:33 2026 -0800
HDDS-14699. Fix orphan snapshot versions handling when snapshot chain
tableKey mapping is stale (#9810)
---
.../apache/hadoop/ozone/om/OmSnapshotManager.java | 23 ++++++++++--
.../hadoop/ozone/om/SnapshotChainManager.java | 16 +++++++-
.../snapshot/TestOmSnapshotLocalDataManager.java | 43 ++++++++++++++++++++++
3 files changed, 76 insertions(+), 6 deletions(-)
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
index b488997b522..1608d1a8cef 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
@@ -284,12 +284,23 @@ public OmSnapshotManager(OzoneManager ozoneManager)
throws IOException {
public static boolean isSnapshotPurged(SnapshotChainManager chainManager,
OMMetadataManager omMetadataManager,
UUID snapshotId, TransactionInfo transactionInfo) throws IOException {
+ boolean purgeFlushed = transactionInfo != null &&
+ isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
String tableKey = chainManager.getTableKey(snapshotId);
if (tableKey == null) {
- return true;
+ // Snapshot chain is rebuilt from DB on every OM restart
(loadFromSnapshotInfoTable),
+ // but entries committed to the Raft log (but not yet flushed) after the
restart
+ // are applied in addToDBBatch (skipping validateAndUpdateCache), so
they never call
+ // addSnapshot(). This can affect any OM (leader or follower) after a
restart.
+ //
+ // Need to fall back to transactionInfo. null means no purge has been
recorded. Treat as active.
+ LOG.debug("snapshotId {} has null tableKey in SnapshotChainManager. "
+ + "transactionInfo={} purgeFlushed={}. Returning {}",
+ snapshotId, transactionInfo, purgeFlushed, purgeFlushed);
+ return purgeFlushed;
}
- return !omMetadataManager.getSnapshotInfoTable().isExist(tableKey) &&
transactionInfo != null &&
- isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
+ boolean inDB = omMetadataManager.getSnapshotInfoTable().isExist(tableKey);
+ return !inDB && purgeFlushed;
}
/**
@@ -371,8 +382,12 @@ public OmSnapshot load(@Nonnull UUID snapshotId) throws
IOException {
}
try
(OmSnapshotLocalDataManager.ReadableOmSnapshotLocalDataMetaProvider
snapshotLocalDataProvider =
snapshotLocalDataManager.getOmSnapshotLocalDataMeta(snapshotInfo)) {
+ final OmSnapshotLocalDataManager.SnapshotVersionsMeta snapshotMeta
= snapshotLocalDataProvider.getMeta();
+ if (snapshotMeta == null) {
+ throw new OMException("Snapshot local metadata is missing for
snapshotId: " + snapshotId, FILE_NOT_FOUND);
+ }
snapshotMetadataManager =
getSnapshotOmMetadataManager(snapshotInfo,
- snapshotLocalDataProvider.getMeta().getVersion(),
maxOpenSstFilesInSnapshotDb, conf);
+ snapshotMeta.getVersion(), maxOpenSstFilesInSnapshotDb, conf);
}
} catch (IOException e) {
LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey, e);
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
index 65074bb8379..c4d8f18637a 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
@@ -296,6 +296,10 @@ private boolean
loadFromSnapshotInfoTable(OMMetadataManager metadataManager) {
globalSnapshotChain.clear();
snapshotChainByPath.clear();
latestSnapshotIdByPath.clear();
+ if (!snapshotIdToTableKey.isEmpty()) {
+ LOG.debug("Clearing snapshotIdToTableKey (had {} entries) on thread
{}",
+ snapshotIdToTableKey.size(), Thread.currentThread().getName());
+ }
snapshotIdToTableKey.clear();
while (keyIter.hasNext()) {
@@ -353,6 +357,8 @@ public synchronized void addSnapshot(SnapshotInfo
snapshotInfo)
// store snapshot ID to snapshot DB table key in the map
snapshotIdToTableKey.put(snapshotInfo.getSnapshotId(),
snapshotInfo.getTableKey());
+ LOG.debug("Added to snapshotIdToTableKey: snapshotId={} tableKey={}",
+ snapshotInfo.getSnapshotId(), snapshotInfo.getTableKey());
}
/**
@@ -378,7 +384,8 @@ public synchronized boolean deleteSnapshot(SnapshotInfo
snapshotInfo)
*/
public synchronized void removeFromSnapshotIdToTable(UUID snapshotId) throws
IOException {
validateSnapshotChain();
- snapshotIdToTableKey.remove(snapshotId);
+ String tableKey = snapshotIdToTableKey.remove(snapshotId);
+ LOG.debug("Removed from snapshotIdToTableKey: snapshotId={} tableKey={}",
snapshotId, tableKey);
}
/**
@@ -570,7 +577,12 @@ public UUID previousPathSnapshot(String snapshotPath,
}
public String getTableKey(UUID snapshotId) {
- return snapshotIdToTableKey.get(snapshotId);
+ String tableKey = snapshotIdToTableKey.get(snapshotId);
+ if (tableKey == null) {
+ LOG.debug("getTableKey returned null for snapshotId={}.
snapshotIdToTableKey has {} entries",
+ snapshotId, snapshotIdToTableKey.size());
+ }
+ return tableKey;
}
public LinkedHashMap<UUID, SnapshotChainInfo> getSnapshotChainPath(
diff --git
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
index 7cacfd2bc9e..f9d8fba45a8 100644
---
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
+++
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java
@@ -82,6 +82,7 @@
import org.apache.hadoop.ozone.om.OmSnapshotLocalData;
import org.apache.hadoop.ozone.om.OmSnapshotLocalDataYaml;
import org.apache.hadoop.ozone.om.OmSnapshotManager;
+import org.apache.hadoop.ozone.om.SnapshotChainManager;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.om.lock.DAGLeveledResource;
import org.apache.hadoop.ozone.om.lock.HierarchicalResourceLockManager;
@@ -1027,6 +1028,48 @@ public void testInitWithMissingYamlFiles(boolean
needsUpgrade) throws IOExceptio
}
}
+ /**
+ * Regression test for NullPointerException in
OmSnapshotManager#createCacheLoader.
+ * <p>
+ * isSnapshotPurged() now falls back to transactionInfo when getTableKey()
returns null.
+ * A null transactionInfo means no purge was ever recorded for this snapshot
in its YAML,
+ * so the snapshot is treated as active and the orphan check should
correctly skip it.
+ */
+ @Test
+ public void testCheckOrphanSnapshotVersionsWithStaleSnapshotChain() throws
IOException {
+ localDataManager = getNewOmSnapshotLocalDataManager();
+ UUID snapshotId = createSnapshotLocalData(localDataManager, 1).get(0);
+
+ // Snapshot must be in versionNodeMap before the orphan check.
+
assertNotNull(localDataManager.getVersionNodeMapUnmodifiable().get(snapshotId));
+
+ // Use the real isSnapshotPurged
+ snapshotUtilMock.when(() -> OmSnapshotManager.isSnapshotPurged(any(),
any(), any(), any()))
+ .thenCallRealMethod();
+
+ // Simulate a stale SnapshotChainManager: getTableKey returns null for the
+ // snapshot because the snapshot chain has not been correctly updated
+ SnapshotChainManager staleChain = mock(SnapshotChainManager.class);
+ when(staleChain.getTableKey(snapshotId)).thenReturn(null);
+
+ localDataManager.checkOrphanSnapshotVersions(omMetadataManager,
staleChain, snapshotId);
+
+ // Before the fix: isSnapshotPurged returned true for any null tableKey,
so the snapshot
+ // was removed from versionNodeMap. getMeta() then returned null, causing
NullPointerException
+
+ // After the fix: null transactionInfo means no purge has been recorded,
assuming active snapshot.
+ // versionNodeMap entry will survive the orphan check. getMeta() will be
non-null.
+
+
assertNotNull(localDataManager.getVersionNodeMapUnmodifiable().get(snapshotId),
+ "Active snapshot was removed erroneously from versionNodeMap due to
stale SnapshotChainManager");
+
+ try (OmSnapshotLocalDataManager.ReadableOmSnapshotLocalDataMetaProvider
provider =
+ localDataManager.getOmSnapshotLocalDataMeta(snapshotId)) {
+ assertNotNull(provider.getMeta(),
+ "getMeta() returned null. Calling getVersion() on it throws
NullPointerException");
+ }
+ }
+
@Test
public void testInitWithInvalidPathThrowsException() throws IOException {
UUID snapshotId = UUID.randomUUID();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]