Siyao Meng created HDDS-14699:
---------------------------------

             Summary: Fix orphan snapshot versions handling when snapshot chain 
tableKey mapping is stale
                 Key: HDDS-14699
                 URL: https://issues.apache.org/jira/browse/HDDS-14699
             Project: Apache Ozone
          Issue Type: Bug
          Components: Ozone Manager
            Reporter: Siyao Meng
            Assignee: Siyao Meng


In isSnapshotPurged() check, snapshot chain tableKey returning null should not 
be the sole indicator for judging whether the snapshot is still active or not.

isSnapshotPurged() incorrectly returning true causes 
checkOrphanSnapshotVersions() to incorrectly removing active snapshot's YAML 
metadata. This in turn causes NPE in CacheLoader when attempting to load the 
snapshot.

OM log:

{code}
2026-02-16 09:47:40,047 INFO [IPC Server handler 92 on 
9862]-org.apache.hadoop.ozone.om.snapshot.SnapshotCache: Loading SnapshotId: 
'28d99c74-13d8-4a9d-91fe-7d0530ca84a3'
2026-02-16 09:47:40,050 WARN [IPC Server handler 92 on 
9862]-org.apache.hadoop.ipc.Server: IPC Server handler 92 on 9862, call Call#2 
Retry#0 org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol.submitRequest 
from 10.65.50.249:55484
java.lang.IllegalStateException: java.lang.NullPointerException
        at 
org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:217)
        at 
java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1947)
        at 
org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:202)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:693)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:681)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveSnapshot(OmSnapshotManager.java:648)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveFsMetadataOrSnapshot(OmSnapshotManager.java:638)
        at 
org.apache.hadoop.ozone.om.OzoneManager.getReader(OzoneManager.java:5013)
        at 
org.apache.hadoop.ozone.om.OzoneManager.listStatus(OzoneManager.java:3874)
        at 
org.apache.hadoop.ozone.om.OzoneManager.listStatus(OzoneManager.java:3865)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.listStatus(OzoneManagerRequestHandler.java:1196)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleReadRequest(OzoneManagerRequestHandler.java:269)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitReadRequestToOM(OzoneManagerProtocolServerSideTranslatorPB.java:245)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:198)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:158)
        at 
org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:87)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:148)
        at 
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:995)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:923)
        at java.base/java.security.AccessController.doPrivileged(Native Method)
        at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1910)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2905)
Caused by: java.lang.NullPointerException
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:404)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
        at 
org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:206)
        ... 25 more
{code}

Client log (client would keep retrying, but keep hitting the same NPE issue):

{code}
$ ozone fs -ls 
ofs://ozone1771677192/vol-test-workload-om-decommission-recommission-1771689727/buck-test-workload-om-decommission-recommission-1771689727/.snapshot/snap-79xbc/
26/02/21 17:42:47 INFO retry.RetryInvocationHandler: 
com.google.protobuf.ServiceException: 
org.apache.hadoop.ipc.RemoteException(java.lang.IllegalStateException): 
java.lang.NullPointerException
        at 
org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:217)
        at 
java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1908)
        at 
org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:202)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:693)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:681)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveSnapshot(OmSnapshotManager.java:648)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.getActiveFsMetadataOrSnapshot(OmSnapshotManager.java:638)
        at 
org.apache.hadoop.ozone.om.OzoneManager.getReader(OzoneManager.java:5013)
        at 
org.apache.hadoop.ozone.om.OzoneManager.getFileStatus(OzoneManager.java:3817)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.getOzoneFileStatus(OzoneManagerRequestHandler.java:1024)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleReadRequest(OzoneManagerRequestHandler.java:258)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitReadRequestToOM(OzoneManagerProtocolServerSideTranslatorPB.java:245)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:198)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:158)
        at 
org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:87)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:148)
        at 
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:533)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:995)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:923)
        at java.base/java.security.AccessController.doPrivileged(Native Method)
        at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1910)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2905)
Caused by: java.lang.NullPointerException
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:404)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
        at 
org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:206)
        ... 24 more
{code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to