Jyotirmoy Sinha created HDDS-8449:
-------------------------------------

             Summary: [snapshot] OM shuts down on trying to delete same 
snapshot twice before reclamation
                 Key: HDDS-8449
                 URL: https://issues.apache.org/jira/browse/HDDS-8449
             Project: Apache Ozone
          Issue Type: Bug
          Components: Ozone Manager
            Reporter: Jyotirmoy Sinha


Scenario -
 # Create volume/bucket/key and create snapshot
 # Delete above snapshot
 # Delete snapshot of step 1  again before its reclaimed in step 2

OM error stacktrace -
{code:java}
2023-04-18 14:02:32,098 INFO 
SecurityLogger.org.apache.hadoop.security.authorize.ServiceAuthorizationManager:
 Authorization successful for [email protected] (auth:KERBEROS) for 
protocol=interface org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol
2023-04-18 14:02:32,682 INFO 
org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotDeleteRequest: Deleted 
snapshot 'snap-jsceh' under path 'vol-jitl6/buck-zcqkr'
2023-04-18 14:02:35,010 INFO SecurityLogger.org.apache.hadoop.ipc.Server: Auth 
successful for [email protected] (auth:KERBEROS)
2023-04-18 14:02:35,019 INFO 
SecurityLogger.org.apache.hadoop.security.authorize.ServiceAuthorizationManager:
 Authorization successful for [email protected] (auth:KERBEROS) for 
protocol=interface org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol
2023-04-18 14:02:35,663 ERROR 
org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotDeleteRequest: Failed to 
delete snapshot 'snap-jsceh' under path 'vol-jitl6/buck-zcqkr'
2023-04-18 14:02:45,979 ERROR 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine: Terminating with 
exit status 1: OM Ratis Server has received unrecoverable error, to avoid 
further DB corruption, terminating OM. Error Response received is:cmdType: 
SnapshotMoveDeletedKeys
traceID: ""
success: false
message: "java.io.IOException: FILE_NOT_FOUND 
org.apache.hadoop.ozone.om.exceptions.OMException: Unable to load snapshot. 
Snapshot with table key \'/vol-jitl6/buck-zcqkr/snap-jsceh\' is no longer 
active\n\tat 
org.apache.hadoop.ozone.om.OmSnapshotManager.checkForSnapshot(OmSnapshotManager.java:441)\n\tat
 
org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:80)\n\tat
 
org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequest(OzoneManagerRequestHandler.java:337)\n\tat
 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:541)\n\tat
 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:332)\n\tat
 
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)\n\tat
 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat
 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat
 java.lang.Thread.run(Thread.java:748)\nCaused by: FILE_NOT_FOUND 
org.apache.hadoop.ozone.om.exceptions.OMException: Unable to load snapshot. 
Snapshot with table key \'/vol-jitl6/buck-zcqkr/snap-jsceh\' is no longer 
active\n\tat 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:197)\n\tat
 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)\n\tat
 
com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3533)\n\tat
 
com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2282)\n\tat 
com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2159)\n\tat
 com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2049)\n\tat 
com.google.common.cache.LocalCache.get(LocalCache.java:3966)\n\tat 
com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3989)\n\tat 
com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4950)\n\tat
 
org.apache.hadoop.ozone.om.OmSnapshotManager.checkForSnapshot(OmSnapshotManager.java:439)\n\t...
 8 more\n"
status: INTERNAL_ERROR

INTERNAL_ERROR org.apache.hadoop.ozone.om.exceptions.OMException: 
java.io.IOException: FILE_NOT_FOUND 
org.apache.hadoop.ozone.om.exceptions.OMException: Unable to load snapshot. 
Snapshot with table key '/vol-jitl6/buck-zcqkr/snap-jsceh' is no longer active
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.checkForSnapshot(OmSnapshotManager.java:441)
        at 
org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:80)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequest(OzoneManagerRequestHandler.java:337)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:541)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:332)
        at 
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
Caused by: FILE_NOT_FOUND org.apache.hadoop.ozone.om.exceptions.OMException: 
Unable to load snapshot. Snapshot with table key 
'/vol-jitl6/buck-zcqkr/snap-jsceh' is no longer active
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:197)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
        at 
com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3533)
        at 
com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2282)
        at 
com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2159)
        at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2049)
        at com.google.common.cache.LocalCache.get(LocalCache.java:3966)
        at com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3989)
        at 
com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4950)
        at 
org.apache.hadoop.ozone.om.OmSnapshotManager.checkForSnapshot(OmSnapshotManager.java:439)
        ... 8 more        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.terminate(OzoneManagerStateMachine.java:373)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$2(OzoneManagerStateMachine.java:348)
        at 
java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
        at 
java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
        at 
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1609)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
2023-04-18 14:02:45,986 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: ==> JVMShutdownHook.run()
2023-04-18 14:02:45,987 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: JVMShutdownHook: 
Signalling async audit cleanup to start.
2023-04-18 14:02:45,987 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: RangerAsyncAuditCleanup: 
Starting cleanup
2023-04-18 14:02:45,987 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: JVMShutdownHook: Waiting 
up to 30 seconds for audit cleanup to finish.
2023-04-18 14:02:45,988 INFO org.apache.ranger.audit.queue.AuditAsyncQueue: 
Stop called. name=ozone.async
2023-04-18 14:02:45,988 INFO org.apache.ranger.audit.queue.AuditAsyncQueue: 
Interrupting consumerThread. name=ozone.async, consumer=ozone.async.summary
2023-04-18 14:02:45,988 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: RangerAsyncAuditCleanup: 
Done cleanup
2023-04-18 14:02:45,988 INFO org.apache.ranger.audit.queue.AuditAsyncQueue: 
Caught exception in consumer thread. Shutdown might be in progress
2023-04-18 14:02:45,988 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: JVMShutdownHook: Audit 
cleanup finished after 1 milli seconds
2023-04-18 14:02:45,989 INFO org.apache.ranger.audit.queue.AuditAsyncQueue: 
Exiting polling loop. name=ozone.async
2023-04-18 14:02:45,989 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: RangerAsyncAuditCleanup: 
Waiting to audit cleanup start signal
2023-04-18 14:02:45,989 INFO org.apache.ranger.audit.queue.AuditAsyncQueue: 
Calling to stop consumer. name=ozone.async, consumer.name=ozone.async.summary
2023-04-18 14:02:45,989 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: JVMShutdownHook: 
Interrupting ranger async audit cleanup thread
2023-04-18 14:02:45,989 INFO org.apache.ranger.audit.queue.AuditSummaryQueue: 
Stop called. name=ozone.async.summary
2023-04-18 14:02:45,989 INFO org.apache.hadoop.ozone.om.OzoneManager: 
om1[jspriv05-1.jspriv05.root.hwx.site:9862]: Stopping Ozone Manager
2023-04-18 14:02:45,989 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: <== JVMShutdownHook.run()
2023-04-18 14:02:45,990 INFO org.apache.hadoop.ipc.Server: Stopping server on 
9862
2023-04-18 14:02:45,989 INFO 
org.apache.ranger.audit.provider.AuditProviderFactory: RangerAsyncAuditCleanup: 
Interrupted while waiting for audit startCleanup signal!  Exiting the thread...
java.lang.InterruptedException
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:998)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1304)
        at java.util.concurrent.Semaphore.acquire(Semaphore.java:312)
        at 
org.apache.ranger.audit.provider.AuditProviderFactory$RangerAsyncAuditCleanup.run(AuditProviderFactory.java:531)
        at java.lang.Thread.run(Thread.java:748)
2023-04-18 14:02:45,989 INFO org.apache.ranger.audit.queue.AuditSummaryQueue: 
Interrupting consumerThread. name=ozone.async.summary, 
consumer=ozone.async.summary.batch
2023-04-18 14:02:45,993 INFO org.apache.ranger.audit.queue.AuditAsyncQueue: 
Exiting consumerThread.run() method. name=ozone.async
2023-04-18 14:02:45,993 INFO org.apache.ranger.audit.queue.AuditSummaryQueue: 
Caught exception in consumer thread. Shutdown might be in progress
2023-04-18 14:02:45,994 INFO org.apache.ranger.audit.queue.AuditSummaryQueue: 
Exiting polling loop. name=ozone.async.summary
2023-04-18 14:02:45,994 INFO org.apache.ranger.audit.queue.AuditSummaryQueue: 
Calling to stop consumer. name=ozone.async.summary, 
consumer.name=ozone.async.summary.batch
2023-04-18 14:02:45,995 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server 
listener on 9862
2023-04-18 14:02:45,996 INFO org.apache.hadoop.ozone.om.OzoneManagerStarter: 
SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down OzoneManager at 
jspriv05-1.jspriv05.root.hwx.site/172.27.34.194
************************************************************/ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to