[
https://issues.apache.org/jira/browse/HDDS-11068?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Attila Doroszlai resolved HDDS-11068.
-------------------------------------
Fix Version/s: 1.5.0
Resolution: Fixed
> OM down to Snapshot Chain Corruption
> ------------------------------------
>
> Key: HDDS-11068
> URL: https://issues.apache.org/jira/browse/HDDS-11068
> Project: Apache Ozone
> Issue Type: Bug
> Components: Ozone Manager, Snapshot
> Reporter: Jyotirmoy Sinha
> Assignee: Swaminathan Balachandran
> Priority: Critical
> Labels: ozone-snapshot, pull-request-available
> Fix For: 1.5.0
>
>
> OM down to Snapshot Chain Corruption
> OM Error stacktrace -
> {code:java}
> 2024-06-25 14:51:14,293 ERROR
> [main]-org.apache.hadoop.ozone.om.SnapshotChainManager: Failure while loading
> snapshot chain.
> java.io.IOException: Snapshot chain corruption. All snapshots have not been
> added to the snapshot chain. Last snapshot added to chain :
> 750ae2ca-3f8d-4a1f-8655-9d15bd5ed84b
> at
> org.apache.hadoop.ozone.om.SnapshotChainManager.loadFromSnapshotInfoTable(SnapshotChainManager.java:324)
> at
> org.apache.hadoop.ozone.om.SnapshotChainManager.<init>(SnapshotChainManager.java:66)
> at
> org.apache.hadoop.ozone.om.OmMetadataManagerImpl.start(OmMetadataManagerImpl.java:565)
> at
> org.apache.hadoop.ozone.om.OmMetadataManagerImpl.<init>(OmMetadataManagerImpl.java:342)
> at
> org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:809)
> at
> org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:687)
> at
> org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:774)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74)
> at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38)
> at picocli.CommandLine.executeUserObject(CommandLine.java:2041)
> at picocli.CommandLine.access$1500(CommandLine.java:148)
> at
> picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2461)
> at picocli.CommandLine$RunLast.handle(CommandLine.java:2453)
> at picocli.CommandLine$RunLast.handle(CommandLine.java:2415)
> at
> picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2273)
> at picocli.CommandLine$RunLast.execute(CommandLine.java:2417)
> at picocli.CommandLine.execute(CommandLine.java:2170)
> at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100)
> at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58)
> {code}
> {code:java}
> 2024-06-25 14:51:16,854 INFO
> [main]-org.apache.hadoop.hdds.utils.NativeLibraryLoader: Loading Library:
> ozone_rocksdb_tools
> 2024-06-25 14:51:16,857 ERROR
> [main]-org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager: Native
> Library for raw sst file reading loading failed.
> org.apache.hadoop.hdds.utils.NativeLibraryNotLoadedException: Unable to load
> library ozone_rocksdb_tools from both java.library.path & resource file
> libozone_rocksdb_tools.so from jar.
> at
> org.apache.hadoop.hdds.utils.db.managed.ManagedRawSSTFileReader.loadLibrary(ManagedRawSSTFileReader.java:40)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager.initNativeLibraryForEfficientDiff(SnapshotDiffManager.java:285)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager.<init>(SnapshotDiffManager.java:259)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.<init>(OmSnapshotManager.java:286)
> at
> org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:863)
> at
> org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:687)
> at
> org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:774)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74)
> at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38)
> at picocli.CommandLine.executeUserObject(CommandLine.java:2041)
> at picocli.CommandLine.access$1500(CommandLine.java:148)
> at
> picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2461)
> at picocli.CommandLine$RunLast.handle(CommandLine.java:2453)
> at picocli.CommandLine$RunLast.handle(CommandLine.java:2415)
> at
> picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2273)
> at picocli.CommandLine$RunLast.execute(CommandLine.java:2417)
> at picocli.CommandLine.execute(CommandLine.java:2170)
> at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100)
> at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91)
> at
> org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58)
> {code}
> {code:java}
> 2024-06-25 14:51:19,572 WARN [om229-OMStateMachineApplyTransactionThread -
> 0]-org.apache.hadoop.metrics2.util.MBeans: Error creating MBean object name:
> Hadoop:service=LayoutVersionManager,name=OMLayoutVersionManager
> org.apache.hadoop.metrics2.MetricsException:
> org.apache.hadoop.metrics2.MetricsException:
> Hadoop:service=LayoutVersionManager,name=OMLayoutVersionManager already
> exists!
> at
> org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.newObjectName(DefaultMetricsSystem.java:135)
> at
> org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.newMBeanName(DefaultMetricsSystem.java:110)
> at
> org.apache.hadoop.metrics2.util.MBeans.getMBeanName(MBeans.java:163)
> at org.apache.hadoop.metrics2.util.MBeans.register(MBeans.java:95)
> at org.apache.hadoop.metrics2.util.MBeans.register(MBeans.java:72)
> at
> org.apache.hadoop.ozone.upgrade.AbstractLayoutVersionManager.init(AbstractLayoutVersionManager.java:88)
> at
> org.apache.hadoop.ozone.om.upgrade.OMLayoutVersionManager.<init>(OMLayoutVersionManager.java:69)
> at
> org.apache.hadoop.ozone.om.upgrade.OMLayoutFeatureAspect.checkLayoutFeature(OMLayoutFeatureAspect.java:75)
> at
> org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:63)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:397)
> at
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:395)
> at
> org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:539)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:357)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.hadoop.metrics2.MetricsException:
> Hadoop:service=LayoutVersionManager,name=OMLayoutVersionManager already
> exists!
> at
> org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.newObjectName(DefaultMetricsSystem.java:131)
> ... 18 more
> 2024-06-25 14:51:19,579 INFO [main]-org.apache.ratis.grpc.server.GrpcService:
> om229: GrpcService started, listening on 9872
> 2024-06-25 14:51:19,582 INFO
> [JvmPauseMonitor0]-org.apache.ratis.util.JvmPauseMonitor:
> JvmPauseMonitor-om229: Started
> 2024-06-25 14:51:19,594 INFO [main]-org.apache.hadoop.ozone.om.OzoneManager:
> Starting secret key client.
> 2024-06-25 14:51:19,678 ERROR
> [om229-OMDoubleBufferFlushThread]-org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer:
> Terminating with exit status 2: During flush to DB encountered error in
> OMDoubleBuffer flush thread om229-OMDoubleBufferFlushThread when handling
> OMRequest: cmdType: PurgeKeys
> traceID: ""
> success: true
> status: OKjava.lang.IllegalStateException: java.io.IOException: No snapshot
> exist with snapshotId: a7efa54d-8beb-4fd0-808c-400e173ca6e9
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:166)
> at
> java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1853)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:154)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:690)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:678)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:661)
> at
> org.apache.hadoop.ozone.om.response.key.OMKeyPurgeResponse.addToDBBatch(OMKeyPurgeResponse.java:82)
> at
> org.apache.hadoop.ozone.om.response.OMClientResponse.checkAndUpdateDB(OMClientResponse.java:66)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.lambda$8(OzoneManagerDoubleBuffer.java:408)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.addToBatchWithTrace(OzoneManagerDoubleBuffer.java:253)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.addToBatch(OzoneManagerDoubleBuffer.java:407)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.flushBatch(OzoneManagerDoubleBuffer.java:353)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.flushCurrentBuffer(OzoneManagerDoubleBuffer.java:328)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.flushTransactions(OzoneManagerDoubleBuffer.java:295)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.io.IOException: No snapshot exist with snapshotId:
> a7efa54d-8beb-4fd0-808c-400e173ca6e9
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:351)
> at
> org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:158)
> ... 14 more
> 2024-06-25 14:51:19,678 ERROR [om229-OMStateMachineApplyTransactionThread -
> 0]-org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine: Terminating
> with exit status 1: OM Ratis Server has received unrecoverable error, to
> avoid further DB corruption, terminating OM. Error Response received
> is:cmdType: SnapshotMoveDeletedKeys
> traceID: ""
> success: false
> message: "java.io.IOException: Snapshot chain is corrupted.\n\tat
> org.apache.hadoop.ozone.om.SnapshotChainManager.validateSnapshotChain(SnapshotChainManager.java:558)\n\tat
>
> org.apache.hadoop.ozone.om.SnapshotChainManager.hasNextPathSnapshot(SnapshotChainManager.java:453)\n\tat
>
> org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getNextActiveSnapshot(SnapshotUtils.java:157)\n\tat
>
> org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:81)\n\tat
>
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:397)\n\tat
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)\n\tat
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:395)\n\tat
>
> org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63)\n\tat
>
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:539)\n\tat
>
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:357)\n\tat
>
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)\n\tat
>
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat
>
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat
> java.lang.Thread.run(Thread.java:748)\n"
> status: INTERNAL_ERRORINTERNAL_ERROR
> org.apache.hadoop.ozone.om.exceptions.OMException: java.io.IOException:
> Snapshot chain is corrupted.
> at
> org.apache.hadoop.ozone.om.SnapshotChainManager.validateSnapshotChain(SnapshotChainManager.java:558)
> at
> org.apache.hadoop.ozone.om.SnapshotChainManager.hasNextPathSnapshot(SnapshotChainManager.java:453)
> at
> org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getNextActiveSnapshot(SnapshotUtils.java:157)
> at
> org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:81)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:397)
> at
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
> at
> org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:395)
> at
> org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:539)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:357)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.terminate(OzoneManagerStateMachine.java:381)
> at
> org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.processResponse(OzoneManagerStateMachine.java:370)
> at
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> at
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> at
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1609)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]