Jyotirmoy Sinha created HDDS-8345:
-------------------------------------

             Summary: [snapshot] OM process crash on restart due to Snapshot 
Chain corruption
                 Key: HDDS-8345
                 URL: https://issues.apache.org/jira/browse/HDDS-8345
             Project: Apache Ozone
          Issue Type: Bug
          Components: Ozone Manager
            Reporter: Jyotirmoy Sinha


Scenario - Create 13k+ snapshots on a cluster and then restart the ozone 
services.

The OM process crashed with Snapshot Chain corruption exception since its not 
able to find any of the SST files.

Stacktrace - 
{code:java}
2023-03-31 07:43:33,133 INFO 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer: Can't find SST '097467'
java.io.FileNotFoundException: Can't find SST file: 097467.sst
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.getAbsoluteSstFilePath(RocksDBCheckpointDiffer.java:562)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.getSSTFileSummary(RocksDBCheckpointDiffer.java:541)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.addNodeToDAG(RocksDBCheckpointDiffer.java:989)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.lambda$populateCompactionDAG$2(RocksDBCheckpointDiffer.java:1020)
        at 
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1688)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.populateCompactionDAG(RocksDBCheckpointDiffer.java:1019)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.processCompactionLogLine(RocksDBCheckpointDiffer.java:678)
        at java.util.Iterator.forEachRemaining(Iterator.java:116)
        at 
java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801)
        at 
java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:647)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.readCompactionLogToDAG(RocksDBCheckpointDiffer.java:692)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.loadAllCompactionLogs(RocksDBCheckpointDiffer.java:713)
        at org.apache.hadoop.hdds.utils.db.RDBStore.<init>(RDBStore.java:167)
        at 
org.apache.hadoop.hdds.utils.db.DBStoreBuilder.build(DBStoreBuilder.java:219)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.loadDB(OmMetadataManagerImpl.java:507)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.loadDB(OmMetadataManagerImpl.java:486)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.start(OmMetadataManagerImpl.java:476)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.<init>(OmMetadataManagerImpl.java:320)
        at 
org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:747)
        at org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:627)
        at 
org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:712)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74)
        at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38)
        at picocli.CommandLine.executeUserObject(CommandLine.java:1953)
        at picocli.CommandLine.access$1300(CommandLine.java:145)
        at 
picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2352)
        at picocli.CommandLine$RunLast.handle(CommandLine.java:2346)
        at picocli.CommandLine$RunLast.handle(CommandLine.java:2311)
        at 
picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2179)
        at picocli.CommandLine.execute(CommandLine.java:2078)
        at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100)
        at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58)
2023-03-31 07:43:33,133 INFO 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer: Can't find SST '097484'
java.io.FileNotFoundException: Can't find SST file: 097484.sst
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.getAbsoluteSstFilePath(RocksDBCheckpointDiffer.java:562)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.getSSTFileSummary(RocksDBCheckpointDiffer.java:541)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.addNodeToDAG(RocksDBCheckpointDiffer.java:989)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.lambda$populateCompactionDAG$2(RocksDBCheckpointDiffer.java:1020)
        at 
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.populateCompactionDAG(RocksDBCheckpointDiffer.java:1019)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.processCompactionLogLine(RocksDBCheckpointDiffer.java:678)
        at java.util.Iterator.forEachRemaining(Iterator.java:116)
        at 
java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801)
        at 
java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:647)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.readCompactionLogToDAG(RocksDBCheckpointDiffer.java:692)
        at 
org.apache.ozone.rocksdiff.RocksDBCheckpointDiffer.loadAllCompactionLogs(RocksDBCheckpointDiffer.java:713)
        at org.apache.hadoop.hdds.utils.db.RDBStore.<init>(RDBStore.java:167)
        at 
org.apache.hadoop.hdds.utils.db.DBStoreBuilder.build(DBStoreBuilder.java:219)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.loadDB(OmMetadataManagerImpl.java:507)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.loadDB(OmMetadataManagerImpl.java:486)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.start(OmMetadataManagerImpl.java:476)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.<init>(OmMetadataManagerImpl.java:320)
        at 
org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:747)
        at org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:627)
        at 
org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:712)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74)
        at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38)
        at picocli.CommandLine.executeUserObject(CommandLine.java:1953)
        at picocli.CommandLine.access$1300(CommandLine.java:145)
        at 
picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2352)
        at picocli.CommandLine$RunLast.handle(CommandLine.java:2346)
        at picocli.CommandLine$RunLast.handle(CommandLine.java:2311)
        at 
picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2179)
        at picocli.CommandLine.execute(CommandLine.java:2078)
        at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100)
        at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58)
2023-03-31 07:43:34,951 ERROR org.apache.hadoop.ozone.om.OzoneManagerStarter: 
OM start failed with exception
java.io.IOException: Snapshot Chain corruption:  previous snapshotID given but 
no associated snapshot found in snapshot chain: SnapshotID 
1197e0c1-99d1-43b9-9b33-424a6c09b35a
        at 
org.apache.hadoop.ozone.om.SnapshotChainManager.addSnapshotGlobal(SnapshotChainManager.java:86)
        at 
org.apache.hadoop.ozone.om.SnapshotChainManager.addSnapshot(SnapshotChainManager.java:288)
        at 
org.apache.hadoop.ozone.om.SnapshotChainManager.loadFromSnapshotInfoTable(SnapshotChainManager.java:279)
        at 
org.apache.hadoop.ozone.om.SnapshotChainManager.<init>(SnapshotChainManager.java:63)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.start(OmMetadataManagerImpl.java:481)
        at 
org.apache.hadoop.ozone.om.OmMetadataManagerImpl.<init>(OmMetadataManagerImpl.java:320)
        at 
org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:747)
        at org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:627)
        at 
org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:712)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74)
        at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38)
        at picocli.CommandLine.executeUserObject(CommandLine.java:1953)
        at picocli.CommandLine.access$1300(CommandLine.java:145)
        at 
picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2352)
        at picocli.CommandLine$RunLast.handle(CommandLine.java:2346)
        at picocli.CommandLine$RunLast.handle(CommandLine.java:2311)
        at 
picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2179)
        at picocli.CommandLine.execute(CommandLine.java:2078)
        at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100)
        at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91)
        at 
org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58)
2023-03-31 07:43:34,955 INFO org.apache.hadoop.ozone.om.OzoneManagerStarter: 
SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down OzoneManager at 
jspriv02-8.jspriv02.root.hwx.site/172.27.115.2
************************************************************/  {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to