[
https://issues.apache.org/jira/browse/HDDS-7787?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Stephen O'Donnell updated HDDS-7787:
------------------------------------
Parent: HDDS-6462
Issue Type: Sub-task (was: Bug)
> GetChecksum for EC files can fail intermittently with IndexOutOfBounds
> exception
> --------------------------------------------------------------------------------
>
> Key: HDDS-7787
> URL: https://issues.apache.org/jira/browse/HDDS-7787
> Project: Apache Ozone
> Issue Type: Sub-task
> Components: Ozone Datanode
> Reporter: Varsha Ravi
> Priority: Major
> Labels: pull-request-available
>
> When calculating a checksum for an EC file with Rack Topology enabled, you
> can get the following error intermittently:
> {code}
> ERROR : Failed with exception null
> java.lang.IndexOutOfBoundsException
> at java.nio.ByteBuffer.wrap(ByteBuffer.java:375)
> at
> org.apache.hadoop.ozone.client.checksum.ECBlockChecksumComputer.computeCompositeCrc(ECBlockChecksumComputer.java:163)
> at
> org.apache.hadoop.ozone.client.checksum.ECBlockChecksumComputer.compute(ECBlockChecksumComputer.java:65)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.getBlockChecksumFromChunkChecksums(ECFileChecksumHelper.java:148)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.checksumBlock(ECFileChecksumHelper.java:106)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.checksumBlocks(ECFileChecksumHelper.java:73)
> at
> org.apache.hadoop.ozone.client.checksum.BaseFileChecksumHelper.compute(BaseFileChecksumHelper.java:220)
> at
> org.apache.hadoop.fs.ozone.OzoneClientUtils.getFileChecksumWithCombineMode(OzoneClientUtils.java:223)
> at
> org.apache.hadoop.fs.ozone.BasicRootedOzoneClientAdapterImpl.getFileChecksum(BasicRootedOzoneClientAdapterImpl.java:1123)
> at
> org.apache.hadoop.fs.ozone.BasicRootedOzoneFileSystem.getFileChecksum(BasicRootedOzoneFileSystem.java:955)
> at
> org.apache.hadoop.fs.FileSystem.getFileChecksum(FileSystem.java:2831)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addInsertNonDirectoryInformation(Hive.java:3659)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addInsertFileInformation(Hive.java:3632)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addWriteNotificationLog(Hive.java:3578)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addWriteNotificationLog(Hive.java:3563)
> at org.apache.hadoop.hive.ql.metadata.Hive.loadTable(Hive.java:3224)
> at org.apache.hadoop.hive.ql.exec.MoveTask.execute(MoveTask.java:418)
> at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:213)
> at
> org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:105)
> at org.apache.hadoop.hive.ql.Executor.launchTask(Executor.java:357)
> at org.apache.hadoop.hive.ql.Executor.launchTasks(Executor.java:330)
> at org.apache.hadoop.hive.ql.Executor.runTasks(Executor.java:246)
> at org.apache.hadoop.hive.ql.Executor.execute(Executor.java:109)
> at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:769)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:504)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:498)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:166)
> at
> org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:226)
> at
> org.apache.hive.service.cli.operation.SQLOperation.access$700(SQLOperation.java:88)
> at
> org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork$1.run(SQLOperation.java:327)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
> at
> org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork.run(SQLOperation.java:345)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> ERROR : FAILED: Execution Error, return code 40000 from
> org.apache.hadoop.hive.ql.exec.MoveTask. java.lang.IndexOutOfBoundsException
> at java.nio.ByteBuffer.wrap(ByteBuffer.java:375)
> at
> org.apache.hadoop.ozone.client.checksum.ECBlockChecksumComputer.computeCompositeCrc(ECBlockChecksumComputer.java:163)
> at
> org.apache.hadoop.ozone.client.checksum.ECBlockChecksumComputer.compute(ECBlockChecksumComputer.java:65)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.getBlockChecksumFromChunkChecksums(ECFileChecksumHelper.java:148)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.checksumBlock(ECFileChecksumHelper.java:106)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.checksumBlocks(ECFileChecksumHelper.java:73)
> at
> org.apache.hadoop.ozone.client.checksum.BaseFileChecksumHelper.compute(BaseFileChecksumHelper.java:220)
> at
> org.apache.hadoop.fs.ozone.OzoneClientUtils.getFileChecksumWithCombineMode(OzoneClientUtils.java:223)
> at
> org.apache.hadoop.fs.ozone.BasicRootedOzoneClientAdapterImpl.getFileChecksum(BasicRootedOzoneClientAdapterImpl.java:1123)
> at
> org.apache.hadoop.fs.ozone.BasicRootedOzoneFileSystem.getFileChecksum(BasicRootedOzoneFileSystem.java:955)
> at
> org.apache.hadoop.fs.FileSystem.getFileChecksum(FileSystem.java:2831)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addInsertNonDirectoryInformation(Hive.java:3659)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addInsertFileInformation(Hive.java:3632)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addWriteNotificationLog(Hive.java:3578)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addWriteNotificationLog(Hive.java:3563)
> at org.apache.hadoop.hive.ql.metadata.Hive.loadTable(Hive.java:3224)
> at org.apache.hadoop.hive.ql.exec.MoveTask.execute(MoveTask.java:418)
> at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:213)
> at
> org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:105)
> at org.apache.hadoop.hive.ql.Executor.launchTask(Executor.java:357)
> at org.apache.hadoop.hive.ql.Executor.launchTasks(Executor.java:330)
> at org.apache.hadoop.hive.ql.Executor.runTasks(Executor.java:246)
> at org.apache.hadoop.hive.ql.Executor.execute(Executor.java:109)
> at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:769)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:504)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:498)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:166)
> at
> org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:226)
> at
> org.apache.hive.service.cli.operation.SQLOperation.access$700(SQLOperation.java:88)
> at
> org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork$1.run(SQLOperation.java:327)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
> at
> org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork.run(SQLOperation.java:345)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> INFO : Completed executing
> command(queryId=hive_20221214035652_bc45477d-98df-408e-b945-a63b4ac6896a);
> Time taken: 22.167 seconds
> INFO : OK
> Error: Error while compiling statement: FAILED: Execution Error, return
> code 40000 from org.apache.hadoop.hive.ql.exec.MoveTask.
> java.lang.IndexOutOfBoundsException
> at java.nio.ByteBuffer.wrap(ByteBuffer.java:375)
> at
> org.apache.hadoop.ozone.client.checksum.ECBlockChecksumComputer.computeCompositeCrc(ECBlockChecksumComputer.java:163)
> at
> org.apache.hadoop.ozone.client.checksum.ECBlockChecksumComputer.compute(ECBlockChecksumComputer.java:65)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.getBlockChecksumFromChunkChecksums(ECFileChecksumHelper.java:148)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.checksumBlock(ECFileChecksumHelper.java:106)
> at
> org.apache.hadoop.ozone.client.checksum.ECFileChecksumHelper.checksumBlocks(ECFileChecksumHelper.java:73)
> at
> org.apache.hadoop.ozone.client.checksum.BaseFileChecksumHelper.compute(BaseFileChecksumHelper.java:220)
> at
> org.apache.hadoop.fs.ozone.OzoneClientUtils.getFileChecksumWithCombineMode(OzoneClientUtils.java:223)
> at
> org.apache.hadoop.fs.ozone.BasicRootedOzoneClientAdapterImpl.getFileChecksum(BasicRootedOzoneClientAdapterImpl.java:1123)
> at
> org.apache.hadoop.fs.ozone.BasicRootedOzoneFileSystem.getFileChecksum(BasicRootedOzoneFileSystem.java:955)
> at
> org.apache.hadoop.fs.FileSystem.getFileChecksum(FileSystem.java:2831)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addInsertNonDirectoryInformation(Hive.java:3659)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addInsertFileInformation(Hive.java:3632)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addWriteNotificationLog(Hive.java:3578)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.addWriteNotificationLog(Hive.java:3563)
> at org.apache.hadoop.hive.ql.metadata.Hive.loadTable(Hive.java:3224)
> at org.apache.hadoop.hive.ql.exec.MoveTask.execute(MoveTask.java:418)
> at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:213)
> at
> org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:105)
> at org.apache.hadoop.hive.ql.Executor.launchTask(Executor.java:357)
> at org.apache.hadoop.hive.ql.Executor.launchTasks(Executor.java:330)
> at org.apache.hadoop.hive.ql.Executor.runTasks(Executor.java:246)
> at org.apache.hadoop.hive.ql.Executor.execute(Executor.java:109)
> at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:769)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:504)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:498)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:166)
> at
> org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:226)
> at
> org.apache.hive.service.cli.operation.SQLOperation.access$700(SQLOperation.java:88)
> at
> org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork$1.run(SQLOperation.java:327)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898)
> at
> org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork.run(SQLOperation.java:345)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> (state=08S01,code=40000){noformat}
> {code}
> This is because the wrong nodes are used to obtain the stripe checksum
> sometimes as the node does not correctly use the replicaIndex in the pipeline
> to order the nodes.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]