[
https://issues.apache.org/jira/browse/HDFS-17179?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Bryan Beaudreault updated HDFS-17179:
-------------------------------------
Description:
We've been running into some data corruption issues recently. When a
ChecksumException is thrown, DFSInputStream correctly reports the block to the
NameNode which triggers deletion and re-replication of the replica. It's also
possible that we fail to even read the meta header for constructing the
checksum. This gets thrown as CorruptMetaHeaderException which is not handled
by DFSInputStream. We should handle this similarly to ChecksumException. See
stacktrace:
{code:java}
WARN org.apache.hadoop.hdfs.client.impl.BlockReaderFactory:
BlockReaderFactory(fileName=/hbase/data/default/table-c1/5a76502c2c7be37b2d92057baa8a3d81/0/24ddc16e2d824a3bb9bf242ad950a589,
block=BP-154245500-xxxxxx-1657570070866:blk_1362550389_288818622): error crea
ting ShortCircuitReplica.
org.apache.hadoop.hdfs.server.datanode.CorruptMetaHeaderException: The block
meta file header is corrupt
at
org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader.preadHeader(BlockMetadataHeader.java:133)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplica.<init>(ShortCircuitReplica.java:129)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.requestFileDescriptors(BlockReaderFactory.java:618)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.createShortCircuitReplicaInfo(BlockReaderFactory.java:545)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.create(ShortCircuitCache.java:786)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.fetchOrCreate(ShortCircuitCache.java:723)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getBlockReaderLocal(BlockReaderFactory.java:483)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:360)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:715)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream.actualGetFromOneDataNode(DFSInputStream.java:1160)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream$2.call(DFSInputStream.java:1132)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream$2.call(DFSInputStream.java:1128)
~[hadoop-hdfs-client-3.3.1.jar:?]
at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) ~[?:?]
at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
~[?:?]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
~[?:?]
at java.lang.Thread.run(Thread.java:829) ~[?:?]
Caused by: org.apache.hadoop.util.InvalidChecksumSizeException: The value -75
does not map to a valid checksum Type
at
org.apache.hadoop.util.DataChecksum.mapByteToChecksumType(DataChecksum.java:190)
~[hadoop-common-3.3.1.jar:?]
at
org.apache.hadoop.util.DataChecksum.newDataChecksum(DataChecksum.java:159)
~[hadoop-common-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader.preadHeader(BlockMetadataHeader.java:131)
~[hadoop-hdfs-client-3.3.1.jar:?]
WARN org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache:
ShortCircuitCache(0x4da4703d): failed to load
1362550389_BP-154245500-xxxxxxx-1657570070866{code}
was:
We've been running into some data corruption issues recently. When a
ChecksumException is thrown, DFSInputStream correctly reports the block to the
NameNode which triggers deletion and re-replication of the replica. It's also
possible that we fail to even read the meta header for constructing the
checksum. This gets thrown as CorruptMetaHeaderException which is not handled
by DFSInputStream. We should handle this similarly to ChecksumException. See
stacktrace:
{code:java}
org.apache.hadoop.hdfs.server.datanode.CorruptMetaHeaderException: The block
meta file header is corrupt
at
org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader.preadHeader(BlockMetadataHeader.java:133)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplica.<init>(ShortCircuitReplica.java:129)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.requestFileDescriptors(BlockReaderFactory.java:618)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.createShortCircuitReplicaInfo(BlockReaderFactory.java:545)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.create(ShortCircuitCache.java:786)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.fetchOrCreate(ShortCircuitCache.java:723)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getBlockReaderLocal(BlockReaderFactory.java:483)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:360)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:715)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream.actualGetFromOneDataNode(DFSInputStream.java:1160)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream$2.call(DFSInputStream.java:1132)
~[hadoop-hdfs-client-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.DFSInputStream$2.call(DFSInputStream.java:1128)
~[hadoop-hdfs-client-3.3.1.jar:?]
at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) ~[?:?]
at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
~[?:?]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
~[?:?]
at java.lang.Thread.run(Thread.java:829) ~[?:?]
Caused by: org.apache.hadoop.util.InvalidChecksumSizeException: The value -75
does not map to a valid checksum Type
at
org.apache.hadoop.util.DataChecksum.mapByteToChecksumType(DataChecksum.java:190)
~[hadoop-common-3.3.1.jar:?]
at
org.apache.hadoop.util.DataChecksum.newDataChecksum(DataChecksum.java:159)
~[hadoop-common-3.3.1.jar:?]
at
org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader.preadHeader(BlockMetadataHeader.java:131)
~[hadoop-hdfs-client-3.3.1.jar:?] {code}
> DFSInputStream should report CorruptMetaHeaderException as corruptBlock to
> NameNode
> -----------------------------------------------------------------------------------
>
> Key: HDFS-17179
> URL: https://issues.apache.org/jira/browse/HDFS-17179
> Project: Hadoop HDFS
> Issue Type: Improvement
> Reporter: Bryan Beaudreault
> Assignee: Bryan Beaudreault
> Priority: Major
>
> We've been running into some data corruption issues recently. When a
> ChecksumException is thrown, DFSInputStream correctly reports the block to
> the NameNode which triggers deletion and re-replication of the replica. It's
> also possible that we fail to even read the meta header for constructing the
> checksum. This gets thrown as CorruptMetaHeaderException which is not handled
> by DFSInputStream. We should handle this similarly to ChecksumException. See
> stacktrace:
>
> {code:java}
> WARN org.apache.hadoop.hdfs.client.impl.BlockReaderFactory:
> BlockReaderFactory(fileName=/hbase/data/default/table-c1/5a76502c2c7be37b2d92057baa8a3d81/0/24ddc16e2d824a3bb9bf242ad950a589,
> block=BP-154245500-xxxxxx-1657570070866:blk_1362550389_288818622): error crea
> ting ShortCircuitReplica.
> org.apache.hadoop.hdfs.server.datanode.CorruptMetaHeaderException: The block
> meta file header is corrupt
> at
> org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader.preadHeader(BlockMetadataHeader.java:133)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplica.<init>(ShortCircuitReplica.java:129)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.requestFileDescriptors(BlockReaderFactory.java:618)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.createShortCircuitReplicaInfo(BlockReaderFactory.java:545)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.create(ShortCircuitCache.java:786)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.fetchOrCreate(ShortCircuitCache.java:723)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getBlockReaderLocal(BlockReaderFactory.java:483)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:360)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:715)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.DFSInputStream.actualGetFromOneDataNode(DFSInputStream.java:1160)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.DFSInputStream$2.call(DFSInputStream.java:1132)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.DFSInputStream$2.call(DFSInputStream.java:1128)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) ~[?:?]
> at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
> ~[?:?]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
> ~[?:?]
> at java.lang.Thread.run(Thread.java:829) ~[?:?]
> Caused by: org.apache.hadoop.util.InvalidChecksumSizeException: The value -75
> does not map to a valid checksum Type
> at
> org.apache.hadoop.util.DataChecksum.mapByteToChecksumType(DataChecksum.java:190)
> ~[hadoop-common-3.3.1.jar:?]
> at
> org.apache.hadoop.util.DataChecksum.newDataChecksum(DataChecksum.java:159)
> ~[hadoop-common-3.3.1.jar:?]
> at
> org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader.preadHeader(BlockMetadataHeader.java:131)
> ~[hadoop-hdfs-client-3.3.1.jar:?]
> WARN org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache:
> ShortCircuitCache(0x4da4703d): failed to load
> 1362550389_BP-154245500-xxxxxxx-1657570070866{code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]