[ 
https://issues.apache.org/jira/browse/HDFS-13056?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16433401#comment-16433401
 ] 

Hudson commented on HDFS-13056:
-------------------------------

SUCCESS: Integrated in Jenkins build Hadoop-trunk-Commit #13966 (See 
[https://builds.apache.org/job/Hadoop-trunk-Commit/13966/])
HDFS-13056. Expose file-level composite CRCs in HDFS which are (xiao: rev 
7c9cdad6d04c98db5a83e2108219bf6e6c903daf)
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Sender.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java
* (add) 
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/CrcUtil.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/DfsClientConf.java
* (add) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/BlockChecksumType.java
* (add) 
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestCrcUtil.java
* (add) 
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestCrcComposer.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/fs/Hdfs.java
* (add) 
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumCompositeCrc.java
* (add) 
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CompositeCrcFileChecksum.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/protocolPB/TestPBHelper.java
* (edit) 
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
* (add) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/BlockChecksumOptions.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelperClient.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/FileChecksumHelper.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
* (edit) hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/DataTransferProtocol.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/proto/datatransfer.proto
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Receiver.java
* (add) 
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapperCompositeCrc.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java
* (edit) 
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
* (add) 
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/CrcComposer.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockReconstructor.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java
* (edit) 
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java
* (add) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumCompositeCrcReconstructor.java
* (edit) 
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
* (edit) hadoop-hdfs-project/hadoop-hdfs-client/src/main/proto/hdfs.proto
* (add) 
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumMd5CrcReconstructor.java


> Expose file-level composite CRCs in HDFS which are comparable across 
> different instances/layouts
> ------------------------------------------------------------------------------------------------
>
>                 Key: HDFS-13056
>                 URL: https://issues.apache.org/jira/browse/HDFS-13056
>             Project: Hadoop HDFS
>          Issue Type: New Feature
>          Components: datanode, distcp, erasure-coding, federation, hdfs
>    Affects Versions: 3.0.0
>            Reporter: Dennis Huo
>            Assignee: Dennis Huo
>            Priority: Major
>             Fix For: 3.2.0
>
>         Attachments: HDFS-13056-branch-2.8.001.patch, 
> HDFS-13056-branch-2.8.002.patch, HDFS-13056-branch-2.8.003.patch, 
> HDFS-13056-branch-2.8.004.patch, HDFS-13056-branch-2.8.005.patch, 
> HDFS-13056-branch-2.8.poc1.patch, HDFS-13056.001.patch, HDFS-13056.002.patch, 
> HDFS-13056.003.patch, HDFS-13056.003.patch, HDFS-13056.004.patch, 
> HDFS-13056.005.patch, HDFS-13056.006.patch, HDFS-13056.007.patch, 
> HDFS-13056.008.patch, HDFS-13056.009.patch, HDFS-13056.010.patch, 
> HDFS-13056.011.patch, HDFS-13056.012.patch, HDFS-13056.013.patch, 
> HDFS-13056.014.patch, Reference_only_zhen_PPOC_hadoop2.6.X.diff, 
> hdfs-file-composite-crc32-v1.pdf, hdfs-file-composite-crc32-v2.pdf, 
> hdfs-file-composite-crc32-v3.pdf
>
>
> FileChecksum was first introduced in 
> [https://issues-test.apache.org/jira/browse/HADOOP-3981] and ever since then 
> has remained defined as MD5-of-MD5-of-CRC, where per-512-byte chunk CRCs are 
> already stored as part of datanode metadata, and the MD5 approach is used to 
> compute an aggregate value in a distributed manner, with individual datanodes 
> computing the MD5-of-CRCs per-block in parallel, and the HDFS client 
> computing the second-level MD5.
>  
> A shortcoming of this approach which is often brought up is the fact that 
> this FileChecksum is sensitive to the internal block-size and chunk-size 
> configuration, and thus different HDFS files with different block/chunk 
> settings cannot be compared. More commonly, one might have different HDFS 
> clusters which use different block sizes, in which case any data migration 
> won't be able to use the FileChecksum for distcp's rsync functionality or for 
> verifying end-to-end data integrity (on top of low-level data integrity 
> checks applied at data transfer time).
>  
> This was also revisited in https://issues.apache.org/jira/browse/HDFS-8430 
> during the addition of checksum support for striped erasure-coded files; 
> while there was some discussion of using CRC composability, it still 
> ultimately settled on hierarchical MD5 approach, which also adds the problem 
> that checksums of basic replicated files are not comparable to striped files.
>  
> This feature proposes to add a "COMPOSITE-CRC" FileChecksum type which uses 
> CRC composition to remain completely chunk/block agnostic, and allows 
> comparison between striped vs replicated files, between different HDFS 
> instances, and possible even between HDFS and other external storage systems. 
> This feature can also be added in-place to be compatible with existing block 
> metadata, and doesn't need to change the normal path of chunk verification, 
> so is minimally invasive. This also means even large preexisting HDFS 
> deployments could adopt this feature to retroactively sync data. A detailed 
> design document can be found here: 
> https://storage.googleapis.com/dennishuo/hdfs-file-composite-crc32-v1.pdf



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org

Reply via email to