jojochuang commented on code in PR #4369: URL: https://github.com/apache/hadoop/pull/4369#discussion_r930415320
########## hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestClientProtocolForPipelineRecovery.java: ########## @@ -761,6 +762,131 @@ public void failPipeline(ReplicaInPipeline replicaInfo, } } + + @Test + public void testPipelineRecoveryWithFailedTransferBlock() throws Exception { + final int chunkSize = 512; + final int oneWriteSize = 5000; + final int totalSize = 1024 * 1024; + final int errorInjectionPos = 512; + Configuration conf = new HdfsConfiguration(); + // Need 5 datanodes to verify the replaceDatanode during pipeline recovery + final MiniDFSCluster cluster = + new MiniDFSCluster.Builder(conf).numDataNodes(5).build(); + DataNodeFaultInjector old = DataNodeFaultInjector.get(); + + try { + DistributedFileSystem fs = cluster.getFileSystem(); + Path fileName = new Path("/f"); + FSDataOutputStream o = fs.create(fileName); + int count = 0; + // Flush to get the pipeline created. + o.writeBytes("hello"); + o.hflush(); + DFSOutputStream dfsO = (DFSOutputStream) o.getWrappedStream(); + final DatanodeInfo[] pipeline = dfsO.getStreamer().getNodes(); + final String firstDn = pipeline[0].getXferAddr(false); + final String secondDn = pipeline[1].getXferAddr(false); + final AtomicBoolean pipelineFailed = new AtomicBoolean(false); + final AtomicBoolean transferFailed = new AtomicBoolean(false); + + DataNodeFaultInjector.set(new DataNodeFaultInjector() { + @Override + public void failPipeline(ReplicaInPipeline replicaInfo, + String mirror) throws IOException { + if (!secondDn.equals(mirror)) { + // Only fail for first DN + return; + } + if (!pipelineFailed.get() && + (replicaInfo.getBytesAcked() > errorInjectionPos) && + (replicaInfo.getBytesAcked() % chunkSize != 0)) { + int count = 0; + while (count < 10) { + // Fail the pipeline (Throw exception) when: + // 1. bytsAcked is not at chunk boundary (checked in the if + // statement above) + // 2. bytesOnDisk is bigger than bytesAcked and at least + // reaches (or go beyond) the end of the chunk that + // bytesAcked is in (checked in the if statement below). + // At this condition, transferBlock that happens during + // pipeline recovery would transfer extra bytes to make up to the + // end of the chunk. And this is when the block corruption + // described in HDFS-4660 would occur. Review Comment: Oh HDFS-4660 brought back my worst nightmare when I spent a month chasing this bug. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-issues-h...@hadoop.apache.org