Author: cutting Date: Tue Jul 17 12:56:25 2007 New Revision: 557039 URL: http://svn.apache.org/viewvc?view=rev&rev=557039 Log: HADOOP-1564. Add unit tests for HDFS block-level checksums. Contributed by Dhruba.
Added: lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestCrcCorruption.java Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/DFSTestUtil.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=557039&r1=557038&r2=557039 ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Tue Jul 17 12:56:25 2007 @@ -370,6 +370,9 @@ 115. HADOOP-1400. Make JobClient retry requests, so that clients can survive jobtracker problems. (omalley via cutting) +116. HADOOP-1564. Add unit tests for HDFS block-level checksums. + (Dhruba Borthakur via cutting) + Release 0.13.0 - 2007-06-08 Modified: lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/DFSTestUtil.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/DFSTestUtil.java?view=diff&rev=557039&r1=557038&r2=557039 ============================================================================== --- lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/DFSTestUtil.java (original) +++ lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/DFSTestUtil.java Tue Jul 17 12:56:25 2007 @@ -90,11 +90,16 @@ int getSize() { return size; } long getSeed() { return seed; } } + + void createFiles(FileSystem fs, String topdir) throws IOException { + createFiles(fs, topdir, (short)3); + } /** create nFiles with random names and directory hierarchies - * with random (but reproducible) data in them. + * with random (but reproducible) data in them. */ - void createFiles(FileSystem fs, String topdir) throws IOException { + void createFiles(FileSystem fs, String topdir, + short replicationFactor) throws IOException { files = new MyFile[nFiles]; for (int idx = 0; idx < nFiles; idx++) { @@ -109,7 +114,7 @@ throw new IOException("Mkdirs failed to create " + fPath.getParent().toString()); } - FSDataOutputStream out = fs.create(fPath); + FSDataOutputStream out = fs.create(fPath, replicationFactor); byte[] toWrite = new byte[files[idx].getSize()]; Random rb = new Random(files[idx].getSeed()); rb.nextBytes(toWrite); @@ -132,7 +137,7 @@ byte[] toCompare = new byte[files[idx].getSize()]; Random rb = new Random(files[idx].getSeed()); rb.nextBytes(toCompare); - assertEquals("Cannnot read file.", toRead.length, in.read(toRead)); + in.readFully(0, toRead); in.close(); for (int i = 0; i < toRead.length; i++) { if (toRead[i] != toCompare[i]) { @@ -144,6 +149,52 @@ } return true; + } + + void setReplication(FileSystem fs, String topdir, short value) + throws IOException { + Path root = new Path(topdir); + for (int idx = 0; idx < nFiles; idx++) { + Path fPath = new Path(root, files[idx].getName()); + fs.setReplication(fPath, value); + } + } + + // waits for the replication factor of all files to reach the + // specified target + // + void waitReplication(FileSystem fs, String topdir, short value) + throws IOException { + Path root = new Path(topdir); + + /** wait for the replication factor to settle down */ + while (true) { + boolean good = true; + for (int idx = 0; idx < nFiles; idx++) { + Path fPath = new Path(root, files[idx].getName()); + String locs[][] = fs.getFileCacheHints(fPath, 0, Long.MAX_VALUE); + for (int j = 0; j < locs.length; j++) { + String[] loc = locs[j]; + if (loc.length != value) { + System.out.println("File " + fPath + " has replication factor " + + loc.length); + good = false; + break; + } + } + if (!good) { + break; + } + } + if (!good) { + try { + System.out.println("Waiting for replication factor to drain"); + Thread.sleep(1000); + } catch (InterruptedException e) {} + continue; + } + break; + } } /** delete directory and everything underneath it.*/ Added: lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestCrcCorruption.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestCrcCorruption.java?view=auto&rev=557039 ============================================================================== --- lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestCrcCorruption.java (added) +++ lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestCrcCorruption.java Tue Jul 17 12:56:25 2007 @@ -0,0 +1,225 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.dfs; + +import java.io.*; +import java.nio.channels.FileChannel; +import java.nio.ByteBuffer; +import java.util.Random; +import junit.framework.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.ChecksumException; +import org.apache.hadoop.fs.Path; + +/** + * A JUnit test for corrupted file handling. + * This test creates a bunch of files/directories with replication + * factor of 2. Then verifies that a client can automatically + * access the remaining valid replica inspite of the following + * types of simulated errors: + * + * 1. Delete meta file on one replica + * 2. Truncates meta file on one replica + * 3. Corrupts the meta file header on one replica + * 4. Corrupts any random offset and portion of the meta file + * 5. Swaps two meta files, i.e the format of the meta files + * are valid but their CRCs do not match with their corresponding + * data blocks + * The above tests are run for varied values of io.bytes.per.checksum + * and dfs.block.size. It tests for the case when the meta file is + * multiple blocks. + * + * Another portion of the test is commented out till HADOOP-1557 + * is addressed: + * 1. Create file with 2 replica, corrupt the meta file of replica, + * decrease replication factor from 2 to 1. Validate that the + * remaining replica is the good one. + * 2. Create file with 2 replica, corrupt the meta file of one replica, + * increase replication factor of file to 3. verify that the new + * replica was created from the non-corrupted replica. + */ +public class TestCrcCorruption extends TestCase { + + public TestCrcCorruption(String testName) { + super(testName); + } + + protected void setUp() throws Exception { + } + + protected void tearDown() throws Exception { + } + + /** + * check if DFS can handle corrupted CRC blocks + */ + private void thistest(Configuration conf, DFSTestUtil util) throws Exception { + MiniDFSCluster cluster = null; + int numDataNodes = 2; + short replFactor = 2; + Random random = new Random(); + + try { + cluster = new MiniDFSCluster(conf, numDataNodes, true, null); + cluster.waitActive(); + FileSystem fs = cluster.getFileSystem(); + util.createFiles(fs, "/srcdat", replFactor); + util.waitReplication(fs, "/srcdat", (short)2); + + // Now deliberately remove/truncate meta blocks from the first + // directory of the first datanode. The complete absense of a meta + // file disallows this Datanode to send data to another datanode. + // However, a client is alowed access to this block. + // + File data_dir = new File(System.getProperty("test.build.data"), + "dfs/data/data1/current"); + assertTrue("data directory does not exist", data_dir.exists()); + File[] blocks = data_dir.listFiles(); + assertTrue("Blocks do not exist in data-dir", (blocks != null) && (blocks.length > 0)); + int num = 0; + for (int idx = 0; idx < blocks.length; idx++) { + if (blocks[idx].getName().startsWith("blk_") && + blocks[idx].getName().endsWith(".meta")) { + num++; + if (num % 3 == 0) { + // + // remove .meta file + // + System.out.println("Deliberately removing file " + blocks[idx].getName()); + assertTrue("Cannot remove file.", blocks[idx].delete()); + } else if (num % 3 == 1) { + // + // shorten .meta file + // + RandomAccessFile file = new RandomAccessFile(blocks[idx], "rw"); + FileChannel channel = file.getChannel(); + int newsize = random.nextInt((int)channel.size()/2); + System.out.println("Deliberately truncating file " + + blocks[idx].getName() + + " to size " + newsize + " bytes."); + channel.truncate(newsize); + file.close(); + } else { + // + // corrupt a few bytes of the metafile + // + RandomAccessFile file = new RandomAccessFile(blocks[idx], "rw"); + FileChannel channel = file.getChannel(); + long position = 0; + // + // The very first time, corrupt the meta header at offset 0 + // + if (num != 2) { + position = (long)random.nextInt((int)channel.size()); + } + int length = random.nextInt((int)(channel.size() - position + 1)); + byte[] buffer = new byte[length]; + random.nextBytes(buffer); + channel.write(ByteBuffer.wrap(buffer), position); + System.out.println("Deliberately corrupting file " + + blocks[idx].getName() + + " at offset " + position + + " length " + length); + file.close(); + } + } + } + // + // Now deliberately corrupt all meta blocks from the second + // directory of the first datanode + // + data_dir = new File(System.getProperty("test.build.data"), + "dfs/data/data2/current"); + assertTrue("data directory does not exist", data_dir.exists()); + blocks = data_dir.listFiles(); + assertTrue("Blocks do not exist in data-dir", (blocks != null) && (blocks.length > 0)); + + int count = 0; + File previous = null; + for (int idx = 0; idx < blocks.length; idx++) { + if (blocks[idx].getName().startsWith("blk_") && + blocks[idx].getName().endsWith(".meta")) { + // + // Move the previous metafile into the current one. + // + count++; + if (count % 2 == 0) { + System.out.println("Deliberately insertimg bad crc into files " + + blocks[idx].getName() + " " + previous.getName()); + assertTrue("Cannot remove file.", blocks[idx].delete()); + assertTrue("Cannot corrupt meta file.", previous.renameTo(blocks[idx])); + assertTrue("Cannot recreate empty meta file.", previous.createNewFile()); + previous = null; + } else { + previous = blocks[idx]; + } + } + } + + // + // Only one replica is possibly corrupted. The other replica should still + // be good. Verify. + // + assertTrue("Corrupted replicas not handled properly.", + util.checkFiles(fs, "/srcdat")); + System.out.println("All File still have a valid replica"); + + // + // set replication factor back to 1. This causes only one replica of + // of each block to remain in HDFS. The check is to make sure that + // the corrupted replica generated above is the one that gets deleted. + // This test is currently disabled until HADOOP-1557 is solved. + // + util.setReplication(fs, "/srcdat", (short)1); + //util.waitReplication(fs, "/srcdat", (short)1); + //System.out.println("All Files done with removing replicas"); + //assertTrue("Excess replicas deleted. Corrupted replicas found.", + // util.checkFiles(fs, "/srcdat")); + System.out.println("The excess-corrupted-replica test is disabled " + + " pending HADOOP-1557"); + + util.cleanup(fs, "/srcdat"); + } finally { + if (cluster != null) { cluster.shutdown(); } + } + } + + public void testCrcCorruption() throws Exception { + // + // default parameters + // + System.out.println("TestCrcCorruption with default parameters"); + Configuration conf1 = new Configuration(); + conf1.setInt("dfs.blockreport.intervalMsec", 3 * 1000); + DFSTestUtil util1 = new DFSTestUtil("TestCrcCorruption", 40, 3, 8*1024); + thistest(conf1, util1); + + // + // specific parameters + // + System.out.println("TestCrcCorruption with specific parameters"); + Configuration conf2 = new Configuration(); + conf2.setInt("io.bytes.per.checksum", 17); + conf2.setInt("dfs.block.size", 34); + DFSTestUtil util2 = new DFSTestUtil("TestCrcCorruption", 40, 3, 400); + thistest(conf2, util2); + } +}