Repository: bigtop Updated Branches: refs/heads/master fce161888 -> e7646c67d
BIGTOP-1560. Add a test case for performing block corruption recovery Signed-off-by: Konstantin Boudnik <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/e7646c67 Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/e7646c67 Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/e7646c67 Branch: refs/heads/master Commit: e7646c67d18643cd87b20e31f4b2ef68c62b3156 Parents: fce1618 Author: Dasha Boudnik <[email protected]> Authored: Mon Dec 15 07:57:31 2014 -0800 Committer: Konstantin Boudnik <[email protected]> Committed: Mon Dec 15 07:57:31 2014 -0800 ---------------------------------------------------------------------- bigtop-tests/test-artifacts/hadoop/pom.xml | 6 + .../itest/hadoop/hdfs/TestBlockRecovery.groovy | 205 +++++++++++++++++++ 2 files changed, 211 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/e7646c67/bigtop-tests/test-artifacts/hadoop/pom.xml ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hadoop/pom.xml b/bigtop-tests/test-artifacts/hadoop/pom.xml index 748a66c..315edfe 100644 --- a/bigtop-tests/test-artifacts/hadoop/pom.xml +++ b/bigtop-tests/test-artifacts/hadoop/pom.xml @@ -44,6 +44,12 @@ <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-hdfs</artifactId> + <version>2.4.1</version> + <type>test-jar</type> + </dependency> </dependencies> </project> http://git-wip-us.apache.org/repos/asf/bigtop/blob/e7646c67/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy b/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy new file mode 100644 index 0000000..a75f016 --- /dev/null +++ b/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.bigtop.itest.hadoop.hdfs + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.FileSystem +import org.apache.hadoop.hdfs.DistributedFileSystem +import org.apache.hadoop.hdfs.HdfsConfiguration +import org.junit.Assume + +import static org.junit.Assert.assertNotNull +import static org.junit.Assert.assertTrue; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.apache.bigtop.itest.shell.Shell; +import org.apache.hadoop.hdfs.DFSTestUtil; + + +/** + This test checks block recovery after a block is corrupted. + The test must be performed on a cluster with at least + three datanodes to allow block recovery. + The test must be run under user hdfs. + Block replication must be set to a minimum value of 2 + for this test to work properly. + */ +public class TestBlockRecovery { + + private static Shell sh = new Shell("/bin/bash"); + + private static Configuration conf; + + private static final String corruptContent = "0123456789"; + private static final String fsFilePath = USER_DIR + "/file0"; + private static final String grepIP = "grep -o '\\[[^]]*\\]' | " + + "grep -o '[0-9]*\\.[0-9]*\\.[0-9]*\\.[0-9]*'"; + private static final String localTestDir = "/tmp/test"; + private static final String outputFile = localTestDir + "/fsckOutput.txt"; + private static final String USER_DIR = "/user/hdfs"; + + private static final int sleepTime = 60 * 1000; + private static final int TIMEOUT = 5000; + + private static String blockToTest; + private static String blockLocation; + private static String blockRecoveryNode; + private static String cksumError; + private static String initialBlockChecksum; + private static String fileContent; + private static String USERNAME; + + private static def dataDirs = []; + private static def nodesBeforeRecovery = []; + private static def nodesAfterRecovery = []; + + private static short numberOfDataNodes; + private static short repFactor; + + private static final long fileLen = 10; + private static final long SEED = 0; + + @BeforeClass + public static void setUp() { + /* Find datanode data directory, make file, add content, ensure replication + * is set to guarantee any chosen datanode will have block, + * get block and its location, perform checksum before corrupting block + * -- all on client side + */ + conf = new HdfsConfiguration(); + FileSystem fileSys = DistributedFileSystem.get(conf); + conf.addResource("hdfs-site.xml"); + dataDirs = conf.get("dfs.data.dir").split(","); + if (dataDirs == null) + dataDirs = conf.get("dfs.datanode.data.dir").split(","); + + USERNAME = System.getProperty("user.name"); + Assume.assumeTrue(USERNAME == "hdfs"); + + numberOfDataNodes = sh.exec("hdfs dfsadmin -report | grep ^Name | wc -l").getOut()[0] as short; + Assume.assumeTrue(numberOfDataNodes >= 3); + + sh.exec("rm -rf $localTestDir"); + sh.exec("mkdir $localTestDir"); + sh.exec("hadoop fs -rm -r $fsFilePath"); + Thread.sleep(TIMEOUT); + sh.exec("hadoop fs -mkdir -p $USER_DIR"); + assertTrue("Failed to create input directory", sh.getRet() == 0); + + repFactor = (numberOfDataNodes - 1); + + DFSTestUtil.createFile(fileSys, new Path(fsFilePath), fileLen, repFactor, SEED); + + fileContent = sh.exec("hadoop fs -cat $fsFilePath").getOut()[0]; + + sh.exec("hdfs fsck $fsFilePath -blocks -locations -files > $outputFile"); + assertTrue("Could not write output to file", sh.getRet() == 0); + + nodesBeforeRecovery = sh.exec("grep -o '\\[[^]]*\\]' $outputFile | " + + "grep -o '[0-9]*\\.[0-9]*\\.[0-9]*\\.[0-9]*'").getOut(); + assertTrue("Could not obtain datanode addresses", sh.getRet() == 0); + + blockToTest = sh.exec("grep -o 'blk_[0-9]*' $outputFile").getOut()[0]; + assertTrue("Could not obtain block number", sh.getRet() == 0); + + for (int i=0; i < dataDirs.length; i++) { + def dataDir = dataDirs[i] + blockLocation = sh.exec("find $dataDir -name $blockToTest | grep $dataDir").getOut()[0]; + if (blockLocation != null) break; + } + assertNotNull("Could not find specified block", blockLocation); + + initialBlockChecksum = sh.exec("cksum $blockLocation").getOut()[0].split(" ")[0]; + assertTrue("Could not obtain checksum for block $blockToTest", sh.getRet() == 0); + } + + @AfterClass + public static void tearDown() { + // deletion of test files + sh.exec("hadoop fs -rm -r -skipTrash $fsFilePath"); + assertTrue("Could not delete file $fsFilePath", sh.getRet() == 0); + sh.exec("rm -rf $localTestDir"); + assertTrue("Could not delete test directory $localTestDir", sh.getRet() == 0); + } + + @Test + public void testBlockRecovery() { + // corrupt block + sh.exec("echo $corruptContent > $blockLocation"); + assertTrue("Could not write to file", sh.getRet() == 0); + + // perform checksum after block corruption + String corruptBlockChecksum = sh.exec("cksum $blockLocation").getOut()[0].split(" ")[0]; + assertTrue("Could not obtain checksum for block $blockToTest", sh.getRet() == 0); + + // trigger block recovery by trying to access the file + sh.exec("hadoop fs -cat $fsFilePath"); + + // make sure checksum changes back to original, indicating block recovery + for (int j=0; j<3; j++) { + // wait a bit to let the block recover + sleep(sleepTime); + // see if checksum has changed + cksumError = sh.exec("hadoop fs -cat $fsFilePath | grep -o 'Checksum error'").getErr(); + if (cksumError != "Checksum error") break; + } + assertNotNull ("Block has not been successfully triggered for recovery.", cksumError); + + nodesAfterRecovery = sh.exec("hdfs fsck $fsFilePath -blocks -locations -files | $grepIP").getOut(); + assertTrue("Could not obtain datanode addresses", sh.getRet() == 0); + + blockRecoveryNode = (nodesBeforeRecovery.intersect(nodesAfterRecovery))[0]; + + if (blockRecoveryNode == null) { + sleep(sleepTime); + + nodesAfterRecovery = sh.exec("hdfs fsck $fsFilePath -blocks -locations -files | $grepIP").getOut(); + assertTrue("Could not obtain datanode addresses", sh.getRet() == 0); + + blockRecoveryNode = (nodesBeforeRecovery.intersect(nodesAfterRecovery))[0]; + assert (blockRecoveryNode.size() != 0) : "Block has not been successfully triggered for recovery." + } + + int cksumAttempt; + + boolean success = false; + + // verify block has recovered. If not, give it a few more tries + while (cksumAttempt < 3) { + if (corruptBlockChecksum != initialBlockChecksum) { + sleep(sleepTime); + corruptBlockChecksum = sh.exec("ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_hdfsuser " + + "$blockRecoveryNode 'cksum `find ${dataDirs.join(' ')}" + + " -name $blockToTest 2>/dev/null | grep $blockToTest` '").getOut()[0].split(" ")[0]; + ++cksumAttempt; + } else { + // If block recovers, verify file content is the same as before corruption + if (sh.exec("hadoop fs -cat $fsFilePath").getOut()[0] == fileContent) { + assertTrue("Could not read file $fsFilePath", sh.getRet() == 0); + success = true; + break; + } + } + } + assertTrue("Block has not recovered", success); + } + +} +
