Author: arp Date: Wed Apr 9 16:14:35 2014 New Revision: 1586008 URL: http://svn.apache.org/r1586008 Log: HDFS-6160: Merging 1586007 from trunk to branch-2.
Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1586008&r1=1586007&r2=1586008&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Wed Apr 9 16:14:35 2014 @@ -74,6 +74,8 @@ Release 2.5.0 - UNRELEASED HDFS-6169. Move the address in WebImageViewer. (Akira Ajisaka via wheat9) + HDFS-6160. TestSafeMode occasionally fails. (Arpit Agarwal) + Release 2.4.1 - UNRELEASED INCOMPATIBLE CHANGES Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java?rev=1586008&r1=1586007&r2=1586008&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java Wed Apr 9 16:14:35 2014 @@ -1017,6 +1017,7 @@ class NameNodeRpcServer implements Namen for(StorageBlockReport r : reports) { final BlockListAsLongs blocks = new BlockListAsLongs(r.getBlocks()); hasStaleStorages = bm.processReport(nodeReg, r.getStorage(), poolId, blocks); + metrics.incrStorageBlockReportOps(); } if (nn.getFSImage().isUpgradeFinalized() && Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java?rev=1586008&r1=1586007&r2=1586008&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java Wed Apr 9 16:14:35 2014 @@ -73,6 +73,8 @@ public class NameNodeMetrics { MutableCounterLong snapshotDiffReportOps; @Metric("Number of blockReceivedAndDeleted calls") MutableCounterLong blockReceivedAndDeletedOps; + @Metric("Number of blockReports from individual storages") + MutableCounterLong storageBlockReportOps; @Metric("Journal transactions") MutableRate transactions; @Metric("Journal syncs") MutableRate syncs; @@ -221,6 +223,10 @@ public class NameNodeMetrics { public void incrBlockReceivedAndDeletedOps() { blockReceivedAndDeletedOps.incr(); } + + public void incrStorageBlockReportOps() { + storageBlockReportOps.incr(); + } public void addTransaction(long latency) { transactions.add(latency); Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java?rev=1586008&r1=1586007&r2=1586008&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java (original) +++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java Wed Apr 9 16:14:35 2014 @@ -18,6 +18,8 @@ package org.apache.hadoop.hdfs; +import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; +import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -26,6 +28,8 @@ import static org.junit.Assert.fail; import java.io.IOException; import java.util.List; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; @@ -55,12 +59,14 @@ import com.google.common.collect.Lists; * Tests to verify safe mode correctness. */ public class TestSafeMode { + public static final Log LOG = LogFactory.getLog(TestSafeMode.class); private static final Path TEST_PATH = new Path("/test"); private static final int BLOCK_SIZE = 1024; Configuration conf; MiniDFSCluster cluster; FileSystem fs; DistributedFileSystem dfs; + private static final String NN_METRICS = "NameNodeActivity"; @Before public void startUp() throws IOException { @@ -158,6 +164,7 @@ public class TestSafeMode { */ @Test(timeout=45000) public void testInitializeReplQueuesEarly() throws Exception { + LOG.info("Starting testInitializeReplQueuesEarly"); // Spray the blocks around the cluster when we add DNs instead of // concentrating all blocks on the first node. BlockManagerTestUtil.setWritingPrefersLocalNode( @@ -165,9 +172,11 @@ public class TestSafeMode { cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null); cluster.waitActive(); + + LOG.info("Creating files"); DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L); - + LOG.info("Stopping all DataNodes"); List<DataNodeProperties> dnprops = Lists.newLinkedList(); dnprops.add(cluster.stopDataNode(0)); dnprops.add(cluster.stopDataNode(0)); @@ -176,6 +185,7 @@ public class TestSafeMode { cluster.getConfiguration(0).setFloat( DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f); + LOG.info("Restarting NameNode"); cluster.restartNameNode(); final NameNode nn = cluster.getNameNode(); @@ -189,27 +199,37 @@ public class TestSafeMode { "until threshold is crossed", NameNodeAdapter.safeModeInitializedReplQueues(nn)); + LOG.info("Restarting one DataNode"); cluster.restartDataNode(dnprops.remove(0)); - // Wait for the block report from the restarted DN to come in. + // Wait for block reports from all attached storages of + // the restarted DN to come in. GenericTestUtils.waitFor(new Supplier<Boolean>() { @Override public Boolean get() { - return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0; + return getLongCounter("StorageBlockReportOps", getMetrics(NN_METRICS)) == + MiniDFSCluster.DIRS_PER_DATANODE; } }, 10, 10000); - // SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks - // call is still going on at this point - wait until it's done by grabbing - // the lock. - nn.getNamesystem().writeLock(); - nn.getNamesystem().writeUnlock(); - int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn); - assertTrue("Expected first block report to make some but not all blocks " + - "safe. Got: " + safe, safe >= 1 && safe < 15); - BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); - + + final int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn); + assertTrue("Expected first block report to make some blocks safe.", safe > 0); + assertTrue("Did not expect first block report to make all blocks safe.", safe < 15); + assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn)); - assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks()); + + // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated + // blocks are processed asynchronously so this may take a few seconds. + // Failure here will manifest as a test timeout. + BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); + long underReplicatedBlocks = nn.getNamesystem().getUnderReplicatedBlocks(); + while (underReplicatedBlocks != (15 - safe)) { + LOG.info("UnderReplicatedBlocks expected=" + (15 - safe) + + ", actual=" + underReplicatedBlocks); + Thread.sleep(100); + BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); + underReplicatedBlocks = nn.getNamesystem().getUnderReplicatedBlocks(); + } cluster.restartDataNodes(); }