Repository: hadoop Updated Branches: refs/heads/branch-2 a1b8b9ca5 -> b70c0c295
HDFS-6425. Large postponedMisreplicatedBlocks has impact on blockReport latency. Contributed by Ming Ma. (cherry picked from commit b7923a356e9f111619375b94d12749d634069347) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/b70c0c29 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/b70c0c29 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/b70c0c29 Branch: refs/heads/branch-2 Commit: b70c0c295cab04073d7889b6d7c893c29ba5b7ba Parents: a1b8b9c Author: Kihwal Lee <kih...@apache.org> Authored: Tue Dec 16 10:31:57 2014 -0600 Committer: Kihwal Lee <kih...@apache.org> Committed: Tue Dec 16 10:31:57 2014 -0600 ---------------------------------------------------------------------- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 4 + .../server/blockmanagement/BlockManager.java | 113 +++++++++++-------- .../server/blockmanagement/DatanodeManager.java | 14 ++- .../src/main/resources/hdfs-default.xml | 8 ++ .../blockmanagement/BlockManagerTestUtil.java | 8 ++ .../hdfs/server/namenode/ha/TestDNFencing.java | 19 +++- .../ha/TestDNFencingWithReplication.java | 4 + 8 files changed, 124 insertions(+), 49 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 49188a8..8a2c115 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -338,6 +338,9 @@ Release 2.7.0 - UNRELEASED HDFS-7516. Fix findbugs warnings in hdfs-nfs project. (brandonli) + HDFS-6425. Large postponedMisreplicatedBlocks has impact on blockReport + latency. (Ming Ma via kihwal) + Release 2.6.1 - UNRELEASED INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 590ba2a..5305d2c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -326,6 +326,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final String DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY = "dfs.namenode.write.stale.datanode.ratio"; public static final float DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT = 0.5f; + // Number of blocks to rescan for each iteration of postponedMisreplicatedBlocks. + public static final String DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY = "dfs.namenode.blocks.per.postponedblocks.rescan"; + public static final long DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT = 10000; + // Replication monitoring related keys public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION = "dfs.namenode.invalidate.work.pct.per.iteration"; http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 1ff9674..23ba10a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -1050,21 +1050,6 @@ public class BlockManager { node.resetBlocks(); invalidateBlocks.remove(node); - - // If the DN hasn't block-reported since the most recent - // failover, then we may have been holding up on processing - // over-replicated blocks because of it. But we can now - // process those blocks. - boolean stale = false; - for(DatanodeStorageInfo storage : node.getStorageInfos()) { - if (storage.areBlockContentsStale()) { - stale = true; - break; - } - } - if (stale) { - rescanPostponedMisreplicatedBlocks(); - } } /** Remove the blocks associated to the given DatanodeStorageInfo. */ @@ -1821,17 +1806,7 @@ public class BlockManager { invalidatedBlocks = processReport(storageInfo, newReport); } - // Now that we have an up-to-date block report, we know that any - // deletions from a previous NN iteration have been accounted for. - boolean staleBefore = storageInfo.areBlockContentsStale(); storageInfo.receivedBlockReport(); - if (staleBefore && !storageInfo.areBlockContentsStale()) { - LOG.info("BLOCK* processReport: Received first block report from " - + storage + " after starting up or becoming active. Its block " - + "contents are no longer considered stale"); - rescanPostponedMisreplicatedBlocks(); - } - } finally { endTime = Time.now(); namesystem.writeUnlock(); @@ -1860,31 +1835,74 @@ public class BlockManager { /** * Rescan the list of blocks which were previously postponed. */ - private void rescanPostponedMisreplicatedBlocks() { - for (Iterator<Block> it = postponedMisreplicatedBlocks.iterator(); - it.hasNext();) { - Block b = it.next(); - - BlockInfo bi = blocksMap.getStoredBlock(b); - if (bi == null) { - if (LOG.isDebugEnabled()) { - LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + - "Postponed mis-replicated block " + b + " no longer found " + - "in block map."); + void rescanPostponedMisreplicatedBlocks() { + if (getPostponedMisreplicatedBlocksCount() == 0) { + return; + } + long startTimeRescanPostponedMisReplicatedBlocks = Time.now(); + long startPostponedMisReplicatedBlocksCount = + getPostponedMisreplicatedBlocksCount(); + namesystem.writeLock(); + try { + // blocksPerRescan is the configured number of blocks per rescan. + // Randomly select blocksPerRescan consecutive blocks from the HashSet + // when the number of blocks remaining is larger than blocksPerRescan. + // The reason we don't always pick the first blocksPerRescan blocks is to + // handle the case if for some reason some datanodes remain in + // content stale state for a long time and only impact the first + // blocksPerRescan blocks. + int i = 0; + long startIndex = 0; + long blocksPerRescan = + datanodeManager.getBlocksPerPostponedMisreplicatedBlocksRescan(); + long base = getPostponedMisreplicatedBlocksCount() - blocksPerRescan; + if (base > 0) { + startIndex = DFSUtil.getRandom().nextLong() % (base+1); + if (startIndex < 0) { + startIndex += (base+1); } - it.remove(); - postponedMisreplicatedBlocksCount.decrementAndGet(); - continue; } - MisReplicationResult res = processMisReplicatedBlock(bi); - if (LOG.isDebugEnabled()) { - LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + - "Re-scanned block " + b + ", result is " + res); + Iterator<Block> it = postponedMisreplicatedBlocks.iterator(); + for (int tmp = 0; tmp < startIndex; tmp++) { + it.next(); } - if (res != MisReplicationResult.POSTPONE) { - it.remove(); - postponedMisreplicatedBlocksCount.decrementAndGet(); + + for (;it.hasNext(); i++) { + Block b = it.next(); + if (i >= blocksPerRescan) { + break; + } + + BlockInfo bi = blocksMap.getStoredBlock(b); + if (bi == null) { + if (LOG.isDebugEnabled()) { + LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + + "Postponed mis-replicated block " + b + " no longer found " + + "in block map."); + } + it.remove(); + postponedMisreplicatedBlocksCount.decrementAndGet(); + continue; + } + MisReplicationResult res = processMisReplicatedBlock(bi); + if (LOG.isDebugEnabled()) { + LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + + "Re-scanned block " + b + ", result is " + res); + } + if (res != MisReplicationResult.POSTPONE) { + it.remove(); + postponedMisreplicatedBlocksCount.decrementAndGet(); + } } + } finally { + namesystem.writeUnlock(); + long endPostponedMisReplicatedBlocksCount = + getPostponedMisreplicatedBlocksCount(); + LOG.info("Rescan of postponedMisreplicatedBlocks completed in " + + (Time.now() - startTimeRescanPostponedMisReplicatedBlocks) + + " msecs. " + endPostponedMisReplicatedBlocksCount + + " blocks are left. " + (startPostponedMisReplicatedBlocksCount - + endPostponedMisReplicatedBlocksCount) + " blocks are removed."); } } @@ -3583,6 +3601,7 @@ public class BlockManager { if (namesystem.isPopulatingReplQueues()) { computeDatanodeWork(); processPendingReplications(); + rescanPostponedMisreplicatedBlocks(); } Thread.sleep(replicationRecheckInterval); } catch (Throwable t) { @@ -3651,6 +3670,8 @@ public class BlockManager { excessReplicateMap.clear(); invalidateBlocks.clear(); datanodeManager.clearPendingQueues(); + postponedMisreplicatedBlocks.clear(); + postponedMisreplicatedBlocksCount.set(0); }; http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index 6438466..ba33db4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -133,7 +133,7 @@ public class DatanodeManager { * writing to stale datanodes, i.e., continue using stale nodes for writing. */ private final float ratioUseStaleDataNodesForWrite; - + /** The number of stale DataNodes */ private volatile int numStaleNodes; @@ -141,6 +141,11 @@ public class DatanodeManager { private volatile int numStaleStorages; /** + * Number of blocks to check for each postponedMisreplicatedBlocks iteration + */ + private final long blocksPerPostponedMisreplicatedBlocksRescan; + + /** * Whether or not this cluster has ever consisted of more than 1 rack, * according to the NetworkTopology. */ @@ -259,6 +264,9 @@ public class DatanodeManager { this.timeBetweenResendingCachingDirectivesMs = conf.getLong( DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS, DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS_DEFAULT); + this.blocksPerPostponedMisreplicatedBlocksRescan = conf.getLong( + DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY, + DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT); } private static long getStaleIntervalFromConf(Configuration conf, @@ -1132,6 +1140,10 @@ public class DatanodeManager { * ratioUseStaleDataNodesForWrite); } + public long getBlocksPerPostponedMisreplicatedBlocksRescan() { + return blocksPerPostponedMisreplicatedBlocksRescan; + } + /** * @return The time interval used to mark DataNodes as stale. */ http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 83840db..51d9ed6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -2253,4 +2253,12 @@ </description> </property> +<property> + <name>dfs.namenode.blocks.per.postponedblocks.rescan</name> + <value>10000</value> + <description>Number of blocks to rescan for each iteration of + postponedMisreplicatedBlocks. + </description> +</property> + </configuration> http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java index 2755b29..fccd308 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java @@ -238,6 +238,14 @@ public class BlockManagerTestUtil { return dn.updateStorage(s); } + /** + * Call heartbeat check function of HeartbeatManager + * @param bm the BlockManager to manipulate + */ + public static void rescanPostponedMisreplicatedBlocks(BlockManager bm) { + bm.rescanPostponedMisreplicatedBlocks(); + } + public static DatanodeDescriptor getLocalDatanodeDescriptor( boolean initializeStorage) { DatanodeDescriptor dn = new DatanodeDescriptor(DFSTestUtil.getLocalDatanodeID()); http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java index 75d5b70..85864f7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java @@ -165,7 +165,12 @@ public class TestDNFencing { banner("Metadata after nodes have all block-reported"); doMetasave(nn2); - + + // Force a rescan of postponedMisreplicatedBlocks. + BlockManager nn2BM = nn2.getNamesystem().getBlockManager(); + BlockManagerTestUtil.checkHeartbeat(nn2BM); + BlockManagerTestUtil.rescanPostponedMisreplicatedBlocks(nn2BM); + // The blocks should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); @@ -251,7 +256,12 @@ public class TestDNFencing { banner("Metadata after nodes have all block-reported"); doMetasave(nn2); - + + // Force a rescan of postponedMisreplicatedBlocks. + BlockManager nn2BM = nn2.getNamesystem().getBlockManager(); + BlockManagerTestUtil.checkHeartbeat(nn2BM); + BlockManagerTestUtil.rescanPostponedMisreplicatedBlocks(nn2BM); + // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); @@ -347,6 +357,11 @@ public class TestDNFencing { banner("Metadata after nodes have all block-reported"); doMetasave(nn2); + // Force a rescan of postponedMisreplicatedBlocks. + BlockManager nn2BM = nn2.getNamesystem().getBlockManager(); + BlockManagerTestUtil.checkHeartbeat(nn2BM); + BlockManagerTestUtil.rescanPostponedMisreplicatedBlocks(nn2BM); + // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); http://git-wip-us.apache.org/repos/asf/hadoop/blob/b70c0c29/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java index 93830c1..e7cba75 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java @@ -109,6 +109,10 @@ public class TestDNFencingWithReplication { HAStressTestHarness harness = new HAStressTestHarness(); harness.conf.setInt( DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000); + harness.conf.setInt( + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 1); + harness.conf.setInt( + DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1); final MiniDFSCluster cluster = harness.startCluster(); try {