Repository: hadoop Updated Branches: refs/heads/branch-2 d8a5d2b2f -> a6166aa55
HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe) (cherry-picked from 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/a6166aa5 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/a6166aa5 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/a6166aa5 Branch: refs/heads/branch-2 Commit: a6166aa5523916f002b70ec9c731fcfe3389228f Parents: d8a5d2b Author: Colin Patrick Mccabe <[email protected]> Authored: Fri Sep 25 15:25:42 2015 -0700 Committer: Colin Patrick Mccabe <[email protected]> Committed: Fri Sep 25 15:33:02 2015 -0700 ---------------------------------------------------------------------- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../blockmanagement/HeartbeatManager.java | 30 ++++++++++++++++++-- .../blockmanagement/TestHeartbeatHandling.java | 27 ++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 3df32dc00..682f37c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1071,6 +1071,9 @@ Release 2.8.0 - UNRELEASED HDFS-9123. Copying from the root to a subdirectory should be forbidden. (Wei-Chiu Chuang via Yongjun Zhang) + HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn + Sharp via Colin P. McCabe) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index cc9365d..f2e9827 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -24,6 +24,7 @@ import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.StorageType; @@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.util.Daemon; +import org.apache.hadoop.util.StopWatch; import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; + /** * Manage the heartbeats received from datanodes. * The datanode list and statistics are synchronized @@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics { private final long heartbeatRecheckInterval; /** Heartbeat monitor thread */ private final Daemon heartbeatThread = new Daemon(new Monitor()); + private final StopWatch heartbeatStopWatch = new StopWatch(); - final Namesystem namesystem; final BlockManager blockManager; @@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics { stats.add(node); } } - + + @VisibleForTesting + void restartHeartbeatStopWatch() { + heartbeatStopWatch.reset().start(); + } + + @VisibleForTesting + boolean shouldAbortHeartbeatCheck(long offset) { + long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS); + return elapsed + offset > heartbeatRecheckInterval; + } + /** * Check if there are any expired heartbeats, and if so, * whether any blocks have to be re-replicated. @@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics { int numOfStaleStorages = 0; synchronized(this) { for (DatanodeDescriptor d : datanodes) { + // check if an excessive GC pause has occurred + if (shouldAbortHeartbeatCheck(0)) { + return; + } if (dead == null && dm.isDatanodeDead(d)) { stats.incrExpiredHeartbeats(); dead = d; @@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics { @Override public void run() { while(namesystem.isRunning()) { + restartHeartbeatStopWatch(); try { final long now = Time.monotonicNow(); if (lastHeartbeatCheck + heartbeatRecheckInterval < now) { @@ -396,6 +416,12 @@ class HeartbeatManager implements DatanodeStatistics { Thread.sleep(5000); // 5 seconds } catch (InterruptedException ie) { } + // avoid declaring nodes dead for another cycle if a GC pause lasts + // longer than the node recheck interval + if (shouldAbortHeartbeatCheck(-5000)) { + LOG.warn("Skipping next heartbeat scan due to excessive pause"); + lastHeartbeatCheck = Time.monotonicNow(); + } } } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java index 3e233c6..b77c413 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hdfs.server.blockmanagement; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import java.util.ArrayList; @@ -33,6 +35,7 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; @@ -40,6 +43,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; import org.junit.Test; +import org.mockito.Mockito; /** * Test if FSNamesystem handles heartbeat right @@ -243,4 +247,27 @@ public class TestHeartbeatHandling { cluster.shutdown(); } } + + @Test + public void testHeartbeatStopWatch() throws Exception { + Namesystem ns = Mockito.mock(Namesystem.class); + BlockManager bm = Mockito.mock(BlockManager.class); + Configuration conf = new Configuration(); + long recheck = 2000; + conf.setLong( + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, recheck); + HeartbeatManager monitor = new HeartbeatManager(ns, bm, conf); + monitor.restartHeartbeatStopWatch(); + assertFalse(monitor.shouldAbortHeartbeatCheck(0)); + // sleep shorter than recheck and verify shouldn't abort + Thread.sleep(100); + assertFalse(monitor.shouldAbortHeartbeatCheck(0)); + // sleep longer than recheck and verify should abort unless ignore delay + Thread.sleep(recheck); + assertTrue(monitor.shouldAbortHeartbeatCheck(0)); + assertFalse(monitor.shouldAbortHeartbeatCheck(-recheck*3)); + // ensure it resets properly + monitor.restartHeartbeatStopWatch(); + assertFalse(monitor.shouldAbortHeartbeatCheck(0)); + } }
