Repository: hadoop Updated Branches: refs/heads/branch-2.7 03892df21 -> 0dca198f0
HDFS-12131. Add some of the FSNamesystem JMX values as metrics. Contributed by Erik Krogen. (cherry picked from commit f4c6b00a9f48ae7667db4035b641769efc3bb7cf) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/0dca198f Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/0dca198f Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/0dca198f Branch: refs/heads/branch-2.7 Commit: 0dca198f097276bbe32dcdcdc041417312608bd7 Parents: 03892df Author: Andrew Wang <[email protected]> Authored: Thu Aug 3 15:45:47 2017 -0700 Committer: Konstantin V Shvachko <[email protected]> Committed: Wed Sep 6 15:18:28 2017 -0700 ---------------------------------------------------------------------- .../hadoop-common/src/site/markdown/Metrics.md | 8 ++ hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hdfs/server/namenode/FSNamesystem.java | 14 ++ .../org/apache/hadoop/hdfs/MiniDFSCluster.java | 6 +- .../namenode/metrics/TestNameNodeMetrics.java | 128 ++++++++++++++++++- 5 files changed, 157 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index 88ed6f6..81539a9 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -230,7 +230,15 @@ Each metrics record contains tags such as HAState and Hostname as additional inf | `PendingDataNodeMessageCourt` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode | | `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 | | `BlockCapacity` | Current number of block capacity | +| `NumLiveDataNodes` | Number of datanodes which are currently live | +| `NumDeadDataNodes` | Number of datanodes which are currently dead | +| `NumDecomLiveDataNodes` | Number of datanodes which have been decommissioned and are now live | +| `NumDecomDeadDataNodes` | Number of datanodes which have been decommissioned and are now dead | +| `NumDecommissioningDataNodes` | Number of datanodes in decommissioning state | +| `VolumeFailuresTotal` | Total number of volume failures across all Datanodes | +| `EstimatedCapacityLostTotal` | An estimate of the total capacity lost due to volume failures | | `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat | +| `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) | | `TotalFiles` | Current number of files and directories (same as FilesTotal) | | `LockQueueLength` | Number of threads waiting to acquire FSNameSystem lock | http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index d1ebdcd..3ce47d7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -12,6 +12,9 @@ Release 2.7.5 - UNRELEASED HDFS-8797. WebHdfsFileSystem creates too many connections for pread. (jing9) + HDFS-12131. Add some of the FSNamesystem JMX values as metrics. + (Erik Krogen via wang, shv) + OPTIMIZATIONS BUG FIXES http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index ce8878f..cb0c7a3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -6030,16 +6030,20 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"}) public int getNumLiveDataNodes() { return getBlockManager().getDatanodeManager().getNumLiveDataNodes(); } @Override // FSNamesystemMBean + @Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"}) public int getNumDeadDataNodes() { return getBlockManager().getDatanodeManager().getNumDeadDataNodes(); } @Override // FSNamesystemMBean + @Metric({"NumDecomLiveDataNodes", + "Number of datanodes which have been decommissioned and are now live"}) public int getNumDecomLiveDataNodes() { final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); @@ -6051,6 +6055,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"NumDecomDeadDataNodes", + "Number of datanodes which have been decommissioned and are now dead"}) public int getNumDecomDeadDataNodes() { final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false); @@ -6062,6 +6068,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"VolumeFailuresTotal", + "Total number of volume failures across all Datanodes"}) public int getVolumeFailuresTotal() { List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); @@ -6073,6 +6081,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"EstimatedCapacityLostTotal", + "An estimate of the total capacity lost due to volume failures"}) public long getEstimatedCapacityLostTotal() { List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); @@ -6088,6 +6098,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"NumDecommissioningDataNodes", + "Number of datanodes in decommissioning state"}) public int getNumDecommissioningDataNodes() { return getBlockManager().getDatanodeManager().getDecommissioningNodes() .size(); @@ -6105,6 +6117,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, * before NN receives the first Heartbeat followed by the first Blockreport. */ @Override // FSNamesystemMBean + @Metric({"NumStaleStorages", + "Number of storages marked as content stale"}) public int getNumStaleStorages() { return getBlockManager().getDatanodeManager().getNumStaleStorages(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index 1739821..15ecf0197 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -142,6 +142,8 @@ public class MiniDFSCluster { public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir"; public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY = DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing"; + public static final String DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY + = DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY + ".testing"; // Changing this default may break some tests that assume it is 2. private static final int DEFAULT_STORAGES_PER_DATANODE = 2; @@ -788,7 +790,9 @@ public class MiniDFSCluster { int safemodeExtension = conf.getInt( DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0); conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension); - conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 3); // 3 second + int decommissionInterval = conf.getInt( + DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 3); + conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, decommissionInterval); conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, StaticMapping.class, DNSToSwitchMapping.class); http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index ad4c171..8665834 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -24,10 +24,15 @@ import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.junit.Assert.assertTrue; +import com.google.common.base.Joiner; import java.io.DataInputStream; +import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Random; +import org.apache.commons.io.FileUtils; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; @@ -46,6 +51,8 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger; @@ -84,6 +91,13 @@ public class TestNameNodeMetrics { DFS_REPLICATION_INTERVAL); CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, DFS_REPLICATION_INTERVAL); + // Set it long enough to essentially disable unless we manually call it + // Used for decommissioning DataNode metrics + CONF.setInt(MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, + 9999999); + // For checking failed volume metrics + CONF.setInt(DFSConfigKeys.DFS_DF_INTERVAL_KEY, 1000); + CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1); CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, "" + PERCENTILES_INTERVAL); // Enable stale DataNodes checking @@ -97,6 +111,8 @@ public class TestNameNodeMetrics { private final Random rand = new Random(); private FSNamesystem namesystem; private BlockManager bm; + // List of temporary files on local FileSystem to be cleaned up + private List<Path> tempFiles; private static Path getTestPath(String fileName) { return new Path(TEST_ROOT_DIR_PATH, fileName); @@ -109,6 +125,7 @@ public class TestNameNodeMetrics { namesystem = cluster.getNamesystem(); bm = namesystem.getBlockManager(); fs = cluster.getFileSystem(); + tempFiles = new ArrayList<>(); } @After @@ -120,6 +137,9 @@ public class TestNameNodeMetrics { assertQuantileGauges("GetGroups1s", rb); } cluster.shutdown(); + for (Path p : tempFiles) { + FileUtils.deleteQuietly(new File(p.toUri().getPath())); + } } /** create a file with a length of <code>fileLen</code> */ @@ -196,7 +216,113 @@ public class TestNameNodeMetrics { .getBlockManager()); assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS)); } - + + /** + * Test metrics associated with volume failures. + */ + @Test + public void testVolumeFailures() throws Exception { + assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS)); + assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS)); + DataNode dn = cluster.getDataNodes().get(0); + FsVolumeSpi fsVolume = + DataNodeTestUtils.getFSDataset(dn).getVolumes().get(0); + File dataDir = new File(fsVolume.getBasePath()); + long capacity = ((FsVolumeImpl) fsVolume).getCapacity(); + DataNodeTestUtils.injectDataDirFailure(dataDir); + long lastDiskErrorCheck = dn.getLastDiskErrorCheck(); + dn.checkDiskErrorAsync(); + while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) { + Thread.sleep(100); + } + DataNodeTestUtils.triggerHeartbeat(dn); + BlockManagerTestUtil.checkHeartbeat(bm); + assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS)); + assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS)); + } + + /** + * Test metrics associated with liveness and decommission status of DataNodes. + */ + @Test + public void testDataNodeLivenessAndDecom() throws Exception { + Path hostFileDir = new Path(MiniDFSCluster.getBaseDirectory(), "hosts"); + FileSystem localFs = FileSystem.getLocal(CONF); + localFs.mkdirs(hostFileDir); + Path includeFile = new Path(hostFileDir, "include"); + Path excludeFile = new Path(hostFileDir, "exclude"); + tempFiles.add(includeFile); + tempFiles.add(excludeFile); + CONF.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath()); + CONF.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath()); + + List<DataNode> dataNodes = cluster.getDataNodes(); + DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT]; + String[] dnAddresses = new String[DATANODE_COUNT]; + for (int i = 0; i < DATANODE_COUNT; i++) { + dnDescriptors[i] = bm.getDatanodeManager() + .getDatanode(dataNodes.get(i).getDatanodeId()); + dnAddresses[i] = dnDescriptors[i].getXferAddr(); + } + // First put all DNs into include + DFSTestUtil.writeFile(localFs, includeFile, + Joiner.on("\n").join(dnAddresses)); + DFSTestUtil.writeFile(localFs, excludeFile, ""); + bm.getDatanodeManager().refreshNodes(CONF); + assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS)); + + // Now decommission one DN + DFSTestUtil.writeFile(localFs, excludeFile, dnAddresses[0]); + bm.getDatanodeManager().refreshNodes(CONF); + assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS)); + BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager()); + assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS)); + + // Now kill all DNs by expiring their heartbeats + for (int i = 0; i < DATANODE_COUNT; i++) { + DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true); + long expireInterval = CONF.getLong( + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L + + CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, + DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L; + DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], + -(expireInterval + 1)); + } + BlockManagerTestUtil.checkHeartbeat(bm); + assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS)); + + // Now remove the decommissioned DN altogether + String[] includeHosts = new String[dnAddresses.length - 1]; + for (int i = 0; i < includeHosts.length; i++) { + includeHosts[i] = dnAddresses[i + 1]; + } + DFSTestUtil.writeFile(localFs, includeFile, + Joiner.on("\n").join(includeHosts)); + // Just init to a nonexistent host to clear out the previous exclusion + DFSTestUtil.writeFile(localFs, excludeFile, ""); + bm.getDatanodeManager().refreshNodes(CONF); + assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS)); + + // Finally mark the remaining DNs as live again + for (int i = 1; i < dataNodes.size(); i++) { + DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false); + DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0); + } + BlockManagerTestUtil.checkHeartbeat(bm); + assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS)); + assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS)); + } + /** Test metrics associated with addition of a file */ @Test public void testFileAdd() throws Exception { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
