Repository: hadoop Updated Branches: refs/heads/branch-2.8 8943cf9d3 -> 22ef7c6bf
HDFS-12131. Add some of the FSNamesystem JMX values as metrics. Contributed by Erik Krogen. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/22ef7c6b Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/22ef7c6b Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/22ef7c6b Branch: refs/heads/branch-2.8 Commit: 22ef7c6bf287a80494893a109a76da04522db267 Parents: 8943cf9 Author: Andrew Wang <[email protected]> Authored: Thu Aug 3 15:45:47 2017 -0700 Committer: Andrew Wang <[email protected]> Committed: Thu Aug 3 15:45:47 2017 -0700 ---------------------------------------------------------------------- .../hadoop-common/src/site/markdown/Metrics.md | 8 ++ .../hdfs/server/namenode/FSNamesystem.java | 14 +++ .../org/apache/hadoop/hdfs/MiniDFSCluster.java | 6 +- .../namenode/metrics/TestNameNodeMetrics.java | 116 ++++++++++++++++++- 4 files changed, 142 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/22ef7c6b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index 4d4311a..24bb9fe 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -214,7 +214,15 @@ Each metrics record contains tags such as HAState and Hostname as additional inf | `PendingDataNodeMessageCount` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode | | `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 | | `BlockCapacity` | Current number of block capacity | +| `NumLiveDataNodes` | Number of datanodes which are currently live | +| `NumDeadDataNodes` | Number of datanodes which are currently dead | +| `NumDecomLiveDataNodes` | Number of datanodes which have been decommissioned and are now live | +| `NumDecomDeadDataNodes` | Number of datanodes which have been decommissioned and are now dead | +| `NumDecommissioningDataNodes` | Number of datanodes in decommissioning state | +| `VolumeFailuresTotal` | Total number of volume failures across all Datanodes | +| `EstimatedCapacityLostTotal` | An estimate of the total capacity lost due to volume failures | | `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat | +| `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) | | `TotalFiles` | Deprecated: Use FilesTotal instead | | `MissingReplOneBlocks` | Current number of missing blocks with replication factor 1 | | `NumFilesUnderConstruction` | Current number of files under construction | http://git-wip-us.apache.org/repos/asf/hadoop/blob/22ef7c6b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 385612f..23f7f54 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -5346,16 +5346,20 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"}) public int getNumLiveDataNodes() { return getBlockManager().getDatanodeManager().getNumLiveDataNodes(); } @Override // FSNamesystemMBean + @Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"}) public int getNumDeadDataNodes() { return getBlockManager().getDatanodeManager().getNumDeadDataNodes(); } @Override // FSNamesystemMBean + @Metric({"NumDecomLiveDataNodes", + "Number of datanodes which have been decommissioned and are now live"}) public int getNumDecomLiveDataNodes() { final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); @@ -5367,6 +5371,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"NumDecomDeadDataNodes", + "Number of datanodes which have been decommissioned and are now dead"}) public int getNumDecomDeadDataNodes() { final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false); @@ -5378,6 +5384,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"VolumeFailuresTotal", + "Total number of volume failures across all Datanodes"}) public int getVolumeFailuresTotal() { List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); @@ -5389,6 +5397,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"EstimatedCapacityLostTotal", + "An estimate of the total capacity lost due to volume failures"}) public long getEstimatedCapacityLostTotal() { List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); @@ -5404,6 +5414,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, } @Override // FSNamesystemMBean + @Metric({"NumDecommissioningDataNodes", + "Number of datanodes in decommissioning state"}) public int getNumDecommissioningDataNodes() { return getBlockManager().getDatanodeManager().getDecommissioningNodes() .size(); @@ -5421,6 +5433,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, * before NN receives the first Heartbeat followed by the first Blockreport. */ @Override // FSNamesystemMBean + @Metric({"NumStaleStorages", + "Number of storages marked as content stale"}) public int getNumStaleStorages() { return getBlockManager().getDatanodeManager().getNumStaleStorages(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/22ef7c6b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index 3437805..6929d09 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -148,6 +148,8 @@ public class MiniDFSCluster implements AutoCloseable { public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir"; public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY = DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing"; + public static final String DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY + = DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY + ".testing"; // Changing this default may break some tests that assume it is 2. private static final int DEFAULT_STORAGES_PER_DATANODE = 2; @@ -799,7 +801,9 @@ public class MiniDFSCluster implements AutoCloseable { int safemodeExtension = conf.getInt( DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0); conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension); - conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 3); // 3 second + int decommissionInterval = conf.getInt( + DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 3); + conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, decommissionInterval); conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, StaticMapping.class, DNSToSwitchMapping.class); http://git-wip-us.apache.org/repos/asf/hadoop/blob/22ef7c6b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index c22cd11..b07e1eb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -36,6 +36,7 @@ import java.io.File; import java.io.IOException; import java.security.NoSuchAlgorithmException; import java.util.EnumSet; +import java.util.List; import java.util.Random; import com.google.common.collect.ImmutableList; @@ -59,10 +60,13 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; +import org.apache.hadoop.hdfs.util.HostsFileWriter; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.MetricsSource; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; @@ -99,6 +103,13 @@ public class TestNameNodeMetrics { DFS_REPLICATION_INTERVAL); CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, DFS_REPLICATION_INTERVAL); + // Set it long enough to essentially disable unless we manually call it + // Used for decommissioning DataNode metrics + CONF.setInt(MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, + 9999999); + // For checking failed volume metrics + CONF.setInt(DFSConfigKeys.DFS_DF_INTERVAL_KEY, 1000); + CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1); CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, "" + PERCENTILES_INTERVAL); // Enable stale DataNodes checking @@ -111,6 +122,7 @@ public class TestNameNodeMetrics { private DistributedFileSystem fs; private final Random rand = new Random(); private FSNamesystem namesystem; + private HostsFileWriter hostsFileWriter; private BlockManager bm; private static Path getTestPath(String fileName) { @@ -119,7 +131,10 @@ public class TestNameNodeMetrics { @Before public void setUp() throws Exception { - cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT).build(); + hostsFileWriter = new HostsFileWriter(); + hostsFileWriter.initialize(CONF, "temp/decommission"); + cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT) + .build(); cluster.waitActive(); namesystem = cluster.getNamesystem(); bm = namesystem.getBlockManager(); @@ -134,6 +149,10 @@ public class TestNameNodeMetrics { MetricsRecordBuilder rb = getMetrics(source); assertQuantileGauges("GetGroups1s", rb); } + if (hostsFileWriter != null) { + hostsFileWriter.cleanup(); + hostsFileWriter = null; + } if (cluster != null) { cluster.shutdown(); cluster = null; @@ -214,6 +233,101 @@ public class TestNameNodeMetrics { .getBlockManager()); assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS)); } + + /** + * Test metrics associated with volume failures. + */ + @Test + public void testVolumeFailures() throws Exception { + assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS)); + assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS)); + DataNode dn = cluster.getDataNodes().get(0); + FsDatasetSpi.FsVolumeReferences volumeReferences = + DataNodeTestUtils.getFSDataset(dn).getFsVolumeReferences(); + FsVolumeImpl fsVolume = (FsVolumeImpl) volumeReferences.get(0); + File dataDir = new File(fsVolume.getBasePath()); + long capacity = fsVolume.getCapacity(); + volumeReferences.close(); + DataNodeTestUtils.injectDataDirFailure(dataDir); + long lastDiskErrorCheck = dn.getLastDiskErrorCheck(); + dn.checkDiskErrorAsync(); + while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) { + Thread.sleep(100); + } + DataNodeTestUtils.triggerHeartbeat(dn); + BlockManagerTestUtil.checkHeartbeat(bm); + assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS)); + assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS)); + } + + /** + * Test metrics associated with liveness and decommission status of DataNodes. + */ + @Test + public void testDataNodeLivenessAndDecom() throws Exception { + List<DataNode> dataNodes = cluster.getDataNodes(); + DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT]; + String[] dnAddresses = new String[DATANODE_COUNT]; + for (int i = 0; i < DATANODE_COUNT; i++) { + dnDescriptors[i] = bm.getDatanodeManager() + .getDatanode(dataNodes.get(i).getDatanodeId()); + dnAddresses[i] = dnDescriptors[i].getXferAddr(); + } + // First put all DNs into include + hostsFileWriter.initIncludeHosts(dnAddresses); + bm.getDatanodeManager().refreshNodes(CONF); + assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS)); + + // Now decommission one DN + hostsFileWriter.initExcludeHost(dnAddresses[0]); + bm.getDatanodeManager().refreshNodes(CONF); + assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS)); + BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager()); + assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS)); + + // Now kill all DNs by expiring their heartbeats + for (int i = 0; i < DATANODE_COUNT; i++) { + DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true); + long expireInterval = CONF.getLong( + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L + + CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, + DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L; + DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], + -(expireInterval + 1)); + } + BlockManagerTestUtil.checkHeartbeat(bm); + assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS)); + + // Now remove the decommissioned DN altogether + String[] includeHosts = new String[dnAddresses.length - 1]; + for (int i = 0; i < includeHosts.length; i++) { + includeHosts[i] = dnAddresses[i + 1]; + } + hostsFileWriter.initIncludeHosts(includeHosts); + // Just init to a nonexistent host to clear out the previous exclusion + hostsFileWriter.initExcludeHost(""); + bm.getDatanodeManager().refreshNodes(CONF); + assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS)); + assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS)); + + // Finally mark the remaining DNs as live again + for (int i = 1; i < dataNodes.size(); i++) { + DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false); + DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0); + } + BlockManagerTestUtil.checkHeartbeat(bm); + assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS)); + assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS)); + } /** Test metrics associated with addition of a file */ @Test --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
