HDFS-7501. TransactionsSinceLastCheckpoint can be negative on SBNs. Contributed by Gautam Gopalakrishnan.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/7d4d6150 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/7d4d6150 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/7d4d6150 Branch: refs/heads/YARN-2928 Commit: 7d4d6150f8c81a242f7676e27d65db9f31136007 Parents: 74e941d Author: Harsh J <ha...@cloudera.com> Authored: Sun Mar 29 00:45:01 2015 +0530 Committer: Zhijie Shen <zjs...@apache.org> Committed: Mon Mar 30 12:10:47 2015 -0700 ---------------------------------------------------------------------- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hdfs/server/namenode/FSNamesystem.java | 2 +- .../namenode/metrics/TestNameNodeMetrics.java | 84 ++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/7d4d6150/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index f7cc2bc..496db06 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -351,6 +351,9 @@ Release 2.8.0 - UNRELEASED BUG FIXES + HDFS-7501. TransactionsSinceLastCheckpoint can be negative on SBNs. + (Gautam Gopalakrishnan via harsh) + HDFS-5356. MiniDFSCluster should close all open FileSystems when shutdown() (Rakesh R via vinayakumarb) http://git-wip-us.apache.org/repos/asf/hadoop/blob/7d4d6150/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index d0999b8..0e0f484 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4784,7 +4784,7 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, @Metric({"TransactionsSinceLastCheckpoint", "Number of transactions since last checkpoint"}) public long getTransactionsSinceLastCheckpoint() { - return getEditLog().getLastWrittenTxId() - + return getFSImage().getLastAppliedOrWrittenTxId() - getFSImage().getStorage().getMostRecentCheckpointTxId(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/7d4d6150/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index 011db3c..64ea1e4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -22,12 +22,16 @@ import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertGauge; import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.DataInputStream; import java.io.IOException; import java.util.Random; +import com.google.common.collect.ImmutableList; +import com.google.common.io.Files; +import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; @@ -39,6 +43,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; @@ -47,7 +52,9 @@ import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.MetricsSource; @@ -69,6 +76,7 @@ public class TestNameNodeMetrics { new Path("/testNameNodeMetrics"); private static final String NN_METRICS = "NameNodeActivity"; private static final String NS_METRICS = "FSNamesystem"; + public static final Log LOG = LogFactory.getLog(TestNameNodeMetrics.class); // Number of datanodes in the cluster private static final int DATANODE_COUNT = 3; @@ -400,6 +408,82 @@ public class TestNameNodeMetrics { } /** + * Testing TransactionsSinceLastCheckpoint. Need a new cluster as + * the other tests in here don't use HA. See HDFS-7501. + */ + @Test(timeout = 300000) + public void testTransactionSinceLastCheckpointMetrics() throws Exception { + Random random = new Random(); + int retryCount = 0; + while (retryCount < 5) { + try { + int basePort = 10060 + random.nextInt(100) * 2; + MiniDFSNNTopology topology = new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf("ns1") + .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(basePort)) + .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(basePort + 1))); + + HdfsConfiguration conf2 = new HdfsConfiguration(); + // Lower the checkpoint condition for purpose of testing. + conf2.setInt( + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, + 100); + // Check for checkpoint condition very often, for purpose of testing. + conf2.setInt( + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, + 1); + // Poll and follow ANN txns very often, for purpose of testing. + conf2.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + MiniDFSCluster cluster2 = new MiniDFSCluster.Builder(conf2) + .nnTopology(topology).numDataNodes(1).build(); + cluster2.waitActive(); + DistributedFileSystem fs2 = cluster2.getFileSystem(0); + NameNode nn0 = cluster2.getNameNode(0); + NameNode nn1 = cluster2.getNameNode(1); + cluster2.transitionToActive(0); + fs2.mkdirs(new Path("/tmp-t1")); + fs2.mkdirs(new Path("/tmp-t2")); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + // Test to ensure tracking works before the first-ever + // checkpoint. + assertEquals("SBN failed to track 2 transactions pre-checkpoint.", + 4L, // 2 txns added further when catch-up is called. + cluster2.getNameNode(1).getNamesystem() + .getTransactionsSinceLastCheckpoint()); + // Complete up to the boundary required for + // an auto-checkpoint. Using 94 to expect fsimage + // rounded at 100, as 4 + 94 + 2 (catch-up call) = 100. + for (int i = 1; i <= 94; i++) { + fs2.mkdirs(new Path("/tmp-" + i)); + } + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + // Assert 100 transactions in checkpoint. + HATestUtil.waitForCheckpoint(cluster2, 1, ImmutableList.of(100)); + // Test to ensure number tracks the right state of + // uncheckpointed edits, and does not go negative + // (as fixed in HDFS-7501). + assertEquals("Should be zero right after the checkpoint.", + 0L, + cluster2.getNameNode(1).getNamesystem() + .getTransactionsSinceLastCheckpoint()); + fs2.mkdirs(new Path("/tmp-t3")); + fs2.mkdirs(new Path("/tmp-t4")); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + // Test to ensure we track the right numbers after + // the checkpoint resets it to zero again. + assertEquals("SBN failed to track 2 added txns after the ckpt.", + 4L, + cluster2.getNameNode(1).getNamesystem() + .getTransactionsSinceLastCheckpoint()); + cluster2.shutdown(); + break; + } catch (Exception e) { + LOG.warn("Unable to set up HA cluster, exception thrown: " + e); + retryCount++; + } + } + } + /** * Test NN checkpoint and transaction-related metrics. */ @Test