HDFS-13641. Add metrics for edit log tailing. Contributed by Chao Sun.

Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/8e7548d3
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/8e7548d3
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/8e7548d3

Branch: refs/heads/HDDS-48
Commit: 8e7548d33be9c4874daab18b2e774bdc2ed216d3
Parents: 6307962
Author: Yiqun Lin <yq...@apache.org>
Authored: Wed Jun 13 20:05:55 2018 +0800
Committer: Yiqun Lin <yq...@apache.org>
Committed: Wed Jun 13 20:05:55 2018 +0800

----------------------------------------------------------------------
 .../hadoop-common/src/site/markdown/Metrics.md  | 12 ++++
 .../org/apache/hadoop/test/MetricsAsserts.java  | 22 ++++++--
 .../hdfs/server/namenode/ha/EditLogTailer.java  | 11 ++++
 .../namenode/metrics/NameNodeMetrics.java       | 59 +++++++++++++++++++-
 .../namenode/metrics/TestNameNodeMetrics.java   | 41 ++++++++++++++
 5 files changed, 140 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/8e7548d3/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md 
b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
index 2538491..676ab0b 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
@@ -181,6 +181,18 @@ Each metrics record contains tags such as ProcessName, 
SessionId, and Hostname a
 | `WarmUpEDEKTimeAvgTime` | Average time of warming up EDEK in milliseconds |
 | `ResourceCheckTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 
50/75/90/95/99th percentile of NameNode resource check latency in milliseconds. 
Percentile measurement is off by default, by watching no intervals. The 
intervals are specified by `dfs.metrics.percentiles.intervals`. |
 | `StorageBlockReport`*num*`s(50/75/90/95/99)thPercentileLatency` | The 
50/75/90/95/99th percentile of storage block report latency in milliseconds. 
Percentile measurement is off by default, by watching no intervals. The 
intervals are specified by `dfs.metrics.percentiles.intervals`. |
+| `EditLogTailTimeNumOps` | Total number of times the standby NameNode tailed 
the edit log |
+| `EditLogTailTimeAvgTime` | Average time (in milliseconds) spent by standby 
NameNode in tailing edit log |
+| `EditLogTailTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 
50/75/90/95/99th percentile of time spent in tailing edit logs by standby 
NameNode, in milliseconds. Percentile measurement is off by default, by 
watching no intervals. The intervals are specified by 
`dfs.metrics.percentiles.intervals`. |
+| `EditLogFetchTimeNumOps` | Total number of times the standby NameNode 
fetched remote edit streams from journal nodes |
+| `EditLogFetchTimeAvgTime` | Average time (in milliseconds) spent by standby 
NameNode in fetching remote edit streams from journal nodes |
+| `EditLogFetchTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 
50/75/90/95/99th percentile of time spent in fetching edit streams from journal 
nodes by standby NameNode, in milliseconds. Percentile measurement is off by 
default, by watching no intervals. The intervals are specified by 
`dfs.metrics.percentiles.intervals`. |
+| `NumEditLogLoadedNumOps` | Total number of times edits were loaded by 
standby NameNode |
+| `NumEditLogLoadedAvgCount` | Average number of edits loaded by standby 
NameNode in each edit log tailing |
+| `NumEditLogLoaded`*num*`s(50/75/90/95/99)thPercentileCount` | The 
50/75/90/95/99th percentile of number of edits loaded by standby NameNode in 
each edit log tailing. Percentile measurement is off by default, by watching no 
intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
+| `EditLogTailIntervalNumOps` | Total number of intervals between edit log 
tailings by standby NameNode |
+| `EditLogTailIntervalAvgTime` | Average time of intervals between edit log 
tailings by standby NameNode in milliseconds |
+| `EditLogTailInterval`*num*`s(50/75/90/95/99)thPercentileLatency` | The 
50/75/90/95/99th percentile of time between edit log tailings by standby 
NameNode, in milliseconds. Percentile measurement is off by default, by 
watching no intervals. The intervals are specified by 
`dfs.metrics.percentiles.intervals`. |
 
 FSNamesystem
 ------------

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8e7548d3/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MetricsAsserts.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MetricsAsserts.java
 
b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MetricsAsserts.java
index caa65c5..d384834 100644
--- 
a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MetricsAsserts.java
+++ 
b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MetricsAsserts.java
@@ -367,17 +367,31 @@ public class MetricsAsserts {
   }
   
   /**
-   * Asserts that the NumOps and quantiles for a metric have been changed at
-   * some point to a non-zero value.
+   * Asserts that the NumOps and quantiles for a metric with value name
+   * "Latency" have been changed at some point to a non-zero value.
    * 
    * @param prefix of the metric
    * @param rb MetricsRecordBuilder with the metric
    */
-  public static void assertQuantileGauges(String prefix, 
+  public static void assertQuantileGauges(String prefix,
       MetricsRecordBuilder rb) {
+    assertQuantileGauges(prefix, rb, "Latency");
+  }
+
+  /**
+   * Asserts that the NumOps and quantiles for a metric have been changed at
+   * some point to a non-zero value, for the specified value name of the
+   * metrics (e.g., "Latency", "Count").
+   *
+   * @param prefix of the metric
+   * @param rb MetricsRecordBuilder with the metric
+   * @param valueName the value name for the metric
+   */
+  public static void assertQuantileGauges(String prefix,
+      MetricsRecordBuilder rb, String valueName) {
     verify(rb).addGauge(eqName(info(prefix + "NumOps", "")), geq(0l));
     for (Quantile q : MutableQuantiles.quantiles) {
-      String nameTemplate = prefix + "%dthPercentileLatency";
+      String nameTemplate = prefix + "%dthPercentile" + valueName;
       int percentile = (int) (100 * q.quantile);
       verify(rb).addGauge(
           eqName(info(String.format(nameTemplate, percentile), "")),

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8e7548d3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
index 73a111e..2003f94 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
@@ -62,6 +62,7 @@ import static org.apache.hadoop.util.ExitUtil.terminate;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import org.apache.hadoop.util.Time;
 
 
 /**
@@ -292,6 +293,7 @@ public class EditLogTailer {
         LOG.debug("lastTxnId: " + lastTxnId);
       }
       Collection<EditLogInputStream> streams;
+      long startTime = Time.monotonicNow();
       try {
         streams = editLog.selectInputStreams(lastTxnId + 1, 0,
             null, inProgressOk, true);
@@ -302,6 +304,9 @@ public class EditLogTailer {
         LOG.warn("Edits tailer failed to find any streams. Will try again " +
             "later.", ioe);
         return;
+      } finally {
+        NameNode.getNameNodeMetrics().addEditLogFetchTime(
+            Time.monotonicNow() - startTime);
       }
       if (LOG.isDebugEnabled()) {
         LOG.debug("edit streams to load from: " + streams.size());
@@ -322,6 +327,7 @@ public class EditLogTailer {
           LOG.debug(String.format("Loaded %d edits starting from txid %d ",
               editsLoaded, lastTxnId));
         }
+        NameNode.getNameNodeMetrics().addNumEditLogLoaded(editsLoaded);
       }
 
       if (editsLoaded > 0) {
@@ -446,10 +452,15 @@ public class EditLogTailer {
           // name system lock will be acquired to further block even the block
           // state updates.
           namesystem.cpLockInterruptibly();
+          long startTime = Time.monotonicNow();
           try {
+            NameNode.getNameNodeMetrics().addEditLogTailInterval(
+                startTime - lastLoadTimeMs);
             doTailEdits();
           } finally {
             namesystem.cpUnlock();
+            NameNode.getNameNodeMetrics().addEditLogTailTime(
+                Time.monotonicNow() - startTime);
           }
           //Update NameDirSize Metric
           namesystem.getFSImage().getStorage().updateNameDirSize();

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8e7548d3/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
index 94c5e9e..5d43ebf 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
@@ -32,6 +32,7 @@ import org.apache.hadoop.metrics2.lib.MutableCounterLong;
 import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
 import org.apache.hadoop.metrics2.lib.MutableQuantiles;
 import org.apache.hadoop.metrics2.lib.MutableRate;
+import org.apache.hadoop.metrics2.lib.MutableStat;
 import org.apache.hadoop.metrics2.source.JvmMetrics;
 
 /**
@@ -134,6 +135,18 @@ public class NameNodeMetrics {
   @Metric("Time loading FS Image at startup in msec")
   MutableGaugeInt fsImageLoadTime;
 
+  @Metric("Time tailing edit logs in msec")
+  MutableRate editLogTailTime;
+  private final MutableQuantiles[] editLogTailTimeQuantiles;
+  @Metric MutableRate editLogFetchTime;
+  private final MutableQuantiles[] editLogFetchTimeQuantiles;
+  @Metric(value = "Number of edits loaded", valueName = "Count")
+  MutableStat numEditLogLoaded;
+  private final MutableQuantiles[] numEditLogLoadedQuantiles;
+  @Metric("Time between edit log tailing in msec")
+  MutableRate editLogTailInterval;
+  private final MutableQuantiles[] editLogTailIntervalQuantiles;
+
   @Metric("GetImageServlet getEdit")
   MutableRate getEdit;
   @Metric("GetImageServlet getImage")
@@ -156,7 +169,11 @@ public class NameNodeMetrics {
     generateEDEKTimeQuantiles = new MutableQuantiles[len];
     warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
     resourceCheckTimeQuantiles = new MutableQuantiles[len];
-    
+    editLogTailTimeQuantiles = new MutableQuantiles[len];
+    editLogFetchTimeQuantiles = new MutableQuantiles[len];
+    numEditLogLoadedQuantiles = new MutableQuantiles[len];
+    editLogTailIntervalQuantiles = new MutableQuantiles[len];
+
     for (int i = 0; i < len; i++) {
       int interval = intervals[i];
       syncsQuantiles[i] = registry.newQuantiles(
@@ -181,6 +198,18 @@ public class NameNodeMetrics {
       resourceCheckTimeQuantiles[i] = registry.newQuantiles(
           "resourceCheckTime" + interval + "s",
           "resource check time", "ops", "latency", interval);
+      editLogTailTimeQuantiles[i] = registry.newQuantiles(
+          "editLogTailTime" + interval + "s",
+          "Edit log tailing time", "ops", "latency", interval);
+      editLogFetchTimeQuantiles[i] = registry.newQuantiles(
+          "editLogFetchTime" + interval + "s",
+          "Edit log fetch time", "ops", "latency", interval);
+      numEditLogLoadedQuantiles[i] = registry.newQuantiles(
+          "numEditLogLoaded" + interval + "s",
+          "Number of edits loaded", "ops", "count", interval);
+      editLogTailIntervalQuantiles[i] = registry.newQuantiles(
+          "editLogTailInterval" + interval + "s",
+          "Edit log tailing interval", "ops", "latency", interval);
     }
   }
 
@@ -389,4 +418,32 @@ public class NameNodeMetrics {
       q.add(latency);
     }
   }
+
+  public void addEditLogTailTime(long elapsed) {
+    editLogTailTime.add(elapsed);
+    for (MutableQuantiles q : editLogTailTimeQuantiles) {
+      q.add(elapsed);
+    }
+  }
+
+  public void addEditLogFetchTime(long elapsed) {
+    editLogFetchTime.add(elapsed);
+    for (MutableQuantiles q : editLogFetchTimeQuantiles) {
+      q.add(elapsed);
+    }
+  }
+
+  public void addNumEditLogLoaded(long loaded) {
+    numEditLogLoaded.add(loaded);
+    for (MutableQuantiles q : numEditLogLoadedQuantiles) {
+      q.add(loaded);
+    }
+  }
+
+  public void addEditLogTailInterval(long elapsed) {
+    editLogTailInterval.add(elapsed);
+    for (MutableQuantiles q : editLogTailIntervalQuantiles) {
+      q.add(elapsed);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8e7548d3/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
index 92bc51c..e34deea 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
@@ -32,6 +32,7 @@ import org.apache.hadoop.hdfs.client.HdfsAdmin;
 import static 
org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
 import static 
org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
 import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
+import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
 import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
 import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
 import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
@@ -1059,4 +1060,44 @@ public class TestNameNodeMetrics {
       }
     }
   }
+
+  @Test
+  public void testEditLogTailing() throws Exception {
+    HdfsConfiguration conf = new HdfsConfiguration();
+    conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+    conf.setInt(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, 60);
+    MiniDFSCluster dfsCluster = null;
+    try {
+      dfsCluster = new MiniDFSCluster.Builder(conf)
+          .numDataNodes(0)
+          .nnTopology(MiniDFSNNTopology.simpleHATopology())
+          .build();
+      DistributedFileSystem dfs = dfsCluster.getFileSystem(0);
+      dfsCluster.transitionToActive(0);
+      dfsCluster.waitActive();
+
+      Path testDir = new Path("/testdir");
+      dfs.mkdir(testDir, FsPermission.getDefault());
+
+      dfsCluster.getNameNodeRpc(0).rollEditLog();
+      Thread.sleep(2 * 1000);
+
+      // We need to get the metrics for the SBN (excluding the NN from dfs
+      // cluster created in setUp() and the ANN).
+      MetricsRecordBuilder rb = getMetrics(NN_METRICS+"-2");
+      assertQuantileGauges("EditLogTailTime60s", rb);
+      assertQuantileGauges("EditLogFetchTime60s", rb);
+      assertQuantileGauges("NumEditLogLoaded60s", rb, "Count");
+      assertQuantileGauges("EditLogTailInterval60s", rb);
+      assertCounterGt("EditLogTailTimeNumOps", 0L, rb);
+      assertCounterGt("EditLogFetchTimeNumOps", 0L, rb);
+      assertCounterGt("NumEditLogLoadedNumOps", 0L, rb);
+      assertCounterGt("EditLogTailIntervalNumOps", 0L, rb);
+    } finally {
+      if (dfsCluster != null) {
+        dfsCluster.shutdown();
+      }
+    }
+
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to