This is an automated email from the ASF dual-hosted git repository.
inigoiri pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/trunk by this push:
new dee6dc2 YARN-10893. Adding metrics for getClusterMetrics and
getApplications APIs in FederationClientInterceptor (#3325)
dee6dc2 is described below
commit dee6dc2f89609bd35a18e40188103671a38efb89
Author: Akshat Bordia <[email protected]>
AuthorDate: Thu Sep 9 21:50:57 2021 +0530
YARN-10893. Adding metrics for getClusterMetrics and getApplications APIs
in FederationClientInterceptor (#3325)
---
.../hadoop/yarn/server/router/RouterMetrics.java | 33 +++++++++++++++++
.../clientrm/FederationClientInterceptor.java | 35 ++++++++++++++----
.../yarn/server/router/TestRouterMetrics.java | 42 ++++++++++++++++++++++
3 files changed, 104 insertions(+), 6 deletions(-)
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterMetrics.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterMetrics.java
index 24fdbb90..37e7419 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterMetrics.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterMetrics.java
@@ -53,6 +53,8 @@ public final class RouterMetrics {
private MutableGaugeInt numMultipleAppsFailedRetrieved;
@Metric("# of applicationAttempt reports failed to be retrieved")
private MutableGaugeInt numAppAttemptsFailedRetrieved;
+ @Metric("# of getClusterMetrics failed to be retrieved")
+ private MutableGaugeInt numGetClusterMetricsFailedRetrieved;
// Aggregate metrics are shared, and don't have to be looked up per call
@Metric("Total number of successful Submitted apps and latency(ms)")
@@ -69,6 +71,9 @@ public final class RouterMetrics {
@Metric("Total number of successful Retrieved " +
"appAttempt reports and latency(ms)")
private MutableRate totalSucceededAppAttemptsRetrieved;
+ @Metric("Total number of successful Retrieved getClusterMetrics and "
+ + "latency(ms)")
+ private MutableRate totalSucceededGetClusterMetricsRetrieved;
/**
@@ -80,6 +85,7 @@ public final class RouterMetrics {
private MutableQuantiles getApplicationReportLatency;
private MutableQuantiles getApplicationsReportLatency;
private MutableQuantiles getApplicationAttemptReportLatency;
+ private MutableQuantiles getClusterMetricsLatency;
private static volatile RouterMetrics INSTANCE = null;
private static MetricsRegistry registry;
@@ -103,6 +109,9 @@ public final class RouterMetrics {
registry.newQuantiles("getApplicationAttemptReportLatency",
"latency of get applicationattempt " +
"report", "ops", "latency", 10);
+ getClusterMetricsLatency =
+ registry.newQuantiles("getClusterMetricsLatency",
+ "latency of get cluster metrics", "ops", "latency", 10);
}
public static RouterMetrics getMetrics() {
@@ -155,6 +164,11 @@ public final class RouterMetrics {
}
@VisibleForTesting
+ public long getNumSucceededGetClusterMetricsRetrieved(){
+ return totalSucceededGetClusterMetricsRetrieved.lastStat().numSamples();
+ }
+
+ @VisibleForTesting
public double getLatencySucceededAppsCreated() {
return totalSucceededAppsCreated.lastStat().mean();
}
@@ -185,6 +199,11 @@ public final class RouterMetrics {
}
@VisibleForTesting
+ public double getLatencySucceededGetClusterMetricsRetrieved() {
+ return totalSucceededGetClusterMetricsRetrieved.lastStat().mean();
+ }
+
+ @VisibleForTesting
public int getAppsFailedCreated() {
return numAppsFailedCreated.value();
}
@@ -214,6 +233,11 @@ public final class RouterMetrics {
return numMultipleAppsFailedRetrieved.value();
}
+ @VisibleForTesting
+ public int getClusterMetricsFailedRetrieved() {
+ return numGetClusterMetricsFailedRetrieved.value();
+ }
+
public void succeededAppsCreated(long duration) {
totalSucceededAppsCreated.add(duration);
getNewApplicationLatency.add(duration);
@@ -244,6 +268,11 @@ public final class RouterMetrics {
getApplicationAttemptReportLatency.add(duration);
}
+ public void succeededGetClusterMetricsRetrieved(long duration) {
+ totalSucceededGetClusterMetricsRetrieved.add(duration);
+ getClusterMetricsLatency.add(duration);
+ }
+
public void incrAppsFailedCreated() {
numAppsFailedCreated.incr();
}
@@ -268,4 +297,8 @@ public final class RouterMetrics {
numAppAttemptsFailedRetrieved.incr();
}
+ public void incrGetClusterMetricsFailedRetrieved() {
+ numGetClusterMetricsFailedRetrieved.incr();
+ }
+
}
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java
index 08636bb..391bc1c 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java
@@ -628,18 +628,29 @@ public class FederationClientInterceptor
public GetApplicationsResponse getApplications(GetApplicationsRequest
request)
throws YarnException, IOException {
if (request == null) {
+ routerMetrics.incrMultipleAppsFailedRetrieved();
RouterServerUtil.logAndThrowException(
"Missing getApplications request.",
null);
}
+ long startTime = clock.getTime();
Map<SubClusterId, SubClusterInfo> subclusters =
federationFacade.getSubClusters(true);
ClientMethod remoteMethod = new ClientMethod("getApplications",
new Class[] {GetApplicationsRequest.class}, new Object[] {request});
- Map<SubClusterId, GetApplicationsResponse> applications =
- invokeConcurrent(subclusters.keySet(), remoteMethod,
- GetApplicationsResponse.class);
+ Map<SubClusterId, GetApplicationsResponse> applications;
+
+ try {
+ applications = invokeConcurrent(subclusters.keySet(), remoteMethod,
+ GetApplicationsResponse.class);
+ } catch (Exception ex) {
+ routerMetrics.incrMultipleAppsFailedRetrieved();
+ LOG.error("Unable to get applications due to exception.", ex);
+ throw ex;
+ }
+ long stopTime = clock.getTime();
+ routerMetrics.succeededMultipleAppsRetrieved(stopTime - startTime);
// Merge the Application Reports
return RouterYarnClientUtils.mergeApplications(applications.values(),
returnPartialReport);
@@ -648,14 +659,26 @@ public class FederationClientInterceptor
@Override
public GetClusterMetricsResponse getClusterMetrics(
GetClusterMetricsRequest request) throws YarnException, IOException {
+ long startTime = clock.getTime();
Map<SubClusterId, SubClusterInfo> subclusters =
federationFacade.getSubClusters(true);
ClientMethod remoteMethod = new ClientMethod("getClusterMetrics",
new Class[] {GetClusterMetricsRequest.class}, new Object[] {request});
ArrayList<SubClusterId> clusterList = new
ArrayList<>(subclusters.keySet());
- Map<SubClusterId, GetClusterMetricsResponse> clusterMetrics =
- invokeConcurrent(clusterList, remoteMethod,
- GetClusterMetricsResponse.class);
+ Map<SubClusterId, GetClusterMetricsResponse> clusterMetrics;
+
+ try {
+ clusterMetrics = invokeConcurrent(clusterList, remoteMethod,
+ GetClusterMetricsResponse.class);
+
+ } catch (Exception ex) {
+ routerMetrics.incrGetClusterMetricsFailedRetrieved();
+ LOG.error("Unable to get cluster metrics due to exception.", ex);
+ throw ex;
+ }
+
+ long stopTime = clock.getTime();
+ routerMetrics.succeededGetClusterMetricsRetrieved(stopTime - startTime);
return RouterYarnClientUtils.merge(clusterMetrics.values());
}
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/TestRouterMetrics.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/TestRouterMetrics.java
index 1456a42..4d4838a 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/TestRouterMetrics.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/TestRouterMetrics.java
@@ -279,6 +279,37 @@ public class TestRouterMetrics {
metrics.getMultipleAppsFailedRetrieved());
}
+ /**
+ * This test validates the correctness of the metric: Retrieved
getClusterMetrics
+ * multiple times successfully.
+ */
+ @Test
+ public void testSucceededGetClusterMetrics() {
+ long totalGoodBefore = metrics.getNumSucceededGetClusterMetricsRetrieved();
+ goodSubCluster.getClusterMetrics(100);
+ Assert.assertEquals(totalGoodBefore + 1,
+ metrics.getNumSucceededGetClusterMetricsRetrieved());
+ Assert.assertEquals(100,
metrics.getLatencySucceededGetClusterMetricsRetrieved(),
+ 0);
+ goodSubCluster.getClusterMetrics(200);
+ Assert.assertEquals(totalGoodBefore + 2,
+ metrics.getNumSucceededGetClusterMetricsRetrieved());
+ Assert.assertEquals(150,
metrics.getLatencySucceededGetClusterMetricsRetrieved(),
+ 0);
+ }
+
+ /**
+ * This test validates the correctness of the metric: Failed to
+ * retrieve getClusterMetrics.
+ */
+ @Test
+ public void testGetClusterMetricsFailed() {
+ long totalBadbefore = metrics.getClusterMetricsFailedRetrieved();
+ badSubCluster.getClusterMetrics();
+ Assert.assertEquals(totalBadbefore + 1,
+ metrics.getClusterMetricsFailedRetrieved());
+ }
+
// Records failures for all calls
private class MockBadSubCluster {
public void getNewApplication() {
@@ -310,6 +341,11 @@ public class TestRouterMetrics {
LOG.info("Mocked: failed getApplicationsReport call");
metrics.incrMultipleAppsFailedRetrieved();
}
+
+ public void getClusterMetrics() {
+ LOG.info("Mocked: failed getClusterMetrics call");
+ metrics.incrGetClusterMetricsFailedRetrieved();
+ }
}
// Records successes for all calls
@@ -350,5 +386,11 @@ public class TestRouterMetrics {
duration);
metrics.succeededMultipleAppsRetrieved(duration);
}
+
+ public void getClusterMetrics(long duration){
+ LOG.info("Mocked: successful getClusterMetrics call with duration {}",
+ duration);
+ metrics.succeededGetClusterMetricsRetrieved(duration);
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]