Repository: incubator-slider
Updated Branches:
  refs/heads/develop bc7073e75 -> 4995e98fe


SLIDER-1194 If an app fails due to "Too many recent failures" - provide the 
list of containers which counted towards this


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/4995e98f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/4995e98f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/4995e98f

Branch: refs/heads/develop
Commit: 4995e98fe6e2cfbac6386b7be43e7cb06c10a24c
Parents: bc7073e
Author: Gour Saha <gourks...@apache.org>
Authored: Thu Feb 9 20:00:19 2017 -0800
Committer: Gour Saha <gourks...@apache.org>
Committed: Thu Feb 9 20:00:19 2017 -0800

----------------------------------------------------------------------
 .../api/types/ApplicationDiagnostics.java       | 18 +++++++++++
 .../slider/core/exceptions/ErrorStrings.java    |  1 +
 .../server/appmaster/SliderAppMaster.java       |  3 +-
 .../slider/server/appmaster/state/AppState.java | 34 +++++++++++---------
 .../server/appmaster/state/RoleStatus.java      | 21 +++++++++---
 .../TestMockAppStateContainerFailure.groovy     |  8 ++---
 .../appmaster/web/view/TestIndexBlock.groovy    |  4 +--
 7 files changed, 63 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java
 
b/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java
index 2b563e2..c28c11b 100644
--- 
a/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java
+++ 
b/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java
@@ -47,6 +47,7 @@ public class ApplicationDiagnostics {
   private FinalApplicationStatus finalStatus;
   private String finalMessage;
   private Set<ContainerInformation> containers = new HashSet<>();
+  private Set<String> recentFailedContainers = new HashSet<>();
 
   public Collection<ContainerInformation> getContainers() {
     return Collections.unmodifiableCollection(containers);
@@ -64,6 +65,23 @@ public class ApplicationDiagnostics {
     containers.add(container);
   }
 
+  public Collection<String> getRecentFailedContainers() {
+    return Collections.unmodifiableCollection(recentFailedContainers);
+  }
+
+  public void setRecentFailedContainers(Collection<String> containerIds) {
+    if (containerIds != null) {
+      recentFailedContainers = new HashSet<>(containerIds);
+    }
+  }
+
+  public void addRecentFailedContainer(String containerId) {
+    if (containerId == null) {
+      return;
+    }
+    recentFailedContainers.add(containerId);
+  }
+
   public FinalApplicationStatus getFinalStatus() {
     return finalStatus;
   }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java 
b/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java
index 8b04969..38e7c5c 100644
--- 
a/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java
+++ 
b/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java
@@ -54,4 +54,5 @@ public interface ErrorStrings {
     "Both application image path and home dir have been provided";
   String E_CONFIGURATION_DIRECTORY_NOT_FOUND =
     "Configuration directory \"%s\" not found";
+  String E_MISSING_DIAGNOSTICS_FROM_YARN = "Container failure info not 
available from Yarn";
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index c6f99c6..3d63254 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -1606,7 +1606,8 @@ public class SliderAppMaster extends 
AbstractSliderLaunchedService
     launchService.stop();
 
     //now release all containers
-    releaseAllContainers(finalMessage);
+    String containerReleaseMessage = "Application stop triggered";
+    releaseAllContainers(containerReleaseMessage);
 
     // When the application completes, it should send a finish application
     // signal to the RM

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 4700a66..5d588a5 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -22,6 +22,8 @@ import com.codahale.metrics.Metric;
 import com.codahale.metrics.MetricRegistry;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.yarn.api.records.Container;
@@ -1540,7 +1542,7 @@ public class AppState {
     if (null != instance) {
       RoleStatus roleStatus = lookupRoleStatus(instance.roleId);
       instance.diagnostics = text;
-      roleStatus.noteFailed(true, text, ContainerOutcome.Failed);
+      roleStatus.noteFailed(true, text, ContainerOutcome.Failed, containerId);
       getFailedContainers().put(containerId, instance);
       roleHistory.onNodeManagerContainerStartFailed(instance.container);
     }
@@ -1707,7 +1709,8 @@ public class AppState {
           } else {
             message = String.format("Failure %s (%d)", containerId, 
exitStatus);
           }
-          roleStatus.noteFailed(shortLived, message, result.outcome);
+          roleStatus.noteFailed(shortLived, message, result.outcome,
+              containerId);
           long failed = roleStatus.getFailed();
           log.info("Current count of failed role[{}] {} =  {}",
               roleId, rolename, failed);
@@ -2027,6 +2030,10 @@ public class AppState {
     }
 
     if (threshold > 0 && failures > threshold) {
+      // populate recent failed containers
+      for (ContainerId cId : role.getFailedContainers()) {
+        getApplicationDiagnostics().addRecentFailedContainer(cId.toString());
+      }
       throw new TriggerClusterTeardownException(
         SliderExitCodes.EXIT_DEPLOYMENT_FAILED,
           FinalApplicationStatus.FAILED, ErrorStrings.E_UNSTABLE_CLUSTER +
@@ -2035,7 +2042,7 @@ public class AppState {
           role.getName(),
         role.getFailed(),
         role.getStartFailed(),
-          threshold,
+        threshold,
         role.getFailureMessage());
     }
   }
@@ -2358,17 +2365,10 @@ public class AppState {
       ContainerId id = possible.getId();
       if (!instance.released) {
         String url = getLogsURLForContainer(possible);
-        // Add the completed container log link (overwrites log link for live
-        // container). Mark container stopped as well.
-        ContainerInformation ci = getApplicationDiagnostics()
-            .getContainer(id.toString());
-        if (ci != null) {
-          ci.logLink = url;
-          ci.state = StateValues.STATE_STOPPED;
-          ci.exitCode = ContainerExitStatus.SUCCESS;
-          ci.diagnostics = releaseMessage;
-          ci.completionTime = containerCompletionTime;
-        }
+        // Store container diagnostics on release
+        storeContainerDiagnostics(id.toString(), ContainerExitStatus.SUCCESS,
+            releaseMessage, StateValues.STATE_STOPPED, url,
+            containerCompletionTime);
         log.info("Releasing container. Log: " + url);
         try {
           containerReleaseSubmitted(possible);
@@ -2587,7 +2587,11 @@ public class AppState {
         .getContainer(containerId);
     if (containerInfo != null) {
       containerInfo.exitCode = exitCode;
-      containerInfo.diagnostics = diagnostics;
+      if (StringUtils.isNotBlank(diagnostics)) {
+        containerInfo.diagnostics = diagnostics;
+      } else {
+        containerInfo.diagnostics = 
ErrorStrings.E_MISSING_DIAGNOSTICS_FROM_YARN;
+      }
       containerInfo.state = state;
       if (logLink != null) {
         containerInfo.logLink = logLink;

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 0a3a3c9..07a5cf9 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -21,6 +21,7 @@ package org.apache.slider.server.appmaster.state;
 import com.codahale.metrics.Metric;
 import com.codahale.metrics.MetricSet;
 import com.google.common.base.Preconditions;
+import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.slider.api.types.ComponentInformation;
 import org.apache.slider.api.types.RoleStatistics;
@@ -30,9 +31,12 @@ import 
org.apache.slider.server.appmaster.management.BoolMetricPredicate;
 import org.apache.slider.server.appmaster.management.LongGauge;
 
 import java.io.Serializable;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 
 /**
  * Models the ongoing status of all nodes in an application.
@@ -72,12 +76,11 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
   /** resource requirements */
   private Resource resourceRequirements;
 
-
   /** any pending AA request */
   private volatile OutstandingRequest outstandingAArequest = null;
 
-
   private String failureMessage = "";
+  private final Set<ContainerId> failedContainers = new HashSet<>();
 
   public RoleStatus(ProviderRole providerRole) {
     this.providerRole = providerRole;
@@ -239,7 +242,9 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
    * Reset the recent failure
    * @return the number of failures in the "recent" window
    */
-  public long resetFailedRecently() {
+  public synchronized long resetFailedRecently() {
+    // clear failedContainers
+    failedContainers.clear();
     return failedRecently.getAndSet(0);
   }
 
@@ -276,10 +281,13 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
    * @param outcome outcome of the container
    */
   public synchronized void noteFailed(boolean startupFailure, String text,
-      ContainerOutcome outcome) {
+      ContainerOutcome outcome, ContainerId containerId) {
     if (text != null) {
       failureMessage = text;
     }
+    if (containerId != null) {
+      failedContainers.add(containerId);
+    }
     switch (outcome) {
       case Preempted:
         preempted.incrementAndGet();
@@ -317,6 +325,10 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
     return failureMessage;
   }
 
+  public synchronized Set<ContainerId> getFailedContainers() {
+    return Collections.unmodifiableSet(failedContainers);
+  }
+
   public long getCompleted() {
     return completed.get();
   }
@@ -441,6 +453,7 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
     }
     sb.append(", failureMessage='").append(failureMessage).append('\'');
     sb.append(", providerRole=").append(providerRole);
+    sb.append(", failedContainers=").append(failedContainers);
     sb.append('}');
     return sb.toString();
   }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
----------------------------------------------------------------------
diff --git 
a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
 
b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
index f1a4027..f6314b0 100644
--- 
a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
+++ 
b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
@@ -247,7 +247,7 @@ class TestMockAppStateContainerFailure extends 
BaseMockAppStateTest
   public void testRoleStatusFailed() throws Throwable {
     def status = role0Status
     // limits exceeded
-    status.noteFailed(false, "text",ContainerOutcome.Failed)
+    status.noteFailed(false, "text",ContainerOutcome.Failed, null)
     assert 1 == status.failed
     assert 1L == status.failedRecently
     assert 0L == status.limitsExceeded
@@ -264,7 +264,7 @@ class TestMockAppStateContainerFailure extends 
BaseMockAppStateTest
   public void testRoleStatusFailedLimitsExceeded() throws Throwable {
     def status = role0Status
     // limits exceeded
-    status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded)
+    status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded, 
null)
     assert 1 == status.failed
     assert 1L == status.failedRecently
     assert 1L == status.limitsExceeded
@@ -283,7 +283,7 @@ class TestMockAppStateContainerFailure extends 
BaseMockAppStateTest
   public void testRoleStatusFailedPrempted() throws Throwable {
     def status = role0Status
     // limits exceeded
-    status.noteFailed(false, "text", ContainerOutcome.Preempted)
+    status.noteFailed(false, "text", ContainerOutcome.Preempted, null)
     assert 0 == status.failed
     assert 1L == status.preempted
     assert 0L == status.failedRecently
@@ -299,7 +299,7 @@ class TestMockAppStateContainerFailure extends 
BaseMockAppStateTest
   public void testRoleStatusFailedNode() throws Throwable {
     def status = role0Status
     // limits exceeded
-    status.noteFailed(false, "text", ContainerOutcome.Node_failure)
+    status.noteFailed(false, "text", ContainerOutcome.Node_failure, null)
     assert 1 == status.failed
     assert 0L == status.failedRecently
     assert 0L == status.limitsExceeded

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
----------------------------------------------------------------------
diff --git 
a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
 
b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
index a818e53..014a850 100644
--- 
a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
+++ 
b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
@@ -101,8 +101,8 @@ public class TestIndexBlock extends BaseMockAppStateAATest {
 
     def role0_failures = 2
 
-    role0.noteFailed(false, "", ContainerOutcome.Failed)
-    role0.noteFailed(true,  "", ContainerOutcome.Failed)
+    role0.noteFailed(false, "", ContainerOutcome.Failed, null)
+    role0.noteFailed(true,  "", ContainerOutcome.Failed, null)
 
     // all aa roles fields are in the
     def aarole_desired = 200

Reply via email to