Repository: incubator-slider Updated Branches: refs/heads/develop bc7073e75 -> 4995e98fe
SLIDER-1194 If an app fails due to "Too many recent failures" - provide the list of containers which counted towards this Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/4995e98f Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/4995e98f Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/4995e98f Branch: refs/heads/develop Commit: 4995e98fe6e2cfbac6386b7be43e7cb06c10a24c Parents: bc7073e Author: Gour Saha <gourks...@apache.org> Authored: Thu Feb 9 20:00:19 2017 -0800 Committer: Gour Saha <gourks...@apache.org> Committed: Thu Feb 9 20:00:19 2017 -0800 ---------------------------------------------------------------------- .../api/types/ApplicationDiagnostics.java | 18 +++++++++++ .../slider/core/exceptions/ErrorStrings.java | 1 + .../server/appmaster/SliderAppMaster.java | 3 +- .../slider/server/appmaster/state/AppState.java | 34 +++++++++++--------- .../server/appmaster/state/RoleStatus.java | 21 +++++++++--- .../TestMockAppStateContainerFailure.groovy | 8 ++--- .../appmaster/web/view/TestIndexBlock.groovy | 4 +-- 7 files changed, 63 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java b/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java index 2b563e2..c28c11b 100644 --- a/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java +++ b/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java @@ -47,6 +47,7 @@ public class ApplicationDiagnostics { private FinalApplicationStatus finalStatus; private String finalMessage; private Set<ContainerInformation> containers = new HashSet<>(); + private Set<String> recentFailedContainers = new HashSet<>(); public Collection<ContainerInformation> getContainers() { return Collections.unmodifiableCollection(containers); @@ -64,6 +65,23 @@ public class ApplicationDiagnostics { containers.add(container); } + public Collection<String> getRecentFailedContainers() { + return Collections.unmodifiableCollection(recentFailedContainers); + } + + public void setRecentFailedContainers(Collection<String> containerIds) { + if (containerIds != null) { + recentFailedContainers = new HashSet<>(containerIds); + } + } + + public void addRecentFailedContainer(String containerId) { + if (containerId == null) { + return; + } + recentFailedContainers.add(containerId); + } + public FinalApplicationStatus getFinalStatus() { return finalStatus; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java b/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java index 8b04969..38e7c5c 100644 --- a/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java +++ b/slider-core/src/main/java/org/apache/slider/core/exceptions/ErrorStrings.java @@ -54,4 +54,5 @@ public interface ErrorStrings { "Both application image path and home dir have been provided"; String E_CONFIGURATION_DIRECTORY_NOT_FOUND = "Configuration directory \"%s\" not found"; + String E_MISSING_DIAGNOSTICS_FROM_YARN = "Container failure info not available from Yarn"; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index c6f99c6..3d63254 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -1606,7 +1606,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService launchService.stop(); //now release all containers - releaseAllContainers(finalMessage); + String containerReleaseMessage = "Application stop triggered"; + releaseAllContainers(containerReleaseMessage); // When the application completes, it should send a finish application // signal to the RM http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java index 4700a66..5d588a5 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java @@ -22,6 +22,8 @@ import com.codahale.metrics.Metric; import com.codahale.metrics.MetricRegistry; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; + +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.yarn.api.records.Container; @@ -1540,7 +1542,7 @@ public class AppState { if (null != instance) { RoleStatus roleStatus = lookupRoleStatus(instance.roleId); instance.diagnostics = text; - roleStatus.noteFailed(true, text, ContainerOutcome.Failed); + roleStatus.noteFailed(true, text, ContainerOutcome.Failed, containerId); getFailedContainers().put(containerId, instance); roleHistory.onNodeManagerContainerStartFailed(instance.container); } @@ -1707,7 +1709,8 @@ public class AppState { } else { message = String.format("Failure %s (%d)", containerId, exitStatus); } - roleStatus.noteFailed(shortLived, message, result.outcome); + roleStatus.noteFailed(shortLived, message, result.outcome, + containerId); long failed = roleStatus.getFailed(); log.info("Current count of failed role[{}] {} = {}", roleId, rolename, failed); @@ -2027,6 +2030,10 @@ public class AppState { } if (threshold > 0 && failures > threshold) { + // populate recent failed containers + for (ContainerId cId : role.getFailedContainers()) { + getApplicationDiagnostics().addRecentFailedContainer(cId.toString()); + } throw new TriggerClusterTeardownException( SliderExitCodes.EXIT_DEPLOYMENT_FAILED, FinalApplicationStatus.FAILED, ErrorStrings.E_UNSTABLE_CLUSTER + @@ -2035,7 +2042,7 @@ public class AppState { role.getName(), role.getFailed(), role.getStartFailed(), - threshold, + threshold, role.getFailureMessage()); } } @@ -2358,17 +2365,10 @@ public class AppState { ContainerId id = possible.getId(); if (!instance.released) { String url = getLogsURLForContainer(possible); - // Add the completed container log link (overwrites log link for live - // container). Mark container stopped as well. - ContainerInformation ci = getApplicationDiagnostics() - .getContainer(id.toString()); - if (ci != null) { - ci.logLink = url; - ci.state = StateValues.STATE_STOPPED; - ci.exitCode = ContainerExitStatus.SUCCESS; - ci.diagnostics = releaseMessage; - ci.completionTime = containerCompletionTime; - } + // Store container diagnostics on release + storeContainerDiagnostics(id.toString(), ContainerExitStatus.SUCCESS, + releaseMessage, StateValues.STATE_STOPPED, url, + containerCompletionTime); log.info("Releasing container. Log: " + url); try { containerReleaseSubmitted(possible); @@ -2587,7 +2587,11 @@ public class AppState { .getContainer(containerId); if (containerInfo != null) { containerInfo.exitCode = exitCode; - containerInfo.diagnostics = diagnostics; + if (StringUtils.isNotBlank(diagnostics)) { + containerInfo.diagnostics = diagnostics; + } else { + containerInfo.diagnostics = ErrorStrings.E_MISSING_DIAGNOSTICS_FROM_YARN; + } containerInfo.state = state; if (logLink != null) { containerInfo.logLink = logLink; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java index 0a3a3c9..07a5cf9 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java @@ -21,6 +21,7 @@ package org.apache.slider.server.appmaster.state; import com.codahale.metrics.Metric; import com.codahale.metrics.MetricSet; import com.google.common.base.Preconditions; +import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.slider.api.types.ComponentInformation; import org.apache.slider.api.types.RoleStatistics; @@ -30,9 +31,12 @@ import org.apache.slider.server.appmaster.management.BoolMetricPredicate; import org.apache.slider.server.appmaster.management.LongGauge; import java.io.Serializable; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; /** * Models the ongoing status of all nodes in an application. @@ -72,12 +76,11 @@ public final class RoleStatus implements Cloneable, MetricSet { /** resource requirements */ private Resource resourceRequirements; - /** any pending AA request */ private volatile OutstandingRequest outstandingAArequest = null; - private String failureMessage = ""; + private final Set<ContainerId> failedContainers = new HashSet<>(); public RoleStatus(ProviderRole providerRole) { this.providerRole = providerRole; @@ -239,7 +242,9 @@ public final class RoleStatus implements Cloneable, MetricSet { * Reset the recent failure * @return the number of failures in the "recent" window */ - public long resetFailedRecently() { + public synchronized long resetFailedRecently() { + // clear failedContainers + failedContainers.clear(); return failedRecently.getAndSet(0); } @@ -276,10 +281,13 @@ public final class RoleStatus implements Cloneable, MetricSet { * @param outcome outcome of the container */ public synchronized void noteFailed(boolean startupFailure, String text, - ContainerOutcome outcome) { + ContainerOutcome outcome, ContainerId containerId) { if (text != null) { failureMessage = text; } + if (containerId != null) { + failedContainers.add(containerId); + } switch (outcome) { case Preempted: preempted.incrementAndGet(); @@ -317,6 +325,10 @@ public final class RoleStatus implements Cloneable, MetricSet { return failureMessage; } + public synchronized Set<ContainerId> getFailedContainers() { + return Collections.unmodifiableSet(failedContainers); + } + public long getCompleted() { return completed.get(); } @@ -441,6 +453,7 @@ public final class RoleStatus implements Cloneable, MetricSet { } sb.append(", failureMessage='").append(failureMessage).append('\''); sb.append(", providerRole=").append(providerRole); + sb.append(", failedContainers=").append(failedContainers); sb.append('}'); return sb.toString(); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy index f1a4027..f6314b0 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy @@ -247,7 +247,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest public void testRoleStatusFailed() throws Throwable { def status = role0Status // limits exceeded - status.noteFailed(false, "text",ContainerOutcome.Failed) + status.noteFailed(false, "text",ContainerOutcome.Failed, null) assert 1 == status.failed assert 1L == status.failedRecently assert 0L == status.limitsExceeded @@ -264,7 +264,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest public void testRoleStatusFailedLimitsExceeded() throws Throwable { def status = role0Status // limits exceeded - status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded) + status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded, null) assert 1 == status.failed assert 1L == status.failedRecently assert 1L == status.limitsExceeded @@ -283,7 +283,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest public void testRoleStatusFailedPrempted() throws Throwable { def status = role0Status // limits exceeded - status.noteFailed(false, "text", ContainerOutcome.Preempted) + status.noteFailed(false, "text", ContainerOutcome.Preempted, null) assert 0 == status.failed assert 1L == status.preempted assert 0L == status.failedRecently @@ -299,7 +299,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest public void testRoleStatusFailedNode() throws Throwable { def status = role0Status // limits exceeded - status.noteFailed(false, "text", ContainerOutcome.Node_failure) + status.noteFailed(false, "text", ContainerOutcome.Node_failure, null) assert 1 == status.failed assert 0L == status.failedRecently assert 0L == status.limitsExceeded http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/4995e98f/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy index a818e53..014a850 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy @@ -101,8 +101,8 @@ public class TestIndexBlock extends BaseMockAppStateAATest { def role0_failures = 2 - role0.noteFailed(false, "", ContainerOutcome.Failed) - role0.noteFailed(true, "", ContainerOutcome.Failed) + role0.noteFailed(false, "", ContainerOutcome.Failed, null) + role0.noteFailed(true, "", ContainerOutcome.Failed, null) // all aa roles fields are in the def aarole_desired = 200