Repository: incubator-slider Updated Branches: refs/heads/develop 46396410b -> ef5954ded
SLIDER-1233 Lost nodes should not contribute to container failures Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/ef5954de Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/ef5954de Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/ef5954de Branch: refs/heads/develop Commit: ef5954dedf4f8503d2104987ffc061cb99e906f4 Parents: 4639641 Author: Billie Rinaldi <bil...@apache.org> Authored: Tue Jul 25 14:23:09 2017 -0700 Committer: Billie Rinaldi <bil...@apache.org> Committed: Tue Jul 25 14:23:09 2017 -0700 ---------------------------------------------------------------------- .../server/appmaster/state/RoleStatus.java | 4 +++ .../TestMockAppStateContainerFailure.groovy | 35 ++++++++++++++++++++ 2 files changed, 39 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/ef5954de/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java index 07a5cf9..694f5cf 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java @@ -289,6 +289,10 @@ public final class RoleStatus implements Cloneable, MetricSet { failedContainers.add(containerId); } switch (outcome) { + case Completed: + // don't increment failure counts + break; + case Preempted: preempted.incrementAndGet(); break; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/ef5954de/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy index f6314b0..87095ad 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy @@ -20,6 +20,7 @@ package org.apache.slider.server.appmaster.model.appstate import groovy.transform.CompileStatic import groovy.util.logging.Slf4j +import org.apache.hadoop.yarn.api.records.ContainerExitStatus import org.apache.hadoop.yarn.api.records.ContainerId import org.apache.slider.api.ResourceKeys import org.apache.slider.core.conf.AggregateConf @@ -216,6 +217,28 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest } @Test + public void testRecurrentNodeLost() throws Throwable { + role0Status.desired = 1 + try { + for (int i = 0; i < 100; i++) { + List<RoleInstance> instances = createAndSubmitNodes() + assert instances.size() == 1 + + List<ContainerId> ids = extractContainerIds(instances, 0) + + ContainerId cid = ids[0] + log.info("$i instance $instances[0] $cid") + assert cid + AppState.NodeCompletionResult result = appState.onCompletedNode(containerStatus(cid, ContainerExitStatus.ABORTED)) + assert result.containerFailed + } + } catch (TriggerClusterTeardownException teardown) { + log.info("Exception $teardown.exitCode : $teardown") + fail("Cluster failed despite aborted/killed container status") + } + } + + @Test public void testRoleStatusFailureWindow() throws Throwable { ResetFailureWindow resetter = new ResetFailureWindow(operationHandler); @@ -308,6 +331,18 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest } @Test + public void testRoleStatusCompleted() throws Throwable { + def status = role0Status + // aborted or killed + status.noteFailed(false, "text", ContainerOutcome.Completed, null) + assert 0 == status.failed + assert 0L == status.failedRecently + assert 0L == status.limitsExceeded + assert 0L == status.preempted + assert 0L == status.nodeFailed + } + + @Test public void testNodeEntryCompleted() throws Throwable { NodeEntry nodeEntry = new NodeEntry(1) nodeEntry.containerCompleted(true, ContainerOutcome.Completed);