SLIDER-832 scanning for avaialable nodes to keep failed nodes on list -just skip them. Tests to show this & strict placement logic
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/fc7f7364 Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/fc7f7364 Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/fc7f7364 Branch: refs/heads/develop Commit: fc7f7364e732c86127e28d0853bf5e4d3e4d6bc9 Parents: 63e2b80 Author: Steve Loughran <[email protected]> Authored: Thu Mar 26 15:11:36 2015 +0000 Committer: Steve Loughran <[email protected]> Committed: Thu Mar 26 15:11:36 2015 +0000 ---------------------------------------------------------------------- .../server/appmaster/state/RoleHistory.java | 26 ++++---- .../TestRoleHistoryRequestTracking.groovy | 64 ++++++++++++++++---- .../model/mock/BaseMockAppStateTest.groovy | 17 +++++- .../appmaster/model/mock/MockFactory.groovy | 2 +- 4 files changed, 85 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/fc7f7364/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java index 98cf4e4..c2a741c 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java @@ -488,30 +488,34 @@ public class RoleHistory { int roleId = role.getKey(); boolean strictPlacement = role.isStrictPlacement(); NodeInstance nodeInstance = null; - // get the list of possible targets + // Get the list of possible targets. + // This is a live list: changes here are preserved List<NodeInstance> targets = getNodesForRoleId(roleId); if (targets == null) { - // add an empty list here for ease downstream - targets = new ArrayList<>(0); + // nothing to allocate on + return null; } + int cnt = targets.size(); log.debug("There are {} node(s) to consider for {}", cnt, role.getName()); - // spin until there's a candidate - while (!targets.isEmpty() && nodeInstance == null) { - NodeInstance head = targets.remove(0); - if (head.getActiveRoleInstances(roleId) == 0) { + for (int i = 0; i < cnt && nodeInstance == null; i++) { + NodeInstance candidate = targets.get(i); + if (candidate.getActiveRoleInstances(roleId) == 0) { // no active instances: check failure statistics - if (strictPlacement || !head.exceedsFailureThreshold(role)) { - nodeInstance = head; + if (strictPlacement || !candidate.exceedsFailureThreshold(role)) { + targets.remove(i); + // exit criteria for loop is now met + nodeInstance = candidate; } else { // too many failures for this node log.info("Recent node failures is higher than threshold {}. Not requesting host {}", - role.getNodeFailureThreshold(), head.hostname); + role.getNodeFailureThreshold(), candidate.hostname); } } } + if (nodeInstance == null) { - log.info("No historical node found for {}", role.getName()); + log.info("No node found for {}", role.getName()); } return nodeInstance; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/fc7f7364/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy index 82750a3..9847992 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy @@ -18,14 +18,17 @@ package org.apache.slider.server.appmaster.model.history +import groovy.util.logging.Slf4j import org.apache.hadoop.yarn.api.records.Container import org.apache.hadoop.yarn.api.records.Resource import org.apache.hadoop.yarn.client.api.AMRMClient +import org.apache.slider.providers.PlacementPolicy import org.apache.slider.providers.ProviderRole import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest import org.apache.slider.server.appmaster.model.mock.MockContainer import org.apache.slider.server.appmaster.model.mock.MockFactory import org.apache.slider.server.appmaster.state.ContainerAllocationOutcome +import org.apache.slider.server.appmaster.state.NodeEntry import org.apache.slider.server.appmaster.state.NodeInstance import org.apache.slider.server.appmaster.state.OutstandingRequest import org.apache.slider.server.appmaster.state.RoleHistory @@ -37,15 +40,16 @@ import org.junit.Test * Test the RH availability list and request tracking: that hosts * get removed and added */ +@Slf4j class TestRoleHistoryRequestTracking extends BaseMockAppStateTest { String roleName = "test" NodeInstance age1Active4 = nodeInstance(1, 4, 0, 0) NodeInstance age2Active2 = nodeInstance(2, 2, 0, 1) + NodeInstance age2Active0 = nodeInstance(2, 0, 0, 0) NodeInstance age3Active0 = nodeInstance(3, 0, 0, 0) NodeInstance age4Active1 = nodeInstance(4, 1, 0, 0) - NodeInstance age2Active0 = nodeInstance(2, 0, 0, 0) NodeInstance empty = new NodeInstance("empty", MockFactory.ROLE_COUNT) List<NodeInstance> nodes = [age2Active2, age2Active0, age4Active1, age1Active4, age3Active0] @@ -87,24 +91,34 @@ class TestRoleHistoryRequestTracking extends BaseMockAppStateTest { @Test public void testRequestedNodeOffListWithFailures() throws Throwable { + assert 0 == roleStatus.key + assert !roleHistory.cloneAvailableList(0).isEmpty() + + NodeEntry age3role0 = recordAsFailed(age3Active0, 0, 4) + assert age3Active0.isConsideredUnreliable(0, roleStatus.nodeFailureThreshold) + recordAsFailed(age2Active0, 0, 4) + assert age2Active0.isConsideredUnreliable(0, roleStatus.nodeFailureThreshold) + // expect to get a null node back NodeInstance ni = roleHistory.findNodeForNewInstance(roleStatus) - assert age3Active0 == ni + assert !ni + + // which is translated to a no-location request AMRMClient.ContainerRequest req = roleHistory.requestInstanceOnNode(ni, roleStatus, resource, "") - assert 1 == req.nodes.size() - List<NodeInstance> a2 = roleHistory.cloneAvailableList(0) - assertListEquals([age2Active0], a2) - age3Active0.get(0).failedRecently = 4 - req = roleHistory.requestInstanceOnNode(ni, - roleStatus, - resource, - "") assertNull(req.nodes) - age3Active0.get(0).failedRecently = 0 + log.info "resetting failure count" + age3role0.resetFailedRecently() + roleHistory.dump() + assert 0 == age3role0.failedRecently + assert !age3Active0.isConsideredUnreliable(0, roleStatus.nodeFailureThreshold) + assert !roleHistory.cloneAvailableList(0).isEmpty() + // looking for a node should now find one + ni = roleHistory.findNodeForNewInstance(roleStatus) + assert ni == age3Active0 req = roleHistory.requestInstanceOnNode(ni, roleStatus, resource, @@ -113,6 +127,34 @@ class TestRoleHistoryRequestTracking extends BaseMockAppStateTest { } @Test + public void testStrictPlacementIgnoresFailures() throws Throwable { + + def targetRole = role1Status + final ProviderRole providerRole1 = targetRole.providerRole + assert providerRole1.placementPolicy == PlacementPolicy.STRICT + int key = targetRole.key + + recordAsFailed(age1Active4, key, 4) + recordAsFailed(age2Active0, key, 4) + recordAsFailed(age2Active2, key, 4) + recordAsFailed(age3Active0, key, 4) + recordAsFailed(age4Active1, key, 4) + + // trigger a list rebuild + roleHistory.buildAvailableNodeLists(); + + assert !roleHistory.cloneAvailableList(key).isEmpty() + + + NodeInstance ni = roleHistory.findNodeForNewInstance(targetRole) + assert ni == age4Active1!= null + // next lookup returns next node + ni = roleHistory.findNodeForNewInstance(roleStatus) + assert ni == age3Active0 + } + + + @Test public void testFindAndRequestNode() throws Throwable { AMRMClient.ContainerRequest req = roleHistory.requestNode(roleStatus, resource) http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/fc7f7364/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy index f30fce6..3e5494f 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy @@ -35,6 +35,7 @@ import org.apache.slider.core.main.LauncherExitCodes import org.apache.slider.server.appmaster.operations.AbstractRMOperation import org.apache.slider.server.appmaster.state.AppState import org.apache.slider.server.appmaster.state.ContainerAssignment +import org.apache.slider.server.appmaster.state.NodeEntry import org.apache.slider.server.appmaster.state.NodeInstance import org.apache.slider.server.appmaster.state.RoleInstance import org.apache.slider.server.appmaster.state.RoleStatus @@ -147,7 +148,7 @@ abstract class BaseMockAppStateTest extends SliderTestBase implements MockRoles public NodeInstance nodeInstance(long age, int live0, int live1=0, int live2=0) { - NodeInstance ni = new NodeInstance("age${age}live[${live0},${live1},$live2]", + NodeInstance ni = new NodeInstance("age${age}-[${live0},${live1},$live2]", MockFactory.ROLE_COUNT) ni.getOrCreate(0).lastUsed = age ni.getOrCreate(0).live = live0; @@ -333,4 +334,18 @@ abstract class BaseMockAppStateTest extends SliderTestBase implements MockRoles return cids } + /** + * Record a node as failing + * @param node + * @param id + * @param count + * @return the entry + */ + public NodeEntry recordAsFailed(NodeInstance node, int id, int count) { + def entry = node.getOrCreate(id) + 1.upto(count) { + entry.containerCompleted(false) + } + entry + } } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/fc7f7364/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockFactory.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockFactory.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockFactory.groovy index 06bc10c..fca3376 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockFactory.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockFactory.groovy @@ -184,7 +184,7 @@ class MockFactory implements MockRoles { def roleMap(int count) { return [ - (ResourceKeys.COMPONENT_INSTANCES):count.toString(), + (ResourceKeys.COMPONENT_INSTANCES): count.toString(), ] }
