SLIDER-830 sort target hosts by reliability before choosing
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/6bcffb43 Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/6bcffb43 Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/6bcffb43 Branch: refs/heads/develop Commit: 6bcffb43155f4f6291f732a5a7ad402cedf604b8 Parents: b952b64 Author: Steve Loughran <[email protected]> Authored: Wed Mar 25 12:56:58 2015 +0000 Committer: Steve Loughran <[email protected]> Committed: Wed Mar 25 12:56:58 2015 +0000 ---------------------------------------------------------------------- .../org/apache/slider/api/ResourceKeys.java | 4 +++ .../apache/slider/providers/ProviderRole.java | 4 +-- .../server/appmaster/state/NodeInstance.java | 32 +++++++++++--------- .../server/appmaster/state/RoleHistory.java | 16 +++++++--- ...stRoleHistoryFindNodesForNewInstances.groovy | 22 ++++++++++++++ 5 files changed, 57 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java index 9066a52..f481c6a 100644 --- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java +++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java @@ -140,6 +140,10 @@ public interface ResourceKeys { */ int DEFAULT_NODE_FAILURE_THRESHOLD = 3; + /** + * Failure threshold is unlimited: {@value} + */ + int NODE_FAILURE_THRESHOLD_UNLIMITED = -1; /** * Time in seconds to escalate placement delay http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java index 294fe89..3009f50 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java +++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java @@ -29,8 +29,8 @@ import org.apache.slider.api.ResourceKeys; public final class ProviderRole { public final String name; public final int id; - public final int placementPolicy; - public final int nodeFailureThreshold; + public int placementPolicy; + public int nodeFailureThreshold; public final long placementTimeoutSeconds; public ProviderRole(String name, int id) { http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java index ed039f9..fb80b5f 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java @@ -198,6 +198,22 @@ public class NodeInstance { return hostname.hashCode(); } + + /** + * Predicate to query if the number of recent failures of a role + * on this node exceeds that role's failure threshold. + * If there is no record of a deployment of that role on this + * node, the failure count is taken as "0". + * @param role role to look up + * @return true if the failure rate is above the threshold. + */ + public boolean exceedsFailureThreshold(RoleStatus role) { + NodeEntry entry = get(role.getKey()); + int numFailuresOnLastHost = entry != null ? entry.getFailedRecently() : 0; + int failureThreshold = role.getNodeFailureThreshold(); + return failureThreshold < 0 || numFailuresOnLastHost > failureThreshold; + } + /** * A comparator for sorting entries where the node is preferred over another. * <p> @@ -218,20 +234,6 @@ public class NodeInstance { public int compare(NodeInstance o1, NodeInstance o2) { NodeEntry left = o1.get(role); NodeEntry right = o2.get(role); - -/* - // sort by failure count - int failL = left != null ? left.getFailedRecently() : -1; - int failR = right != null ? right.getFailedRecently() : -1; - - if (failL < failR) { - return 1; - } - if (failR > failL) { - return -1; - } - */ - // failure counts are equal: compare age long ageL = left != null ? left.getLastUsed() : 0; long ageR = right != null ? right.getLastUsed() : 0; @@ -244,7 +246,7 @@ public class NodeInstance { return 0; } } - + /** * A comparator for sorting entries where the role is newer than * the other. http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java index 9ab40bd..98cf4e4 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java @@ -485,10 +485,11 @@ public class RoleHistory { // no data locality policy return null; } - int roleKey = role.getKey(); + int roleId = role.getKey(); + boolean strictPlacement = role.isStrictPlacement(); NodeInstance nodeInstance = null; // get the list of possible targets - List<NodeInstance> targets = getNodesForRoleId(roleKey); + List<NodeInstance> targets = getNodesForRoleId(roleId); if (targets == null) { // add an empty list here for ease downstream targets = new ArrayList<>(0); @@ -498,8 +499,15 @@ public class RoleHistory { // spin until there's a candidate while (!targets.isEmpty() && nodeInstance == null) { NodeInstance head = targets.remove(0); - if (head.getActiveRoleInstances(roleKey) == 0) { - nodeInstance = head; + if (head.getActiveRoleInstances(roleId) == 0) { + // no active instances: check failure statistics + if (strictPlacement || !head.exceedsFailureThreshold(role)) { + nodeInstance = head; + } else { + // too many failures for this node + log.info("Recent node failures is higher than threshold {}. Not requesting host {}", + role.getNodeFailureThreshold(), head.hostname); + } } } if (nodeInstance == null) { http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy index 79cd348..b29a0b5 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy @@ -23,6 +23,7 @@ import groovy.util.logging.Slf4j import org.apache.slider.providers.ProviderRole import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest import org.apache.slider.server.appmaster.model.mock.MockFactory +import org.apache.slider.server.appmaster.state.NodeEntry import org.apache.slider.server.appmaster.state.NodeInstance import org.apache.slider.server.appmaster.state.RoleHistory import org.apache.slider.server.appmaster.state.RoleStatus @@ -94,6 +95,7 @@ class TestRoleHistoryFindNodesForNewInstances extends BaseMockAppStateTest { assert [age2Active0, age3Active0].contains(found2) assert found != found2; } + @Test public void testFind3NodeR0ReturnsNull() throws Throwable { assert 2== findNodes(2).size() @@ -125,4 +127,24 @@ class TestRoleHistoryFindNodesForNewInstances extends BaseMockAppStateTest { log.info(found ?.toFullString()) assert found == null } + @Test + public void testFindNodesSkipsFailingNode() throws Throwable { + // mark age2 and active 0 as busy, expect a null back + + def entry0 = age2Active0.get(0) + entry0.containerCompleted(false) + assert entry0.failed + assert entry0.failedRecently + entry0.containerCompleted(false) + assert !age2Active0.exceedsFailureThreshold(roleStat) + // set failure to 1 + roleStat.providerRole.nodeFailureThreshold = 1 + // threshold is now exceeded + assert age2Active0.exceedsFailureThreshold(roleStat) + + // get the role & expect age3 to be picked up, even though it is older + NodeInstance found = roleHistory.findNodeForNewInstance(roleStat) + assert age3Active0.is(found) + } + }
