SLIDER-830 sort target hosts by reliability before choosing

Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/6bcffb43
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/6bcffb43
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/6bcffb43

Branch: refs/heads/develop
Commit: 6bcffb43155f4f6291f732a5a7ad402cedf604b8
Parents: b952b64
Author: Steve Loughran <[email protected]>
Authored: Wed Mar 25 12:56:58 2015 +0000
Committer: Steve Loughran <[email protected]>
Committed: Wed Mar 25 12:56:58 2015 +0000

----------------------------------------------------------------------
 .../org/apache/slider/api/ResourceKeys.java     |  4 +++
 .../apache/slider/providers/ProviderRole.java   |  4 +--
 .../server/appmaster/state/NodeInstance.java    | 32 +++++++++++---------
 .../server/appmaster/state/RoleHistory.java     | 16 +++++++---
 ...stRoleHistoryFindNodesForNewInstances.groovy | 22 ++++++++++++++
 5 files changed, 57 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java 
b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
index 9066a52..f481c6a 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
@@ -140,6 +140,10 @@ public interface ResourceKeys {
    */
   int DEFAULT_NODE_FAILURE_THRESHOLD = 3;
 
+  /**
+   * Failure threshold is unlimited: {@value}
+   */
+  int NODE_FAILURE_THRESHOLD_UNLIMITED = -1;
 
   /**
    * Time in seconds to escalate placement delay

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java 
b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
index 294fe89..3009f50 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
@@ -29,8 +29,8 @@ import org.apache.slider.api.ResourceKeys;
 public final class ProviderRole {
   public final String name;
   public final int id;
-  public final int placementPolicy;
-  public final int nodeFailureThreshold;
+  public int placementPolicy;
+  public int nodeFailureThreshold;
   public final long placementTimeoutSeconds;
 
   public ProviderRole(String name, int id) {

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
index ed039f9..fb80b5f 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
@@ -198,6 +198,22 @@ public class NodeInstance {
     return hostname.hashCode();
   }
 
+
+  /**
+   * Predicate to query if the number of recent failures of a role
+   * on this node exceeds that role's failure threshold.
+   * If there is no record of a deployment of that role on this
+   * node, the failure count is taken as "0".
+   * @param role role to look up
+   * @return true if the failure rate is above the threshold.
+   */
+  public boolean exceedsFailureThreshold(RoleStatus role) {
+    NodeEntry entry = get(role.getKey());
+    int numFailuresOnLastHost = entry != null ? entry.getFailedRecently() : 0;
+    int failureThreshold = role.getNodeFailureThreshold();
+    return failureThreshold < 0 || numFailuresOnLastHost > failureThreshold;
+  }
+
   /**
    * A comparator for sorting entries where the node is preferred over another.
    * <p>
@@ -218,20 +234,6 @@ public class NodeInstance {
     public int compare(NodeInstance o1, NodeInstance o2) {
       NodeEntry left = o1.get(role);
       NodeEntry right = o2.get(role);
-
-/*      
-      // sort by failure count 
-      int failL = left != null ? left.getFailedRecently() : -1;
-      int failR = right != null ? right.getFailedRecently() : -1;
-      
-      if (failL < failR) {
-        return 1;
-      }
-      if (failR > failL) {
-        return -1;
-      }
-    */  
-      // failure counts are equal: compare age
       long ageL = left != null ? left.getLastUsed() : 0;
       long ageR = right != null ? right.getLastUsed() : 0;
       
@@ -244,7 +246,7 @@ public class NodeInstance {
       return 0;
     }
   }
-  
+
   /**
    * A comparator for sorting entries where the role is newer than
    * the other. 

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index 9ab40bd..98cf4e4 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -485,10 +485,11 @@ public class RoleHistory {
       // no data locality policy
       return null;
     }
-    int roleKey = role.getKey();
+    int roleId = role.getKey();
+    boolean strictPlacement = role.isStrictPlacement();
     NodeInstance nodeInstance = null;
     // get the list of possible targets
-    List<NodeInstance> targets = getNodesForRoleId(roleKey);
+    List<NodeInstance> targets = getNodesForRoleId(roleId);
     if (targets == null) {
       // add an empty list here for ease downstream
       targets = new ArrayList<>(0);
@@ -498,8 +499,15 @@ public class RoleHistory {
     // spin until there's a candidate
     while (!targets.isEmpty() && nodeInstance == null) {
       NodeInstance head = targets.remove(0);
-      if (head.getActiveRoleInstances(roleKey) == 0) {
-        nodeInstance = head;
+      if (head.getActiveRoleInstances(roleId) == 0) {
+        // no active instances: check failure statistics
+        if (strictPlacement || !head.exceedsFailureThreshold(role)) {
+          nodeInstance = head;
+        } else {
+          // too many failures for this node
+          log.info("Recent node failures is higher than threshold {}. Not 
requesting host {}",
+              role.getNodeFailureThreshold(), head.hostname);
+        }
       }
     }
     if (nodeInstance == null) {

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/6bcffb43/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
----------------------------------------------------------------------
diff --git 
a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
 
b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
index 79cd348..b29a0b5 100644
--- 
a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
+++ 
b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
@@ -23,6 +23,7 @@ import groovy.util.logging.Slf4j
 import org.apache.slider.providers.ProviderRole
 import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest
 import org.apache.slider.server.appmaster.model.mock.MockFactory
+import org.apache.slider.server.appmaster.state.NodeEntry
 import org.apache.slider.server.appmaster.state.NodeInstance
 import org.apache.slider.server.appmaster.state.RoleHistory
 import org.apache.slider.server.appmaster.state.RoleStatus
@@ -94,6 +95,7 @@ class TestRoleHistoryFindNodesForNewInstances extends 
BaseMockAppStateTest {
     assert [age2Active0, age3Active0].contains(found2)
     assert found != found2;
   }
+
   @Test
   public void testFind3NodeR0ReturnsNull() throws Throwable {
     assert 2== findNodes(2).size()
@@ -125,4 +127,24 @@ class TestRoleHistoryFindNodesForNewInstances extends 
BaseMockAppStateTest {
     log.info(found ?.toFullString())
     assert found == null
   }
+  @Test
+  public void testFindNodesSkipsFailingNode() throws Throwable {
+    // mark age2 and active 0 as busy, expect a null back
+
+    def entry0 = age2Active0.get(0)
+    entry0.containerCompleted(false)
+    assert entry0.failed
+    assert entry0.failedRecently
+    entry0.containerCompleted(false)
+    assert !age2Active0.exceedsFailureThreshold(roleStat)
+    // set failure to 1
+    roleStat.providerRole.nodeFailureThreshold = 1
+    // threshold is now exceeded
+    assert age2Active0.exceedsFailureThreshold(roleStat)
+
+    // get the role & expect age3 to be picked up, even though it is older
+    NodeInstance found = roleHistory.findNodeForNewInstance(roleStat)
+    assert age3Active0.is(found)
+  }
+
 }

Reply via email to