Repository: incubator-slider
Updated Branches:
  refs/heads/develop 5696c7de3 -> 0f436c865


SLIDER-1246 Application health should not be affected by faulty nodes (health 
monitor based on percent threshold)


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/0f436c86
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/0f436c86
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/0f436c86

Branch: refs/heads/develop
Commit: 0f436c865a90aba5b427d1c0571183c6fcbded1e
Parents: 5696c7d
Author: Gour Saha <gourks...@apache.org>
Authored: Sun Oct 1 22:15:07 2017 -0700
Committer: Gour Saha <gourks...@apache.org>
Committed: Sun Oct 1 22:15:07 2017 -0700

----------------------------------------------------------------------
 .../org/apache/slider/api/ResourceKeys.java     |  46 ++++++
 .../slider/core/conf/ConfTreeOperations.java    |  15 ++
 .../apache/slider/providers/ProviderRole.java   |   4 +-
 .../server/appmaster/SliderAppMaster.java       |  77 +++++++++-
 .../actions/MonitorHealthThreshold.java         | 146 +++++++++++++++++++
 .../slider/server/appmaster/state/AppState.java | 103 ++++++++++++-
 .../server/appmaster/state/RoleStatus.java      |  12 ++
 7 files changed, 398 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java 
b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
index 295f7cd..29ef8ea 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
@@ -159,6 +159,52 @@ public interface ResourceKeys {
   int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 5;
 
   /**
+   * The container health threshold when explicitly set for a specific role or
+   * glabally for all roles, will schedule a health check monitor to
+   * periodically check for the percentage of healthy containers. It runs the
+   * check at a specified/default poll frequency. It allows a role to be below
+   * the health threshold for a specified/default window after which it
+   * considers the application to be unhealthy and triggers an app stop.
+   */
+  String CONTAINER_HEALTH_THRESHOLD_PERCENT =
+      "yarn.container.health.threshold.percent";
+  /**
+   * Health check monitor poll frequency. It is an advanced setting and does 
not
+   * need to be set unless the app owner understands the implication and does
+   * not want the default.
+   */
+  String CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC =
+      "yarn.container.health.threshold.poll.frequency.secs";
+  /**
+   * The amount of time the health check monitor allows a specific role to be
+   * below the health threshold after which it considers the app to be
+   * unhealthy.
+   */
+  String CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC =
+      "yarn.container.health.threshold.window.secs";
+  /**
+   * The amount of initial time the health check monitor waits before the first
+   * check kicks in. It gives a lead time for the app containers to come up for
+   * the first time.
+   */
+  String CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC =
+      "yarn.container.health.threshold.init.delay.secs";
+  /**
+   * By default the health threshold percent does not come into play until it 
is
+   * explicitly set in resource config for a specific role or globally for all
+   * roles. -1 signifies disabled.
+   */
+  int CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED = -1;
+
+  int DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT =
+      CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED;
+  long DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC = 10;
+  long DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC = 600;
+  // the default for init delay is same as default health window
+  long DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC =
+      DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC;
+
+  /**
    * Default node failure threshold for a component instance: {@value}
    * Should to be lower than default component failure threshold to allow
    * the component to start elsewhere

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java 
b/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
index 526e17d..c8a7720 100644
--- 
a/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
+++ 
b/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
@@ -459,6 +459,21 @@ public class ConfTreeOperations {
   }
 
   /**
+   * Get a component opt; use {@link Long#decode(String)} so as to take hex
+   * oct and bin values too.
+   *
+   * @param name component name
+   * @param option option name
+   * @param defVal default value
+   * @return parsed value
+   * @throws NumberFormatException if the role could not be parsed.
+   */
+  public long getComponentOptLong(String name, String option, long defVal) {
+    String val = getComponentOpt(name, option, Long.toString(defVal));
+    return Long.decode(val);
+  }
+
+  /**
    * Get a component opt as a boolean using {@link Boolean#valueOf(String)}.
    *
    * @param name component name

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java 
b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
index 4f6be52..4105b67 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
@@ -76,8 +76,8 @@ public final class ProviderRole {
    * @param group role/component group
    * @param id ID. This becomes the YARN priority
    * @param policy placement policy
-   * @param nodeFailureThreshold threshold for node failures (within a reset 
interval)
-   * after which a node failure is considered an app failure
+   * @param nodeFailureThreshold threshold for node failures (within a reset
+   * interval) after which a node failure is considered an app failure
    * @param placementTimeoutSeconds for lax placement, timeout in seconds 
before
    * @param labelExpression label expression for requests; may be null
    */

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 3f47b98..c12fae8 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -138,6 +138,7 @@ import 
org.apache.slider.server.appmaster.actions.QueueService;
 import org.apache.slider.server.appmaster.actions.ActionStopSlider;
 import org.apache.slider.server.appmaster.actions.ActionUpgradeContainers;
 import org.apache.slider.server.appmaster.actions.AsyncAction;
+import org.apache.slider.server.appmaster.actions.MonitorHealthThreshold;
 import org.apache.slider.server.appmaster.actions.RenewingAction;
 import org.apache.slider.server.appmaster.actions.ResetFailureWindow;
 import org.apache.slider.server.appmaster.actions.ReviewAndFlexApplicationSize;
@@ -991,6 +992,7 @@ public class SliderAppMaster extends 
AbstractSliderLaunchedService
 
     scheduleFailureWindowResets(instanceDefinition.getResources());
     scheduleEscalation(instanceDefinition.getInternal());
+    scheduleHealthThresholdMonitor(instanceDefinition.getResources());
 
     try {
       // schedule YARN Registry registration
@@ -1902,6 +1904,79 @@ public class SliderAppMaster extends 
AbstractSliderLaunchedService
   }
 
   /**
+   * Schedule the health threshold monitor for all roles (except AM)
+   *
+   * @param resources
+   *          the resource tree
+   */
+  private void scheduleHealthThresholdMonitor(ConfTree resources) {
+    ConfTreeOperations ops = new ConfTreeOperations(resources);
+    for (String roleGroup : ops.getComponentNames()) {
+      if (roleGroup.equals(SliderKeys.COMPONENT_AM)) {
+        continue;
+      }
+      // determine health threshold percent
+      int healthThresholdPercent = appState
+          .getHealthThresholdPercentForRole(roleGroup);
+      // validations
+      if (healthThresholdPercent ==
+            ResourceKeys.CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED) {
+        log.info("No health threshold monitor enabled for role {}", roleGroup);
+        continue;
+      }
+      // if threshold set to outside acceptable range then don't enable monitor
+      if (healthThresholdPercent <= 0 || healthThresholdPercent > 100) {
+        log.error(
+            "Invalid health threshold percent {}% for role {}. Monitor not "
+                + "enabled.",
+            healthThresholdPercent, roleGroup);
+        continue;
+      }
+      // determine the threshold properties
+      long window = ops.getComponentOptLong(roleGroup,
+          ResourceKeys.CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC,
+          ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC);
+      long initDelay = ops.getComponentOptLong(roleGroup,
+          ResourceKeys.CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC,
+          ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC);
+      long pollFrequency = ops.getComponentOptLong(roleGroup,
+          ResourceKeys.CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC,
+          ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC);
+      // validations
+      if (window <= 0) {
+        log.error(
+            "Invalid health monitor window {} secs for role {}. Monitor not "
+                + "enabled.",
+            window, roleGroup);
+        continue;
+      }
+      if (initDelay < 0) {
+        log.error("Invalid health monitor init delay {} secs for role {}. "
+            + "Monitor not enabled.", initDelay, roleGroup);
+        continue;
+      }
+      if (pollFrequency <= 0) {
+        log.error("Invalid health monitor poll frequency {} secs for role {}. "
+            + "Monitor not enabled.", pollFrequency, roleGroup);
+        continue;
+      }
+      log.info(
+          "Scheduling the health threshold monitor for role {} with percent = "
+              + "{}%, window = {} secs, poll freq = {} secs, init-delay = {} "
+              + "secs",
+          roleGroup, healthThresholdPercent, window, pollFrequency, initDelay);
+      MonitorHealthThreshold monitor = new MonitorHealthThreshold(roleGroup,
+          healthThresholdPercent, window);
+      RenewingAction<MonitorHealthThreshold> renew = new RenewingAction<>(
+          monitor, initDelay, pollFrequency, TimeUnit.SECONDS, 0);
+      actionQueues.renewing("healthThresholdMonitor", renew);
+      // Mark that health threshold monitor is enabled for this role. Can be
+      // used to disable the failure threshold check.
+      appState.setHealthThresholdMonitorEnabled(roleGroup, true);
+    }
+  }
+
+  /**
    * Schedule the escalation action
    * @param internal
    * @throws BadConfigException
@@ -2078,7 +2153,7 @@ public class SliderAppMaster extends 
AbstractSliderLaunchedService
   public void onError(Throwable e) {
     if (e instanceof InvalidResourceRequestException) {
       // stop the cluster
-      LOG_YARN.error("AMRMClientAsync.onError() received {}", e, e);
+      LOG_YARN.error("AMRMClientAsync.onError() received {}", e);
       ActionStopSlider stopSlider = new ActionStopSlider("stop",
           EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED,
           SliderUtils.extractFirstLine(e.getLocalizedMessage()));

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
new file mode 100644
index 0000000..cc6f13c
--- /dev/null
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.slider.server.appmaster.actions;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
+import org.apache.slider.api.SliderExitReason;
+import org.apache.slider.core.main.LauncherExitCodes;
+import org.apache.slider.server.appmaster.SliderAppMaster;
+import org.apache.slider.server.appmaster.state.AppState;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Monitors at a regular interval if the container health for a specific role
+ * has dropped below a desired threshold.
+ */
+public class MonitorHealthThreshold extends AsyncAction {
+  protected static final Logger log = LoggerFactory
+      .getLogger(MonitorHealthThreshold.class);
+
+  private final String roleGroup;
+  private final int healthThresholdPercent;
+  private final long healthThresholdWindowSecs;
+  private final long healthThresholdWindowNanos;
+  private long firstOccurrenceTimestamp = 0;
+  // Sufficient logging happens when role health is below threshold. However,
+  // there has to be some logging when it is above threshold, otherwise app
+  // owners have no idea how the health is fluctuating. So let's log whenever
+  // there is a change in role health, thereby preventing excessive logging on
+  // every poll. 
+  private float prevRunningContainerFraction = 0;
+
+  public MonitorHealthThreshold(String roleGroup, int healthThresholdPercent,
+      long healthThresholdWindowSecs) {
+    super("MonitorHealthThreshold");
+    this.roleGroup = roleGroup;
+    this.healthThresholdPercent = healthThresholdPercent;
+    this.healthThresholdWindowSecs = healthThresholdWindowSecs;
+    this.healthThresholdWindowNanos = TimeUnit.NANOSECONDS
+        .convert(healthThresholdWindowSecs, TimeUnit.SECONDS);
+  }
+
+  @Override
+  public void execute(SliderAppMaster appMaster, QueueAccess queueService,
+      AppState appState) throws Exception {
+    log.debug("MonitorHealthThreshold execute method");
+    // Perform container health checks against desired threshold
+    synchronized (appMaster) {
+      long desiredContainerCount = 
appState.getDesiredContainerCount(roleGroup);
+      // if desired container count for this role is 0 then nothing to do
+      if (desiredContainerCount == 0) {
+        return;
+      }
+      long runningContainerCount = appState.getLiveContainerCount(roleGroup);
+      float thresholdFraction = (float) healthThresholdPercent / 100;
+      // no possibility of div by 0 since desiredContainerCount won't be 0 here
+      float runningContainerFraction = (float) runningContainerCount
+          / desiredContainerCount;
+      boolean healthChanged = false;
+      if (runningContainerFraction != prevRunningContainerFraction) {
+        prevRunningContainerFraction = runningContainerFraction;
+        healthChanged = true;
+      }
+      String runningContainerPercentStr = String.format("%.2f",
+          runningContainerFraction * 100);
+      // Check if the current running container percent is less than the
+      // threshold percent
+      if (runningContainerFraction < thresholdFraction) {
+        // Check if it is the first occurrence and if yes set the timestamp
+        long currentTimestamp = now();
+        if (firstOccurrenceTimestamp == 0) {
+          firstOccurrenceTimestamp = currentTimestamp;
+          log.info("Role {} is going below health threshold for the first time 
"
+              + "at ts = {}", roleGroup, firstOccurrenceTimestamp);
+        }
+        long elapsedTime = currentTimestamp - firstOccurrenceTimestamp;
+        long elapsedTimeSecs = TimeUnit.SECONDS.convert(elapsedTime,
+            TimeUnit.NANOSECONDS);
+        log.warn(
+            "Role = {}, Current health = {}%, is below Health threshold of {}% 
"
+                + "for {} secs (window = {} secs)",
+            roleGroup, runningContainerPercentStr, healthThresholdPercent,
+            elapsedTimeSecs, healthThresholdWindowSecs);
+        if (elapsedTime > healthThresholdWindowNanos) {
+          log.error(
+              "Role = {}, Current health = {}%, has been below health "
+                  + "threshold of {}% for {} secs (threshold window = {} 
secs)",
+              roleGroup, runningContainerPercentStr, healthThresholdPercent,
+              elapsedTimeSecs, healthThresholdWindowSecs);
+          // Trigger an app stop
+          ActionStopSlider stopSlider = new ActionStopSlider("stop",
+              LauncherExitCodes.EXIT_EXCEPTION_THROWN,
+              FinalApplicationStatus.FAILED,
+              String.format(
+                  "Application was killed because container health for role %s 
"
+                      + "was %s%% (threshold = %d%%) for %d secs (threshold "
+                      + "window = %d secs)",
+                  roleGroup, runningContainerPercentStr, 
healthThresholdPercent,
+                  elapsedTimeSecs, healthThresholdWindowSecs));
+          stopSlider.setExitReason(SliderExitReason.APP_ERROR);
+          appMaster.queue(stopSlider);
+        }
+      } else {
+        String logMsg = "Role = {}, Health threshold = {}%, Current health = "
+            + "{}% (Current Running count = {}, Desired count = {})";
+        if (healthChanged) {
+          log.info(logMsg, roleGroup, healthThresholdPercent,
+              runningContainerPercentStr, runningContainerCount,
+              desiredContainerCount);
+        } else {
+          log.debug(logMsg, roleGroup, healthThresholdPercent,
+              runningContainerPercentStr, runningContainerCount,
+              desiredContainerCount);
+        }
+        // The container health might have recovered above threshold after 
being
+        // below for less than the threshold window amount of time. So we need
+        // to reset firstOccurrenceTimestamp to 0.
+        if (firstOccurrenceTimestamp != 0) {
+          log.info(
+              "Role = {}, resetting first occurence to 0, since it recovered "
+                  + "above health threshold of {}%",
+              roleGroup, healthThresholdPercent);
+          firstOccurrenceTimestamp = 0;
+        }
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 9e56870..d33e92e 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -72,6 +72,7 @@ import org.apache.slider.core.persist.AggregateConfSerDeser;
 import org.apache.slider.core.persist.ConfTreeSerDeser;
 import org.apache.slider.providers.PlacementPolicy;
 import org.apache.slider.providers.ProviderRole;
+import org.apache.slider.server.appmaster.actions.MonitorHealthThreshold;
 import org.apache.slider.server.appmaster.management.LongGauge;
 import org.apache.slider.server.appmaster.management.MetricsAndMonitoring;
 import org.apache.slider.server.appmaster.management.MetricsConstants;
@@ -274,6 +275,12 @@ public class AppState {
   private final AtomicInteger completionOfUnknownContainerEvent =
     new AtomicInteger();
 
+  /**
+   * This simply keeps track of the current set of live container ids for all
+   * roles and is primarily used by the {@link MonitorHealthThreshold} class.
+   */
+  private final Map<String, Set<ContainerId>> currentLiveContainers =
+      new ConcurrentHashMap<>();
 
   /**
    * limits of container core numbers in this queue
@@ -293,6 +300,9 @@ public class AppState {
 
   private int failureThreshold = 10;
   private int nodeFailureThreshold = 3;
+  // health threshold is disabled by default
+  private int healthThresholdPercent =
+      CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED;
 
   private static String logServerURL = "";
 
@@ -337,6 +347,69 @@ public class AppState {
   }
 
   /**
+   * Get the no of containers running for a specific role at the time of this
+   * API call. It includes owned containers, meaning containers which have been
+   * allocated even if the app is not completely deployed and/or started in the
+   * container.
+   */
+  public int getLiveContainerCount(String roleGroup) {
+    if (roleGroup == null) {
+      return 0;
+    }
+    Set<ContainerId> containers = currentLiveContainers.get(roleGroup);
+    log.debug("Current live containers = {} for role {}", containers, 
roleGroup);
+    return containers == null ? 0 : containers.size();
+  }
+
+  public long getDesiredContainerCount(String roleGroup)
+      throws BadConfigException {
+    return getDesiredInstanceCount(getResourcesSnapshot(), roleGroup);
+  }
+
+  public void setHealthThresholdMonitorEnabled(String roleGroup,
+      boolean enabled) {
+    for (RoleStatus rs : getRoleStatusMap().values()) {
+      if (rs.getGroup().equals(roleGroup)) {
+        rs.setHealthThresholdMonitorEnabled(enabled);
+      }
+    }
+  }
+
+  /**
+   * Add a new (or existing in which case it has no effect) container to the
+   * live container set.
+   */
+  public void addLiveContainer(String roleGroup, ContainerId cId) {
+    log.info("Adding live container {} to role {}", cId, roleGroup);
+    if (roleGroup == null) {
+      return;
+    }
+    if (!currentLiveContainers.containsKey(roleGroup)) {
+      // new role entry
+      currentLiveContainers.put(roleGroup, Collections
+          .newSetFromMap(new ConcurrentHashMap<ContainerId, Boolean>()));
+    }
+    currentLiveContainers.get(roleGroup).add(cId);
+  }
+
+  /**
+   * Remove an existing container from the live container set.
+   */
+  public void removeLiveContainer(String roleGroup, ContainerId cId) {
+    if (roleGroup == null) {
+      return;
+    }
+    if (currentLiveContainers.containsKey(roleGroup)) {
+      log.info("Removing live container {} from role {}", cId, roleGroup);
+      currentLiveContainers.get(roleGroup).remove(cId);
+    } else {
+      log.warn(
+          "Nothing to remove as role {} does not exist in 
currentLiveContainers",
+          roleGroup);
+    }
+  }
+
+  /**
    * Increment the count
    */
   public void incFailedCountainerCount() {
@@ -586,6 +659,9 @@ public class AppState {
     nodeFailureThreshold = globalResOpts.getOptionInt(
         NODE_FAILURE_THRESHOLD,
         DEFAULT_NODE_FAILURE_THRESHOLD);
+    healthThresholdPercent = globalResOpts.getOptionInt(
+        CONTAINER_HEALTH_THRESHOLD_PERCENT,
+        DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT);
     initClusterStatus();
 
 
@@ -1091,7 +1167,14 @@ public class AppState {
    * @return the instance removed
    */
   private RoleInstance removeOwnedContainer(ContainerId id) {
-    return ownedContainers.remove(id);
+    RoleInstance ri = ownedContainers.remove(id);
+    if (ri == null) {
+      log.warn("RoleInstance is null for container {}", id);
+    } else {
+      log.debug("RoleInstance = {}", ri);
+      removeLiveContainer(ri.group, id);
+    }
+    return ri;
   }
 
   /**
@@ -1102,6 +1185,7 @@ public class AppState {
    */
   private RoleInstance putOwnedContainer(ContainerId id,
       RoleInstance instance) {
+    addLiveContainer(instance.group, id);
     return ownedContainers.put(id, instance);
   }
 
@@ -2076,6 +2160,18 @@ public class AppState {
   }
 
   /**
+   * Get the health threshold percent for a specific role, falling back to
+   * the global one if not.
+   * @param roleGroup role group
+   * @return the threshold percent for health
+   */
+  public int getHealthThresholdPercentForRole(String roleGroup) {
+    ConfTreeOperations resources = instanceDefinition.getResourceOperations();
+    return resources.getComponentOptInt(roleGroup,
+        CONTAINER_HEALTH_THRESHOLD_PERCENT, healthThresholdPercent);
+  }
+
+  /**
    * Get the node failure threshold for a specific role, falling back to
    * the global one if not
    * @param roleGroup role group
@@ -2154,7 +2250,10 @@ public class AppState {
 
     log.info("Reviewing {} : ", role);
     log.debug("Expected {}, Delta: {}", expected, delta);
-    checkFailureThreshold(role);
+    // If health threshold monitor is disabled then check for failure threshold
+    if (!role.isHealthThresholdMonitorEnabled()) {
+      checkFailureThreshold(role);
+    }
 
     if (expected < 0 ) {
       // negative value: fail

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 694f5cf..f46ed74 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -82,6 +82,8 @@ public final class RoleStatus implements Cloneable, MetricSet 
{
   private String failureMessage = "";
   private final Set<ContainerId> failedContainers = new HashSet<>();
 
+  private boolean healthThresholdMonitorEnabled = false;
+
   public RoleStatus(ProviderRole providerRole) {
     this.providerRole = providerRole;
     this.name = providerRole.name;
@@ -458,6 +460,8 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
     sb.append(", failureMessage='").append(failureMessage).append('\'');
     sb.append(", providerRole=").append(providerRole);
     sb.append(", failedContainers=").append(failedContainers);
+    sb.append(", healthThresholdMonitorEnabled=")
+        .append(healthThresholdMonitorEnabled);
     sb.append('}');
     return sb.toString();
   }
@@ -577,4 +581,12 @@ public final class RoleStatus implements Cloneable, 
MetricSet {
     return stats;
   }
 
+  public boolean isHealthThresholdMonitorEnabled() {
+    return healthThresholdMonitorEnabled;
+  }
+
+  public void setHealthThresholdMonitorEnabled(
+      boolean healthThresholdMonitorEnabled) {
+    this.healthThresholdMonitorEnabled = healthThresholdMonitorEnabled;
+  }
 }

Reply via email to