Repository: incubator-slider Updated Branches: refs/heads/develop 5696c7de3 -> 0f436c865
SLIDER-1246 Application health should not be affected by faulty nodes (health monitor based on percent threshold) Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/0f436c86 Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/0f436c86 Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/0f436c86 Branch: refs/heads/develop Commit: 0f436c865a90aba5b427d1c0571183c6fcbded1e Parents: 5696c7d Author: Gour Saha <gourks...@apache.org> Authored: Sun Oct 1 22:15:07 2017 -0700 Committer: Gour Saha <gourks...@apache.org> Committed: Sun Oct 1 22:15:07 2017 -0700 ---------------------------------------------------------------------- .../org/apache/slider/api/ResourceKeys.java | 46 ++++++ .../slider/core/conf/ConfTreeOperations.java | 15 ++ .../apache/slider/providers/ProviderRole.java | 4 +- .../server/appmaster/SliderAppMaster.java | 77 +++++++++- .../actions/MonitorHealthThreshold.java | 146 +++++++++++++++++++ .../slider/server/appmaster/state/AppState.java | 103 ++++++++++++- .../server/appmaster/state/RoleStatus.java | 12 ++ 7 files changed, 398 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java index 295f7cd..29ef8ea 100644 --- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java +++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java @@ -159,6 +159,52 @@ public interface ResourceKeys { int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 5; /** + * The container health threshold when explicitly set for a specific role or + * glabally for all roles, will schedule a health check monitor to + * periodically check for the percentage of healthy containers. It runs the + * check at a specified/default poll frequency. It allows a role to be below + * the health threshold for a specified/default window after which it + * considers the application to be unhealthy and triggers an app stop. + */ + String CONTAINER_HEALTH_THRESHOLD_PERCENT = + "yarn.container.health.threshold.percent"; + /** + * Health check monitor poll frequency. It is an advanced setting and does not + * need to be set unless the app owner understands the implication and does + * not want the default. + */ + String CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC = + "yarn.container.health.threshold.poll.frequency.secs"; + /** + * The amount of time the health check monitor allows a specific role to be + * below the health threshold after which it considers the app to be + * unhealthy. + */ + String CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC = + "yarn.container.health.threshold.window.secs"; + /** + * The amount of initial time the health check monitor waits before the first + * check kicks in. It gives a lead time for the app containers to come up for + * the first time. + */ + String CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC = + "yarn.container.health.threshold.init.delay.secs"; + /** + * By default the health threshold percent does not come into play until it is + * explicitly set in resource config for a specific role or globally for all + * roles. -1 signifies disabled. + */ + int CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED = -1; + + int DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT = + CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED; + long DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC = 10; + long DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC = 600; + // the default for init delay is same as default health window + long DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC = + DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC; + + /** * Default node failure threshold for a component instance: {@value} * Should to be lower than default component failure threshold to allow * the component to start elsewhere http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java b/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java index 526e17d..c8a7720 100644 --- a/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java +++ b/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java @@ -459,6 +459,21 @@ public class ConfTreeOperations { } /** + * Get a component opt; use {@link Long#decode(String)} so as to take hex + * oct and bin values too. + * + * @param name component name + * @param option option name + * @param defVal default value + * @return parsed value + * @throws NumberFormatException if the role could not be parsed. + */ + public long getComponentOptLong(String name, String option, long defVal) { + String val = getComponentOpt(name, option, Long.toString(defVal)); + return Long.decode(val); + } + + /** * Get a component opt as a boolean using {@link Boolean#valueOf(String)}. * * @param name component name http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java index 4f6be52..4105b67 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java +++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java @@ -76,8 +76,8 @@ public final class ProviderRole { * @param group role/component group * @param id ID. This becomes the YARN priority * @param policy placement policy - * @param nodeFailureThreshold threshold for node failures (within a reset interval) - * after which a node failure is considered an app failure + * @param nodeFailureThreshold threshold for node failures (within a reset + * interval) after which a node failure is considered an app failure * @param placementTimeoutSeconds for lax placement, timeout in seconds before * @param labelExpression label expression for requests; may be null */ http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index 3f47b98..c12fae8 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -138,6 +138,7 @@ import org.apache.slider.server.appmaster.actions.QueueService; import org.apache.slider.server.appmaster.actions.ActionStopSlider; import org.apache.slider.server.appmaster.actions.ActionUpgradeContainers; import org.apache.slider.server.appmaster.actions.AsyncAction; +import org.apache.slider.server.appmaster.actions.MonitorHealthThreshold; import org.apache.slider.server.appmaster.actions.RenewingAction; import org.apache.slider.server.appmaster.actions.ResetFailureWindow; import org.apache.slider.server.appmaster.actions.ReviewAndFlexApplicationSize; @@ -991,6 +992,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService scheduleFailureWindowResets(instanceDefinition.getResources()); scheduleEscalation(instanceDefinition.getInternal()); + scheduleHealthThresholdMonitor(instanceDefinition.getResources()); try { // schedule YARN Registry registration @@ -1902,6 +1904,79 @@ public class SliderAppMaster extends AbstractSliderLaunchedService } /** + * Schedule the health threshold monitor for all roles (except AM) + * + * @param resources + * the resource tree + */ + private void scheduleHealthThresholdMonitor(ConfTree resources) { + ConfTreeOperations ops = new ConfTreeOperations(resources); + for (String roleGroup : ops.getComponentNames()) { + if (roleGroup.equals(SliderKeys.COMPONENT_AM)) { + continue; + } + // determine health threshold percent + int healthThresholdPercent = appState + .getHealthThresholdPercentForRole(roleGroup); + // validations + if (healthThresholdPercent == + ResourceKeys.CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED) { + log.info("No health threshold monitor enabled for role {}", roleGroup); + continue; + } + // if threshold set to outside acceptable range then don't enable monitor + if (healthThresholdPercent <= 0 || healthThresholdPercent > 100) { + log.error( + "Invalid health threshold percent {}% for role {}. Monitor not " + + "enabled.", + healthThresholdPercent, roleGroup); + continue; + } + // determine the threshold properties + long window = ops.getComponentOptLong(roleGroup, + ResourceKeys.CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC, + ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC); + long initDelay = ops.getComponentOptLong(roleGroup, + ResourceKeys.CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC, + ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC); + long pollFrequency = ops.getComponentOptLong(roleGroup, + ResourceKeys.CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC, + ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC); + // validations + if (window <= 0) { + log.error( + "Invalid health monitor window {} secs for role {}. Monitor not " + + "enabled.", + window, roleGroup); + continue; + } + if (initDelay < 0) { + log.error("Invalid health monitor init delay {} secs for role {}. " + + "Monitor not enabled.", initDelay, roleGroup); + continue; + } + if (pollFrequency <= 0) { + log.error("Invalid health monitor poll frequency {} secs for role {}. " + + "Monitor not enabled.", pollFrequency, roleGroup); + continue; + } + log.info( + "Scheduling the health threshold monitor for role {} with percent = " + + "{}%, window = {} secs, poll freq = {} secs, init-delay = {} " + + "secs", + roleGroup, healthThresholdPercent, window, pollFrequency, initDelay); + MonitorHealthThreshold monitor = new MonitorHealthThreshold(roleGroup, + healthThresholdPercent, window); + RenewingAction<MonitorHealthThreshold> renew = new RenewingAction<>( + monitor, initDelay, pollFrequency, TimeUnit.SECONDS, 0); + actionQueues.renewing("healthThresholdMonitor", renew); + // Mark that health threshold monitor is enabled for this role. Can be + // used to disable the failure threshold check. + appState.setHealthThresholdMonitorEnabled(roleGroup, true); + } + } + + /** * Schedule the escalation action * @param internal * @throws BadConfigException @@ -2078,7 +2153,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService public void onError(Throwable e) { if (e instanceof InvalidResourceRequestException) { // stop the cluster - LOG_YARN.error("AMRMClientAsync.onError() received {}", e, e); + LOG_YARN.error("AMRMClientAsync.onError() received {}", e); ActionStopSlider stopSlider = new ActionStopSlider("stop", EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED, SliderUtils.extractFirstLine(e.getLocalizedMessage())); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java new file mode 100644 index 0000000..cc6f13c --- /dev/null +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.slider.server.appmaster.actions; + +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.slider.api.SliderExitReason; +import org.apache.slider.core.main.LauncherExitCodes; +import org.apache.slider.server.appmaster.SliderAppMaster; +import org.apache.slider.server.appmaster.state.AppState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Monitors at a regular interval if the container health for a specific role + * has dropped below a desired threshold. + */ +public class MonitorHealthThreshold extends AsyncAction { + protected static final Logger log = LoggerFactory + .getLogger(MonitorHealthThreshold.class); + + private final String roleGroup; + private final int healthThresholdPercent; + private final long healthThresholdWindowSecs; + private final long healthThresholdWindowNanos; + private long firstOccurrenceTimestamp = 0; + // Sufficient logging happens when role health is below threshold. However, + // there has to be some logging when it is above threshold, otherwise app + // owners have no idea how the health is fluctuating. So let's log whenever + // there is a change in role health, thereby preventing excessive logging on + // every poll. + private float prevRunningContainerFraction = 0; + + public MonitorHealthThreshold(String roleGroup, int healthThresholdPercent, + long healthThresholdWindowSecs) { + super("MonitorHealthThreshold"); + this.roleGroup = roleGroup; + this.healthThresholdPercent = healthThresholdPercent; + this.healthThresholdWindowSecs = healthThresholdWindowSecs; + this.healthThresholdWindowNanos = TimeUnit.NANOSECONDS + .convert(healthThresholdWindowSecs, TimeUnit.SECONDS); + } + + @Override + public void execute(SliderAppMaster appMaster, QueueAccess queueService, + AppState appState) throws Exception { + log.debug("MonitorHealthThreshold execute method"); + // Perform container health checks against desired threshold + synchronized (appMaster) { + long desiredContainerCount = appState.getDesiredContainerCount(roleGroup); + // if desired container count for this role is 0 then nothing to do + if (desiredContainerCount == 0) { + return; + } + long runningContainerCount = appState.getLiveContainerCount(roleGroup); + float thresholdFraction = (float) healthThresholdPercent / 100; + // no possibility of div by 0 since desiredContainerCount won't be 0 here + float runningContainerFraction = (float) runningContainerCount + / desiredContainerCount; + boolean healthChanged = false; + if (runningContainerFraction != prevRunningContainerFraction) { + prevRunningContainerFraction = runningContainerFraction; + healthChanged = true; + } + String runningContainerPercentStr = String.format("%.2f", + runningContainerFraction * 100); + // Check if the current running container percent is less than the + // threshold percent + if (runningContainerFraction < thresholdFraction) { + // Check if it is the first occurrence and if yes set the timestamp + long currentTimestamp = now(); + if (firstOccurrenceTimestamp == 0) { + firstOccurrenceTimestamp = currentTimestamp; + log.info("Role {} is going below health threshold for the first time " + + "at ts = {}", roleGroup, firstOccurrenceTimestamp); + } + long elapsedTime = currentTimestamp - firstOccurrenceTimestamp; + long elapsedTimeSecs = TimeUnit.SECONDS.convert(elapsedTime, + TimeUnit.NANOSECONDS); + log.warn( + "Role = {}, Current health = {}%, is below Health threshold of {}% " + + "for {} secs (window = {} secs)", + roleGroup, runningContainerPercentStr, healthThresholdPercent, + elapsedTimeSecs, healthThresholdWindowSecs); + if (elapsedTime > healthThresholdWindowNanos) { + log.error( + "Role = {}, Current health = {}%, has been below health " + + "threshold of {}% for {} secs (threshold window = {} secs)", + roleGroup, runningContainerPercentStr, healthThresholdPercent, + elapsedTimeSecs, healthThresholdWindowSecs); + // Trigger an app stop + ActionStopSlider stopSlider = new ActionStopSlider("stop", + LauncherExitCodes.EXIT_EXCEPTION_THROWN, + FinalApplicationStatus.FAILED, + String.format( + "Application was killed because container health for role %s " + + "was %s%% (threshold = %d%%) for %d secs (threshold " + + "window = %d secs)", + roleGroup, runningContainerPercentStr, healthThresholdPercent, + elapsedTimeSecs, healthThresholdWindowSecs)); + stopSlider.setExitReason(SliderExitReason.APP_ERROR); + appMaster.queue(stopSlider); + } + } else { + String logMsg = "Role = {}, Health threshold = {}%, Current health = " + + "{}% (Current Running count = {}, Desired count = {})"; + if (healthChanged) { + log.info(logMsg, roleGroup, healthThresholdPercent, + runningContainerPercentStr, runningContainerCount, + desiredContainerCount); + } else { + log.debug(logMsg, roleGroup, healthThresholdPercent, + runningContainerPercentStr, runningContainerCount, + desiredContainerCount); + } + // The container health might have recovered above threshold after being + // below for less than the threshold window amount of time. So we need + // to reset firstOccurrenceTimestamp to 0. + if (firstOccurrenceTimestamp != 0) { + log.info( + "Role = {}, resetting first occurence to 0, since it recovered " + + "above health threshold of {}%", + roleGroup, healthThresholdPercent); + firstOccurrenceTimestamp = 0; + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java index 9e56870..d33e92e 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java @@ -72,6 +72,7 @@ import org.apache.slider.core.persist.AggregateConfSerDeser; import org.apache.slider.core.persist.ConfTreeSerDeser; import org.apache.slider.providers.PlacementPolicy; import org.apache.slider.providers.ProviderRole; +import org.apache.slider.server.appmaster.actions.MonitorHealthThreshold; import org.apache.slider.server.appmaster.management.LongGauge; import org.apache.slider.server.appmaster.management.MetricsAndMonitoring; import org.apache.slider.server.appmaster.management.MetricsConstants; @@ -274,6 +275,12 @@ public class AppState { private final AtomicInteger completionOfUnknownContainerEvent = new AtomicInteger(); + /** + * This simply keeps track of the current set of live container ids for all + * roles and is primarily used by the {@link MonitorHealthThreshold} class. + */ + private final Map<String, Set<ContainerId>> currentLiveContainers = + new ConcurrentHashMap<>(); /** * limits of container core numbers in this queue @@ -293,6 +300,9 @@ public class AppState { private int failureThreshold = 10; private int nodeFailureThreshold = 3; + // health threshold is disabled by default + private int healthThresholdPercent = + CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED; private static String logServerURL = ""; @@ -337,6 +347,69 @@ public class AppState { } /** + * Get the no of containers running for a specific role at the time of this + * API call. It includes owned containers, meaning containers which have been + * allocated even if the app is not completely deployed and/or started in the + * container. + */ + public int getLiveContainerCount(String roleGroup) { + if (roleGroup == null) { + return 0; + } + Set<ContainerId> containers = currentLiveContainers.get(roleGroup); + log.debug("Current live containers = {} for role {}", containers, roleGroup); + return containers == null ? 0 : containers.size(); + } + + public long getDesiredContainerCount(String roleGroup) + throws BadConfigException { + return getDesiredInstanceCount(getResourcesSnapshot(), roleGroup); + } + + public void setHealthThresholdMonitorEnabled(String roleGroup, + boolean enabled) { + for (RoleStatus rs : getRoleStatusMap().values()) { + if (rs.getGroup().equals(roleGroup)) { + rs.setHealthThresholdMonitorEnabled(enabled); + } + } + } + + /** + * Add a new (or existing in which case it has no effect) container to the + * live container set. + */ + public void addLiveContainer(String roleGroup, ContainerId cId) { + log.info("Adding live container {} to role {}", cId, roleGroup); + if (roleGroup == null) { + return; + } + if (!currentLiveContainers.containsKey(roleGroup)) { + // new role entry + currentLiveContainers.put(roleGroup, Collections + .newSetFromMap(new ConcurrentHashMap<ContainerId, Boolean>())); + } + currentLiveContainers.get(roleGroup).add(cId); + } + + /** + * Remove an existing container from the live container set. + */ + public void removeLiveContainer(String roleGroup, ContainerId cId) { + if (roleGroup == null) { + return; + } + if (currentLiveContainers.containsKey(roleGroup)) { + log.info("Removing live container {} from role {}", cId, roleGroup); + currentLiveContainers.get(roleGroup).remove(cId); + } else { + log.warn( + "Nothing to remove as role {} does not exist in currentLiveContainers", + roleGroup); + } + } + + /** * Increment the count */ public void incFailedCountainerCount() { @@ -586,6 +659,9 @@ public class AppState { nodeFailureThreshold = globalResOpts.getOptionInt( NODE_FAILURE_THRESHOLD, DEFAULT_NODE_FAILURE_THRESHOLD); + healthThresholdPercent = globalResOpts.getOptionInt( + CONTAINER_HEALTH_THRESHOLD_PERCENT, + DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT); initClusterStatus(); @@ -1091,7 +1167,14 @@ public class AppState { * @return the instance removed */ private RoleInstance removeOwnedContainer(ContainerId id) { - return ownedContainers.remove(id); + RoleInstance ri = ownedContainers.remove(id); + if (ri == null) { + log.warn("RoleInstance is null for container {}", id); + } else { + log.debug("RoleInstance = {}", ri); + removeLiveContainer(ri.group, id); + } + return ri; } /** @@ -1102,6 +1185,7 @@ public class AppState { */ private RoleInstance putOwnedContainer(ContainerId id, RoleInstance instance) { + addLiveContainer(instance.group, id); return ownedContainers.put(id, instance); } @@ -2076,6 +2160,18 @@ public class AppState { } /** + * Get the health threshold percent for a specific role, falling back to + * the global one if not. + * @param roleGroup role group + * @return the threshold percent for health + */ + public int getHealthThresholdPercentForRole(String roleGroup) { + ConfTreeOperations resources = instanceDefinition.getResourceOperations(); + return resources.getComponentOptInt(roleGroup, + CONTAINER_HEALTH_THRESHOLD_PERCENT, healthThresholdPercent); + } + + /** * Get the node failure threshold for a specific role, falling back to * the global one if not * @param roleGroup role group @@ -2154,7 +2250,10 @@ public class AppState { log.info("Reviewing {} : ", role); log.debug("Expected {}, Delta: {}", expected, delta); - checkFailureThreshold(role); + // If health threshold monitor is disabled then check for failure threshold + if (!role.isHealthThresholdMonitorEnabled()) { + checkFailureThreshold(role); + } if (expected < 0 ) { // negative value: fail http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java index 694f5cf..f46ed74 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java @@ -82,6 +82,8 @@ public final class RoleStatus implements Cloneable, MetricSet { private String failureMessage = ""; private final Set<ContainerId> failedContainers = new HashSet<>(); + private boolean healthThresholdMonitorEnabled = false; + public RoleStatus(ProviderRole providerRole) { this.providerRole = providerRole; this.name = providerRole.name; @@ -458,6 +460,8 @@ public final class RoleStatus implements Cloneable, MetricSet { sb.append(", failureMessage='").append(failureMessage).append('\''); sb.append(", providerRole=").append(providerRole); sb.append(", failedContainers=").append(failedContainers); + sb.append(", healthThresholdMonitorEnabled=") + .append(healthThresholdMonitorEnabled); sb.append('}'); return sb.toString(); } @@ -577,4 +581,12 @@ public final class RoleStatus implements Cloneable, MetricSet { return stats; } + public boolean isHealthThresholdMonitorEnabled() { + return healthThresholdMonitorEnabled; + } + + public void setHealthThresholdMonitorEnabled( + boolean healthThresholdMonitorEnabled) { + this.healthThresholdMonitorEnabled = healthThresholdMonitorEnabled; + } }