HIVE-16927: LLAP: Slider takes down all daemons when some daemons fail repeatedly (Prasanth Jayachandran reviewed by Siddharth Seth)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/67610b12 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/67610b12 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/67610b12 Branch: refs/heads/hive-14535 Commit: 67610b123e33c816cdf401a7c63032ee46240ea5 Parents: f3bbc3c Author: Prasanth Jayachandran <prasan...@apache.org> Authored: Fri Sep 29 11:38:44 2017 -0700 Committer: Prasanth Jayachandran <prasan...@apache.org> Committed: Fri Sep 29 11:38:44 2017 -0700 ---------------------------------------------------------------------- .../hadoop/hive/llap/cli/LlapOptionsProcessor.java | 16 ++++++++++++++++ llap-server/src/main/resources/package.py | 8 +++++++- llap-server/src/main/resources/templates.py | 5 ++++- 3 files changed, 27 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/67610b12/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapOptionsProcessor.java ---------------------------------------------------------------------- diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapOptionsProcessor.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapOptionsProcessor.java index aa09083..d01598c 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapOptionsProcessor.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapOptionsProcessor.java @@ -69,6 +69,9 @@ public class LlapOptionsProcessor { public static final String OPTION_SLIDER_DEFAULT_KEYTAB = "slider-default-keytab"; public static final String OPTION_OUTPUT_DIR = "output"; public static final String OPTION_START = "startImmediately"; + public static final String OPTION_HEALTH_PERCENT = "health-percent"; + public static final String OPTION_HEALTH_TIME_WINDOW_SECS = "health-time-window-secs"; + public static final String OPTION_HEALTH_INIT_DELAY_SECS = "health-init-delay-secs"; public static class LlapOptions { private final int instances; @@ -233,6 +236,19 @@ public class LlapOptionsProcessor { .withDescription("Slider placement policy; see slider documentation at https://slider.incubator.apache.org/docs/placement.html." + " 4 means anti-affinity (the default; unnecessary if LLAP is going to take more than half of the YARN capacity of a node), 0 is normal.").create()); + options.addOption(OptionBuilder.hasArg().withArgName(OPTION_HEALTH_PERCENT).withLongOpt(OPTION_HEALTH_PERCENT) + .withDescription("Percentage of running containers after which LLAP application is considered healthy" + + " (Default: 80)").create()); + + options.addOption(OptionBuilder.hasArg().withArgName(OPTION_HEALTH_INIT_DELAY_SECS) + .withLongOpt(OPTION_HEALTH_INIT_DELAY_SECS) + .withDescription("Delay in seconds after which health percentage is monitored (Default: 400)").create()); + + options.addOption(OptionBuilder.hasArg().withArgName(OPTION_HEALTH_TIME_WINDOW_SECS) + .withLongOpt(OPTION_HEALTH_TIME_WINDOW_SECS) + .withDescription("Time window in seconds (after initial delay) for which LLAP application is allowed to be in " + + "unhealthy state before being killed (Default: 300)").create()); + options.addOption(OptionBuilder.hasArg().withArgName(OPTION_EXECUTORS).withLongOpt(OPTION_EXECUTORS) .withDescription("executor per instance").create('e')); http://git-wip-us.apache.org/repos/asf/hive/blob/67610b12/llap-server/src/main/resources/package.py ---------------------------------------------------------------------- diff --git a/llap-server/src/main/resources/package.py b/llap-server/src/main/resources/package.py index e83d3b0..21c34e9 100644 --- a/llap-server/src/main/resources/package.py +++ b/llap-server/src/main/resources/package.py @@ -93,6 +93,9 @@ def main(args): parser.add_argument("--slider-principal", default="") parser.add_argument("--slider-default-keytab", dest='slider_default_keytab', action='store_true') parser.add_argument("--slider-placement", type=int, default=4) + parser.add_argument("--health-percent", type=int, default=80) + parser.add_argument("--health-time-window-secs", type=int, default=300) + parser.add_argument("--health-init-delay-secs", type=int, default=400) parser.set_defaults(slider_default_keytab=False) parser.add_argument("--startImmediately", dest='start_immediately', action='store_true') parser.add_argument("--javaChild", dest='java_child', action='store_true') @@ -162,7 +165,10 @@ def main(args): "slider_keytab_dir" : slider_keytab_dir, "slider_keytab" : slider_keytab, "slider_principal" : slider_principal, - "placement" : args.slider_placement + "placement" : args.slider_placement, + "health_percent": args.health_percent, + "health_time_window": args.health_time_window_secs, + "health_init_delay": args.health_init_delay_secs } if not exists(output): http://git-wip-us.apache.org/repos/asf/hive/blob/67610b12/llap-server/src/main/resources/templates.py ---------------------------------------------------------------------- diff --git a/llap-server/src/main/resources/templates.py b/llap-server/src/main/resources/templates.py index aedebcf..3d747a2 100644 --- a/llap-server/src/main/resources/templates.py +++ b/llap-server/src/main/resources/templates.py @@ -116,7 +116,10 @@ resources = """ "yarn.component.instances": "%(instances)d", "yarn.resource.normalization.enabled": "false", "yarn.memory": "%(container.mb)d", - "yarn.component.placement.policy" : "%(placement)d" + "yarn.component.placement.policy" : "%(placement)d", + "yarn.container.health.threshold.percent": "%(health_percent)d", + "yarn.container.health.threshold.window.secs": "%(health_time_window)d", + "yarn.container.health.threshold.init.delay.secs": "%(health_init_delay)d" } } }