YARN-3809. Failed to launch new attempts because ApplicationMasterLauncher's threads all hang. Contributed by Jun Gong
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/382d9128 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/382d9128 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/382d9128 Branch: refs/heads/YARN-2928 Commit: 382d9128e54e63d08a9e12397325c84f89692d30 Parents: c31cf73 Author: Jason Lowe <[email protected]> Authored: Wed Jun 24 16:23:48 2015 +0000 Committer: Zhijie Shen <[email protected]> Committed: Mon Jun 29 10:28:23 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 ++ .../hadoop/yarn/conf/YarnConfiguration.java | 10 +++++++ .../src/main/resources/yarn-default.xml | 12 ++++++++ .../amlauncher/ApplicationMasterLauncher.java | 30 ++++++++++++++++++-- 4 files changed, 52 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/382d9128/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index de7900d..6ebb183 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -802,6 +802,9 @@ Release 2.7.1 - UNRELEASED YARN-3842. NMProxy should retry on NMNotYetReadyException. (Robert Kanter via kasha) + YARN-3809. Failed to launch new attempts because + ApplicationMasterLauncher's threads all hang (Jun Gong via jlowe) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/382d9128/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 23e6b11..17ad2a4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -144,6 +144,16 @@ public class YarnConfiguration extends Configuration { RM_PREFIX + "client.thread-count"; public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50; + /** Number of threads used to launch/cleanup AM.*/ + public static final String RM_AMLAUNCHER_THREAD_COUNT = + RM_PREFIX + "amlauncher.thread-count"; + public static final int DEFAULT_RM_AMLAUNCHER_THREAD_COUNT = 50; + + /** Retry times to connect with NM.*/ + public static final String RM_NODEMANAGER_CONNECT_RETIRES = + RM_PREFIX + "nodemanager-connect-retries"; + public static final int DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES = 10; + /** The Kerberos principal for the resource manager.*/ public static final String RM_PRINCIPAL = RM_PREFIX + "principal"; http://git-wip-us.apache.org/repos/asf/hadoop/blob/382d9128/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index c7145b6..d34cf82 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -77,6 +77,18 @@ </property> <property> + <description>Number of threads used to launch/cleanup AM.</description> + <name>yarn.resourcemanager.amlauncher.thread-count</name> + <value>50</value> + </property> + + <property> + <description>Retry times to connect with NM.</description> + <name>yarn.resourcemanager.nodemanager-connect-retries</name> + <value>10</value> + </property> + + <property> <description>The expiry interval for application master reporting.</description> <name>yarn.am.liveness-monitor.expiry-interval-ms</name> <value>600000</value> http://git-wip-us.apache.org/repos/asf/hadoop/blob/382d9128/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java index 5fc39fd..f606e45 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java @@ -19,12 +19,17 @@ package org.apache.hadoop.yarn.server.resourcemanager.amlauncher; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; @@ -34,7 +39,7 @@ public class ApplicationMasterLauncher extends AbstractService implements EventHandler<AMLauncherEvent> { private static final Log LOG = LogFactory.getLog( ApplicationMasterLauncher.class); - private final ThreadPoolExecutor launcherPool; + private ThreadPoolExecutor launcherPool; private LauncherThread launcherHandlingThread; private final BlockingQueue<Runnable> masterEvents @@ -45,12 +50,31 @@ public class ApplicationMasterLauncher extends AbstractService implements public ApplicationMasterLauncher(RMContext context) { super(ApplicationMasterLauncher.class.getName()); this.context = context; - this.launcherPool = new ThreadPoolExecutor(10, 10, 1, - TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>()); this.launcherHandlingThread = new LauncherThread(); } @Override + protected void serviceInit(Configuration conf) throws Exception { + int threadCount = conf.getInt( + YarnConfiguration.RM_AMLAUNCHER_THREAD_COUNT, + YarnConfiguration.DEFAULT_RM_AMLAUNCHER_THREAD_COUNT); + ThreadFactory tf = new ThreadFactoryBuilder() + .setNameFormat("ApplicationMasterLauncher #%d") + .build(); + launcherPool = new ThreadPoolExecutor(threadCount, threadCount, 1, + TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>()); + launcherPool.setThreadFactory(tf); + + Configuration newConf = new YarnConfiguration(conf); + newConf.setInt(CommonConfigurationKeysPublic. + IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + conf.getInt(YarnConfiguration.RM_NODEMANAGER_CONNECT_RETIRES, + YarnConfiguration.DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES)); + setConfig(newConf); + super.serviceInit(newConf); + } + + @Override protected void serviceStart() throws Exception { launcherHandlingThread.start(); super.serviceStart();
