YARN-3136. Fixed a synchronization problem of AbstractYarnScheduler#getTransferredContainers. Contributed by Sunil G
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1bddfcd6 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1bddfcd6 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1bddfcd6 Branch: refs/heads/YARN-2928 Commit: 1bddfcd6019feac788624f2c97d452ddb43d693a Parents: b8dfc8a Author: Jian He <[email protected]> Authored: Sat Apr 18 12:45:38 2015 -0700 Committer: Zhijie Shen <[email protected]> Committed: Tue Apr 21 16:16:53 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 ++ .../dev-support/findbugs-exclude.xml | 8 ++++ .../ApplicationMasterService.java | 47 +++++++++++--------- .../scheduler/AbstractYarnScheduler.java | 17 ++++++- 4 files changed, 51 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/1bddfcd6/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index e7622f8..0f9eef6 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -286,6 +286,9 @@ Release 2.8.0 - UNRELEASED YARN-3493. RM fails to come up with error "Failed to load/recover state" when mem settings are changed. (Jian He via wangda) + YARN-3136. Fixed a synchronization problem of + AbstractYarnScheduler#getTransferredContainers. (Sunil G via jianhe) + Release 2.7.1 - UNRELEASED INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/1bddfcd6/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index 8aae152..ece8548 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -469,6 +469,14 @@ <Method name="recoverContainersOnNode" /> <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE" /> </Match> + <Match> + <Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler" /> + <Or> + <Field name="rmContext" /> + <Field name="applications" /> + </Or> + <Bug pattern="IS2_INCONSISTENT_SYNC" /> + </Match> <!-- Following fields are used in ErrorsAndWarningsBlock, which is not a part of analysis of findbugs --> <Match> http://git-wip-us.apache.org/repos/asf/hadoop/blob/1bddfcd6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java index a31127d..ba7b1ad 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java @@ -298,32 +298,35 @@ public class ApplicationMasterService extends AbstractService implements // For work-preserving AM restart, retrieve previous attempts' containers // and corresponding NM tokens. - List<Container> transferredContainers = - ((AbstractYarnScheduler) rScheduler) + if (app.getApplicationSubmissionContext() + .getKeepContainersAcrossApplicationAttempts()) { + List<Container> transferredContainers = ((AbstractYarnScheduler) rScheduler) .getTransferredContainers(applicationAttemptId); - if (!transferredContainers.isEmpty()) { - response.setContainersFromPreviousAttempts(transferredContainers); - List<NMToken> nmTokens = new ArrayList<NMToken>(); - for (Container container : transferredContainers) { - try { - NMToken token = rmContext.getNMTokenSecretManager() - .createAndGetNMToken(app.getUser(), applicationAttemptId, - container); - if (null != token) { - nmTokens.add(token); - } - } catch (IllegalArgumentException e) { - // if it's a DNS issue, throw UnknowHostException directly and that - // will be automatically retried by RMProxy in RPC layer. - if (e.getCause() instanceof UnknownHostException) { - throw (UnknownHostException) e.getCause(); + if (!transferredContainers.isEmpty()) { + response.setContainersFromPreviousAttempts(transferredContainers); + List<NMToken> nmTokens = new ArrayList<NMToken>(); + for (Container container : transferredContainers) { + try { + NMToken token = rmContext.getNMTokenSecretManager() + .createAndGetNMToken(app.getUser(), applicationAttemptId, + container); + if (null != token) { + nmTokens.add(token); + } + } catch (IllegalArgumentException e) { + // if it's a DNS issue, throw UnknowHostException directly and + // that + // will be automatically retried by RMProxy in RPC layer. + if (e.getCause() instanceof UnknownHostException) { + throw (UnknownHostException) e.getCause(); + } } } + response.setNMTokensFromPreviousAttempts(nmTokens); + LOG.info("Application " + appID + " retrieved " + + transferredContainers.size() + " containers from previous" + + " attempts and " + nmTokens.size() + " NM tokens."); } - response.setNMTokensFromPreviousAttempts(nmTokens); - LOG.info("Application " + appID + " retrieved " - + transferredContainers.size() + " containers from previous" - + " attempts and " + nmTokens.size() + " NM tokens."); } response.setSchedulerResourceTypes(rScheduler http://git-wip-us.apache.org/repos/asf/hadoop/blob/1bddfcd6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 1a8c653..ae927f1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -21,12 +21,15 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler; import java.io.IOException; import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience.Private; +import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; @@ -67,6 +70,8 @@ import com.google.common.util.concurrent.SettableFuture; @SuppressWarnings("unchecked") +@Private +@Unstable public abstract class AbstractYarnScheduler <T extends SchedulerApplicationAttempt, N extends SchedulerNode> extends AbstractService implements ResourceScheduler { @@ -91,7 +96,12 @@ public abstract class AbstractYarnScheduler private long configuredMaximumAllocationWaitTime; protected RMContext rmContext; - protected Map<ApplicationId, SchedulerApplication<T>> applications; + + /* + * All schedulers which are inheriting AbstractYarnScheduler should use + * concurrent version of 'applications' map. + */ + protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications; protected int nmExpireInterval; protected final static List<Container> EMPTY_CONTAINER_LIST = @@ -123,7 +133,7 @@ public abstract class AbstractYarnScheduler super.serviceInit(conf); } - public synchronized List<Container> getTransferredContainers( + public List<Container> getTransferredContainers( ApplicationAttemptId currentAttempt) { ApplicationId appId = currentAttempt.getApplicationId(); SchedulerApplication<T> app = applications.get(appId); @@ -132,6 +142,9 @@ public abstract class AbstractYarnScheduler if (appImpl.getApplicationSubmissionContext().getUnmanagedAM()) { return containerList; } + if (app == null) { + return containerList; + } Collection<RMContainer> liveContainers = app.getCurrentAppAttempt().getLiveContainers(); ContainerId amContainerId =
