Repository: hadoop
Updated Branches:
  refs/heads/trunk 1b790f4dd -> 2bba329c1


YARN-8994. Fix race condition between move app and queue cleanup in Fair 
Scheduler. (Contributed by Wilfred Spiegelenburg)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2bba329c
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2bba329c
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2bba329c

Branch: refs/heads/trunk
Commit: 2bba329c141dea17a1a9f7112f7155e11264ccfd
Parents: 1b790f4
Author: Haibo Chen <haiboc...@apache.org>
Authored: Wed Dec 5 14:38:15 2018 -0800
Committer: Haibo Chen <haiboc...@apache.org>
Committed: Wed Dec 5 14:38:15 2018 -0800

----------------------------------------------------------------------
 .../scheduler/fair/FairScheduler.java           | 52 ++++++++++++--------
 1 file changed, 31 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/2bba329c/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
index e5d2a06..282367e 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
@@ -1735,13 +1735,15 @@ public class FairScheduler extends
       String queueName) throws YarnException {
     writeLock.lock();
     try {
+      // app could have finished between pre check and now
       SchedulerApplication<FSAppAttempt> app = applications.get(appId);
       if (app == null) {
         throw new YarnException("App to be moved " + appId + " not found.");
       }
-      FSAppAttempt attempt = (FSAppAttempt) app.getCurrentAppAttempt();
-      // To serialize with FairScheduler#allocate, synchronize on app attempt
+      FSLeafQueue targetQueue = null;
 
+      // To serialize with FairScheduler#allocate, synchronize on app attempt
+      FSAppAttempt attempt = app.getCurrentAppAttempt();
       attempt.getWriteLock().lock();
       try {
         FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
@@ -1753,7 +1755,9 @@ public class FairScheduler extends
               + " is stopped and can't be moved!");
         }
         String destQueueName = handleMoveToPlanQueue(queueName);
-        FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
+        // Prevent removal of the queue while the move is in progress by
+        // registering the app as submitted to the queue.
+        targetQueue = queueMgr.getLeafQueue(destQueueName, false, appId);
         if (targetQueue == null) {
           throw new YarnException("Target queue " + queueName
               + " not found or is not a leaf queue.");
@@ -1766,9 +1770,14 @@ public class FairScheduler extends
           verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
         }
 
+        // The move itself will clean up the app submit registration.
         executeMove(app, attempt, oldQueue, targetQueue);
         return targetQueue.getQueueName();
       } finally {
+        // Cleanup the submit registration in case of move failure.
+        if (targetQueue != null) {
+          targetQueue.removeAssignedApp(appId);
+        }
         attempt.getWriteLock().unlock();
       }
     } finally {
@@ -1776,6 +1785,17 @@ public class FairScheduler extends
     }
   }
 
+  /**
+   * Perform pre-checks while moving the application. This should not check any
+   * application values that can change since the check is not part of an
+   * atomic action. During a move the scheduler can still assign containers and
+   * the app can still be updated until the real move is performed under
+   * proper locking.
+   *
+   * @param appId The ID of the app to be moved
+   * @param newQueue The name of the queue the app should move to
+   * @throws YarnException if the validate fails
+   */
   @Override
   public void preValidateMoveApplication(ApplicationId appId, String newQueue)
       throws YarnException {
@@ -1786,24 +1806,14 @@ public class FairScheduler extends
         throw new YarnException("App to be moved " + appId + " not found.");
       }
 
-      FSAppAttempt attempt = app.getCurrentAppAttempt();
-      // To serialize with FairScheduler#allocate, synchronize on app attempt
-
-      attempt.getWriteLock().lock();
-      try {
-        FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
-        String destQueueName = handleMoveToPlanQueue(newQueue);
-        FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
-        if (targetQueue == null) {
-          throw new YarnException("Target queue " + newQueue
-              + " not found or is not a leaf queue.");
-        }
-
-        if (oldQueue.isRunnableApp(attempt)) {
-          verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
-        }
-      } finally {
-        attempt.getWriteLock().unlock();
+      // Do not register the app on the new queue: lots of things can still
+      // change between this check and the real move and unregistering the move
+      // becomes a problem.
+      String destQueueName = handleMoveToPlanQueue(newQueue);
+      FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
+      if (targetQueue == null) {
+        throw new YarnException("Target queue " + newQueue
+            + " not found or is not a leaf queue.");
       }
     } finally {
       writeLock.unlock();


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to