YARN-2456. Possible livelock in CapacityScheduler when RM is recovering
apps. Contributed by Jian He


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/e65ae575
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/e65ae575
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/e65ae575

Branch: refs/heads/HDFS-6584
Commit: e65ae575a059a426c4c38fdabe22a31eabbb349e
Parents: 40364dc
Author: XuanGong <xg...@apache.org>
Authored: Fri Sep 12 15:21:46 2014 -0700
Committer: XuanGong <xg...@apache.org>
Committed: Fri Sep 12 15:21:46 2014 -0700

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |  3 ++
 .../resourcemanager/recovery/RMStateStore.java  |  3 +-
 .../server/resourcemanager/TestRMRestart.java   | 43 ++++++++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/e65ae575/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 9002e6a..efc3e09 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -345,6 +345,9 @@ Release 2.6.0 - UNRELEASED
     YARN-2484. FileSystemRMStateStore#readFile/writeFile should close
     FSData(In|Out)putStream in final block (Tsuyoshi OZAWA via jlowe)
 
+    YARN-2456. Possible livelock in CapacityScheduler when RM is recovering 
apps.
+    (Jian He via xgong)
+
 Release 2.5.1 - 2014-09-05
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/e65ae575/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
index ac51a17..df4f3a9 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
@@ -22,6 +22,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeMap;
 
 import javax.crypto.SecretKey;
 
@@ -421,7 +422,7 @@ public abstract class RMStateStore extends AbstractService {
    */
   public static class RMState {
     Map<ApplicationId, ApplicationState> appState =
-        new HashMap<ApplicationId, ApplicationState>();
+        new TreeMap<ApplicationId, ApplicationState>();
 
     RMDTSecretManagerState rmSecretManagerState = new RMDTSecretManagerState();
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/e65ae575/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
index 7d511db..caa5647 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
@@ -19,9 +19,11 @@
 package org.apache.hadoop.yarn.server.resourcemanager;
 
 import static org.mockito.Matchers.isA;
+import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -1656,6 +1658,47 @@ public class TestRMRestart {
     rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
   }
 
+  @Test (timeout = 20000)
+  public void testAppRecoveredInOrderOnRMRestart() throws Exception {
+    MemoryRMStateStore memStore = new MemoryRMStateStore();
+    memStore.init(conf);
+
+    for (int i = 10; i > 0; i--) {
+      ApplicationState appState = mock(ApplicationState.class);
+      when(appState.getAppId()).thenReturn(ApplicationId.newInstance(1234, i));
+      memStore.getState().getApplicationState()
+        .put(appState.getAppId(), appState);
+    }
+
+    MockRM rm1 = new MockRM(conf, memStore) {
+      @Override
+      protected RMAppManager createRMAppManager() {
+        return new TestRMAppManager(this.rmContext, this.scheduler,
+          this.masterService, this.applicationACLsManager, conf);
+      }
+
+      class TestRMAppManager extends RMAppManager {
+        ApplicationId prevId = ApplicationId.newInstance(1234, 0);
+
+        public TestRMAppManager(RMContext context, YarnScheduler scheduler,
+            ApplicationMasterService masterService,
+            ApplicationACLsManager applicationACLsManager, Configuration conf) 
{
+          super(context, scheduler, masterService, applicationACLsManager, 
conf);
+        }
+
+        @Override
+        protected void recoverApplication(ApplicationState appState,
+            RMState rmState) throws Exception {
+          // check application is recovered in order.
+          Assert.assertTrue(rmState.getApplicationState().size() > 0);
+          Assert.assertTrue(appState.getAppId().compareTo(prevId) > 0);
+          prevId = appState.getAppId();
+        }
+      }
+    };
+    rm1.start();
+  }
+
   @SuppressWarnings("resource")
   @Test (timeout = 60000)
   public void testQueueMetricsOnRMRestart() throws Exception {

Reply via email to