Repository: hadoop Updated Branches: refs/heads/branch-2.6.1 81ba30211 -> f83d89894
YARN-2816. NM fail to start with NPE during container recovery. Contributed by Zhihai Xu (cherry picked from commit 49c38898b0be64fc686d039ed2fb2dea1378df02) (cherry picked from commit ad140d1fc831735fb9335e27b38d2fc040847af1) (cherry picked from commit 85b23c323c80c5303bd0b7bdb066258792ca67d8) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/f83d8989 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/f83d8989 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/f83d8989 Branch: refs/heads/branch-2.6.1 Commit: f83d89894425b601ccb65d72bfce3dab12a9d898 Parents: 81ba302 Author: Jason Lowe <[email protected]> Authored: Fri Nov 14 21:25:59 2014 +0000 Committer: Vinod Kumar Vavilapalli <[email protected]> Committed: Thu Aug 27 18:32:59 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 +++ .../recovery/NMLeveldbStateStoreService.java | 24 +++++++++++++++++++- .../TestNMLeveldbStateStoreService.java | 7 ++++++ 3 files changed, 33 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/f83d8989/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 6691f6e..5e8e4f9 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -15,6 +15,9 @@ Release 2.6.1 - UNRELEASED YARN-2856. Fixed RMAppImpl to handle ATTEMPT_KILLED event at ACCEPTED state on app recovery. (Rohith Sharmaks via jianhe) + YARN-2816. NM fail to start with NPE during container recovery (Zhihai Xu + via jlowe) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/f83d8989/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java index 7cf4921..9d54688 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java @@ -146,6 +146,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { throws IOException { ArrayList<RecoveredContainerState> containers = new ArrayList<RecoveredContainerState>(); + ArrayList<ContainerId> containersToRemove = + new ArrayList<ContainerId>(); LeveldbIterator iter = null; try { iter = new LeveldbIterator(db); @@ -165,7 +167,14 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { ContainerId containerId = ConverterUtils.toContainerId( key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos)); String keyPrefix = key.substring(0, idEndPos+1); - containers.add(loadContainerState(containerId, iter, keyPrefix)); + RecoveredContainerState rcs = loadContainerState(containerId, + iter, keyPrefix); + // Don't load container without StartContainerRequest + if (rcs.startRequest != null) { + containers.add(rcs); + } else { + containersToRemove.add(containerId); + } } } catch (DBException e) { throw new IOException(e); @@ -175,6 +184,19 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } } + // remove container without StartContainerRequest + for (ContainerId containerId : containersToRemove) { + LOG.warn("Remove container " + containerId + + " with incomplete records"); + try { + removeContainer(containerId); + // TODO: kill and cleanup the leaked container + } catch (IOException e) { + LOG.error("Unable to remove container " + containerId + + " in store", e); + } + } + return containers; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/f83d8989/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java index 438cec3..f7f43cc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java @@ -274,6 +274,13 @@ public class TestNMLeveldbStateStoreService { assertEquals(containerReq, rcs.getStartRequest()); assertTrue(rcs.getDiagnostics().isEmpty()); + // store a new container record without StartContainerRequest + ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 6); + stateStore.storeContainerLaunched(containerId1); + recoveredContainers = stateStore.loadContainersState(); + // check whether the new container record is discarded + assertEquals(1, recoveredContainers.size()); + // launch the container, add some diagnostics, and verify recovered StringBuilder diags = new StringBuilder(); stateStore.storeContainerLaunched(containerId);
