Repository: hadoop
Updated Branches:
refs/heads/branch-2.8 7dd879893 -> 9b5c5bd42
YARN-4924. NM recovery race can lead to container not cleaned up. Contributed
by sandflee
(cherry picked from commit 3150ae8108a1fc40a67926be6254824c1e37cb38)
Conflicts:
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/9b5c5bd4
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/9b5c5bd4
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/9b5c5bd4
Branch: refs/heads/branch-2.8
Commit: 9b5c5bd42f0cb240d0fe7754967765a99dd5be46
Parents: 7dd8798
Author: Jason Lowe <[email protected]>
Authored: Thu Apr 14 19:39:32 2016 +0000
Committer: Jason Lowe <[email protected]>
Committed: Thu Apr 14 19:40:10 2016 +0000
----------------------------------------------------------------------
.../containermanager/ContainerManagerImpl.java | 11 ---
.../recovery/NMLeveldbStateStoreService.java | 76 +++++++++++++-------
.../recovery/NMNullStateStoreService.java | 4 --
.../recovery/NMStateStoreService.java | 12 ----
.../TestContainerManagerRecovery.java | 4 ++
.../recovery/NMMemoryStateStoreService.java | 10 ---
.../TestNMLeveldbStateStoreService.java | 10 +--
7 files changed, 54 insertions(+), 73 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
index a156dfc..fb24e71 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
@@ -292,12 +292,6 @@ public class ContainerManagerImpl extends CompositeService
implements
for (RecoveredContainerState rcs : stateStore.loadContainersState()) {
recoverContainer(rcs);
}
-
- String diagnostic = "Application marked finished during recovery";
- for (ApplicationId appId : appsState.getFinishedApplications()) {
- dispatcher.getEventHandler().handle(
- new ApplicationFinishEvent(appId, diagnostic));
- }
}
}
@@ -1318,11 +1312,6 @@ public class ContainerManagerImpl extends
CompositeService implements
} else if (appsFinishedEvent.getReason() ==
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
diagnostic = "Application killed by ResourceManager";
}
- try {
- this.context.getNMStateStore().storeFinishedApplication(appID);
- } catch (IOException e) {
- LOG.error("Unable to update application state in store", e);
- }
this.dispatcher.getEventHandler().handle(
new ApplicationFinishEvent(appID,
diagnostic));
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
index 89c71bb..6f64cc7 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
@@ -84,6 +84,7 @@ public class NMLeveldbStateStoreService extends
NMStateStoreService {
private static final String APPLICATIONS_KEY_PREFIX =
"ContainerManager/applications/";
+ @Deprecated
private static final String FINISHED_APPS_KEY_PREFIX =
"ContainerManager/finishedApps/";
@@ -361,20 +362,6 @@ public class NMLeveldbStateStoreService extends
NMStateStoreService {
state.applications.add(
ContainerManagerApplicationProto.parseFrom(entry.getValue()));
}
-
- state.finishedApplications = new ArrayList<ApplicationId>();
- keyPrefix = FINISHED_APPS_KEY_PREFIX;
- iter.seek(bytes(keyPrefix));
- while (iter.hasNext()) {
- Entry<byte[], byte[]> entry = iter.next();
- String key = asString(entry.getKey());
- if (!key.startsWith(keyPrefix)) {
- break;
- }
- ApplicationId appId =
- ConverterUtils.toApplicationId(key.substring(keyPrefix.length()));
- state.finishedApplications.add(appId);
- }
} catch (DBException e) {
throw new IOException(e);
} finally {
@@ -383,6 +370,8 @@ public class NMLeveldbStateStoreService extends
NMStateStoreService {
}
}
+ cleanupDeprecatedFinishedApps();
+
return state;
}
@@ -398,17 +387,6 @@ public class NMLeveldbStateStoreService extends
NMStateStoreService {
}
@Override
- public void storeFinishedApplication(ApplicationId appId)
- throws IOException {
- String key = FINISHED_APPS_KEY_PREFIX + appId;
- try {
- db.put(bytes(key), new byte[0]);
- } catch (DBException e) {
- throw new IOException(e);
- }
- }
-
- @Override
public void removeApplication(ApplicationId appId)
throws IOException {
try {
@@ -416,8 +394,6 @@ public class NMLeveldbStateStoreService extends
NMStateStoreService {
try {
String key = APPLICATIONS_KEY_PREFIX + appId;
batch.delete(bytes(key));
- key = FINISHED_APPS_KEY_PREFIX + appId;
- batch.delete(bytes(key));
db.write(batch);
} finally {
batch.close();
@@ -935,6 +911,52 @@ public class NMLeveldbStateStoreService extends
NMStateStoreService {
}
}
+ @SuppressWarnings("deprecation")
+ private void cleanupDeprecatedFinishedApps() {
+ try {
+ cleanupKeysWithPrefix(FINISHED_APPS_KEY_PREFIX);
+ } catch (Exception e) {
+ LOG.warn("cleanup keys with prefix " + FINISHED_APPS_KEY_PREFIX +
+ " from leveldb failed", e);
+ }
+ }
+
+ private void cleanupKeysWithPrefix(String prefix) throws IOException {
+ WriteBatch batch = null;
+ LeveldbIterator iter = null;
+ try {
+ iter = new LeveldbIterator(db);
+ try {
+ batch = db.createWriteBatch();
+ iter.seek(bytes(prefix));
+ while (iter.hasNext()) {
+ byte[] key = iter.next().getKey();
+ String keyStr = asString(key);
+ if (!keyStr.startsWith(prefix)) {
+ break;
+ }
+ batch.delete(key);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("cleanup " + keyStr + " from leveldb");
+ }
+ }
+ db.write(batch);
+ } catch (DBException e) {
+ throw new IOException(e);
+ } finally {
+ if (batch != null) {
+ batch.close();
+ }
+ }
+ } catch (DBException e) {
+ throw new IOException(e);
+ } finally {
+ if (iter != null) {
+ iter.close();
+ }
+ }
+ }
+
private String getLogDeleterKey(ApplicationId appId) {
return LOG_DELETER_KEY_PREFIX + appId;
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
index d5dce9b..a887e71 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
@@ -59,10 +59,6 @@ public class NMNullStateStoreService extends
NMStateStoreService {
}
@Override
- public void storeFinishedApplication(ApplicationId appId) {
- }
-
- @Override
public void removeApplication(ApplicationId appId) throws IOException {
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
index e8ccf54..a9e8a8a 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
@@ -52,15 +52,11 @@ public abstract class NMStateStoreService extends
AbstractService {
public static class RecoveredApplicationsState {
List<ContainerManagerApplicationProto> applications;
- List<ApplicationId> finishedApplications;
public List<ContainerManagerApplicationProto> getApplications() {
return applications;
}
- public List<ApplicationId> getFinishedApplications() {
- return finishedApplications;
- }
}
public enum RecoveredContainerStatus {
@@ -248,14 +244,6 @@ public abstract class NMStateStoreService extends
AbstractService {
ContainerManagerApplicationProto p) throws IOException;
/**
- * Record that an application has finished
- * @param appId the application ID
- * @throws IOException
- */
- public abstract void storeFinishedApplication(ApplicationId appId)
- throws IOException;
-
- /**
* Remove records corresponding to an application
* @param appId the application ID
* @throws IOException
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 2e014de..9fa3fcc 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -259,6 +259,10 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
+ // no longer saving FINISH_APP event in NM stateStore,
+ // simulate by resending FINISH_APP event
+ cm.handle(new CMgrCompletedAppsEvent(finishedApps,
+ CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
assertTrue(context.getApplicationACLsManager().checkAccess(
UserGroupInformation.createRemoteUser(modUser),
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
index a1c95ab..1279896 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
@@ -44,7 +44,6 @@ import
org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
public class NMMemoryStateStoreService extends NMStateStoreService {
private Map<ApplicationId, ContainerManagerApplicationProto> apps;
- private Set<ApplicationId> finishedApps;
private Map<ContainerId, RecoveredContainerState> containerStates;
private Map<TrackerKey, TrackerState> trackerStates;
private Map<Integer, DeletionServiceDeleteTaskProto> deleteTasks;
@@ -59,7 +58,6 @@ public class NMMemoryStateStoreService extends
NMStateStoreService {
@Override
protected void initStorage(Configuration conf) {
apps = new HashMap<ApplicationId, ContainerManagerApplicationProto>();
- finishedApps = new HashSet<ApplicationId>();
containerStates = new HashMap<ContainerId, RecoveredContainerState>();
nmTokenState = new RecoveredNMTokensState();
nmTokenState.applicationMasterKeys =
@@ -86,7 +84,6 @@ public class NMMemoryStateStoreService extends
NMStateStoreService {
RecoveredApplicationsState state = new RecoveredApplicationsState();
state.applications = new ArrayList<ContainerManagerApplicationProto>(
apps.values());
- state.finishedApplications = new ArrayList<ApplicationId>(finishedApps);
return state;
}
@@ -99,15 +96,9 @@ public class NMMemoryStateStoreService extends
NMStateStoreService {
}
@Override
- public synchronized void storeFinishedApplication(ApplicationId appId) {
- finishedApps.add(appId);
- }
-
- @Override
public synchronized void removeApplication(ApplicationId appId)
throws IOException {
apps.remove(appId);
- finishedApps.remove(appId);
}
@Override
@@ -393,7 +384,6 @@ public class NMMemoryStateStoreService extends
NMStateStoreService {
logDeleterState.remove(appId);
}
-
private static class TrackerState {
Map<Path, LocalResourceProto> inProgressMap =
new HashMap<Path, LocalResourceProto>();
http://git-wip-us.apache.org/repos/asf/hadoop/blob/9b5c5bd4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index 08b49e7..47468d6 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -174,7 +174,6 @@ public class TestNMLeveldbStateStoreService {
// test empty when no state
RecoveredApplicationsState state = stateStore.loadApplicationsState();
assertTrue(state.getApplications().isEmpty());
- assertTrue(state.getFinishedApplications().isEmpty());
// store an application and verify recovered
final ApplicationId appId1 = ApplicationId.newInstance(1234, 1);
@@ -188,10 +187,8 @@ public class TestNMLeveldbStateStoreService {
state = stateStore.loadApplicationsState();
assertEquals(1, state.getApplications().size());
assertEquals(appProto1, state.getApplications().get(0));
- assertTrue(state.getFinishedApplications().isEmpty());
- // finish an application and add a new one
- stateStore.storeFinishedApplication(appId1);
+ // add a new app
final ApplicationId appId2 = ApplicationId.newInstance(1234, 2);
builder = ContainerManagerApplicationProto.newBuilder();
builder.setId(((ApplicationIdPBImpl) appId2).getProto());
@@ -203,18 +200,13 @@ public class TestNMLeveldbStateStoreService {
assertEquals(2, state.getApplications().size());
assertTrue(state.getApplications().contains(appProto1));
assertTrue(state.getApplications().contains(appProto2));
- assertEquals(1, state.getFinishedApplications().size());
- assertEquals(appId1, state.getFinishedApplications().get(0));
// test removing an application
- stateStore.storeFinishedApplication(appId2);
stateStore.removeApplication(appId2);
restartStateStore();
state = stateStore.loadApplicationsState();
assertEquals(1, state.getApplications().size());
assertEquals(appProto1, state.getApplications().get(0));
- assertEquals(1, state.getFinishedApplications().size());
- assertEquals(appId1, state.getFinishedApplications().get(0));
}
@Test