Author: vinodkv
Date: Tue Oct 1 00:18:09 2013
New Revision: 1527827
URL: http://svn.apache.org/r1527827
Log:
YARN-1070. Fixed race conditions in NodeManager during container-kill.
Contributed by Zhijie Shen.
Modified:
hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
Modified: hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt?rev=1527827&r1=1527826&r2=1527827&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt Tue Oct 1 00:18:09 2013
@@ -101,7 +101,11 @@ Release 2.1.2 - UNRELEASED
YARN-1221. With Fair Scheduler, reserved MB reported in RM web UI increases
indefinitely (Siqi Li via Sandy Ryza)
- YARN-1247. test-container-executor has gotten out of sync with the changes
to container-executor. (rvs via tucu)
+ YARN-1247. test-container-executor has gotten out of sync with the changes
to
+ container-executor. (rvs via tucu)
+
+ YARN-1070. Fixed race conditions in NodeManager during container-kill.
+ (Zhijie Shen via vinodkv)
Release 2.1.1-beta - 2013-09-23
Modified:
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
URL:
http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java?rev=1527827&r1=1527826&r2=1527827&view=diff
==============================================================================
---
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
(original)
+++
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
Tue Oct 1 00:18:09 2013
@@ -68,6 +68,7 @@ import org.apache.hadoop.yarn.server.nod
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
+import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
@@ -133,10 +134,22 @@ public class ContainerLaunch implements
final List<String> command = launchContext.getCommands();
int ret = -1;
+ // CONTAINER_KILLED_ON_REQUEST should not be missed if the container
+ // is already at KILLING
+ if (container.getContainerState() == ContainerState.KILLING) {
+ dispatcher.getEventHandler().handle(
+ new ContainerExitEvent(containerID,
+ ContainerEventType.CONTAINER_KILLED_ON_REQUEST,
+ Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
+ ExitCode.TERMINATED.getExitCode(),
+ "Container terminated before launch."));
+ return 0;
+ }
+
try {
localResources = container.getLocalizedResources();
if (localResources == null) {
- RPCUtil.getRemoteException(
+ throw RPCUtil.getRemoteException(
"Unable to get local resources when Container " + containerID +
" is at " + container.getContainerState());
}
Modified:
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
URL:
http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java?rev=1527827&r1=1527826&r2=1527827&view=diff
==============================================================================
---
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
(original)
+++
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
Tue Oct 1 00:18:09 2013
@@ -75,20 +75,9 @@ public class ContainersLauncher extends
new ThreadFactoryBuilder()
.setNameFormat("ContainersLauncher #%d")
.build());
- private final Map<ContainerId,RunningContainer> running =
- Collections.synchronizedMap(new HashMap<ContainerId,RunningContainer>());
-
- private static final class RunningContainer {
- public RunningContainer(Future<Integer> submit,
- ContainerLaunch launcher) {
- this.runningcontainer = submit;
- this.launcher = launcher;
- }
-
- Future<Integer> runningcontainer;
- ContainerLaunch launcher;
- }
-
+ @VisibleForTesting
+ public final Map<ContainerId, ContainerLaunch> running =
+ Collections.synchronizedMap(new HashMap<ContainerId, ContainerLaunch>());
public ContainersLauncher(Context context, Dispatcher dispatcher,
ContainerExecutor exec, LocalDirsHandlerService dirsHandler,
@@ -133,38 +122,20 @@ public class ContainersLauncher extends
ContainerLaunch launch =
new ContainerLaunch(context, getConfig(), dispatcher, exec, app,
event.getContainer(), dirsHandler, containerManager);
- running.put(containerId,
- new RunningContainer(containerLauncher.submit(launch),
- launch));
+ containerLauncher.submit(launch);
+ running.put(containerId, launch);
break;
case CLEANUP_CONTAINER:
- RunningContainer rContainerDatum = running.remove(containerId);
- if (rContainerDatum == null) {
+ ContainerLaunch launcher = running.remove(containerId);
+ if (launcher == null) {
// Container not launched. So nothing needs to be done.
return;
}
- Future<Integer> rContainer = rContainerDatum.runningcontainer;
- if (rContainer != null
- && !rContainer.isDone()) {
- // Cancel the future so that it won't be launched if it isn't
already.
- // If it is going to be canceled, make sure
CONTAINER_KILLED_ON_REQUEST
- // will not be missed if the container is already at KILLING
- if (rContainer.cancel(false)) {
- if (container.getContainerState() == ContainerState.KILLING) {
- dispatcher.getEventHandler().handle(
- new ContainerExitEvent(containerId,
- ContainerEventType.CONTAINER_KILLED_ON_REQUEST,
- Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
- ExitCode.TERMINATED.getExitCode(),
- "Container terminated before launch."));
- }
- }
- }
// Cleanup a container whether it is running/killed/completed, so that
// no sub-processes are alive.
try {
- rContainerDatum.launcher.cleanupContainer();
+ launcher.cleanupContainer();
} catch (IOException e) {
LOG.warn("Got exception while cleaning container " + containerId
+ ". Ignoring.");
Modified:
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
URL:
http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java?rev=1527827&r1=1527826&r2=1527827&view=diff
==============================================================================
---
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
(original)
+++
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
Tue Oct 1 00:18:09 2013
@@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.conf.YarnC
import org.apache.hadoop.yarn.event.DrainDispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent;
@@ -72,6 +73,7 @@ import org.apache.hadoop.yarn.server.nod
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
+import
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEventType;
@@ -296,8 +298,7 @@ public class TestContainer {
wc.launchContainer();
reset(wc.localizerBus);
wc.killContainer();
- assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
- wc.c.getContainerState());
+ assertEquals(ContainerState.KILLING, wc.c.getContainerState());
assertNull(wc.c.getLocalizedResources());
wc.containerKilledOnRequest();
@@ -330,14 +331,18 @@ public class TestContainer {
}
@Test
- public void testKillOnLocalized() throws Exception {
+ public void testKillOnLocalizedWhenContainerNotLaunched() throws Exception {
WrappedContainer wc = null;
try {
wc = new WrappedContainer(17, 314159265358979L, 4344, "yak");
wc.initContainer();
wc.localizeResources();
assertEquals(ContainerState.LOCALIZED, wc.c.getContainerState());
+ ContainerLaunch launcher =
wc.launcher.running.get(wc.c.getContainerId());
wc.killContainer();
+ assertEquals(ContainerState.KILLING, wc.c.getContainerState());
+ launcher.call();
+ wc.drainDispatcherEvents();
assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
wc.c.getContainerState());
assertNull(wc.c.getLocalizedResources());
@@ -348,6 +353,31 @@ public class TestContainer {
}
}
}
+
+ @Test
+ public void testKillOnLocalizedWhenContainerLaunched() throws Exception {
+ WrappedContainer wc = null;
+ try {
+ wc = new WrappedContainer(17, 314159265358979L, 4344, "yak");
+ wc.initContainer();
+ wc.localizeResources();
+ assertEquals(ContainerState.LOCALIZED, wc.c.getContainerState());
+ ContainerLaunch launcher =
wc.launcher.running.get(wc.c.getContainerId());
+ launcher.call();
+ wc.drainDispatcherEvents();
+ assertEquals(ContainerState.EXITED_WITH_FAILURE,
+ wc.c.getContainerState());
+ wc.killContainer();
+ assertEquals(ContainerState.EXITED_WITH_FAILURE,
+ wc.c.getContainerState());
+ assertNull(wc.c.getLocalizedResources());
+ verifyCleanupCall(wc);
+ } finally {
+ if (wc != null) {
+ wc.finished();
+ }
+ }
+ }
@Test
public void testResourceLocalizedOnLocalizationFailed() throws Exception {
@@ -472,12 +502,10 @@ public class TestContainer {
wc.initContainer();
wc.localizeResources();
wc.killContainer();
- assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
- wc.c.getContainerState());
+ assertEquals(ContainerState.KILLING, wc.c.getContainerState());
assertNull(wc.c.getLocalizedResources());
wc.launchContainer();
- assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
- wc.c.getContainerState());
+ assertEquals(ContainerState.KILLING, wc.c.getContainerState());
assertNull(wc.c.getLocalizedResources());
wc.containerKilledOnRequest();
verifyCleanupCall(wc);
@@ -650,7 +678,9 @@ public class TestContainer {
Context context = mock(Context.class);
when(context.getApplications()).thenReturn(
new ConcurrentHashMap<ApplicationId, Application>());
- launcher = new ContainersLauncher(context, dispatcher, null, null, null);
+ ContainerExecutor executor = mock(ContainerExecutor.class);
+ launcher =
+ new ContainersLauncher(context, dispatcher, executor, null, null);
// create a mock ExecutorService, which will not really launch
// ContainerLaunch at all.
launcher.containerLauncher = mock(ExecutorService.class);
Modified:
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
URL:
http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java?rev=1527827&r1=1527826&r2=1527827&view=diff
==============================================================================
---
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
(original)
+++
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
Tue Oct 1 00:18:09 2013
@@ -682,9 +682,8 @@ public class TestContainerLaunch extends
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest)
.getContainerStatuses().get(0);
- int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode()
:
- ExitCode.TERMINATED.getExitCode();
- Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
+ Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
+ containerStatus.getExitStatus());
// Now verify the contents of the file. Script generates a message when it
// receives a sigterm so we look for that. We cannot perform this check on