Author: tucu
Date: Wed Oct 9 17:08:53 2013
New Revision: 1530714
URL: http://svn.apache.org/r1530714
Log:
YARN-1284. LCE: Race condition leaves dangling cgroups entries for killed
containers. (Alejandro Abdelnur via Sandy Ryza)
Added:
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
- copied unchanged from r1530493,
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
Modified:
hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
Modified: hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt?rev=1530714&r1=1530713&r2=1530714&view=diff
==============================================================================
--- hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt Wed Oct
9 17:08:53 2013
@@ -12,6 +12,9 @@ Release 2.2.1 - UNRELEASED
BUG FIXES
+ YARN-1284. LCE: Race condition leaves dangling cgroups entries for killed
+ containers. (Alejandro Abdelnur via Sandy Ryza)
+
Release 2.2.0 - 2013-10-13
INCOMPATIBLE CHANGES
Modified:
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java?rev=1530714&r1=1530713&r2=1530714&view=diff
==============================================================================
---
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
(original)
+++
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
Wed Oct 9 17:08:53 2013
@@ -610,7 +610,19 @@ public class YarnConfiguration extends C
/** Where the linux container executor should mount cgroups if not found */
public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH =
NM_PREFIX + "linux-container-executor.cgroups.mount-path";
-
+
+
+ /**
+ * Interval of time the linux container executor should try cleaning up
+ * cgroups entry when cleaning up a container. This is required due to what
+ * it seems a race condition because the SIGTERM/SIGKILL is asynch.
+ */
+ public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
+ NM_PREFIX + "linux-container-executor.cgroups.delete-timeout-ms";
+
+ public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
+ 1000;
+
/** T-file compression types used to compress aggregated logs.*/
public static final String NM_LOG_AGG_COMPRESSION_TYPE =
NM_PREFIX + "log-aggregation.compression-type";
Modified:
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java?rev=1530714&r1=1530713&r2=1530714&view=diff
==============================================================================
---
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
(original)
+++
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
Wed Oct 9 17:08:53 2013
@@ -32,6 +32,7 @@ import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -40,6 +41,8 @@ import org.apache.hadoop.yarn.api.record
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
+import org.apache.hadoop.yarn.util.Clock;
+import org.apache.hadoop.yarn.util.SystemClock;
public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
@@ -59,8 +62,13 @@ public class CgroupsLCEResourcesHandler
private final int CPU_DEFAULT_WEIGHT = 1024; // set by kernel
private final Map<String, String> controllerPaths; // Controller -> path
+ private long deleteCgroupTimeout;
+ // package private for testing purposes
+ Clock clock;
+
public CgroupsLCEResourcesHandler() {
this.controllerPaths = new HashMap<String, String>();
+ clock = new SystemClock();
}
@Override
@@ -73,7 +81,8 @@ public class CgroupsLCEResourcesHandler
return conf;
}
- public synchronized void init(LinuxContainerExecutor lce) throws IOException
{
+ @VisibleForTesting
+ void initConfig() throws IOException {
this.cgroupPrefix = conf.get(YarnConfiguration.
NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn");
@@ -82,6 +91,9 @@ public class CgroupsLCEResourcesHandler
this.cgroupMountPath = conf.get(YarnConfiguration.
NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null);
+ this.deleteCgroupTimeout = conf.getLong(
+ YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
+ YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
// remove extra /'s at end or start of cgroupPrefix
if (cgroupPrefix.charAt(0) == '/') {
cgroupPrefix = cgroupPrefix.substring(1);
@@ -91,7 +103,11 @@ public class CgroupsLCEResourcesHandler
if (cgroupPrefix.charAt(len - 1) == '/') {
cgroupPrefix = cgroupPrefix.substring(0, len - 1);
}
+ }
+ public void init(LinuxContainerExecutor lce) throws IOException {
+ initConfig();
+
// mount cgroups if requested
if (cgroupMount && cgroupMountPath != null) {
ArrayList<String> cgroupKVs = new ArrayList<String>();
@@ -158,14 +174,32 @@ public class CgroupsLCEResourcesHandler
}
}
- private void deleteCgroup(String controller, String groupName) {
- String path = pathForCgroup(controller, groupName);
+ @VisibleForTesting
+ boolean deleteCgroup(String cgroupPath) {
+ boolean deleted;
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("deleteCgroup: " + cgroupPath);
+ }
- LOG.debug("deleteCgroup: " + path);
+ long start = clock.getTime();
+ do {
+ deleted = new File(cgroupPath).delete();
+ if (!deleted) {
+ try {
+ Thread.sleep(20);
+ } catch (InterruptedException ex) {
+ // NOP
+ }
+ }
+ } while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
- if (! new File(path).delete()) {
- LOG.warn("Unable to delete cgroup at: " + path);
+ if (!deleted) {
+ LOG.warn("Unable to delete cgroup at: " + cgroupPath +
+ ", tried to delete for " + deleteCgroupTimeout + "ms");
}
+
+ return deleted;
}
/*
@@ -185,21 +219,8 @@ public class CgroupsLCEResourcesHandler
}
private void clearLimits(ContainerId containerId) {
- String containerName = containerId.toString();
-
- // Based on testing, ApplicationMaster executables don't terminate until
- // a little after the container appears to have finished. Therefore, we
- // wait a short bit for the cgroup to become empty before deleting it.
- if (containerId.getId() == 1) {
- try {
- Thread.sleep(500);
- } catch (InterruptedException e) {
- // not a problem, continue anyway
- }
- }
-
if (isCpuWeightEnabled()) {
- deleteCgroup(CONTROLLER_CPU, containerName);
+ deleteCgroup(pathForCgroup(CONTROLLER_CPU, containerId.toString()));
}
}