Author: tucu
Date: Wed Oct  9 17:08:53 2013
New Revision: 1530714

URL: http://svn.apache.org/r1530714
Log:
YARN-1284. LCE: Race condition leaves dangling cgroups entries for killed 
containers. (Alejandro Abdelnur via Sandy Ryza)

Added:
    
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
      - copied unchanged from r1530493, 
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
Modified:
    hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt
    
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
    
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java

Modified: hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt?rev=1530714&r1=1530713&r2=1530714&view=diff
==============================================================================
--- hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2.2/hadoop-yarn-project/CHANGES.txt Wed Oct  
9 17:08:53 2013
@@ -12,6 +12,9 @@ Release 2.2.1 - UNRELEASED
 
   BUG FIXES
 
+    YARN-1284. LCE: Race condition leaves dangling cgroups entries for killed
+    containers. (Alejandro Abdelnur via Sandy Ryza)
+
 Release 2.2.0 - 2013-10-13
 
   INCOMPATIBLE CHANGES

Modified: 
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java?rev=1530714&r1=1530713&r2=1530714&view=diff
==============================================================================
--- 
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
 (original)
+++ 
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
 Wed Oct  9 17:08:53 2013
@@ -610,7 +610,19 @@ public class YarnConfiguration extends C
   /** Where the linux container executor should mount cgroups if not found */
   public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH =
     NM_PREFIX + "linux-container-executor.cgroups.mount-path";
-  
+
+
+  /**
+   * Interval of time the linux container executor should try cleaning up
+   * cgroups entry when cleaning up a container. This is required due to what 
+   * it seems a race condition because the SIGTERM/SIGKILL is asynch.
+   */
+  public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
+   NM_PREFIX + "linux-container-executor.cgroups.delete-timeout-ms";
+
+  public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
+      1000;
+
   /** T-file compression types used to compress aggregated logs.*/
   public static final String NM_LOG_AGG_COMPRESSION_TYPE = 
     NM_PREFIX + "log-aggregation.compression-type";

Modified: 
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java?rev=1530714&r1=1530713&r2=1530714&view=diff
==============================================================================
--- 
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
 (original)
+++ 
hadoop/common/branches/branch-2.2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
 Wed Oct  9 17:08:53 2013
@@ -32,6 +32,7 @@ import java.util.Map.Entry;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -40,6 +41,8 @@ import org.apache.hadoop.yarn.api.record
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
+import org.apache.hadoop.yarn.util.Clock;
+import org.apache.hadoop.yarn.util.SystemClock;
 
 public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
 
@@ -59,8 +62,13 @@ public class CgroupsLCEResourcesHandler 
   private final int CPU_DEFAULT_WEIGHT = 1024; // set by kernel
   private final Map<String, String> controllerPaths; // Controller -> path
 
+  private long deleteCgroupTimeout;
+  // package private for testing purposes
+  Clock clock;
+  
   public CgroupsLCEResourcesHandler() {
     this.controllerPaths = new HashMap<String, String>();
+    clock = new SystemClock();
   }
 
   @Override
@@ -73,7 +81,8 @@ public class CgroupsLCEResourcesHandler 
     return conf;
   }
 
-  public synchronized void init(LinuxContainerExecutor lce) throws IOException 
{
+  @VisibleForTesting
+  void initConfig() throws IOException {
 
     this.cgroupPrefix = conf.get(YarnConfiguration.
             NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn");
@@ -82,6 +91,9 @@ public class CgroupsLCEResourcesHandler 
     this.cgroupMountPath = conf.get(YarnConfiguration.
             NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null);
 
+    this.deleteCgroupTimeout = conf.getLong(
+        YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
+        YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
     // remove extra /'s at end or start of cgroupPrefix
     if (cgroupPrefix.charAt(0) == '/') {
       cgroupPrefix = cgroupPrefix.substring(1);
@@ -91,7 +103,11 @@ public class CgroupsLCEResourcesHandler 
     if (cgroupPrefix.charAt(len - 1) == '/') {
       cgroupPrefix = cgroupPrefix.substring(0, len - 1);
     }
+  }
   
+  public void init(LinuxContainerExecutor lce) throws IOException {
+    initConfig();
+    
     // mount cgroups if requested
     if (cgroupMount && cgroupMountPath != null) {
       ArrayList<String> cgroupKVs = new ArrayList<String>();
@@ -158,14 +174,32 @@ public class CgroupsLCEResourcesHandler 
     }
   }
 
-  private void deleteCgroup(String controller, String groupName) {
-    String path = pathForCgroup(controller, groupName);
+  @VisibleForTesting
+  boolean deleteCgroup(String cgroupPath) {
+    boolean deleted;
+    
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("deleteCgroup: " + cgroupPath);
+    }
 
-    LOG.debug("deleteCgroup: " + path);
+    long start = clock.getTime();
+    do {
+      deleted = new File(cgroupPath).delete();
+      if (!deleted) {
+        try {
+          Thread.sleep(20);
+        } catch (InterruptedException ex) {
+          // NOP        
+        }
+      }
+    } while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
 
-    if (! new File(path).delete()) {
-      LOG.warn("Unable to delete cgroup at: " + path);
+    if (!deleted) {
+      LOG.warn("Unable to delete cgroup at: " + cgroupPath +
+          ", tried to delete for " + deleteCgroupTimeout + "ms");
     }
+
+    return deleted;
   }
 
   /*
@@ -185,21 +219,8 @@ public class CgroupsLCEResourcesHandler 
   }
 
   private void clearLimits(ContainerId containerId) {
-    String containerName = containerId.toString();
-
-    // Based on testing, ApplicationMaster executables don't terminate until
-    // a little after the container appears to have finished. Therefore, we
-    // wait a short bit for the cgroup to become empty before deleting it.
-    if (containerId.getId() == 1) {
-      try {
-        Thread.sleep(500);
-      } catch (InterruptedException e) {
-        // not a problem, continue anyway
-      }
-    }
-
     if (isCpuWeightEnabled()) {
-      deleteCgroup(CONTROLLER_CPU, containerName);
+      deleteCgroup(pathForCgroup(CONTROLLER_CPU, containerId.toString()));
     }
   }
 


Reply via email to