This is an automated email from the ASF dual-hosted git repository.

snemeth pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new e8fa192  YARN-9217. Nodemanager will fail to start if GPU is 
misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko
e8fa192 is described below

commit e8fa192f07b6f2e7a0b03813edca03c505a8ac1b
Author: Szilard Nemeth <snem...@apache.org>
AuthorDate: Wed Aug 21 16:44:22 2019 +0200

    YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the 
node or GPU drivers missing. Contributed by Peter Bacsko
---
 .../apache/hadoop/yarn/conf/YarnConfiguration.java | 14 ++++
 .../src/main/resources/yarn-default.xml            | 11 +++
 .../linux/resources/ResourcesExceptionUtil.java    | 42 +++++++++++
 .../resources/gpu/GpuResourceHandlerImpl.java      |  5 +-
 .../resourceplugin/ResourcePluginManager.java      |  6 +-
 .../resourceplugin/gpu/GpuDiscoverer.java          | 83 ++++++++++++----------
 .../gpu/GpuNodeResourceUpdateHandler.java          | 13 +++-
 .../resourceplugin/gpu/GpuResourcePlugin.java      | 35 +++++++--
 .../resourceplugin/gpu/NvidiaBinaryHelper.java     | 63 ++++++++++++++++
 .../resources/gpu/TestGpuResourceHandlerImpl.java  | 16 +++--
 .../resourceplugin/gpu/TestGpuDiscoverer.java      | 45 ++++++------
 11 files changed, 256 insertions(+), 77 deletions(-)

diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 134b698..1e55fe3 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1647,6 +1647,20 @@ public class YarnConfiguration extends Configuration {
   public static final String NM_RESOURCE_PLUGINS =
       NM_PREFIX + "resource-plugins";
 
+
+  /**
+   * Specifies whether the initialization of the Node Manager should continue
+   * if a certain device (GPU, FPGA, etc) was not found in the system. If set
+   * to "true", then an exception will be thrown if a device is missing or
+   * an error occurred during discovery.
+   */
+  @Private
+  public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
+      NM_RESOURCE_PLUGINS + ".fail-fast";
+
+  @Private
+  public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
+
   /**
    * This setting controls if pluggable device plugin framework is enabled.
    * */
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 4b93d1e..7a672de 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -3920,6 +3920,17 @@
 
   <property>
     <description>
+      Specifies whether the initialization of the Node Manager should continue
+      if a certain device (GPU, FPGA, etc) was not found in the system. If set
+      to "true", then an exception will be thrown if a device is missing or
+      an error occurred during discovery.
+    </description>
+    <name>yarn.nodemanager.resource-plugins.fail-fast</name>
+    <value></value>
+  </property>
+
+  <property>
+    <description>
       Specify GPU devices which can be managed by YARN NodeManager, split by 
comma
       Number of GPU devices will be reported to RM to make scheduling 
decisions.
       Set to auto (default) let YARN automatically discover GPU resource from
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java
new file mode 100644
index 0000000..f270f42
--- /dev/null
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
+
+import static 
org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
+import static 
org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+
+/**
+ * Small utility class which only re-throws YarnException if
+ * NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
+ *
+ */
+public final class ResourcesExceptionUtil {
+  private ResourcesExceptionUtil() {}
+
+  public static void throwIfNecessary(YarnException e, Configuration conf)
+      throws YarnException {
+    if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
+        DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
+      throw e;
+    }
+  }
+}
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
index 71b041b..c3545fc 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -18,6 +18,8 @@
 
 package 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
 
+import static 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements 
ResourceHandler {
         String message = "GPU is enabled on the NodeManager, but couldn't find 
"
             + "any usable GPU devices, please double check configuration!";
         LOG.error(message);
-        throw new ResourceHandlerException(message);
+        throwIfNecessary(new ResourceHandlerException(message),
+            configuration);
       }
     } catch (YarnException e) {
       LOG.error("Exception when trying to get usable GPU device", e);
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
index 84cdd7a..0dfa33f 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
@@ -72,7 +72,7 @@ public class ResourcePluginManager {
 
     Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
     if (plugins != null) {
-      pluginMap = initializePlugins(context, plugins);
+      pluginMap = initializePlugins(conf, context, plugins);
     }
 
     // Try to load pluggable device plugins
@@ -101,7 +101,7 @@ public class ResourcePluginManager {
     return plugins;
   }
 
-  private Map<String, ResourcePlugin> initializePlugins(
+  private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
       Context context, String[] plugins) throws YarnException {
     Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
 
@@ -114,7 +114,7 @@ public class ResourcePluginManager {
         if (resourceName.equals(GPU_URI)) {
           final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
           final GpuNodeResourceUpdateHandler updateHandler =
-              new GpuNodeResourceUpdateHandler(gpuDiscoverer);
+              new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
           plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
         } else if (resourceName.equals(FPGA_URI)) {
           plugin = new FpgaResourcePlugin();
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 939093f..3f2b657 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -18,6 +18,8 @@
 
 package 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import static 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
@@ -26,7 +28,6 @@ import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import 
org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
@@ -58,10 +59,9 @@ public class GpuDiscoverer extends Configured {
   private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = 
ImmutableSet.of(
       "/usr/bin", "/bin", "/usr/local/nvidia/bin");
 
-  // command should not run more than 10 sec.
-  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
   private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
 
+  private NvidiaBinaryHelper nvidiaBinaryHelper;
   private String pathOfGpuBinary = null;
   private Map<String, String> environment = new HashMap<>();
 
@@ -110,24 +110,17 @@ public class GpuDiscoverer extends Configured {
    * @return GpuDeviceInformation
    * @throws YarnException when any error happens
    */
-  synchronized GpuDeviceInformation getGpuDeviceInformation()
+  public synchronized GpuDeviceInformation getGpuDeviceInformation()
       throws YarnException {
-    validateConfOrThrowException();
-
     if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
       String msg = getErrorMessageOfScriptExecutionThresholdReached();
       LOG.error(msg);
       throw new YarnException(msg);
     }
 
-    String output;
     try {
-      output = Shell.execCommand(environment,
-          new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
-      GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
-      lastDiscoveredGpuInformation = parser.parseXml(output);
-      numOfErrorExecutionSinceLastSucceed = 0;
-      return lastDiscoveredGpuInformation;
+      lastDiscoveredGpuInformation =
+          nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
     } catch (IOException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -136,14 +129,14 @@ public class GpuDiscoverer extends Configured {
     } catch (YarnException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getFailedToParseErrorMessage(e.getMessage());
-      if (LOG.isDebugEnabled()) {
-        LOG.warn(msg, e);
-      }
+      LOG.debug(msg, e);
       throw e;
     }
+
+    return lastDiscoveredGpuInformation;
   }
 
-  private boolean IsAutoDiscoveryEnabled() {
+  private boolean isAutoDiscoveryEnabled() {
     String allowedDevicesStr = getConf().get(
         YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
         YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
@@ -155,13 +148,12 @@ public class GpuDiscoverer extends Configured {
    * Get list of GPU devices usable by YARN.
    *
    * @return List of GPU devices
-   * @throws YarnException when any issue happens
    */
   public synchronized List<GpuDevice> getGpusUsableByYarn()
       throws YarnException {
     validateConfOrThrowException();
 
-    if (IsAutoDiscoveryEnabled()) {
+    if (isAutoDiscoveryEnabled()) {
       return parseGpuDevicesFromAutoDiscoveredGpuInfo();
     } else {
       if (gpuDevicesFromUser == null) {
@@ -217,16 +209,27 @@ public class GpuDiscoverer extends Configured {
       if (device.trim().length() > 0) {
         String[] splitByColon = device.trim().split(":");
         if (splitByColon.length != 2) {
-          throw GpuDeviceSpecificationException.
-              createWithWrongValueSpecified(device, devices);
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithWrongValueSpecified(device, devices), getConf());
+          LOG.warn("Wrong GPU specification string {}, ignored", device);
+        }
+
+        GpuDevice gpuDevice;
+        try {
+          gpuDevice = parseGpuDevice(splitByColon);
+        } catch (NumberFormatException e) {
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithWrongValueSpecified(device, devices, e), getConf());
+          LOG.warn("Cannot parse GPU device numbers: {}", device);
+          continue;
         }
 
-        GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
         if (!gpuDevices.contains(gpuDevice)) {
           gpuDevices.add(gpuDevice);
         } else {
-          throw GpuDeviceSpecificationException
-              .createWithDuplicateValueSpecified(device, devices);
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithDuplicateValueSpecified(device, devices), getConf());
+          LOG.warn("CPU device is duplicated: {}", device);
         }
       }
     }
@@ -235,22 +238,17 @@ public class GpuDiscoverer extends Configured {
     return gpuDevices;
   }
 
-  private GpuDevice parseGpuDevice(String device, String[] splitByColon,
-      String allowedDevicesStr) throws YarnException {
-    try {
-      int index = Integer.parseInt(splitByColon[0]);
-      int minorNumber = Integer.parseInt(splitByColon[1]);
-      return new GpuDevice(index, minorNumber);
-    } catch (NumberFormatException e) {
-      throw GpuDeviceSpecificationException.
-          createWithWrongValueSpecified(device, allowedDevicesStr, e);
-    }
+  private GpuDevice parseGpuDevice(String[] splitByColon) {
+    int index = Integer.parseInt(splitByColon[0]);
+    int minorNumber = Integer.parseInt(splitByColon[1]);
+    return new GpuDevice(index, minorNumber);
   }
 
-  public synchronized void initialize(Configuration config)
-      throws YarnException {
+  public synchronized void initialize(Configuration config,
+      NvidiaBinaryHelper nvidiaHelper) throws YarnException {
     setConf(config);
-    if (IsAutoDiscoveryEnabled()) {
+    this.nvidiaBinaryHelper = nvidiaHelper;
+    if (isAutoDiscoveryEnabled()) {
       numOfErrorExecutionSinceLastSucceed = 0;
       lookUpAutoDiscoveryBinary(config);
 
@@ -284,7 +282,18 @@ public class GpuDiscoverer extends Configured {
       binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
     } else {
       binaryPath = configuredBinaryFile;
+      // If path exists but file name is incorrect don't execute the file
+      String fileName = binaryPath.getName();
+      if (DEFAULT_BINARY_NAME.equals(fileName)) {
+        String msg = String.format("Please check the configuration value of"
+             +" %s. It should point to an %s binary.",
+             YarnConfiguration.NM_GPU_PATH_TO_EXEC,
+             DEFAULT_BINARY_NAME);
+        throwIfNecessary(new YarnException(msg), config);
+        LOG.warn(msg);
+      }
     }
+
     pathOfGpuBinary = binaryPath.getAbsolutePath();
   }
 
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
index 4b2258d..afb0d7e 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -18,6 +18,9 @@
 
 package 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import static 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.ResourceInformation;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
@@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends 
NodeResourceUpdaterPlugin {
   private static final Logger LOG =
       LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
   private final GpuDiscoverer gpuDiscoverer;
+  private Configuration conf;
 
-  public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
+  public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
+      Configuration conf) {
     this.gpuDiscoverer = gpuDiscoverer;
+    this.conf = conf;
   }
 
   @Override
@@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends 
NodeResourceUpdaterPlugin {
           "but could not find any usable GPUs on the NodeManager!";
       LOG.error(message);
       // No gpu can be used by YARN.
-      throw new YarnException(message);
+      throwIfNecessary(new YarnException(message), conf);
+      return;
     }
 
     long nUsableGpus = usableGpus.size();
@@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends 
NodeResourceUpdaterPlugin {
     Map<String, ResourceInformation> configuredResourceTypes =
         ResourceUtils.getResourceTypes();
     if (!configuredResourceTypes.containsKey(GPU_URI)) {
-      throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
+      LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
           + GPU_URI
           + " resource-type is not configured inside"
           + " resource-types.xml, please configure it to enable GPU feature or"
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
index 2b06f31..d44160e 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
@@ -18,6 +18,8 @@
 
 package 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import java.util.List;
+
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
@@ -32,8 +34,6 @@ import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
 import 
org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
 import 
org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
-
-import java.util.List;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
   private final GpuDiscoverer gpuDiscoverer;
+  public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
+
+  private int numOfErrorExecutionSinceLastSucceed = 0;
+
   private GpuResourceHandlerImpl gpuResourceHandler = null;
   private DockerCommandPlugin dockerCommandPlugin = null;
 
@@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   @Override
   public void initialize(Context context) throws YarnException {
-    this.gpuDiscoverer.initialize(context.getConf());
+    this.gpuDiscoverer.initialize(context.getConf(),
+        new NvidiaBinaryHelper());
     this.dockerCommandPlugin =
         GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
             context.getConf());
@@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   @Override
   public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
-    GpuDeviceInformation gpuDeviceInformation =
-        gpuDiscoverer.getGpuDeviceInformation();
+    GpuDeviceInformation gpuDeviceInformation;
 
     //At this point the gpu plugin is already enabled
     checkGpuResourceHandler();
 
+    checkErrorCount();
+    try{
+      gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
+      numOfErrorExecutionSinceLastSucceed = 0;
+    } catch (YarnException e) {
+      LOG.error(e.getMessage(), e);
+      numOfErrorExecutionSinceLastSucceed++;
+      throw e;
+    }
+
     GpuResourceAllocator gpuResourceAllocator =
         gpuResourceHandler.getGpuAllocator();
     List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
@@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
     }
   }
 
+  private void checkErrorCount() throws YarnException {
+    if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+      String msg =
+          "Failed to execute GPU device information detection script for "
+              + MAX_REPEATED_ERROR_ALLOWED
+              + " times, skip following executions.";
+      LOG.error(msg);
+      throw new YarnException(msg);
+    }
+  }
+
   @Override
   public String toString() {
     return GpuResourcePlugin.class.getName();
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
new file mode 100644
index 0000000..8efc32a
--- /dev/null
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import 
org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import 
org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
+
+/**
+ * Executes the "nvidia-smi" command and returns an object
+ * based on its output.
+ *
+ */
+public class NvidiaBinaryHelper {
+  /**
+   * command should not run more than 10 sec.
+   */
+  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
+
+  /**
+   * @param pathOfGpuBinary The path of the binary
+   * @return the GpuDeviceInformation parsed from the nvidia-smi output
+   * @throws IOException if the binary output is not readable
+   * @throws YarnException if the pathOfGpuBinary is null,
+   * or the output parse failed
+   */
+  synchronized GpuDeviceInformation getGpuDeviceInformation(
+      String pathOfGpuBinary) throws IOException, YarnException {
+    GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+    if (pathOfGpuBinary == null) {
+      throw new YarnException(
+          "Failed to find GPU discovery executable, please double check "
+              + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
+    }
+
+    String output = Shell.execCommand(new HashMap<>(),
+        new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
+    return parser.parseXml(output);
+  }
+}
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java
index 777a85b..7871179 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java
@@ -41,6 +41,7 @@ import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
 import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
 import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
 import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
 import 
org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@@ -116,10 +117,12 @@ public class TestGpuResourceHandlerImpl {
   @Rule
   public ExpectedException expected = ExpectedException.none();
 
+  private NvidiaBinaryHelper nvidiaBinaryHelper;
+
   @Before
   public void setup() throws IOException {
     createTestDataDirectory();
-
+    nvidiaBinaryHelper = new NvidiaBinaryHelper();
     CustomResourceTypesConfigurationProvider.
         initResourceTypes(ResourceInformation.GPU_URI);
 
@@ -147,13 +150,14 @@ public class TestGpuResourceHandlerImpl {
   @After
   public void cleanupTestFiles() throws IOException {
     FileUtils.deleteDirectory(testDataDirectory);
+    nvidiaBinaryHelper = new NvidiaBinaryHelper();
   }
 
   @Test
   public void testBootstrapWithRealGpuDiscoverer() throws Exception {
     Configuration conf = createDefaultConfig();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     gpuResourceHandler.bootstrap(conf);
 
@@ -171,7 +175,7 @@ public class TestGpuResourceHandlerImpl {
   public void testBootstrapWithMockGpuDiscoverer() throws Exception {
     GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
     Configuration conf = new YarnConfiguration();
-    mockDiscoverer.initialize(conf);
+    mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     expected.expect(ResourceHandlerException.class);
     gpuResourceHandler.bootstrap(conf);
@@ -271,7 +275,7 @@ public class TestGpuResourceHandlerImpl {
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
 
     gpuDiscoverer = new GpuDiscoverer();
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
     Context nmContext = createMockNmContext(conf);
     gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
         mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
@@ -380,7 +384,7 @@ public class TestGpuResourceHandlerImpl {
   public void testAllocationWithoutAllowedGpus() throws Exception {
     Configuration conf = createDefaultConfig();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     try {
       gpuResourceHandler.bootstrap(conf);
@@ -461,7 +465,7 @@ public class TestGpuResourceHandlerImpl {
         new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
         mockPrivilegedExecutor, gpuDiscoverer);
 
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     gpuNULLStateResourceHandler.bootstrap(conf);
     verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index a70e668..6da23858 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
   private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
   private static final String TEST_PARENT_DIR = new File("target/temp/" +
       TestGpuDiscoverer.class.getName()).getAbsolutePath();
+  private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
 
   @Rule
   public ExpectedException exception = ExpectedException.none();
@@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
       Configuration conf) throws YarnException {
     conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     return discoverer;
   }
 
@@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
     // test case 1, check default setting.
     Configuration conf = new Configuration(false);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
     assertNvidiaIsOnPath(discoverer);
 
     // test case 2, check mandatory set path.
     File fakeBinary = setupFakeBinary(conf);
     discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(fakeBinary.getAbsolutePath(),
         discoverer.getPathOfGpuBinary());
     assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
@@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
     // but binary doesn't exist so default path will be used.
     fakeBinary.delete();
     discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(DEFAULT_BINARY_NAME,
         discoverer.getPathOfGpuBinary());
     assertNvidiaIsOnPath(discoverer);
@@ -317,7 +318,7 @@ public class TestGpuDiscoverer {
         Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
     Configuration conf = new Configuration(false);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
 
     assertTrue(info.getGpus().size() > 0);
@@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
     Configuration conf = createConfigWithAllowedDevices("1:2");
 
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
     assertEquals(1, usableGpuDevices.size());
 
@@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
   public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
     Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
 
     List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
     assertEquals(4, usableGpuDevices.size());
@@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
     conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
     GpuDiscoverer plugin = new GpuDiscoverer();
     try {
-      plugin.initialize(conf);
+      plugin.initialize(conf, binaryHelper);
       plugin.getGpusUsableByYarn();
       fail("Illegal format, should fail.");
     } catch (YarnException e) {
@@ -501,15 +502,15 @@ public class TestGpuDiscoverer {
   }
 
   @Test
-  public void testScriptNotCalled() throws YarnException {
+  public void testScriptNotCalled() throws YarnException, IOException {
     Configuration conf = new Configuration();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
 
     GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
 
-    gpuSpy.initialize(conf);
+    gpuSpy.initialize(conf, binaryHelper);
     gpuSpy.getGpusUsableByYarn();
 
     verify(gpuSpy, never()).getGpuDeviceInformation();
   }
-}
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to