This is an automated email from the ASF dual-hosted git repository.

cnauroth pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new f2b69bace5e YARN-11844: Support configuration of retry policy on GPU 
discovery
f2b69bace5e is described below

commit f2b69bace5e2f44abf3ec30c045bef9c3d7ebebd
Author: Chris Nauroth <cnaur...@apache.org>
AuthorDate: Mon Aug 11 19:06:22 2025 +0000

    YARN-11844: Support configuration of retry policy on GPU discovery
    
    Closes #7857
    
    Co-authored-by: Jayadeep Jayaraman <jjayad...@google.com>
    Reviewed-by: Ashutosh Gupta <ashutosh.gu...@st.niituniversity.in>
    Signed-off-by: Ayush Saxena <ayushsax...@apache.org>
    (cherry picked from commit 0f34922ff3babfea541a95cd4722cc9436f491a0)
---
 .../apache/hadoop/yarn/conf/YarnConfiguration.java | 20 +++++
 .../src/main/resources/yarn-default.xml            | 28 +++++++
 .../resourceplugin/gpu/GpuDiscoverer.java          | 24 ++++--
 .../resourceplugin/gpu/NvidiaBinaryHelper.java     |  9 +--
 .../resourceplugin/gpu/TestGpuDiscoverer.java      | 90 +++++++++++++++++++++-
 5 files changed, 158 insertions(+), 13 deletions(-)

diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 650e82d6738..79523c09df1 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1901,6 +1901,26 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String NM_GPU_PATH_TO_EXEC =
       NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
 
+  /**
+   * Sets the maximum duration for executions of the discovery binary.
+   */
+  @Private
+  public static final String NM_GPU_DISCOVERY_TIMEOUT =
+      NM_GPU_RESOURCE_PREFIX + "discovery-timeout";
+
+  @Private
+  public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s";
+
+  /**
+   * Sets the maximum number of errors allowed from the discovery binary.
+   */
+  @Private
+  public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
+      NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";
+
+  @Private
+  public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;
+
   /**
    * Settings to control which implementation of docker plugin for GPU will be
    * used.
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 6b2d2cd817c..e444fd55d34 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -4593,6 +4593,34 @@
     <value></value>
   </property>
 
+  <property>
+    <description>
+      Sets the maximum duration for executions of the discovery binary defined 
in
+      yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
+      the binary takes longer than this amount of time to run, then the process
+      is aborted. Discovery may be attempted again, depending on
+      yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
+    </description>
+    <name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name>
+    <value>10s</value>
+  </property>
+
+  <property>
+    <description>
+      Sets the maximum number of errors allowed from the discovery binary
+      defined in
+      yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
+      the number of errors exceeds this amount, then discovery is aborted, and
+      the NodeManager will never reattempt discovery again. Errors may be 
either
+      non-zero exit codes returned from the binary or timeouts as defined by
+      yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
+      negative value to disable enforcement of max errors and retry continually
+      until successful.
+    </description>
+    <name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name>
+    <value>10</value>
+  </property>
+
   <property>
     <description>
       Enable additional discovery/isolation of resources on the NodeManager,
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 60314c38374..93d8451d1e7 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -45,7 +45,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-
+import java.util.concurrent.TimeUnit;
 
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
@@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured {
   private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = 
ImmutableSet.of(
       "/usr/bin", "/bin", "/usr/local/nvidia/bin");
 
-  private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
-
   private NvidiaBinaryHelper nvidiaBinaryHelper;
   private String pathOfGpuBinary = null;
+  private long discoveryTimeoutMs;
+  private int discoveryMaxErrors;
   private Map<String, String> environment = new HashMap<>();
 
   private int numOfErrorExecutionSinceLastSucceed = 0;
@@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) {
 
   private String getErrorMessageOfScriptExecutionThresholdReached() {
     return getFailedToExecuteScriptMessage() + " for " +
-        MAX_REPEATED_ERROR_ALLOWED + " times, " +
+        discoveryMaxErrors + " times, " +
         "skipping following executions!";
   }
 
@@ -114,7 +114,8 @@ private String getFailedToParseErrorMessage(String msg) {
    */
   public synchronized GpuDeviceInformation getGpuDeviceInformation()
       throws YarnException {
-    if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+    if (discoveryMaxErrors >= 0 &&
+        numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
       String msg = getErrorMessageOfScriptExecutionThresholdReached();
       LOG.error(msg);
       throw new YarnException(msg);
@@ -122,7 +123,8 @@ public synchronized GpuDeviceInformation 
getGpuDeviceInformation()
 
     try {
       lastDiscoveredGpuInformation =
-          nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
+          nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
+              discoveryTimeoutMs);
     } catch (IOException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration 
config)
     }
 
     pathOfGpuBinary = binaryPath.getAbsolutePath();
+
+    discoveryTimeoutMs = config.getTimeDuration(
+        YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
+        YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
+        TimeUnit.MILLISECONDS);
+
+    discoveryMaxErrors = config.getInt(
+        YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
+        YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);
+
   }
 
   private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
index 8efc32a8b13..2c206feaa49 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
@@ -34,10 +34,6 @@
  *
  */
 public class NvidiaBinaryHelper {
-  /**
-   * command should not run more than 10 sec.
-   */
-  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
 
   /**
    * @param pathOfGpuBinary The path of the binary
@@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
    * or the output parse failed
    */
   synchronized GpuDeviceInformation getGpuDeviceInformation(
-      String pathOfGpuBinary) throws IOException, YarnException {
+      String pathOfGpuBinary, long discoveryTimeoutMs)
+      throws IOException, YarnException {
     GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
 
     if (pathOfGpuBinary == null) {
@@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
     }
 
     String output = Shell.execCommand(new HashMap<>(),
-        new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
+        new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
     return parser.parseXml(output);
   }
 }
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index 06791c82bb5..7d93b39066e 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -302,6 +302,94 @@ public void 
testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
     assertNotNull(discoverer.getGpusUsableByYarn());
   }
 
+  @Test
+  public void testGetGpuDeviceInformationOverrideMaxErrors()
+      throws YarnException, IOException {
+    Configuration conf = new Configuration(false);
+    // The default is 10 max errors. Override to 11.
+    conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11);
+
+    // Initial creation will call the script once. Start out with a successful
+    // script. Otherwise, our error count assertions will be off by one later.
+    File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
+        this::createNvidiaSmiScript);
+
+    GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
+    assertEquals(fakeBinary.getAbsolutePath(),
+        discoverer.getPathOfGpuBinary());
+    assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
+
+    LOG.debug("Replacing script with faulty version!");
+    createFaultyNvidiaSmiScript(fakeBinary);
+
+    final String terminateMsg = "Failed to execute GPU device " +
+        "detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times";
+    final String msg = "Failed to execute GPU device detection script";
+
+    // We expect 11 attempts (not the default of 10).
+    for (int i = 0; i < 11; i++) {
+      try {
+        LOG.debug("Executing faulty nvidia-smi script...");
+        discoverer.getGpuDeviceInformation();
+        fail("Query of GPU device info via nvidia-smi should fail as " +
+            "script should be faulty: " + fakeBinary);
+      } catch (YarnException e) {
+        assertThat(e.getMessage(), containsString(msg));
+        assertThat(e.getMessage(), not(containsString(terminateMsg)));
+      }
+    }
+
+    // On a 12th attempt, we've exceed the configured max of 11, so we expect
+    // the termination message.
+    try {
+      LOG.debug("Executing faulty nvidia-smi script again..." +
+          "We should reach the error threshold now!");
+      discoverer.getGpuDeviceInformation();
+      fail("Query of GPU device info via nvidia-smi should fail as " +
+          "script should be faulty: " + fakeBinary);
+    } catch (YarnException e) {
+      assertThat(e.getMessage(), containsString(terminateMsg));
+    }
+
+    LOG.debug("Verifying if GPUs are still hold the value of " +
+        "first successful query");
+    assertNotNull(discoverer.getGpusUsableByYarn());
+  }
+
+  @Test
+  public void testGetGpuDeviceInformationDisableMaxErrors()
+      throws YarnException, IOException {
+    Configuration conf = new Configuration(false);
+    // A negative value should disable max errors enforcement.
+    conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);
+
+    File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
+        this::createFaultyNvidiaSmiScript);
+
+    GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
+    assertEquals(fakeBinary.getAbsolutePath(),
+        discoverer.getPathOfGpuBinary());
+    assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
+
+    final String terminateMsg = "Failed to execute GPU device " +
+        "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
+    final String msg = "Failed to execute GPU device detection script";
+
+    // The default max errors is 10. Verify that it keeps going for more, and 
we
+    // never see the termination message.
+    for (int i = 0; i < 20; ++i) {
+      try {
+        LOG.debug("Executing faulty nvidia-smi script...");
+        discoverer.getGpuDeviceInformation();
+        fail("Query of GPU device info via nvidia-smi should fail as " +
+            "script should be faulty: " + fakeBinary);
+      } catch (YarnException e) {
+        assertThat(e.getMessage(), containsString(msg));
+        assertThat(e.getMessage(), not(containsString(terminateMsg)));
+      }
+    }
+  }
+
   @Test
   public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
       throws YarnException, IOException {
@@ -538,4 +626,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException 
{
     GpuDiscoverer plugin = new GpuDiscoverer();
     plugin.initialize(conf, binaryHelper);
   }
-}
\ No newline at end of file
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to