This is an automated email from the ASF dual-hosted git repository. cnauroth pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new f2b69bace5e YARN-11844: Support configuration of retry policy on GPU discovery f2b69bace5e is described below commit f2b69bace5e2f44abf3ec30c045bef9c3d7ebebd Author: Chris Nauroth <cnaur...@apache.org> AuthorDate: Mon Aug 11 19:06:22 2025 +0000 YARN-11844: Support configuration of retry policy on GPU discovery Closes #7857 Co-authored-by: Jayadeep Jayaraman <jjayad...@google.com> Reviewed-by: Ashutosh Gupta <ashutosh.gu...@st.niituniversity.in> Signed-off-by: Ayush Saxena <ayushsax...@apache.org> (cherry picked from commit 0f34922ff3babfea541a95cd4722cc9436f491a0) --- .../apache/hadoop/yarn/conf/YarnConfiguration.java | 20 +++++ .../src/main/resources/yarn-default.xml | 28 +++++++ .../resourceplugin/gpu/GpuDiscoverer.java | 24 ++++-- .../resourceplugin/gpu/NvidiaBinaryHelper.java | 9 +-- .../resourceplugin/gpu/TestGpuDiscoverer.java | 90 +++++++++++++++++++++- 5 files changed, 158 insertions(+), 13 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 650e82d6738..79523c09df1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1901,6 +1901,26 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_GPU_PATH_TO_EXEC = NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; + /** + * Sets the maximum duration for executions of the discovery binary. + */ + @Private + public static final String NM_GPU_DISCOVERY_TIMEOUT = + NM_GPU_RESOURCE_PREFIX + "discovery-timeout"; + + @Private + public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s"; + + /** + * Sets the maximum number of errors allowed from the discovery binary. + */ + @Private + public static final String NM_GPU_DISCOVERY_MAX_ERRORS = + NM_GPU_RESOURCE_PREFIX + "discovery-max-errors"; + + @Private + public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10; + /** * Settings to control which implementation of docker plugin for GPU will be * used. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 6b2d2cd817c..e444fd55d34 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -4593,6 +4593,34 @@ <value></value> </property> + <property> + <description> + Sets the maximum duration for executions of the discovery binary defined in + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If + the binary takes longer than this amount of time to run, then the process + is aborted. Discovery may be attempted again, depending on + yarn.nodemanager.resource-plugins.gpu.discovery-max-errors. + </description> + <name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name> + <value>10s</value> + </property> + + <property> + <description> + Sets the maximum number of errors allowed from the discovery binary + defined in + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If + the number of errors exceeds this amount, then discovery is aborted, and + the NodeManager will never reattempt discovery again. Errors may be either + non-zero exit codes returned from the binary or timeouts as defined by + yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a + negative value to disable enforcement of max errors and retry continually + until successful. + </description> + <name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name> + <value>10</value> + </property> + <property> <description> Enable additional discovery/isolation of resources on the NodeManager, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 60314c38374..93d8451d1e7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -45,7 +45,7 @@ import java.util.List; import java.util.Map; import java.util.Set; - +import java.util.concurrent.TimeUnit; @InterfaceAudience.Private @InterfaceStability.Unstable @@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured { private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - private static final int MAX_REPEATED_ERROR_ALLOWED = 10; - private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; + private long discoveryTimeoutMs; + private int discoveryMaxErrors; private Map<String, String> environment = new HashMap<>(); private int numOfErrorExecutionSinceLastSucceed = 0; @@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) { private String getErrorMessageOfScriptExecutionThresholdReached() { return getFailedToExecuteScriptMessage() + " for " + - MAX_REPEATED_ERROR_ALLOWED + " times, " + + discoveryMaxErrors + " times, " + "skipping following executions!"; } @@ -114,7 +114,8 @@ private String getFailedToParseErrorMessage(String msg) { */ public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + if (discoveryMaxErrors >= 0 && + numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) { String msg = getErrorMessageOfScriptExecutionThresholdReached(); LOG.error(msg); throw new YarnException(msg); @@ -122,7 +123,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() try { lastDiscoveredGpuInformation = - nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary, + discoveryTimeoutMs); } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getErrorMessageOfScriptExecution(e.getMessage()); @@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config) } pathOfGpuBinary = binaryPath.getAbsolutePath(); + + discoveryTimeoutMs = config.getTimeDuration( + YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT, + YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT, + TimeUnit.MILLISECONDS); + + discoveryMaxErrors = config.getInt( + YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, + YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT); + } private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java index 8efc32a8b13..2c206feaa49 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java @@ -34,10 +34,6 @@ * */ public class NvidiaBinaryHelper { - /** - * command should not run more than 10 sec. - */ - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; /** * @param pathOfGpuBinary The path of the binary @@ -47,7 +43,8 @@ public class NvidiaBinaryHelper { * or the output parse failed */ synchronized GpuDeviceInformation getGpuDeviceInformation( - String pathOfGpuBinary) throws IOException, YarnException { + String pathOfGpuBinary, long discoveryTimeoutMs) + throws IOException, YarnException { GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); if (pathOfGpuBinary == null) { @@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation( } String output = Shell.execCommand(new HashMap<>(), - new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS); + new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs); return parser.parseXml(output); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 06791c82bb5..7d93b39066e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -302,6 +302,94 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun() assertNotNull(discoverer.getGpusUsableByYarn()); } + @Test + public void testGetGpuDeviceInformationOverrideMaxErrors() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + // The default is 10 max errors. Override to 11. + conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11); + + // Initial creation will call the script once. Start out with a successful + // script. Otherwise, our error count assertions will be off by one later. + File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( + this::createNvidiaSmiScript); + + GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); + assertEquals(fakeBinary.getAbsolutePath(), + discoverer.getPathOfGpuBinary()); + assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + + LOG.debug("Replacing script with faulty version!"); + createFaultyNvidiaSmiScript(fakeBinary); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times"; + final String msg = "Failed to execute GPU device detection script"; + + // We expect 11 attempts (not the default of 10). + for (int i = 0; i < 11; i++) { + try { + LOG.debug("Executing faulty nvidia-smi script..."); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage(), containsString(msg)); + assertThat(e.getMessage(), not(containsString(terminateMsg))); + } + } + + // On a 12th attempt, we've exceed the configured max of 11, so we expect + // the termination message. + try { + LOG.debug("Executing faulty nvidia-smi script again..." + + "We should reach the error threshold now!"); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage(), containsString(terminateMsg)); + } + + LOG.debug("Verifying if GPUs are still hold the value of " + + "first successful query"); + assertNotNull(discoverer.getGpusUsableByYarn()); + } + + @Test + public void testGetGpuDeviceInformationDisableMaxErrors() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + // A negative value should disable max errors enforcement. + conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1); + + File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( + this::createFaultyNvidiaSmiScript); + + GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); + assertEquals(fakeBinary.getAbsolutePath(), + discoverer.getPathOfGpuBinary()); + assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; + final String msg = "Failed to execute GPU device detection script"; + + // The default max errors is 10. Verify that it keeps going for more, and we + // never see the termination message. + for (int i = 0; i < 20; ++i) { + try { + LOG.debug("Executing faulty nvidia-smi script..."); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage(), containsString(msg)); + assertThat(e.getMessage(), not(containsString(terminateMsg))); + } + } + } + @Test public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml() throws YarnException, IOException { @@ -538,4 +626,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException { GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf, binaryHelper); } -} \ No newline at end of file +} --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org