This is an automated email from the ASF dual-hosted git repository. tgraves pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new dcada3d [SPARK-36624][YARN] In yarn client mode, when ApplicationMaster failed with KILLED/FAILED, driver should exit with code not 0 dcada3d is described below commit dcada3d48c51f4855c600dc254883bd9eb3a0a1c Author: Angerszhuuuu <angers....@gmail.com> AuthorDate: Wed Sep 29 11:12:01 2021 -0500 [SPARK-36624][YARN] In yarn client mode, when ApplicationMaster failed with KILLED/FAILED, driver should exit with code not 0 ### What changes were proposed in this pull request? In current code for yarn client mode, even when use use `yarn application -kill` to kill the application, driver side still exit with code 0. This behavior make job scheduler can't know the job is not success. and user don't know too. In this case we should exit program with a non 0 code. ### Why are the changes needed? Make scheduler/user more clear about application's status ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Closes #33873 from AngersZhuuuu/SPDI-36624. Authored-by: Angerszhuuuu <angers....@gmail.com> Signed-off-by: Thomas Graves <tgra...@apache.org> --- docs/running-on-yarn.md | 10 ++++++++++ .../src/main/scala/org/apache/spark/deploy/yarn/config.scala | 11 +++++++++++ .../spark/scheduler/cluster/YarnClientSchedulerBackend.scala | 10 +++++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 9930f3e..37ff479 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -442,6 +442,16 @@ To use a custom metrics.properties for the application master and executors, upd <td>1.6.0</td> </tr> <tr> + <td><code>spark.yarn.am.clientModeExitOnError</code></td> + <td>false</td> + <td> + In yarn-client mode, when this is true, if driver got application report with final status of KILLED or FAILED, + driver will stop corresponding SparkContext and exit program with code 1. + Note, if this is true and called from another application, it will terminate the parent application as well. + </td> + <td>3.3.0</td> +</tr> +<tr> <td><code>spark.yarn.executor.failuresValidityInterval</code></td> <td>(none)</td> <td> diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala index 89a4af2..ab2063c 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala @@ -52,6 +52,17 @@ package object config extends Logging { .timeConf(TimeUnit.MILLISECONDS) .createOptional + private[spark] val AM_CLIENT_MODE_EXIT_ON_ERROR = + ConfigBuilder("spark.yarn.am.clientModeExitOnError") + .doc("In yarn-client mode, when this is true, if driver got " + + "application report with final status of KILLED or FAILED, " + + "driver will stop corresponding SparkContext and exit program with code 1. " + + "Note, if this is true and called from another application, it will terminate " + + "the parent application as well.") + .version("3.3.0") + .booleanConf + .createWithDefault(false) + private[spark] val EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS = ConfigBuilder("spark.yarn.executor.failuresValidityInterval") .doc("Interval after which Executor failures will be considered independent and not " + diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index 8a55e61..28c8652 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -21,7 +21,7 @@ import java.io.InterruptedIOException import scala.collection.mutable.ArrayBuffer -import org.apache.hadoop.yarn.api.records.YarnApplicationState +import org.apache.hadoop.yarn.api.records.{FinalApplicationStatus, YarnApplicationState} import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnAppReport} @@ -122,6 +122,14 @@ private[spark] class YarnClientSchedulerBackend( } allowInterrupt = false sc.stop() + state match { + case FinalApplicationStatus.FAILED | FinalApplicationStatus.KILLED + if conf.get(AM_CLIENT_MODE_EXIT_ON_ERROR) => + logWarning(s"ApplicationMaster finished with status ${state}, " + + s"SparkContext should exit with code 1.") + System.exit(1) + case _ => + } } catch { case _: InterruptedException | _: InterruptedIOException => logInfo("Interrupting monitor thread") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org