Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/22001#discussion_r209273451 --- Diff: core/src/main/scala/org/apache/spark/internal/config/package.scala --- @@ -577,4 +577,17 @@ package object config { .timeConf(TimeUnit.SECONDS) .checkValue(v => v > 0, "The value should be a positive time value.") .createWithDefaultString("365d") + + private[spark] val BARRIER_MAX_CONCURRENT_TASKS_CHECK_INTERVAL = + ConfigBuilder("spark.scheduler.barrier.maxConcurrentTasksCheck.interval") + .doc("Time in seconds to wait between a max concurrent tasks check failure and the next " + + "check. A max concurrent tasks check ensures the cluster can launch more concurrent " + + "tasks than required by a barrier stage on job submitted. The check can fail in case " + + "a cluster has just started and not enough executors have registered, so we wait for a " + + "little while and try to perform the check again. If the check fails consecutively for " + + "three times for a job then fail current job submission. Note this config only applies " + + "to jobs that contain one or more barrier stages, we won't perform the check on " + + "non-barrier jobs.") + .timeConf(TimeUnit.SECONDS) + .createWithDefaultString("10s") --- End diff -- Would you make the default higher like `30s`? This is to cover the case when applications starts immediately with a barrier while master is adding new executors. Let me know if this won't happen.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org