HyukjinKwon closed pull request #23402: [SPARK-26500][CORE] Add conf to support ingoring data locality URL: https://github.com/apache/spark/pull/23402
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 6f4c326442e1e..08aa2137472f8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -188,6 +188,8 @@ private[spark] class DAGScheduler( /** If enabled, FetchFailed will not cause stage retry, in order to surface the problem. */ private val disallowStageRetryForTest = sc.getConf.getBoolean("spark.test.noStageRetry", false) + private val localityIgnore = sc.conf.getBoolean("spark.locality.ignore", false) + /** * Whether to unregister all the outputs on the host in condition that we receive a FetchFailure, * this is set default to false, which means, we only unregister the outputs related to the exact @@ -1105,24 +1107,29 @@ private[spark] class DAGScheduler( outputCommitCoordinator.stageStart( stage = s.id, maxPartitionId = s.rdd.partitions.length - 1) } - val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try { - stage match { - case s: ShuffleMapStage => - partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap - case s: ResultStage => - partitionsToCompute.map { id => - val p = s.partitions(id) - (id, getPreferredLocs(stage.rdd, p)) - }.toMap + val taskIdToLocations: Map[Int, Seq[TaskLocation]] = + if (localityIgnore) { + Map.empty + } else { + try { + stage match { + case s: ShuffleMapStage => + partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id)) }.toMap + case s: ResultStage => + partitionsToCompute.map { id => + val p = s.partitions(id) + (id, getPreferredLocs(stage.rdd, p)) + }.toMap + } + } catch { + case NonFatal(e) => + stage.makeNewStageAttempt(partitionsToCompute.size) + listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties)) + abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e)) + runningStages -= stage + return + } } - } catch { - case NonFatal(e) => - stage.makeNewStageAttempt(partitionsToCompute.size) - listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties)) - abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e)) - runningStages -= stage - return - } stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org