HyukjinKwon closed pull request #23402: [SPARK-26500][CORE] Add conf to support 
ingoring data locality
URL: https://github.com/apache/spark/pull/23402
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 6f4c326442e1e..08aa2137472f8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -188,6 +188,8 @@ private[spark] class DAGScheduler(
   /** If enabled, FetchFailed will not cause stage retry, in order to surface 
the problem. */
   private val disallowStageRetryForTest = 
sc.getConf.getBoolean("spark.test.noStageRetry", false)
 
+  private val localityIgnore = sc.conf.getBoolean("spark.locality.ignore", 
false)
+
   /**
    * Whether to unregister all the outputs on the host in condition that we 
receive a FetchFailure,
    * this is set default to false, which means, we only unregister the outputs 
related to the exact
@@ -1105,24 +1107,29 @@ private[spark] class DAGScheduler(
         outputCommitCoordinator.stageStart(
           stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
     }
-    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
-      stage match {
-        case s: ShuffleMapStage =>
-          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, 
id))}.toMap
-        case s: ResultStage =>
-          partitionsToCompute.map { id =>
-            val p = s.partitions(id)
-            (id, getPreferredLocs(stage.rdd, p))
-          }.toMap
+    val taskIdToLocations: Map[Int, Seq[TaskLocation]] =
+      if (localityIgnore) {
+        Map.empty
+      } else {
+        try {
+          stage match {
+            case s: ShuffleMapStage =>
+              partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, 
id)) }.toMap
+            case s: ResultStage =>
+              partitionsToCompute.map { id =>
+                val p = s.partitions(id)
+                (id, getPreferredLocs(stage.rdd, p))
+              }.toMap
+          }
+        } catch {
+          case NonFatal(e) =>
+            stage.makeNewStageAttempt(partitionsToCompute.size)
+            listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, 
properties))
+            abortStage(stage, s"Task creation failed: 
$e\n${Utils.exceptionString(e)}", Some(e))
+            runningStages -= stage
+            return
+        }
       }
-    } catch {
-      case NonFatal(e) =>
-        stage.makeNewStageAttempt(partitionsToCompute.size)
-        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, 
properties))
-        abortStage(stage, s"Task creation failed: 
$e\n${Utils.exceptionString(e)}", Some(e))
-        runningStages -= stage
-        return
-    }
 
     stage.makeNewStageAttempt(partitionsToCompute.size, 
taskIdToLocations.values.toSeq)
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to