[GitHub] [spark] dongjoon-hyun commented on a change in pull request #26206: [SPARK-29551][CORE] Fix a bug about fetch failed when an executor is lost

GitBox Wed, 23 Oct 2019 11:39:07 -0700

dongjoon-hyun commented on a change in pull request #26206: [SPARK-29551][CORE] 
Fix a bug about fetch failed when an executor is lost
URL: https://github.com/apache/spark/pull/26206#discussion_r338214733


 ##########
 File path: 
core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
 ##########
 @@ -537,6 +536,169 @@ class DAGSchedulerSuite extends SparkFunSuite with 
LocalSparkContext with TimeLi
     assert(mapStatus2(2).location.host === "hostB")
   }
 
+  test("All shuffle files on the executor should be cleaned up when executor 
lost " +
+    "and then causes 'fetch failed'") {
+    // whether to kill Executor or not before FetchFailed
+    Seq(true, false).foreach { killExecutor => {
+      afterEach()
+      val conf = new SparkConf()
+      conf.set(config.SHUFFLE_SERVICE_ENABLED.key, "true")
+      conf.set("spark.files.fetchFailure.unRegisterOutputOnHost", "false")
+      init(conf)
+      runEvent(ExecutorAdded("exec-hostA1", "hostA"))
+      runEvent(ExecutorAdded("exec-hostA2", "hostA"))
+      runEvent(ExecutorAdded("exec-hostB", "hostB"))
+      val firstRDD = new MyRDD(sc, 3, Nil)
+      val firstShuffleDep = new ShuffleDependency(firstRDD, new 
HashPartitioner(3))
+      val firstShuffleId = firstShuffleDep.shuffleId
+      val shuffleMapRdd = new MyRDD(sc, 3, List(firstShuffleDep))
+      val shuffleDep = new ShuffleDependency(shuffleMapRdd, new 
HashPartitioner(3))
+      val secondShuffleId = shuffleDep.shuffleId
+      val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+
+      submit(reduceRdd, Array(0))
+      // map stage1 completes successfully, with one task on each executor
+      complete(taskSets(0), Seq(
+        (Success,
+          MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345),
+            Array.fill[Long](1)(2), mapTaskId = 5)),
+        (Success,
+          MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345),
+            Array.fill[Long](1)(2), mapTaskId = 6)),
+        (Success, makeMapStatus("hostB", 1, mapTaskId = 7))
+      ))
+      // map stage2 completes successfully, with one task on each executor
+      complete(taskSets(1), Seq(
+        (Success,
+          MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345),
+            Array.fill[Long](1)(2), mapTaskId = 8)),
+        (Success,
+          MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345),
+            Array.fill[Long](1)(2), mapTaskId = 9)),
+        (Success, makeMapStatus("hostB", 1))
+      ))
+      // make sure our test setup is correct
+      val initialMapStatus1 = 
mapOutputTracker.shuffleStatuses(firstShuffleId).mapStatuses
+      //  val initialMapStatus1 = mapOutputTracker.mapStatuses.get(0).get
+      assert(initialMapStatus1.count(_ != null) === 3)
+      assert(initialMapStatus1.map {
+        _.location.executorId
+      }.toSet ===
+        Set("exec-hostA1", "exec-hostA2", "exec-hostB"))
+      val initialMapStatus2 = 
mapOutputTracker.shuffleStatuses(secondShuffleId).mapStatuses
+      assert(initialMapStatus2.count(_ != null) === 3)
+      assert(initialMapStatus2.map {
+        _.location.executorId
+      }.toSet ===
+        Set("exec-hostA1", "exec-hostA2", "exec-hostB"))
+      // kill exec-hostA2
+      if (killExecutor) {
+        runEvent(ExecutorLost("exec-hostA2", ExecutorKilled))
+      }
+      // reduce stage fails with a fetch failure from one host
+      complete(taskSets(2), Seq(
+        (FetchFailed(BlockManagerId("exec-hostA2", "hostA", 12345),
+          secondShuffleId, 0L, 0, 0, "ignored"), null)
+      ))
+      // Here is the main assertion -- make sure that we de-register
+      // the map outputs for exec-hostA2
+      val mapStatus1 = 
mapOutputTracker.shuffleStatuses(firstShuffleId).mapStatuses
+      assert(mapStatus1.count(_ != null) === 2)
+      assert(mapStatus1(0).location.executorId === "exec-hostA1")
+      assert(mapStatus1(0).location.host === "hostA")
+      assert(mapStatus1(2).location.executorId === "exec-hostB")
+      assert(mapStatus1(2).location.host === "hostB")
+
+      val mapStatus2 = 
mapOutputTracker.shuffleStatuses(secondShuffleId).mapStatuses
+      assert(mapStatus2.count(_ != null) === 2)
+      assert(mapStatus2(0).location.executorId === "exec-hostA1")
+      assert(mapStatus2(0).location.host === "hostA")
+      assert(mapStatus2(2).location.executorId === "exec-hostB")
+      assert(mapStatus2(2).location.host === "hostB")
+    }
+    }
 
 Review comment:
   Indentation seems to be corrupted in some point before this line.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] dongjoon-hyun commented on a change in pull request #26206: [SPARK-29551][CORE] Fix a bug about fetch failed when an executor is lost

Reply via email to