pgandhi999 commented on a change in pull request #23677:
[SPARK-26755][SCHEDULER] : Optimize Spark Scheduler to dequeue speculative
tasks…
URL: https://github.com/apache/spark/pull/23677#discussion_r301182676
##########
File path:
core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
##########
@@ -1655,4 +1657,81 @@ class TaskSetManagerSuite extends SparkFunSuite with
LocalSparkContext with Logg
// get removed inside TaskSchedulerImpl later.
assert(availableResources(GPU) sameElements Array("0", "1", "2", "3"))
}
+
+ test("SPARK-26755 Ensure that a speculative task is submitted only once for
execution") {
+ sc = new SparkContext("local", "test")
+ sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
+ val taskSet = FakeTask.createTaskSet(4)
+ // Set the speculation multiplier to be 0 so speculative tasks are
launched immediately
+ sc.conf.set(config.SPECULATION_MULTIPLIER, 0.0)
+ sc.conf.set(config.SPECULATION_ENABLED, true)
+ sc.conf.set(config.SPECULATION_QUANTILE, 0.5)
+ val clock = new ManualClock()
+ val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock
= clock)
+ val accumUpdatesByTask: Array[Seq[AccumulatorV2[_, _]]] =
taskSet.tasks.map { task =>
+ task.metrics.internalAccums
+ }
+ // Offer resources for 4 tasks to start
+ for ((k, v) <- List(
+ "exec1" -> "host1",
+ "exec1" -> "host1",
+ "exec2" -> "host2",
+ "exec2" -> "host2")) {
+ val taskOption = manager.resourceOffer(k, v, NO_PREF)
+ assert(taskOption.isDefined)
+ val task = taskOption.get
+ assert(task.executorId === k)
+ }
+ assert(sched.startedTasks.toSet === Set(0, 1, 2, 3))
+ clock.advance(1)
+ // Complete the first 2 tasks and leave the other 2 tasks in running
+ for (id <- Set(0, 1)) {
+ manager.handleSuccessfulTask(id, createTaskResult(id,
accumUpdatesByTask(id)))
+ assert(sched.endedTasks(id) === Success)
+ }
+ // checkSpeculatableTasks checks that the task runtime is greater than the
threshold for
+ // speculating. Since we use a threshold of 0 for speculation, tasks need
to be running for
+ // > 0ms, so advance the clock by 1ms here.
+ clock.advance(1)
+ assert(manager.checkSpeculatableTasks(0))
+ assert(sched.speculativeTasks.toSet === Set(2, 3))
+ assert(manager.pendingSpeculatableTasks.forExecutor.size === 0)
+ assert(manager.pendingSpeculatableTasks.forHost.size === 0)
+ assert(manager.pendingSpeculatableTasks.forRack.size === 0)
+ assert(manager.pendingSpeculatableTasks.anyPrefs.size === 2)
+ assert(manager.pendingSpeculatableTasks.noPrefs.size === 2)
+
+ // Offer resource to start the speculative attempt for the running task
+ val taskOption5 = manager.resourceOffer("exec1", "host1", NO_PREF)
+ val taskOption6 = manager.resourceOffer("exec1", "host1", NO_PREF)
+ assert(taskOption5.isDefined)
+ val task5 = taskOption5.get
+ assert(task5.index === 2)
+ assert(task5.taskId === 4)
+ assert(task5.executorId === "exec1")
+ assert(task5.attemptNumber === 1)
+ assert(taskOption6.isDefined)
+ val task6 = taskOption6.get
+ assert(task6.index === 3)
+ assert(task6.taskId === 5)
+ assert(task6.executorId === "exec1")
+ assert(task6.attemptNumber === 1)
+ sched.initialize(new FakeSchedulerBackend() {
+ override def killTask(
+ taskId: Long,
+ executorId: String,
+ interruptThread: Boolean,
+ reason: String): Unit = {}
+ })
+ clock.advance(1)
+ // Running checkSpeculatableTasks again should return false
+ assert(!manager.checkSpeculatableTasks(0))
+ assert(manager.pendingSpeculatableTasks.forExecutor.size === 0)
+ assert(manager.pendingSpeculatableTasks.forHost.size === 0)
+ assert(manager.pendingSpeculatableTasks.forRack.size === 0)
+ // allPendingSpeculativeTasks will still have two pending tasks but
+ // pendingSpeculatableTasksWithNoPrefs should have none
+ assert(manager.pendingSpeculatableTasks.anyPrefs.size === 2)
+ assert(manager.pendingSpeculatableTasks.noPrefs.size === 0)
Review comment:
@squito Your comment does make sense and I can implement it in the test. The
only part I am stuck at is trying to figure out a way to test that we are not
resubmitting speculative tasks without checking in the HashMap or the size.
Would appreciate your guidance in this matter. Thank you.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]