agrawaldevesh commented on a change in pull request #29452: URL: https://github.com/apache/spark/pull/29452#discussion_r472756046
########## File path: core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala ########## @@ -1051,15 +1049,19 @@ private[spark] class TaskSetManager( logDebug("Task length threshold for speculation: " + threshold) for (tid <- runningTasksSet) { var speculated = checkAndSubmitSpeculatableTask(tid, time, threshold) - if (!speculated && tidToExecutorKillTimeMapping.contains(tid)) { - // Check whether this task will finish before the exectorKillTime assuming - // it will take medianDuration overall. If this task cannot finish within - // executorKillInterval, then this task is a candidate for speculation - val taskEndTimeBasedOnMedianDuration = taskInfos(tid).launchTime + medianDuration - val canExceedDeadline = tidToExecutorKillTimeMapping(tid) < - taskEndTimeBasedOnMedianDuration - if (canExceedDeadline) { - speculated = checkAndSubmitSpeculatableTask(tid, time, 0) + if (!speculated && executorDecommissionKillInterval.nonEmpty) { + val taskInfo = taskInfos(tid) + val decomState = sched.getExecutorDecommissionState(taskInfo.executorId) + if (decomState.nonEmpty) { + // Check whether this task will finish before the exectorKillTime assuming Review comment: Good point. reworked the comment. ########## File path: core/src/main/scala/org/apache/spark/scheduler/ExecutorDecommissionInfo.scala ########## @@ -18,11 +18,22 @@ package org.apache.spark.scheduler /** - * Provides more detail when an executor is being decommissioned. + * Message providing more detail when an executor is being decommissioned. * @param message Human readable reason for why the decommissioning is happening. * @param isHostDecommissioned Whether the host (aka the `node` or `worker` in other places) is * being decommissioned too. Used to infer if the shuffle data might * be lost even if the external shuffle service is enabled. */ private[spark] case class ExecutorDecommissionInfo(message: String, isHostDecommissioned: Boolean) + +/** + * State related to decommissioning that is kept by the TaskSchedulerImpl. This state is derived + * from the info message above but it is kept distinct to allow the state to evolve independently + * from the message. + */ +case class ExecutorDecommissionState( + message: String, + // Timestamp in milliseconds when decommissioning was triggered + tsMillis: Long, Review comment: Sure. good idea. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org