Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/21241#discussion_r186252598 --- Diff: resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala --- @@ -320,50 +322,83 @@ private[spark] class KubernetesClusterSchedulerBackend( override def eventReceived(action: Action, pod: Pod): Unit = { val podName = pod.getMetadata.getName val podIP = pod.getStatus.getPodIP - + val podPhase = pod.getStatus.getPhase action match { - case Action.MODIFIED if (pod.getStatus.getPhase == "Running" + case Action.MODIFIED if (podPhase == "Running" && pod.getMetadata.getDeletionTimestamp == null) => val clusterNodeName = pod.getSpec.getNodeName logInfo(s"Executor pod $podName ready, launched at $clusterNodeName as IP $podIP.") executorPodsByIPs.put(podIP, pod) - case Action.DELETED | Action.ERROR => + case Action.MODIFIED if (podPhase == "Init:Error" || podPhase == "Init:CrashLoopBackoff") + && pod.getMetadata.getDeletionTimestamp == null => val executorId = getExecutorId(pod) - logDebug(s"Executor pod $podName at IP $podIP was at $action.") - if (podIP != null) { - executorPodsByIPs.remove(podIP) + failedInitExecutors.add(executorId) + if (failedInitExecutors.size >= executorMaxInitErrors) { + val errorMessage = s"Aborting Spark application because $executorMaxInitErrors" + + s" executors failed to start. The maximum number of allowed startup failures is" + --- End diff -- nit: no need for s"
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org