Github user vanzin commented on a diff in the pull request: https://github.com/apache/spark/pull/21068#discussion_r185109185 --- Diff: resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/FailureTracker.scala --- @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.deploy.yarn + +import scala.collection.mutable + +import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging +import org.apache.spark.util.{Clock, SystemClock} + +/** + * FailureTracker is responsible for tracking executor failures both for each host separately + * and for all host altogether. +*/ +private[spark] class FailureTracker( + sparkConf: SparkConf, + var clock: Clock = new SystemClock) extends Logging { + + private val executorFailuresValidityInterval = + sparkConf.get(config.EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L) + + // Queue to store the timestamp of failed executors for each host + private val failedExecutorsTimeStampsPerHost = mutable.Map[String, mutable.Queue[Long]]() + + private val failedExecutorsTimeStamps = new mutable.Queue[Long]() + + private def recentFailureCount(failedExecutorsTimeStampsForHost: mutable.Queue[Long]): Int = { + val endTime = clock.getTimeMillis() + while (executorFailuresValidityInterval > 0 && + failedExecutorsTimeStampsForHost.nonEmpty && + failedExecutorsTimeStampsForHost.head < endTime - executorFailuresValidityInterval) { + failedExecutorsTimeStampsForHost.dequeue() + } + failedExecutorsTimeStampsForHost.size + } + + /** + * Use a different clock. This is mainly used for testing. + */ + def setClock(newClock: Clock): Unit = { + clock = newClock + } + + def numExecutorsFailed: Int = synchronized { --- End diff -- `numFailedExecutors`
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org