Github user vanzin commented on a diff in the pull request: https://github.com/apache/spark/pull/21068#discussion_r184215254 --- Diff: resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/FailureWithinTimeIntervalTracker.scala --- @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.deploy.yarn + +import scala.collection.mutable + +import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging +import org.apache.spark.util.{Clock, SystemClock} + +private[spark] class FailureWithinTimeIntervalTracker(sparkConf: SparkConf) extends Logging { + + private var clock: Clock = new SystemClock + + private val executorFailuresValidityInterval = + sparkConf.get(config.EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L) + + // Queue to store the timestamp of failed executors for each host + private val failedExecutorsTimeStampsPerHost = mutable.Map[String, mutable.Queue[Long]]() + + private val failedExecutorsTimeStamps = new mutable.Queue[Long]() + + private def getRecentFailureCount(failedExecutorsTimeStampsForHost: mutable.Queue[Long]): Int = { + val endTime = clock.getTimeMillis() + while (executorFailuresValidityInterval > 0 + && failedExecutorsTimeStampsForHost.nonEmpty + && failedExecutorsTimeStampsForHost.head < endTime - executorFailuresValidityInterval) { + failedExecutorsTimeStampsForHost.dequeue() + } + failedExecutorsTimeStampsForHost.size + } + + /** + * Use a different clock. This is mainly used for testing. + */ + def setClock(newClock: Clock): Unit = { + clock = newClock + } + + def getNumExecutorsFailed: Int = synchronized { + getRecentFailureCount(failedExecutorsTimeStamps) + } + + def registerFailureOnHost(hostname: String): Unit = synchronized { + val timeMillis = clock.getTimeMillis() + failedExecutorsTimeStamps.enqueue(timeMillis) + val failedExecutorsOnHost = + failedExecutorsTimeStampsPerHost.getOrElse(hostname, { + val failureOnHost = mutable.Queue[Long]() + failedExecutorsTimeStampsPerHost.put(hostname, failureOnHost) + failureOnHost + }) + failedExecutorsOnHost.enqueue(timeMillis) } + + def registerExecutorFailure(): Unit = synchronized { + val timeMillis = clock.getTimeMillis() + failedExecutorsTimeStamps.enqueue(timeMillis) + } + + def getNumExecutorFailuresOnHost(hostname: String): Int = --- End diff -- nit: add braces in multi-line methods. Also, I know the YARN module is generally this way, but this seems like a good chance to start to use the more usual Spark convention of not naming methods with "get" (and also take the chance to make the name shorter). e.g., `numFailuresOnHost` or some variation of that.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org