Github user sryza commented on a diff in the pull request:
https://github.com/apache/spark/pull/4850#discussion_r25838009
--- Diff: core/src/main/scala/org/apache/spark/executor/Executor.scala ---
@@ -380,61 +387,70 @@ private[spark] class Executor(
}
}
- def startDriverHeartbeater() {
- val interval = conf.getInt("spark.executor.heartbeatInterval", 10000)
- val timeout = AkkaUtils.lookupTimeout(conf)
- val retryAttempts = AkkaUtils.numRetries(conf)
- val retryIntervalMs = AkkaUtils.retryWaitMs(conf)
- val heartbeatReceiverRef =
AkkaUtils.makeDriverRef("HeartbeatReceiver", conf, env.actorSystem)
+ private val timeout = AkkaUtils.lookupTimeout(conf)
+ private val retryAttempts = AkkaUtils.numRetries(conf)
+ private val retryIntervalMs = AkkaUtils.retryWaitMs(conf)
+ private val heartbeatReceiverRef =
+ AkkaUtils.makeDriverRef("HeartbeatReceiver", conf, env.actorSystem)
+
+ /** Reports heartbeat and metrics for active tasks to the driver. */
+ private def reportHeartBeat(): Unit = {
+ // list of (task id, metrics) to send back to the driver
+ val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]()
+ val curGCTime = computeTotalGcTime()
+
+ for (taskRunner <- runningTasks.values()) {
+ if (taskRunner.task != null) {
+ taskRunner.task.metrics.foreach { metrics =>
+ metrics.updateShuffleReadMetrics()
+ metrics.updateInputMetrics()
+ metrics.setJvmGCTime(curGCTime - taskRunner.startGCTime)
+
+ if (isLocal) {
+ // JobProgressListener will hold an reference of it during
+ // onExecutorMetricsUpdate(), then JobProgressListener can not
see
+ // the changes of metrics any more, so make a deep copy of it
+ val copiedMetrics =
Utils.deserialize[TaskMetrics](Utils.serialize(metrics))
+ tasksMetrics += ((taskRunner.taskId, copiedMetrics))
+ } else {
+ // It will be copied by serialization
+ tasksMetrics += ((taskRunner.taskId, metrics))
+ }
+ }
+ }
+ }
- val t = new Thread() {
+ val message = Heartbeat(executorId, tasksMetrics.toArray,
env.blockManager.blockManagerId)
+ try {
+ val response = AkkaUtils.askWithReply[HeartbeatResponse](message,
heartbeatReceiverRef,
+ retryAttempts, retryIntervalMs, timeout)
+ if (response.reregisterBlockManager) {
+ logWarning("Told to re-register on heartbeat")
+ env.blockManager.reregister()
+ }
+ } catch {
+ case NonFatal(e) => logWarning("Issue communicating with driver in
heartbeater", e)
+ }
+ }
+
+ /**
+ * Starts a thread to report heartbeat and partial metrics for active
tasks to driver.
+ * This thread stops running when the executor is stopped.
+ */
+ private def startDriverHeartbeater() {
+ val interval = conf.getInt("spark.executor.heartbeatInterval", 10000)
+ val thread = new Thread() {
override def run() {
// Sleep a random interval so the heartbeats don't end up in sync
Thread.sleep(interval + (math.random * interval).asInstanceOf[Int])
-
while (!isStopped) {
- val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]()
- val curGCTime = gcTime
-
- for (taskRunner <- runningTasks.values()) {
- if (taskRunner.attemptedTask.nonEmpty) {
- Option(taskRunner.task).flatMap(_.metrics).foreach { metrics
=>
- metrics.updateShuffleReadMetrics()
- metrics.updateInputMetrics()
- metrics.setJvmGCTime(curGCTime - taskRunner.startGCTime)
-
- if (isLocal) {
- // JobProgressListener will hold an reference of it
during
- // onExecutorMetricsUpdate(), then JobProgressListener
can not see
- // the changes of metrics any more, so make a deep copy
of it
- val copiedMetrics =
Utils.deserialize[TaskMetrics](Utils.serialize(metrics))
- tasksMetrics += ((taskRunner.taskId, copiedMetrics))
- } else {
- // It will be copied by serialization
- tasksMetrics += ((taskRunner.taskId, metrics))
- }
- }
- }
- }
-
- val message = Heartbeat(executorId, tasksMetrics.toArray,
env.blockManager.blockManagerId)
- try {
- val response =
AkkaUtils.askWithReply[HeartbeatResponse](message, heartbeatReceiverRef,
- retryAttempts, retryIntervalMs, timeout)
- if (response.reregisterBlockManager) {
- logWarning("Told to re-register on heartbeat")
- env.blockManager.reregister()
- }
- } catch {
- case NonFatal(t) => logWarning("Issue communicating with
driver in heartbeater", t)
- }
-
+ reportHeartBeat()
Thread.sleep(interval)
}
}
}
- t.setDaemon(true)
- t.setName("Driver Heartbeater")
- t.start()
+ thread.setDaemon(true)
+ thread.setName("driver-heartbeat")
--- End diff --
Feel free to disregard this, but "heartbeat" as opposed to "heartbeater"
makes me think that this thread is there for a single heartbeat.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]