Repository: spark Updated Branches: refs/heads/master 328c71161 -> 34c0638ee
[SPARK-14180][CORE] Fix a deadlock in CoarseGrainedExecutorBackend Shutdown ## What changes were proposed in this pull request? Call `executor.stop` in a new thread to eliminate deadlock. ## How was this patch tested? Existing unit tests Author: Shixiong Zhu <[email protected]> Closes #12012 from zsxwing/SPARK-14180. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34c0638e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34c0638e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34c0638e Branch: refs/heads/master Commit: 34c0638ee6f05aef81d90594dd9b8e06006c2c7f Parents: 328c711 Author: Shixiong Zhu <[email protected]> Authored: Mon Mar 28 16:23:29 2016 -0700 Committer: Andrew Or <[email protected]> Committed: Mon Mar 28 16:23:29 2016 -0700 ---------------------------------------------------------------------- .../spark/executor/CoarseGrainedExecutorBackend.scala | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/34c0638e/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index 320a200..81e41e6 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -113,9 +113,15 @@ private[spark] class CoarseGrainedExecutorBackend( case Shutdown => stopping.set(true) - executor.stop() - stop() - rpcEnv.shutdown() + new Thread("CoarseGrainedExecutorBackend-stop-executor") { + override def run(): Unit = { + // executor.stop() will call `SparkEnv.stop()` which waits until RpcEnv stops totally. + // However, if `executor.stop()` runs in some thread of RpcEnv, RpcEnv won't be able to + // stop until `executor.stop()` returns, which becomes a dead-lock (See SPARK-14180). + // Therefore, we put this line in a new thread. + executor.stop() + } + }.start() } override def onDisconnected(remoteAddress: RpcAddress): Unit = { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
