This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 847b65eac370 [SPARK-46456][CORE] Add `spark.ui.jettyStopTimeout` to set Jetty server stop timeout to unblock SparkContext shutdown 847b65eac370 is described below commit 847b65eac370fc8ef98c617a2934b2fa0fcee250 Author: Kent Yao <y...@apache.org> AuthorDate: Wed Dec 20 15:04:45 2023 -0800 [SPARK-46456][CORE] Add `spark.ui.jettyStopTimeout` to set Jetty server stop timeout to unblock SparkContext shutdown ### What changes were proposed in this pull request? The `_stopTimeout` sets a graceful stop time for each ContainerLifeCycle. This pull request aims to address the issue of interrupting shutdown hooks during the shutdown process. By setting the _stopTimeout to 5 seconds, we can reduce the risk of causing modules such as MapOutputTracker and BlockManager in the SparkContext to not be properly stopped, resulting in uncleaned resources. - https://github.com/jetty/jetty.project/blob/1f34ece62b918a006231258474f5fa370c49df29/jetty-util/src/main/java/org/eclipse/jetty/util/component/AbstractLifeCycle.java#L53 ``` private long _stopTimeout = 30000; ``` This pull request reduces the value to 5 seconds, taking into account the value from the [QueuedThreadPool](https://git.eclipse.org/c/jetty/org.eclipse.jetty.project.git/tree/jetty-util/src/main/java/org/eclipse/jetty/util/thread/QueuedThreadPool.java#n96) ### Why are the changes needed? In Jetty, the ContainerLifeCycle implementation manages a collection of contained beans. For managed beans, it stops them one by one and waits for each to stop for a specified time(30s). A single bean can result in the shutdown hook timeout. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? This can be reproduced easily by local-cluster with proxied SparkUI. #### Before ``` 23/12/19 17:07:40 DEBUG QueuedThreadPool: Waiting for Thread[MasterUI-81,5,main] for 14999 23/12/19 17:07:55 DEBUG QueuedThreadPool: Waiting for Thread[MasterUI-81,5,main] for 14999 ``` ``` 23/12/19 17:08:09 WARN ShutdownHookManager: ShutdownHook '' timeout, java.util.concurrent.TimeoutException java.util.concurrent.TimeoutException at java.base/java.util.concurrent.FutureTask.get(FutureTask.java:204) at org.apache.hadoop.util.ShutdownHookManager.executeShutdown(ShutdownHookManager.java:124) at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:95) 23/12/19 17:08:09 ERROR Utils: Uncaught exception in thread shutdown-hook-0 java.lang.InterruptedException at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:1679) at java.base/java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1464) at org.apache.spark.rpc.netty.MessageLoop.stop(MessageLoop.scala:60) at org.apache.spark.rpc.netty.Dispatcher.stop(Dispatcher.scala:205) at org.apache.spark.rpc.netty.NettyRpcEnv.cleanup(NettyRpcEnv.scala:333) at org.apache.spark.rpc.netty.NettyRpcEnv.shutdown(NettyRpcEnv.scala:311) at org.apache.spark.deploy.LocalSparkCluster.$anonfun$stop$4(LocalSparkCluster.scala:97) at org.apache.spark.deploy.LocalSparkCluster.$anonfun$stop$4$adapted(LocalSparkCluster.scala:97) at scala.collection.IterableOnceOps.foreach(IterableOnce.scala:576) at scala.collection.IterableOnceOps.foreach$(IterableOnce.scala:574) at scala.collection.AbstractIterable.foreach(Iterable.scala:933) at org.apache.spark.deploy.LocalSparkCluster.stop(LocalSparkCluster.scala:97) at org.apache.spark.SparkContext$.$anonfun$createTaskScheduler$2(SparkContext.scala:3233) at org.apache.spark.SparkContext$.$anonfun$createTaskScheduler$2$adapted(SparkContext.scala:3232) at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.org$apache$spark$scheduler$cluster$StandaloneSchedulerBackend$$stop(StandaloneSchedulerBackend.scala:280) at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.stop(StandaloneSchedulerBackend.scala:143) at org.apache.spark.scheduler.SchedulerBackend.stop(SchedulerBackend.scala:34) at org.apache.spark.scheduler.SchedulerBackend.stop$(SchedulerBackend.scala:34) at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.stop(CoarseGrainedSchedulerBackend.scala:55) at org.apache.spark.scheduler.TaskSchedulerImpl.$anonfun$stop$2(TaskSchedulerImpl.scala:992) at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1288) at org.apache.spark.scheduler.TaskSchedulerImpl.stop(TaskSchedulerImpl.scala:992) at org.apache.spark.scheduler.DAGScheduler.$anonfun$stop$4(DAGScheduler.scala:3005) at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1288) at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:3005) at org.apache.spark.SparkContext.$anonfun$stop$12(SparkContext.scala:2293) at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1288) at org.apache.spark.SparkContext.stop(SparkContext.scala:2293) at org.apache.spark.sql.hive.thriftserver.SparkSQLEnv$.stop(SparkSQLEnv.scala:88) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.$anonfun$main$2(SparkSQLCLIDriver.scala:151) at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:214) at org.apache.spark.util.SparkShutdownHookManager.$anonfun$runAll$2(ShutdownHookManager.scala:188) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1842) at org.apache.spark.util.SparkShutdownHookManager.$anonfun$runAll$1(ShutdownHookManager.scala:188) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) at scala.util.Try$.apply(Try.scala:210) at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188) at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178) at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539) at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) at java.base/java.lang.Thread.run(Thread.java:840) ``` #### After ``` 23/12/19 17:38:04 DEBUG QueuedThreadPool: Waiting for Thread[MasterUI-78,5,main] for 2499 23/12/19 17:38:06 DEBUG QueuedThreadPool: Waiting for Thread[MasterUI-78,5,main] for 2499 ``` ``` 23/12/19 17:38:09 DEBUG QueuedThreadPool: Waiting for Thread[MasterUI-81,5,main] for -3 23/12/19 17:38:09 WARN QueuedThreadPool: Couldn't stop Thread[MasterUI-78,5,main] at java.base17.0.9/sun.nio.ch.KQueue.poll(Native Method) at java.base17.0.9/sun.nio.ch.KQueueSelectorImpl.doSelect(KQueueSelectorImpl.java:122) at java.base17.0.9/sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:129) at java.base17.0.9/sun.nio.ch.SelectorImpl.select(SelectorImpl.java:146) at app//org.sparkproject.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183) at app//org.sparkproject.jetty.io.ManagedSelector.select(ManagedSelector.java:190) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:362) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:186) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at app//org.sparkproject.jetty.io.ManagedSelector$$Lambda$775/0x000000c801527460.run(Unknown Source) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.base17.0.9/java.lang.Thread.run(Thread.java:840) 23/12/19 17:38:09 WARN QueuedThreadPool: Couldn't stop Thread[MasterUI-79,5,main] at java.base17.0.9/sun.nio.ch.KQueue.poll(Native Method) at java.base17.0.9/sun.nio.ch.KQueueSelectorImpl.doSelect(KQueueSelectorImpl.java:122) at java.base17.0.9/sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:129) at java.base17.0.9/sun.nio.ch.SelectorImpl.select(SelectorImpl.java:146) at app//org.sparkproject.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183) at app//org.sparkproject.jetty.io.ManagedSelector.select(ManagedSelector.java:190) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:362) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:186) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at app//org.sparkproject.jetty.io.ManagedSelector$$Lambda$775/0x000000c801527460.run(Unknown Source) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.base17.0.9/java.lang.Thread.run(Thread.java:840) 23/12/19 17:38:09 WARN QueuedThreadPool: Couldn't stop Thread[MasterUI-77,5,main] at java.base17.0.9/sun.nio.ch.KQueue.poll(Native Method) at java.base17.0.9/sun.nio.ch.KQueueSelectorImpl.doSelect(KQueueSelectorImpl.java:122) at java.base17.0.9/sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:129) at java.base17.0.9/sun.nio.ch.SelectorImpl.select(SelectorImpl.java:146) at app//org.sparkproject.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183) at app//org.sparkproject.jetty.io.ManagedSelector.select(ManagedSelector.java:190) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:362) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:186) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at app//org.sparkproject.jetty.io.ManagedSelector$$Lambda$775/0x000000c801527460.run(Unknown Source) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.base17.0.9/java.lang.Thread.run(Thread.java:840) 23/12/19 17:38:09 WARN QueuedThreadPool: Couldn't stop Thread[MasterUI-82,5,main] at java.base17.0.9/sun.nio.ch.KQueue.poll(Native Method) at java.base17.0.9/sun.nio.ch.KQueueSelectorImpl.doSelect(KQueueSelectorImpl.java:122) at java.base17.0.9/sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:129) at java.base17.0.9/sun.nio.ch.SelectorImpl.select(SelectorImpl.java:146) at app//org.sparkproject.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183) at app//org.sparkproject.jetty.io.ManagedSelector.select(ManagedSelector.java:190) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:362) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:186) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at app//org.sparkproject.jetty.io.ManagedSelector$$Lambda$775/0x000000c801527460.run(Unknown Source) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.base17.0.9/java.lang.Thread.run(Thread.java:840) 23/12/19 17:38:09 WARN QueuedThreadPool: Couldn't stop Thread[MasterUI-80,5,main] at java.base17.0.9/sun.nio.ch.KQueue.poll(Native Method) at java.base17.0.9/sun.nio.ch.KQueueSelectorImpl.doSelect(KQueueSelectorImpl.java:122) at java.base17.0.9/sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:129) at java.base17.0.9/sun.nio.ch.SelectorImpl.select(SelectorImpl.java:146) at app//org.sparkproject.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183) at app//org.sparkproject.jetty.io.ManagedSelector.select(ManagedSelector.java:190) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:362) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:186) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at app//org.sparkproject.jetty.io.ManagedSelector$$Lambda$775/0x000000c801527460.run(Unknown Source) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.base17.0.9/java.lang.Thread.run(Thread.java:840) 23/12/19 17:38:09 WARN QueuedThreadPool: Couldn't stop Thread[MasterUI-81,5,main] at java.base17.0.9/sun.nio.ch.KQueue.poll(Native Method) at java.base17.0.9/sun.nio.ch.KQueueSelectorImpl.doSelect(KQueueSelectorImpl.java:122) at java.base17.0.9/sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:129) at java.base17.0.9/sun.nio.ch.SelectorImpl.select(SelectorImpl.java:146) at app//org.sparkproject.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183) at app//org.sparkproject.jetty.io.ManagedSelector.select(ManagedSelector.java:190) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606) at app//org.sparkproject.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:362) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:186) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at app//org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at app//org.sparkproject.jetty.io.ManagedSelector$$Lambda$775/0x000000c801527460.run(Unknown Source) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at app//org.sparkproject.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.base17.0.9/java.lang.Thread.run(Thread.java:840) 23/12/19 17:38:09 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped! 23/12/19 17:38:09 INFO MemoryStore: MemoryStore cleared 23/12/19 17:38:09 INFO BlockManager: BlockManager stopped 23/12/19 17:38:09 INFO BlockManagerMaster: BlockManagerMaster stopped 23/12/19 17:38:09 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped! 23/12/19 17:38:09 INFO SparkContext: Successfully stopped SparkContext 23/12/19 17:38:09 INFO ShutdownHookManager: Shutdown hook called 23/12/19 17:38:09 INFO ShutdownHookManager: Deleting directory /private/var/folders/84/dgr9ykwn6yndcmq1kjxqvk200000gn/T/spark-8eabc592-87f7-4a3c-8884-594076b25df1 23/12/19 17:38:09 INFO ShutdownHookManager: Deleting directory /private/var/folders/84/dgr9ykwn6yndcmq1kjxqvk200000gn/T/spark-04ca9e0a-819f-41bb-b67a-80356c4dcdd7 ``` ### Was this patch authored or co-authored using generative AI tooling? no Closes #44413 from yaooqinn/SPARK-46456. Authored-by: Kent Yao <y...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- core/src/main/scala/org/apache/spark/internal/config/UI.scala | 7 +++++++ core/src/main/scala/org/apache/spark/ui/JettyUtils.scala | 3 +++ 2 files changed, 10 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/internal/config/UI.scala b/core/src/main/scala/org/apache/spark/internal/config/UI.scala index f983308667e3..320808d5018c 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/UI.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/UI.scala @@ -247,4 +247,11 @@ private[spark] object UI { .version("3.4.0") .booleanConf .createWithDefault(true) + + val UI_JETTY_STOP_TIMEOUT = ConfigBuilder("spark.ui.jettyStopTimeout") + .internal() + .doc("Timeout for Jetty servers started in UIs, such as SparkUI, HistoryUI, etc, to stop.") + .version("4.0.0") + .timeConf(TimeUnit.MILLISECONDS) + .createWithDefaultString("30s") } diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index 50251975d733..849ee14c0afb 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -246,6 +246,7 @@ private[spark] object JettyUtils extends Logging { serverName: String = "", poolSize: Int = 200): ServerInfo = { + val stopTimeout = conf.get(UI_JETTY_STOP_TIMEOUT) logInfo(s"Start Jetty $hostName:$port for $serverName") // Start the server first, with no connectors. val pool = new QueuedThreadPool(poolSize) @@ -276,6 +277,7 @@ private[spark] object JettyUtils extends Logging { val serverExecutor = new ScheduledExecutorScheduler(s"$serverName-JettyScheduler", true) try { + server.setStopTimeout(stopTimeout) server.start() // As each acceptor and each selector will use one thread, the number of threads should at @@ -298,6 +300,7 @@ private[spark] object JettyUtils extends Logging { connector.setReuseAddress(!Utils.isWindows) // spark-45248: set the idle timeout to prevent slow DoS connector.setIdleTimeout(8000) + connector.setStopTimeout(stopTimeout) // Currently we only use "SelectChannelConnector" // Limit the max acceptor number to 8 so that we don't waste a lot of threads --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org