This is an automated email from the ASF dual-hosted git repository.
nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 1d3558bd1 [CELEBORN-1385] HttpServer support idle timeout
configuration of Jetty
1d3558bd1 is described below
commit 1d3558bd143b4f690c8f615e3bae6a4c539f607d
Author: SteNicholas <[email protected]>
AuthorDate: Sun Apr 14 12:40:57 2024 +0800
[CELEBORN-1385] HttpServer support idle timeout configuration of Jetty
### What changes were proposed in this pull request?
Introduce `celeborn.master.http.idleTimeout` and
`celeborn.worker.http.idleTimeout` to support idle timeout configuration of
Jetty for `HttpServer`.
### Why are the changes needed?
`ServerConnector` supports HTTP idle timeout configuration via
`jetty.http.idleTimeout`, of which default value is 30000ms that is configured
as `jetty.http.idleTimeout=300000`. `HttpServer` should also support idle
timeout configuration of Jetty, which timeout is as follows:
```
2024-04-12 16:04:00,926 [DEBUG] [master-JettyScheduler-1] -
org.eclipse.jetty.io.IdleTimeout -IdleTimeout.java(161)
-SocketChannelEndPoint567d3f82{l=/127.0.0.1:9097,r=/127.0.0.1:35276,OPEN,fill=FI,flush=-,to=29999/30000}{io=1/1,kio=1,kro=1}->HttpConnection2f88da0c[p=HttpParser{s=START,0
of
-1},g=HttpGenerator796c3666{s=START}]=>HttpChannelOverHttp63815646{s=HttpChannelState5c192497{s=IDLE
rs=BLOCKING os=OPEN is=IDLE awp=false se=false i=true
al=0},r=5,c=false/false,a=IDLE,uri=null,age [...]
2024-04-12 16:04:00,927 [DEBUG] [master-JettyScheduler-1] -
org.eclipse.jetty.io.IdleTimeout -IdleTimeout.java(161)
-SocketChannelEndPoint567d3f82{l=/127.0.0.1:9097,r=/127.0.0.1:35276,OPEN,fill=FI,flush=-,to=30001/30000}{io=1/1,kio=1,kro=1}->HttpConnection2f88da0c[p=HttpParser{s=START,0
of
-1},g=HttpGenerator796c3666{s=START}]=>HttpChannelOverHttp63815646{s=HttpChannelState5c192497{s=IDLE
rs=BLOCKING os=OPEN is=IDLE awp=false se=false i=true
al=0},r=5,c=false/false,a=IDLE,uri=null,age [...]
2024-04-12 16:04:00,927 [DEBUG] [master-JettyScheduler-1] -
org.eclipse.jetty.io.IdleTimeout -IdleTimeout.java(168)
-SocketChannelEndPoint567d3f82{l=/127.0.0.1:9097,r=/127.0.0.1:35276,OPEN,fill=FI,flush=-,to=30001/30000}{io=1/1,kio=1,kro=1}->HttpConnection2f88da0c[p=HttpParser{s=START,0
of
-1},g=HttpGenerator796c3666{s=START}]=>HttpChannelOverHttp63815646{s=HttpChannelState5c192497{s=IDLE
rs=BLOCKING os=OPEN is=IDLE awp=false se=false i=true
al=0},r=5,c=false/false,a=IDLE,uri=null,age [...]
2024-04-12 16:04:00,927 [DEBUG] [master-JettyScheduler-1] -
org.eclipse.jetty.io.FillInterest -FillInterest.java(136) -onFail
FillInterest6cc48840{AC.ReadCB2f88da0c{HttpConnection2f88da0c::SocketChannelEndPoint567d3f82{l=/127.0.0.1:9097,r=/127.0.0.1:35276,OPEN,fill=FI,flush=-,to=30001/30000}{io=1/1,kio=1,kro=1}->HttpConnection2f88da0c[p=HttpParser{s=START,0
of
-1},g=HttpGenerator796c3666{s=START}]=>HttpChannelOverHttp63815646{s=HttpChannelState5c192497{s=IDLE
rs=BLOCKING os=OPEN is=ID [...]
java.util.concurrent.TimeoutException: Idle timeout expired: 30001/30000 ms
at
org.eclipse.jetty.io.IdleTimeout.checkIdleTimeout(IdleTimeout.java:171)
~[jetty-io-9.4.52.v20230823.jar:9.4.52.v20230823]
at org.eclipse.jetty.io.IdleTimeout.idleCheck(IdleTimeout.java:113)
~[jetty-io-9.4.52.v20230823.jar:9.4.52.v20230823]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
~[?:1.8.0_162]
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
~[?:1.8.0_162]
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
~[?:1.8.0_162]
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
~[?:1.8.0_162]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
~[?:1.8.0_162]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
~[?:1.8.0_162]
at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_162]
2024-04-12 16:04:00,927 [DEBUG] [master-JettyScheduler-1] -
org.eclipse.jetty.http.HttpParser -HttpParser.java(1883) -close
HttpParser{s=START,0 of -1}
2024-04-12 16:04:00,927 [DEBUG] [master-JettyScheduler-1] -
org.eclipse.jetty.http.HttpParser -HttpParser.java(1912) -START --> CLOSE
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
No.
Closes #2455 from SteNicholas/CELEBORN-1385.
Authored-by: SteNicholas <[email protected]>
Signed-off-by: SteNicholas <[email protected]>
---
.../org/apache/celeborn/common/CelebornConf.scala | 19 +++++++++++++++++++
docs/configuration/master.md | 1 +
docs/configuration/worker.md | 1 +
.../apache/celeborn/server/common/HttpService.scala | 12 +++++++++++-
.../celeborn/server/common/http/HttpServer.scala | 9 ++++++++-
5 files changed, 40 insertions(+), 2 deletions(-)
diff --git
a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
index 6ab67fb89..d6acd09a4 100644
--- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -590,6 +590,8 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable
with Logging with Se
def masterHttpStopTimeout: Long = get(MASTER_HTTP_STOP_TIMEOUT)
+ def masterHttpIdleTimeout: Long = get(MASTER_HTTP_IDLE_TIMEOUT)
+
def haEnabled: Boolean = get(HA_ENABLED)
def haMasterNodeId: Option[String] = get(HA_MASTER_NODE_ID)
@@ -682,6 +684,7 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable
with Logging with Se
def workerHttpPort: Int = get(WORKER_HTTP_PORT)
def workerHttpMaxWorkerThreads: Int = get(WORKER_HTTP_MAX_WORKER_THREADS)
def workerHttpStopTimeout: Long = get(WORKER_HTTP_STOP_TIMEOUT)
+ def workerHttpIdleTimeout: Long = get(WORKER_HTTP_IDLE_TIMEOUT)
def workerRpcPort: Int = get(WORKER_RPC_PORT)
def workerPushPort: Int = get(WORKER_PUSH_PORT)
def workerFetchPort: Int = get(WORKER_FETCH_PORT)
@@ -2001,6 +2004,14 @@ object CelebornConf extends Logging {
.timeConf(TimeUnit.MILLISECONDS)
.createWithDefaultString("5s")
+ val MASTER_HTTP_IDLE_TIMEOUT: ConfigEntry[Long] =
+ buildConf("celeborn.master.http.idleTimeout")
+ .categories("master")
+ .version("0.5.0")
+ .doc("Master http server idle timeout.")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefaultString("30s")
+
val HA_ENABLED: ConfigEntry[Boolean] =
buildConf("celeborn.master.ha.enabled")
.withAlternative("celeborn.ha.enabled")
@@ -2576,6 +2587,14 @@ object CelebornConf extends Logging {
.timeConf(TimeUnit.MILLISECONDS)
.createWithDefaultString("5s")
+ val WORKER_HTTP_IDLE_TIMEOUT: ConfigEntry[Long] =
+ buildConf("celeborn.worker.http.idleTimeout")
+ .categories("worker")
+ .version("0.5.0")
+ .doc("Worker http server idle timeout.")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefaultString("30s")
+
val WORKER_RPC_PORT: ConfigEntry[Int] =
buildConf("celeborn.worker.rpc.port")
.categories("worker")
diff --git a/docs/configuration/master.md b/docs/configuration/master.md
index 582f3f8b3..97aae4333 100644
--- a/docs/configuration/master.md
+++ b/docs/configuration/master.md
@@ -44,6 +44,7 @@ license: |
| celeborn.master.heartbeat.worker.timeout | 120s | false | Worker heartbeat
timeout. | 0.3.0 | celeborn.worker.heartbeat.timeout |
| celeborn.master.host | <localhost> | false | Hostname for master to
bind. | 0.2.0 | |
| celeborn.master.http.host | <localhost> | false | Master's http host.
| 0.4.0 |
celeborn.metrics.master.prometheus.host,celeborn.master.metrics.prometheus.host
|
+| celeborn.master.http.idleTimeout | 30s | false | Master http server idle
timeout. | 0.5.0 | |
| celeborn.master.http.maxWorkerThreads | 200 | false | Maximum number of
threads in the master http worker thread pool. | 0.5.0 | |
| celeborn.master.http.port | 9098 | false | Master's http port. | 0.4.0 |
celeborn.metrics.master.prometheus.port,celeborn.master.metrics.prometheus.port
|
| celeborn.master.http.stopTimeout | 5s | false | Master http server stop
timeout. | 0.5.0 | |
diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md
index 63bd69b7a..93cbae431 100644
--- a/docs/configuration/worker.md
+++ b/docs/configuration/worker.md
@@ -85,6 +85,7 @@ license: |
| celeborn.worker.graceful.shutdown.saveCommittedFileInfo.sync | false | false
| Whether to call sync method to save committed file infos into Level DB to
handle OS crash. | 0.3.1 | |
| celeborn.worker.graceful.shutdown.timeout | 600s | false | The worker's
graceful shutdown timeout time. | 0.2.0 | |
| celeborn.worker.http.host | <localhost> | false | Worker's http host.
| 0.4.0 |
celeborn.metrics.worker.prometheus.host,celeborn.worker.metrics.prometheus.host
|
+| celeborn.worker.http.idleTimeout | 30s | false | Worker http server idle
timeout. | 0.5.0 | |
| celeborn.worker.http.maxWorkerThreads | 200 | false | Maximum number of
threads in the worker http worker thread pool. | 0.5.0 | |
| celeborn.worker.http.port | 9096 | false | Worker's http port. | 0.4.0 |
celeborn.metrics.worker.prometheus.port,celeborn.worker.metrics.prometheus.port
|
| celeborn.worker.http.stopTimeout | 5s | false | Worker http server stop
timeout. | 0.5.0 | |
diff --git
a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
index 540b3eadd..f11d03092 100644
--- a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
+++ b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
@@ -181,7 +181,8 @@ abstract class HttpService extends Service with Logging {
httpHost(),
httpPort(),
httpMaxWorkerThreads(),
- httpStopTimeout())
+ httpStopTimeout(),
+ httpIdleTimeout())
httpServer.start()
startInternal()
// block until the HTTP server is started, otherwise, we may get
@@ -228,6 +229,15 @@ abstract class HttpService extends Service with Logging {
}
}
+ private def httpIdleTimeout(): Long = {
+ serviceName match {
+ case Service.MASTER =>
+ conf.masterHttpIdleTimeout
+ case Service.WORKER =>
+ conf.workerHttpIdleTimeout
+ }
+ }
+
def connectionUrl: String = {
httpServer.getServerUri
}
diff --git
a/service/src/main/scala/org/apache/celeborn/server/common/http/HttpServer.scala
b/service/src/main/scala/org/apache/celeborn/server/common/http/HttpServer.scala
index 8592d696c..8dc4e5136 100644
---
a/service/src/main/scala/org/apache/celeborn/server/common/http/HttpServer.scala
+++
b/service/src/main/scala/org/apache/celeborn/server/common/http/HttpServer.scala
@@ -107,7 +107,13 @@ private[celeborn] case class HttpServer(
object HttpServer {
- def apply(role: String, host: String, port: Int, poolSize: Int, stopTimeout:
Long): HttpServer = {
+ def apply(
+ role: String,
+ host: String,
+ port: Int,
+ poolSize: Int,
+ stopTimeout: Long,
+ idleTimeout: Long): HttpServer = {
val pool = new QueuedThreadPool(math.max(poolSize, 8))
pool.setName(s"$role-JettyThreadPool")
pool.setDaemon(true)
@@ -137,6 +143,7 @@ object HttpServer {
connector.setReuseAddress(!SystemUtils.IS_OS_WINDOWS)
connector.setAcceptQueueSize(math.min(connector.getAcceptors, 8))
connector.setStopTimeout(stopTimeout)
+ connector.setIdleTimeout(idleTimeout)
new HttpServer(role, server, connector, collection)
}