[ https://issues.apache.org/jira/browse/RATIS-2323?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
gaoyajun02 updated RATIS-2323: ------------------------------ Description: In the Celeborn master cluster, the Leader's clientAddress and adminAddress are cached in Followers and used as the RPC endpoint for handling client requests. When a Follower receives a client request, since only the Leader can process client requests, the Follower returns the Leader's RPC endpoint to the client, allowing the client to resend the request directly to the Leader. When expanding the master cluster, we currently use the ratis-shell's add operation. However, peers added through this operation lack clientAddress and adminAddress settings. If a newly added peer becomes the Leader, all Followers will return an empty address, causing clients to access an incorrect Leader address (127.0.0.1). {code:java} 25/08/26 15:47:45,534 INFO [main] MasterClient: connect to master zw06-data-k8s-sparktest-node007.mt:9097. 25/08/26 15:47:45,669 WARN [celeborn-netty-rpc-connection-executor-1] TransportClientFactory: Retry create client, times 1/3 with error: Failed to connect to /127.0.0.1:0 org.apache.celeborn.common.exception.CelebornIOException: Failed to connect to /127.0.0.1:0 at org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317) at org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252) at org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159) at org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147) at org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259) at org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234) at org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194) at org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: finishConnect(..) failed: Connection refused: /127.0.0.1:0 Caused by: java.net.ConnectException: finishConnect(..) failed: Connection refused at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166) at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131) at io.netty.channel.unix.Socket.finishConnect(Socket.java:359) at io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715) at io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692) at io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567) at io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491) at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(Thread.java:748) 25/08/26 15:47:50,674 WARN [celeborn-netty-rpc-connection-executor-1] TransportClientFactory: Retry create client, times 2/3 with error: Failed to connect to /127.0.0.1:0 org.apache.celeborn.common.exception.CelebornIOException: Failed to connect to /127.0.0.1:0 at org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317) at org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252) at org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159) at org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147) at org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259) at org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234) at org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194) at org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: finishConnect(..) failed: Connection refused: /127.0.0.1:0 Caused by: java.net.ConnectException: finishConnect(..) failed: Connection refused at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166) at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131) at io.netty.channel.unix.Socket.finishConnect(Socket.java:359) at io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715) at io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692) at io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567) at io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491) at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(Thread.java:748) {code} Therefore, we propose extending the ratis-shell add command to support setting clientAddress and adminAddress parameters when adding new peers to the cluster. was: In the Celeborn master cluster, the Leader's clientAddress and adminAddress are cached in Followers and used as the RPC endpoint for handling client requests. When a Follower receives a client request, since only the Leader can process client requests, the Follower returns the Leader's RPC endpoint to the client, allowing the client to resend the request directly to the Leader. When expanding the master cluster, we currently use the ratis-shell's add operation. However, peers added through this operation lack clientAddress and adminAddress settings. If a newly added peer becomes the Leader, all Followers will return an empty address, causing clients to access an incorrect Leader address (127.0.0.1). Therefore, we propose extending the ratis-shell add command to support setting clientAddress and adminAddress parameters when adding new peers to the cluster. > Extend ratis-shell add command > ------------------------------ > > Key: RATIS-2323 > URL: https://issues.apache.org/jira/browse/RATIS-2323 > Project: Ratis > Issue Type: Improvement > Components: shell > Reporter: gaoyajun02 > Priority: Major > Time Spent: 10m > Remaining Estimate: 0h > > In the Celeborn master cluster, the Leader's clientAddress and adminAddress > are cached in Followers and used as the RPC endpoint for handling client > requests. When a Follower receives a client request, since only the Leader > can process client requests, the Follower returns the Leader's RPC endpoint > to the client, allowing the client to resend the request directly to the > Leader. > When expanding the master cluster, we currently use the ratis-shell's add > operation. However, peers added through this operation lack clientAddress and > adminAddress settings. If a newly added peer becomes the Leader, all > Followers will return an empty address, causing clients to access an > incorrect Leader address (127.0.0.1). > {code:java} > 25/08/26 15:47:45,534 INFO [main] MasterClient: connect to master > zw06-data-k8s-sparktest-node007.mt:9097. > 25/08/26 15:47:45,669 WARN [celeborn-netty-rpc-connection-executor-1] > TransportClientFactory: Retry create client, times 1/3 with error: Failed to > connect to /127.0.0.1:0 > org.apache.celeborn.common.exception.CelebornIOException: Failed to connect > to /127.0.0.1:0 > at > org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317) > at > org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252) > at > org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159) > at > org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147) > at > org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259) > at > org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234) > at > org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194) > at > org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: > finishConnect(..) failed: Connection refused: /127.0.0.1:0 > Caused by: java.net.ConnectException: finishConnect(..) failed: Connection > refused > at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166) > at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131) > at io.netty.channel.unix.Socket.finishConnect(Socket.java:359) > at > io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715) > at > io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692) > at > io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567) > at > io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491) > at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399) > at > io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) > at > io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at > io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > at java.lang.Thread.run(Thread.java:748) > 25/08/26 15:47:50,674 WARN [celeborn-netty-rpc-connection-executor-1] > TransportClientFactory: Retry create client, times 2/3 with error: Failed to > connect to /127.0.0.1:0 > org.apache.celeborn.common.exception.CelebornIOException: Failed to connect > to /127.0.0.1:0 > at > org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317) > at > org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252) > at > org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159) > at > org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147) > at > org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259) > at > org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234) > at > org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194) > at > org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: > finishConnect(..) failed: Connection refused: /127.0.0.1:0 > Caused by: java.net.ConnectException: finishConnect(..) failed: Connection > refused > at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166) > at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131) > at io.netty.channel.unix.Socket.finishConnect(Socket.java:359) > at > io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715) > at > io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692) > at > io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567) > at > io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491) > at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399) > at > io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) > at > io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at > io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > at java.lang.Thread.run(Thread.java:748) > {code} > Therefore, we propose extending the ratis-shell add command to support > setting clientAddress and adminAddress parameters when adding new peers to > the cluster. -- This message was sent by Atlassian Jira (v8.20.10#820010)