[ 
https://issues.apache.org/jira/browse/RATIS-2323?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

gaoyajun02 updated RATIS-2323:
------------------------------
    Description: 
In the Celeborn master cluster, the Leader's clientAddress and adminAddress are 
cached in Followers and used as the RPC endpoint for handling client requests. 
When a Follower receives a client request, since only the Leader can process 
client requests, the Follower returns the Leader's RPC endpoint to the client, 
allowing the client to resend the request directly to the Leader.

When expanding the master cluster, we currently use the ratis-shell's add 
operation. However, peers added through this operation lack clientAddress and 
adminAddress settings. If a newly added peer becomes the Leader, all Followers 
will return an empty address, causing clients to access an incorrect Leader 
address (127.0.0.1).
{code:java}
25/08/26 15:47:45,534 INFO [main] MasterClient: connect to master 
zw06-data-k8s-sparktest-node007.mt:9097.
25/08/26 15:47:45,669 WARN [celeborn-netty-rpc-connection-executor-1] 
TransportClientFactory: Retry create client, times 1/3 with error: Failed to 
connect to /127.0.0.1:0
org.apache.celeborn.common.exception.CelebornIOException: Failed to connect to 
/127.0.0.1:0
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259)
        at 
org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234)
        at 
org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
        at 
org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: 
finishConnect(..) failed: Connection refused: /127.0.0.1:0
Caused by: java.net.ConnectException: finishConnect(..) failed: Connection 
refused
        at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166)
        at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131)
        at io.netty.channel.unix.Socket.finishConnect(Socket.java:359)
        at 
io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715)
        at 
io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692)
        at 
io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567)
        at 
io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491)
        at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
        at 
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
        at 
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
        at java.lang.Thread.run(Thread.java:748)
25/08/26 15:47:50,674 WARN [celeborn-netty-rpc-connection-executor-1] 
TransportClientFactory: Retry create client, times 2/3 with error: Failed to 
connect to /127.0.0.1:0
org.apache.celeborn.common.exception.CelebornIOException: Failed to connect to 
/127.0.0.1:0
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147)
        at 
org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259)
        at 
org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234)
        at 
org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
        at 
org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: 
finishConnect(..) failed: Connection refused: /127.0.0.1:0
Caused by: java.net.ConnectException: finishConnect(..) failed: Connection 
refused
        at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166)
        at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131)
        at io.netty.channel.unix.Socket.finishConnect(Socket.java:359)
        at 
io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715)
        at 
io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692)
        at 
io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567)
        at 
io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491)
        at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
        at 
io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
        at 
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
        at java.lang.Thread.run(Thread.java:748)

{code}
Therefore, we propose extending the ratis-shell add command to support setting 
clientAddress and adminAddress parameters when adding new peers to the cluster.

  was:
In the Celeborn master cluster, the Leader's clientAddress and adminAddress are 
cached in Followers and used as the RPC endpoint for handling client requests. 
When a Follower receives a client request, since only the Leader can process 
client requests, the Follower returns the Leader's RPC endpoint to the client, 
allowing the client to resend the request directly to the Leader.

When expanding the master cluster, we currently use the ratis-shell's add 
operation. However, peers added through this operation lack clientAddress and 
adminAddress settings. If a newly added peer becomes the Leader, all Followers 
will return an empty address, causing clients to access an incorrect Leader 
address (127.0.0.1).

Therefore, we propose extending the ratis-shell add command to support setting 
clientAddress and adminAddress parameters when adding new peers to the cluster.


> Extend ratis-shell add command
> ------------------------------
>
>                 Key: RATIS-2323
>                 URL: https://issues.apache.org/jira/browse/RATIS-2323
>             Project: Ratis
>          Issue Type: Improvement
>          Components: shell
>            Reporter: gaoyajun02
>            Priority: Major
>          Time Spent: 10m
>  Remaining Estimate: 0h
>
> In the Celeborn master cluster, the Leader's clientAddress and adminAddress 
> are cached in Followers and used as the RPC endpoint for handling client 
> requests. When a Follower receives a client request, since only the Leader 
> can process client requests, the Follower returns the Leader's RPC endpoint 
> to the client, allowing the client to resend the request directly to the 
> Leader.
> When expanding the master cluster, we currently use the ratis-shell's add 
> operation. However, peers added through this operation lack clientAddress and 
> adminAddress settings. If a newly added peer becomes the Leader, all 
> Followers will return an empty address, causing clients to access an 
> incorrect Leader address (127.0.0.1).
> {code:java}
> 25/08/26 15:47:45,534 INFO [main] MasterClient: connect to master 
> zw06-data-k8s-sparktest-node007.mt:9097.
> 25/08/26 15:47:45,669 WARN [celeborn-netty-rpc-connection-executor-1] 
> TransportClientFactory: Retry create client, times 1/3 with error: Failed to 
> connect to /127.0.0.1:0
> org.apache.celeborn.common.exception.CelebornIOException: Failed to connect 
> to /127.0.0.1:0
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259)
>         at 
> org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234)
>         at 
> org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
>         at 
> org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
>         at java.util.concurrent.FutureTask.run(FutureTask.java:266)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: 
> finishConnect(..) failed: Connection refused: /127.0.0.1:0
> Caused by: java.net.ConnectException: finishConnect(..) failed: Connection 
> refused
>         at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166)
>         at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131)
>         at io.netty.channel.unix.Socket.finishConnect(Socket.java:359)
>         at 
> io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715)
>         at 
> io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692)
>         at 
> io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567)
>         at 
> io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491)
>         at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
>         at 
> io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
>         at 
> io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
>         at 
> io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
>         at java.lang.Thread.run(Thread.java:748)
> 25/08/26 15:47:50,674 WARN [celeborn-netty-rpc-connection-executor-1] 
> TransportClientFactory: Retry create client, times 2/3 with error: Failed to 
> connect to /127.0.0.1:0
> org.apache.celeborn.common.exception.CelebornIOException: Failed to connect 
> to /127.0.0.1:0
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.internalCreateClient(TransportClientFactory.java:317)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:252)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.retryCreateClient(TransportClientFactory.java:159)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:147)
>         at 
> org.apache.celeborn.common.network.client.TransportClientFactory.createClient(TransportClientFactory.java:259)
>         at 
> org.apache.celeborn.common.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:234)
>         at 
> org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
>         at 
> org.apache.celeborn.common.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
>         at java.util.concurrent.FutureTask.run(FutureTask.java:266)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: 
> finishConnect(..) failed: Connection refused: /127.0.0.1:0
> Caused by: java.net.ConnectException: finishConnect(..) failed: Connection 
> refused
>         at io.netty.channel.unix.Errors.newConnectException0(Errors.java:166)
>         at io.netty.channel.unix.Errors.handleConnectErrno(Errors.java:131)
>         at io.netty.channel.unix.Socket.finishConnect(Socket.java:359)
>         at 
> io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.doFinishConnect(AbstractEpollChannel.java:715)
>         at 
> io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.finishConnect(AbstractEpollChannel.java:692)
>         at 
> io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe.epollOutReady(AbstractEpollChannel.java:567)
>         at 
> io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:491)
>         at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
>         at 
> io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
>         at 
> io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
>         at 
> io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
>         at java.lang.Thread.run(Thread.java:748)
> {code}
> Therefore, we propose extending the ratis-shell add command to support 
> setting clientAddress and adminAddress parameters when adding new peers to 
> the cluster.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to