[ https://issues.apache.org/jira/browse/HBASE-29319?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Viraj Jasani resolved HBASE-29319. ---------------------------------- Fix Version/s: 2.7.0 3.0.0-beta-2 2.6.3 2.5.12 Hadoop Flags: Reviewed Resolution: Fixed > Apply fail-fast retry limit for ConnectException > ------------------------------------------------ > > Key: HBASE-29319 > URL: https://issues.apache.org/jira/browse/HBASE-29319 > Project: HBase > Issue Type: Sub-task > Affects Versions: 2.5.11, 2.6.2 > Reporter: Viraj Jasani > Assignee: Viraj Jasani > Priority: Major > Labels: pull-request-available > Fix For: 2.7.0, 3.0.0-beta-2, 2.6.3, 2.5.12 > > > As part of HBASE-28638 and HBASE-29180, fail-fast retry limit has been > introduced for errors like CallQueueTooBigException, SaslException, > ConnectionClosedException, UnknownHostException. This helps limit the num of > retries that RSProcedureDispatcher has to perform while executing remote > procedures. Since the region open/close fails on the remote server, we also > trigger SCP for the target server. > We recently came across ConnectException as well: > {code:java} > 2025-04-18 15:16:34,013 DEBUG [RSProcedureDispatcher-pool-50043] > procedure.RSProcedureDispatcher - Request to xyz.abc,60020,1744167387805 > failed, try=0 > java.net.ConnectException: Call to address=xyz.abc:60020 failed on connection > exception: > org.apache.hbase.thirdparty.io.netty.channel.ConnectTimeoutException: > connection timed out after 10000 ms: xyz.abc:60020 > at org.apache.hadoop.hbase.ipc.IPCUtil.wrapException(IPCUtil.java:204) > at > org.apache.hadoop.hbase.ipc.AbstractRpcClient.onCallFinished(AbstractRpcClient.java:391) > at > org.apache.hadoop.hbase.ipc.AbstractRpcClient.access$100(AbstractRpcClient.java:92) > at > org.apache.hadoop.hbase.ipc.AbstractRpcClient$3.run(AbstractRpcClient.java:425) > at > org.apache.hadoop.hbase.ipc.AbstractRpcClient$3.run(AbstractRpcClient.java:420) > at org.apache.hadoop.hbase.ipc.Call.callComplete(Call.java:114) > at org.apache.hadoop.hbase.ipc.Call.setException(Call.java:129) > at > org.apache.hadoop.hbase.ipc.BufferCallBeforeInitHandler.userEventTriggered(BufferCallBeforeInitHandler.java:107) > at > org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:398) > at > org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:376) > at > org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.fireUserEventTriggered(AbstractChannelHandlerContext.java:368) > at > org.apache.hbase.thirdparty.io.netty.channel.DefaultChannelPipeline$HeadContext.userEventTriggered(DefaultChannelPipeline.java:1425) > at > org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:396) > at > org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:376) > at > org.apache.hbase.thirdparty.io.netty.channel.DefaultChannelPipeline.fireUserEventTriggered(DefaultChannelPipeline.java:912) > at > org.apache.hadoop.hbase.ipc.NettyRpcConnection.failInit(NettyRpcConnection.java:207) > at > org.apache.hadoop.hbase.ipc.NettyRpcConnection.access$900(NettyRpcConnection.java:82) > at > org.apache.hadoop.hbase.ipc.NettyRpcConnection$2.fail(NettyRpcConnection.java:334) > at > org.apache.hadoop.hbase.ipc.NettyRpcConnection$2.operationComplete(NettyRpcConnection.java:342) > at > org.apache.hadoop.hbase.ipc.NettyRpcConnection$2.operationComplete(NettyRpcConnection.java:316) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:583) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:559) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:629) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:118) > at > org.apache.hbase.thirdparty.io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe$2.run(AbstractEpollChannel.java:616) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.PromiseTask.runTask(PromiseTask.java:98) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:153) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469) > at > org.apache.hbase.thirdparty.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:405) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:994) > at > org.apache.hbase.thirdparty.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at > org.apache.hbase.thirdparty.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > at java.lang.Thread.run(Thread.java:750) > Caused by: > org.apache.hbase.thirdparty.io.netty.channel.ConnectTimeoutException: > connection timed out after 10000 ms: xyz.abc:60020 > at > org.apache.hbase.thirdparty.io.netty.channel.epoll.AbstractEpollChannel$AbstractEpollUnsafe$2.run(AbstractEpollChannel.java:615) > ... 10 more {code} > and so on... -- This message was sent by Atlassian Jira (v8.20.10#820010)