This is an automated email from the ASF dual-hosted git repository.

gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a94648c7621 [fix](rpc) robust retry and exception handling for 
MetaService RPC (#53670)
a94648c7621 is described below

commit a94648c762150c0836d23f3ffabb9e2eb076aa44
Author: hui lai <[email protected]>
AuthorDate: Tue Jul 22 22:34:02 2025 +0800

    [fix](rpc) robust retry and exception handling for MetaService RPC (#53670)
    
    - Ensure correct retry behavior for gRPC StatusRuntimeException:
    - Only retry on UNAVAILABLE, UNKNOWN, or DEADLINE_EXCEEDED (with limited
    times).
    - Throw RpcException after all retries are exhausted or on non-retryable
    errors.
    - Move resource cleanup (client.shutdown) to finally block for better
    reliability.
    - Correctly handle InterruptdEException exceptions rather than ingore.
    - Improve log messages for clarity and typo fix ("servive" ->
    "service").
    - Remove unreachable return null, always throw RpcException if all
    retries fail.
---
 .../apache/doris/cloud/rpc/MetaServiceProxy.java   | 46 ++++++++++++----------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java
index 95753821c35..dc6dc11d518 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java
@@ -22,7 +22,6 @@ import org.apache.doris.common.Config;
 import org.apache.doris.rpc.RpcException;
 
 import com.google.common.collect.Maps;
-import io.grpc.Status;
 import io.grpc.StatusRuntimeException;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -176,46 +175,51 @@ public class MetaServiceProxy {
         }
 
         public <Response> Response executeRequest(Function<MetaServiceClient, 
Response> function) throws RpcException {
-            int tried = 0;
-            while (tried++ < Config.meta_service_rpc_retry_cnt) {
+            long maxRetries = Config.meta_service_rpc_retry_cnt;
+            for (long tried = 1; tried <= maxRetries; tried++) {
                 MetaServiceClient client = null;
                 try {
                     client = proxy.getProxy();
                     return function.apply(client);
                 } catch (StatusRuntimeException sre) {
-                    LOG.info("failed to request meta servive code {}, msg {}, 
trycnt {}", sre.getStatus().getCode(),
+                    LOG.warn("failed to request meta service code {}, msg {}, 
trycnt {}", sre.getStatus().getCode(),
                             sre.getMessage(), tried);
-                    if ((tried > Config.meta_service_rpc_retry_cnt
-                                || (sre.getStatus().getCode() != 
Status.Code.UNAVAILABLE
-                                    && sre.getStatus().getCode() != 
Status.Code.UNKNOWN))
-                            && (tried > 
Config.meta_service_rpc_timeout_retry_times
-                                || sre.getStatus().getCode() != 
Status.Code.DEADLINE_EXCEEDED)) {
+                    boolean shouldRetry = false;
+                    switch (sre.getStatus().getCode()) {
+                        case UNAVAILABLE:
+                        case UNKNOWN:
+                            shouldRetry = true;
+                            break;
+                        case DEADLINE_EXCEEDED:
+                            shouldRetry = tried <= 
Config.meta_service_rpc_timeout_retry_times;
+                            break;
+                        default:
+                            shouldRetry = false;
+                    }
+                    if (!shouldRetry || tried >= maxRetries) {
                         throw new RpcException("", sre.getMessage(), sre);
                     }
                 } catch (Exception e) {
-                    LOG.info("failed to request meta servive trycnt {}", 
tried, e);
-                    if (tried > Config.meta_service_rpc_retry_cnt) {
+                    LOG.warn("failed to request meta servive trycnt {}", 
tried, e);
+                    if (tried >= maxRetries) {
                         throw new RpcException("", e.getMessage(), e);
                     }
-                } catch (Throwable t) {
-                    LOG.info("failed to request meta servive trycnt {}", 
tried, t);
-                    if (tried > Config.meta_service_rpc_retry_cnt) {
-                        throw new RpcException("", t.getMessage());
+                } finally {
+                    if (proxy.needReconn() && client != null) {
+                        client.shutdown(true);
                     }
                 }
 
-                if (proxy.needReconn() && client != null) {
-                    client.shutdown(true);
-                }
-
                 int delay = 20 + random.nextInt(200 - 20 + 1);
                 try {
                     Thread.sleep(delay);
                 } catch (InterruptedException interruptedException) {
-                    // ignore
+                    Thread.currentThread().interrupt();
+                    throw new RpcException("", 
interruptedException.getMessage(), interruptedException);
                 }
             }
-            return null; // impossible and unreachable, just make the compiler 
happy
+            // impossible and unreachable, just make the compiler happy
+            throw new RpcException("", "All retries exhausted", null);
         }
     }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to