This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new ce4815bc76e branch-4.0: [fix](cloud) Fixed be restart queries not 
retried #59566 (#59618)
ce4815bc76e is described below

commit ce4815bc76e86ac1d9f2f86e0c8e6bb058a67cee
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Jan 8 09:57:25 2026 +0800

    branch-4.0: [fix](cloud) Fixed be restart queries not retried #59566 
(#59618)
    
    Cherry-picked from #59566
    
    Co-authored-by: deardeng <[email protected]>
---
 .../java/org/apache/doris/qe/StmtExecutor.java     |  11 +-
 .../org/apache/doris/system/SystemInfoService.java |   5 +-
 .../query_retry/test_retry_be_restart.groovy       | 114 +++++++++++++++++++++
 3 files changed, 125 insertions(+), 5 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
index 5c587f6988e..b9c5c2d72f5 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
@@ -543,6 +543,12 @@ public class StmtExecutor {
                 if (context.getMinidump() != null && 
context.getMinidump().toString(4) != null) {
                     MinidumpUtils.saveMinidumpString(context.getMinidump(), 
DebugUtil.printId(context.queryId()));
                 }
+                // COMPUTE_GROUPS_NO_ALIVE_BE, planner can't get alive be, 
need retry
+                if (Config.isCloudMode() && 
SystemInfoService.needRetryWithReplan(e.getMessage())) {
+                    LOG.debug("planner failed with cloud compute group error, 
need retry. {}",
+                            context.getQueryIdentifier(), e);
+                    throw new UserException(e.getMessage());
+                }
                 LOG.warn("Analyze failed. {}", context.getQueryIdentifier(), 
e);
                 context.getState().setError(e.getMessage());
                 return;
@@ -924,10 +930,9 @@ public class StmtExecutor {
                 LOG.warn("retry due to exception {}. retried {} times. is rpc 
error: {}, is user error: {}.",
                         e.getMessage(), i, e instanceof RpcException, e 
instanceof UserException);
 
-                boolean isNeedRetry = false;
+                boolean isNeedRetry = e instanceof RpcException;
                 if (Config.isCloudMode()) {
                     // cloud mode retry
-                    isNeedRetry = false;
                     // errCode = 2, detailMessage = No backend available as 
scan node,
                     // please check the status of your backends. [10003: not 
alive]
                     List<String> bes = 
Env.getCurrentSystemInfo().getAllBackendIds().stream()
@@ -957,8 +962,6 @@ public class StmtExecutor {
                             }
                         }
                     }
-                } else {
-                    isNeedRetry = e instanceof RpcException;
                 }
                 if (i != retryTime - 1 && isNeedRetry
                         && context.getConnectType().equals(ConnectType.MYSQL) 
&& !context.getMysqlChannel().isSend()) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java 
b/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java
index b9c63f813b1..59264ee6bb6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java
@@ -22,6 +22,7 @@ import org.apache.doris.analysis.ModifyBackendHostNameClause;
 import org.apache.doris.catalog.DiskInfo;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.catalog.ReplicaAllocation;
+import org.apache.doris.cloud.qe.ComputeGroupException;
 import org.apache.doris.common.AnalysisException;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.DdlException;
@@ -84,7 +85,8 @@ public class SystemInfoService {
 
     public static final ImmutableSet<String> NEED_REPLAN_ERRORS = 
ImmutableSet.of(
             NO_SCAN_NODE_BACKEND_AVAILABLE_MSG,
-            ERROR_E230
+            ERROR_E230,
+            
ComputeGroupException.FailedTypeEnum.COMPUTE_GROUPS_NO_ALIVE_BE.toString()
     );
 
     protected volatile ImmutableMap<Long, Backend> idToBackendRef = 
ImmutableMap.of();
@@ -1158,6 +1160,7 @@ public class SystemInfoService {
             return false;
         }
         for (String keyword : NEED_REPLAN_ERRORS) {
+            LOG.debug("key {}, errorMsg {}", keyword, errorMsg);
             if (errorMsg.contains(keyword)) {
                 return true;
             }
diff --git 
a/regression-test/suites/cloud_p0/query_retry/test_retry_be_restart.groovy 
b/regression-test/suites/cloud_p0/query_retry/test_retry_be_restart.groovy
new file mode 100644
index 00000000000..0113f6be877
--- /dev/null
+++ b/regression-test/suites/cloud_p0/query_retry/test_retry_be_restart.groovy
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+import org.apache.doris.regression.suite.ClusterOptions
+import org.apache.doris.regression.util.NodeType
+import org.apache.doris.regression.suite.SuiteCluster
+
+suite("test_retry_be_restart", "p0, docker") {
+    if (!isCloudMode()) {
+        return
+    }
+    def options = new ClusterOptions()
+    options.enableDebugPoints()
+    options.setFeNum(1)
+    options.feConfigs.add('max_query_retry_time=100')
+    options.feConfigs.add('sys_log_verbose_modules=org')
+    options.setBeNum(1)
+    options.cloudMode = true
+    // 1. connect to master
+    options.connectToFollower = false
+
+    def queryTask = {
+        for (int i = 0; i < 100; i++) {
+            try {
+                log.info("query count: {}", i)
+                sql """select * from test_be_restart_table"""
+                Thread.sleep(100)
+            } catch (Exception e) {
+                logger.warn("select failed: ${e.message}")
+                assertFalse(true);
+            }
+        }
+    }
+
+    def pointSelectQueryTask = {
+        for (int i = 0; i < 100; i++) {
+            try {
+                log.info("query count: {}", i)
+                sql """select * from test_be_restart_table where account=1 and 
site_code=1"""
+                Thread.sleep(100)
+            } catch (Exception e) {
+                logger.warn("select failed: ${e.message}")
+                assertFalse(true);
+            }
+        }
+    }
+
+    docker(options) {
+        def be1 = cluster.getBeByIndex(1)
+        def beId = be1.backendId;
+
+        sql """
+            CREATE TABLE IF NOT EXISTS `test_be_restart_table`
+            (
+                `account` bigint NULL COMMENT '用户ID',
+                `site_code` int NULL COMMENT '站点代码',
+                `site_code_str` varchar(64) NOT NULL DEFAULT "" COMMENT 
'string类型站点编号,查询返回数据使用',
+                `register_time` datetime(3) NULL COMMENT '注册时间,tidb中为int,需要转换',
+                `increment_no` bigint NOT NULL AUTO_INCREMENT(1),
+                `currency` varchar(65533) NULL COMMENT '币种'
+            )
+            ENGINE=OLAP
+            UNIQUE KEY(`account`, `site_code`)
+            PARTITION BY LIST(`site_code`)
+            (
+                PARTITION p_1 VALUES IN (1),
+                PARTITION p_2 VALUES IN (2)
+            )
+            DISTRIBUTED BY HASH(`account`) BUCKETS 8
+            PROPERTIES (
+                "binlog.enable" = "true",
+                "replication_num" = "1",
+                "enable_unique_key_merge_on_write" = "true"
+            );
+        """
+        sql """
+            INSERT INTO test_be_restart_table VALUES (1, 1, '1', '2026-01-01 
00:00:00', 1, 'USD');
+        """
+        sql """
+            INSERT INTO test_be_restart_table VALUES (2, 2, '2', '2026-01-01 
00:00:00', 2, 'EUR');
+        """
+        sql """
+            INSERT INTO test_be_restart_table VALUES (3, 1, '3', '2026-01-01 
00:00:00', 3, 'GBP');
+        """
+
+        def result = sql """select account, site_code from 
test_be_restart_table order by account, site_code;"""
+        log.info("insert result : {}", result)
+        assertEquals([[1L, 1], [2L, 2], [3L, 1]], result)
+        cluster.injectDebugPoints(NodeType.FE, ['StmtExecutor.retry.longtime' 
: null])
+        // this should be run at least 10 seconds
+        def queryThread = Thread.start(queryTask)
+        def pointSelectQueryThread = Thread.start(pointSelectQueryTask)
+        sleep(5 * 1000)
+        cluster.restartBackends()
+        // query should have no failure
+        // wait query thread finish
+        queryThread.join(15000)
+        pointSelectQueryThread.join(15000)
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to