This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 0aa07e6118b [enhancement](blacklist) should check the real error 
reason when sendfragment timeout (#42314)
0aa07e6118b is described below

commit 0aa07e6118bc3dd9c9acfe27b1f87d998a866fae
Author: yiguolei <[email protected]>
AuthorDate: Wed Oct 23 16:16:04 2024 +0800

    [enhancement](blacklist) should check the real error reason when 
sendfragment timeout (#42314)
    
    ## Proposed changes
    
    1. send fragment in BE may run for long time.
    2. timeout is not related with BE down.
    3. cancel logic should not handle blacklist. because if a BE is down,
    send fragment will find it.
    
    Co-authored-by: yiguolei <[email protected]>
---
 .../src/main/java/org/apache/doris/qe/Coordinator.java | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
index 7d9c8243c04..fee7d0442bf 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
@@ -1146,6 +1146,14 @@ public class Coordinator implements CoordInterface {
             } catch (ExecutionException e) {
                 exception = e;
                 code = TStatusCode.THRIFT_RPC_ERROR;
+                // If the error reason is call send fragment timeout, then 
should not use thrift rpc error as error
+                // code, because FE will add it to blacklist and other RPC may 
fail.
+                if (e.getCause() instanceof io.grpc.StatusRuntimeException) {
+                    io.grpc.StatusRuntimeException realException = 
(io.grpc.StatusRuntimeException) e.getCause();
+                    if (realException.getStatus().getCode() == 
io.grpc.Status.DEADLINE_EXCEEDED.getCode()) {
+                        code = TStatusCode.TIMEOUT;
+                    }
+                }
                 triple.getMiddle().removeProxy(triple.getLeft().brpcAddr);
             } catch (InterruptedException e) {
                 exception = e;
@@ -1239,6 +1247,14 @@ public class Coordinator implements CoordInterface {
             } catch (ExecutionException e) {
                 exception = e;
                 code = TStatusCode.THRIFT_RPC_ERROR;
+                // If the error reason is call send fragment timeout, then 
should not use thrift rpc error as error
+                // code, because FE will add it to blacklist and other RPC may 
fail.
+                if (e.getCause() instanceof io.grpc.StatusRuntimeException) {
+                    io.grpc.StatusRuntimeException realException = 
(io.grpc.StatusRuntimeException) e.getCause();
+                    if (realException.getStatus().getCode() == 
io.grpc.Status.DEADLINE_EXCEEDED.getCode()) {
+                        code = TStatusCode.TIMEOUT;
+                    }
+                }
                 triple.getMiddle().removeProxy(triple.getLeft().brpcAddr);
             } catch (InterruptedException e) {
                 exception = e;
@@ -3321,7 +3337,6 @@ public class Coordinator implements CoordInterface {
                 } catch (RpcException e) {
                     LOG.warn("cancel plan fragment get a exception, 
address={}:{}", brpcAddress.getHostname(),
                             brpcAddress.getPort());
-                    
SimpleScheduler.addToBlacklist(addressToBackendID.get(brpcAddress), 
e.getMessage());
                 }
 
             } catch (Exception e) {
@@ -3506,7 +3521,6 @@ public class Coordinator implements CoordInterface {
                 } catch (RpcException e) {
                     LOG.warn("cancel plan fragment get a exception, 
address={}:{}", brpcAddress.getHostname(),
                             brpcAddress.getPort());
-                    
SimpleScheduler.addToBlacklist(addressToBackendID.get(brpcAddress), 
e.getMessage());
                 }
             } catch (Exception e) {
                 LOG.warn("catch a exception", e);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to