This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 0aa07e6118b [enhancement](blacklist) should check the real error
reason when sendfragment timeout (#42314)
0aa07e6118b is described below
commit 0aa07e6118bc3dd9c9acfe27b1f87d998a866fae
Author: yiguolei <[email protected]>
AuthorDate: Wed Oct 23 16:16:04 2024 +0800
[enhancement](blacklist) should check the real error reason when
sendfragment timeout (#42314)
## Proposed changes
1. send fragment in BE may run for long time.
2. timeout is not related with BE down.
3. cancel logic should not handle blacklist. because if a BE is down,
send fragment will find it.
Co-authored-by: yiguolei <[email protected]>
---
.../src/main/java/org/apache/doris/qe/Coordinator.java | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
index 7d9c8243c04..fee7d0442bf 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
@@ -1146,6 +1146,14 @@ public class Coordinator implements CoordInterface {
} catch (ExecutionException e) {
exception = e;
code = TStatusCode.THRIFT_RPC_ERROR;
+ // If the error reason is call send fragment timeout, then
should not use thrift rpc error as error
+ // code, because FE will add it to blacklist and other RPC may
fail.
+ if (e.getCause() instanceof io.grpc.StatusRuntimeException) {
+ io.grpc.StatusRuntimeException realException =
(io.grpc.StatusRuntimeException) e.getCause();
+ if (realException.getStatus().getCode() ==
io.grpc.Status.DEADLINE_EXCEEDED.getCode()) {
+ code = TStatusCode.TIMEOUT;
+ }
+ }
triple.getMiddle().removeProxy(triple.getLeft().brpcAddr);
} catch (InterruptedException e) {
exception = e;
@@ -1239,6 +1247,14 @@ public class Coordinator implements CoordInterface {
} catch (ExecutionException e) {
exception = e;
code = TStatusCode.THRIFT_RPC_ERROR;
+ // If the error reason is call send fragment timeout, then
should not use thrift rpc error as error
+ // code, because FE will add it to blacklist and other RPC may
fail.
+ if (e.getCause() instanceof io.grpc.StatusRuntimeException) {
+ io.grpc.StatusRuntimeException realException =
(io.grpc.StatusRuntimeException) e.getCause();
+ if (realException.getStatus().getCode() ==
io.grpc.Status.DEADLINE_EXCEEDED.getCode()) {
+ code = TStatusCode.TIMEOUT;
+ }
+ }
triple.getMiddle().removeProxy(triple.getLeft().brpcAddr);
} catch (InterruptedException e) {
exception = e;
@@ -3321,7 +3337,6 @@ public class Coordinator implements CoordInterface {
} catch (RpcException e) {
LOG.warn("cancel plan fragment get a exception,
address={}:{}", brpcAddress.getHostname(),
brpcAddress.getPort());
-
SimpleScheduler.addToBlacklist(addressToBackendID.get(brpcAddress),
e.getMessage());
}
} catch (Exception e) {
@@ -3506,7 +3521,6 @@ public class Coordinator implements CoordInterface {
} catch (RpcException e) {
LOG.warn("cancel plan fragment get a exception,
address={}:{}", brpcAddress.getHostname(),
brpcAddress.getPort());
-
SimpleScheduler.addToBlacklist(addressToBackendID.get(brpcAddress),
e.getMessage());
}
} catch (Exception e) {
LOG.warn("catch a exception", e);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]