This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 7abf3c5d2a8 branch-3.0: [fix](heartbeat) fix heartbeat editlog no
persist hbTime (#42986)
7abf3c5d2a8 is described below
commit 7abf3c5d2a834cd6bf2710dc20470dbb885eb61d
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Nov 7 14:10:29 2024 +0800
branch-3.0: [fix](heartbeat) fix heartbeat editlog no persist hbTime
(#42986)
PR Body: Backend persist lastUpdateMs, it will be modified by heartbeat
editlog. But heartbeat editlog not persist hbTime, and hbTime always
equal 0, it will make backend's lastUpdateMs = 0 in bdb image.
fix details:
1. heartbeat response persist hbTime;
2. only be state change will write an editlog. but we make a change:
even a backend is healthy, still write a healthy response editlog every
5 min. Inorder to make backend's lastUpdateMs periodly updated in bdb
image. But notice that this change wouldn't increase real editlog num.
Because heartbeat mgr will patch all fe/be's heartbeat into one editlog.
Even no fe/be state change, it still write an editlog which not contains
any node's response.
3. for a dead heartbeat response, set hbTime to last succ hbTime, then
replayer can set correct lastUpdateMs;
Cherry-picked from #42653
Co-authored-by: yujun <[email protected]>
---
.../src/main/java/org/apache/doris/common/Config.java | 6 ++++++
.../src/main/java/org/apache/doris/system/Backend.java | 14 ++++++++++++++
.../java/org/apache/doris/system/BackendHbResponse.java | 10 ++--------
.../main/java/org/apache/doris/system/HeartbeatMgr.java | 4 ++--
.../java/org/apache/doris/system/HeartbeatResponse.java | 4 +++-
5 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index af8ce5d7c2d..59f4b33aff4 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2021,6 +2021,12 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = true)
public static long max_backend_heartbeat_failure_tolerance_count = 1;
+ /**
+ * Even if a backend is healthy, still write a heartbeat editlog to update
backend's lastUpdateMs of bdb image.
+ */
+ @ConfField(mutable = true, masterOnly = true)
+ public static int editlog_healthy_heartbeat_seconds = 300;
+
/**
* Abort transaction time after lost heartbeat.
* The default value is 300s, which means transactions of be will be
aborted after lost heartbeat 300s.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
index da55ecee0de..974c0e0cae1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
@@ -47,6 +47,7 @@ import org.apache.logging.log4j.Logger;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
@@ -153,6 +154,8 @@ public class Backend implements Writable {
// send some queries to this BE, it is not an important problem.
private AtomicBoolean isShutDown = new AtomicBoolean(false);
+ private long nextForceEditlogHeartbeatTime = System.currentTimeMillis() +
(new SecureRandom()).nextInt(60 * 1000);
+
public Backend() {
this.host = "";
this.version = "";
@@ -876,7 +879,18 @@ public class Backend implements Writable {
heartbeatErrMsg = "";
this.heartbeatFailureCounter = 0;
+
+ // even if no change, write an editlog to make lastUpdateMs in
image update
+ if (System.currentTimeMillis() >=
this.nextForceEditlogHeartbeatTime) {
+ isChanged = true;
+ int delaySecond = Config.editlog_healthy_heartbeat_seconds +
(new SecureRandom()).nextInt(60);
+ this.nextForceEditlogHeartbeatTime =
System.currentTimeMillis() + delaySecond * 1000L;
+ }
} else {
+ // for a bad BackendHbResponse, its hbTime is last succ hbTime,
not this hbTime
+ if (hbResponse.getHbTime() > 0) {
+ this.lastUpdateMs = hbResponse.getHbTime();
+ }
// Only set backend to dead if the heartbeat failure counter
exceed threshold.
// And if it is a replay process, must set backend to dead.
if (isReplay || ++this.heartbeatFailureCounter >=
Config.max_backend_heartbeat_failure_tolerance_count) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/system/BackendHbResponse.java
b/fe/fe-core/src/main/java/org/apache/doris/system/BackendHbResponse.java
index a0311a9b737..479966d2ff3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/BackendHbResponse.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/BackendHbResponse.java
@@ -98,18 +98,12 @@ public class BackendHbResponse extends HeartbeatResponse
implements Writable {
this.beMemory = beMemory;
}
- public BackendHbResponse(long beId, String errMsg) {
- super(HeartbeatResponse.Type.BACKEND);
- this.status = HbStatus.BAD;
- this.beId = beId;
- this.msg = errMsg;
- }
-
- public BackendHbResponse(long beId, String host, String errMsg) {
+ public BackendHbResponse(long beId, String host, long lastHbTime, String
errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
this.beId = beId;
this.host = host;
+ this.hbTime = lastHbTime;
this.msg = errMsg;
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
index 7fe5bf0d442..fb6853e83c3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
@@ -316,13 +316,13 @@ public class HeartbeatMgr extends MasterDaemon {
System.currentTimeMillis(), beStartTime, version,
nodeRole,
fragmentNum, lastFragmentUpdateTime, isShutDown,
arrowFlightSqlPort, beMemory);
} else {
- return new BackendHbResponse(backendId, backend.getHost(),
+ return new BackendHbResponse(backendId, backend.getHost(),
backend.getLastUpdateMs(),
result.getStatus().getErrorMsgs().isEmpty()
? "Unknown error" :
result.getStatus().getErrorMsgs().get(0));
}
} catch (Exception e) {
LOG.warn("backend heartbeat got exception", e);
- return new BackendHbResponse(backendId, backend.getHost(),
+ return new BackendHbResponse(backendId, backend.getHost(),
backend.getLastUpdateMs(),
Strings.isNullOrEmpty(e.getMessage()) ? "got
exception" : e.getMessage());
} finally {
if (client != null) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatResponse.java
b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatResponse.java
index 447ffad8189..3fffd121450 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatResponse.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatResponse.java
@@ -51,10 +51,12 @@ public class HeartbeatResponse implements Writable {
protected boolean isTypeRead = false;
/**
- * msg and hbTime are no need to be synchronized to other Frontends,
+ * msg no need to be synchronized to other Frontends,
* and only Master Frontend has these info
*/
protected String msg;
+
+ @SerializedName(value = "hbTime")
protected long hbTime;
public HeartbeatResponse(Type type) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]