This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new f1b0dc3347c [improvement](tablet clone) furthur repair replicas should
be check even if they are versions catchup (#25551) (#26047)
f1b0dc3347c is described below
commit f1b0dc3347c7365e8fa09d16267cc2ee5ffa2ca5
Author: yujun <[email protected]>
AuthorDate: Sat Oct 28 16:57:43 2023 +0800
[improvement](tablet clone) furthur repair replicas should be check even if
they are versions catchup (#25551) (#26047)
---
.../apache/doris/clone/LoadStatisticForTag.java | 55 +++++++++++++----
.../org/apache/doris/clone/TabletSchedCtx.java | 70 +++++++++++++++-------
.../org/apache/doris/clone/TabletScheduler.java | 17 ++++++
3 files changed, 109 insertions(+), 33 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
index 413a3b129f1..158f2cde4af 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
@@ -21,6 +21,7 @@ import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.clone.BackendLoadStatistic.Classification;
import org.apache.doris.clone.BackendLoadStatistic.LoadScore;
import org.apache.doris.common.Config;
+import org.apache.doris.common.util.DebugPointUtil;
import org.apache.doris.common.util.DebugUtil;
import org.apache.doris.resource.Tag;
import org.apache.doris.system.Backend;
@@ -186,35 +187,58 @@ public class LoadStatisticForTag {
int lowCounter = 0;
int midCounter = 0;
int highCounter = 0;
+
+ long debugHighBeId =
DebugPointUtil.getDebugParamOrDefault("FE.HIGH_LOAD_BE_ID", -1L);
+ if (debugHighBeId > 0) {
+ final long targetBeId = debugHighBeId; // debugHighBeId can not
put in lambda cause it's updated later
+ if (!beLoadStatistics.stream().anyMatch(it -> it.getBeId() ==
targetBeId)) {
+ debugHighBeId = -1L;
+ }
+ }
+
for (BackendLoadStatistic beStat : beLoadStatistics) {
if (!beStat.hasMedium(medium)) {
continue;
}
-
- if (Config.be_rebalancer_fuzzy_test) {
+ Classification clazz = Classification.MID;
+ if (debugHighBeId > 0) {
+ if (beStat.getBeId() == debugHighBeId) {
+ clazz = Classification.HIGH;
+ } else {
+ clazz = Classification.LOW;
+ }
+ } else if (Config.be_rebalancer_fuzzy_test) {
if (beStat.getLoadScore(medium) > avgLoadScore) {
- beStat.setClazz(medium, Classification.HIGH);
- highCounter++;
+ clazz = Classification.HIGH;
} else if (beStat.getLoadScore(medium) < avgLoadScore) {
- beStat.setClazz(medium, Classification.LOW);
- lowCounter++;
+ clazz = Classification.LOW;
}
} else {
if (Math.abs(beStat.getLoadScore(medium) - avgLoadScore) /
avgLoadScore
> Config.balance_load_score_threshold) {
if (beStat.getLoadScore(medium) > avgLoadScore) {
- beStat.setClazz(medium, Classification.HIGH);
- highCounter++;
+ clazz = Classification.HIGH;
} else if (beStat.getLoadScore(medium) < avgLoadScore) {
- beStat.setClazz(medium, Classification.LOW);
- lowCounter++;
+ clazz = Classification.LOW;
}
- } else {
- beStat.setClazz(medium, Classification.MID);
- midCounter++;
}
}
+
+ beStat.setClazz(medium, clazz);
+ switch (clazz) {
+ case HIGH:
+ highCounter++;
+ break;
+ case LOW:
+ lowCounter++;
+ break;
+ case MID:
+ midCounter++;
+ break;
+ default:
+ break;
+ }
}
LOG.debug("classify backend by load. medium: {} avg load score: {}.
low/mid/high: {}/{}/{}",
@@ -265,6 +289,11 @@ public class LoadStatisticForTag {
return false;
}
+ long debugHighBeId =
DebugPointUtil.getDebugParamOrDefault("FE.HIGH_LOAD_BE_ID", -1L);
+ if (srcBeStat.getBeId() == debugHighBeId) {
+ return true;
+ }
+
currentSrcBeScore = srcBeStat.getLoadScore(medium);
currentDestBeScore = destBeStat.getLoadScore(medium);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
index b4667f80696..482dfddf300 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
@@ -666,6 +666,7 @@ public class TabletSchedCtx implements
Comparable<TabletSchedCtx> {
List<Replica> decommissionCand = Lists.newArrayList();
List<Replica> colocateCand = Lists.newArrayList();
List<Replica> notColocateCand = Lists.newArrayList();
+ List<Replica> furtherRepairs = Lists.newArrayList();
for (Replica replica : tablet.getReplicas()) {
if (replica.isBad()) {
LOG.debug("replica {} is bad, skip. tablet: {}",
@@ -688,6 +689,11 @@ public class TabletSchedCtx implements
Comparable<TabletSchedCtx> {
if (replica.getLastFailedVersion() <= 0
&& replica.getVersion() >= visibleVersion) {
+
+ if (tabletStatus == TabletStatus.NEED_FURTHER_REPAIR &&
replica.needFurtherRepair()) {
+ furtherRepairs.add(replica);
+ }
+
// skip healthy replica
LOG.debug("replica {} version {} is healthy, visible version
{}, replica state {}, skip. tablet: {}",
replica.getId(), replica.getVersion(), visibleVersion,
replica.getState(), tabletId);
@@ -709,8 +715,24 @@ public class TabletSchedCtx implements
Comparable<TabletSchedCtx> {
} else {
candidates = decommissionCand;
}
+
if (candidates.isEmpty()) {
- throw new SchedException(Status.UNRECOVERABLE, "unable to choose
dest replica");
+ if (furtherRepairs.isEmpty()) {
+ throw new SchedException(Status.UNRECOVERABLE, "unable to
choose dest replica");
+ }
+
+ boolean allCatchup = true;
+ for (Replica replica : furtherRepairs) {
+ if (checkFurthurRepairFinish(replica, visibleVersion)) {
+ replica.setNeedFurtherRepair(false);
+ replica.setFurtherRepairWatermarkTxnTd(-1);
+ } else {
+ allCatchup = false;
+ }
+ }
+
+ throw new SchedException(Status.FINISHED,
+ allCatchup ? "further repair all catchup" : "further
repair waiting catchup");
}
Replica chosenReplica = null;
@@ -782,6 +804,32 @@ public class TabletSchedCtx implements
Comparable<TabletSchedCtx> {
setDest(chosenReplica.getBackendId(), chosenReplica.getPathHash());
}
+ private boolean checkFurthurRepairFinish(Replica replica, long version) {
+ if (replica.getVersion() < version || replica.getLastFailedVersion() >
0) {
+ return false;
+ }
+
+ long furtherRepairWatermarkTxnTd =
replica.getFurtherRepairWatermarkTxnTd();
+ if (furtherRepairWatermarkTxnTd < 0) {
+ return true;
+ }
+
+ try {
+ if
(Env.getCurrentGlobalTransactionMgr().isPreviousTransactionsFinished(
+ furtherRepairWatermarkTxnTd, dbId, tblId,
partitionId)) {
+ LOG.info("replica {} of tablet {} has catchup with further
repair watermark id {}",
+ replica, tabletId, furtherRepairWatermarkTxnTd);
+ return true;
+ } else {
+ return false;
+ }
+ } catch (Exception e) {
+ LOG.warn("replica {} of tablet {} check catchup with further
repair watermark id {} failed",
+ replica, tabletId, furtherRepairWatermarkTxnTd, e);
+ return true;
+ }
+ }
+
public void releaseResource(TabletScheduler tabletScheduler) {
releaseResource(tabletScheduler, false);
}
@@ -1097,25 +1145,7 @@ public class TabletSchedCtx implements
Comparable<TabletSchedCtx> {
// change from prepare to committed or visible, this replica will
be fall behind and be removed
// in REDUNDANT detection.
//
- boolean isCatchup = false;
- if (replica.getVersion() >= partition.getVisibleVersion() &&
replica.getLastFailedVersion() < 0) {
- long furtherRepairWatermarkTxnTd =
replica.getFurtherRepairWatermarkTxnTd();
- if (furtherRepairWatermarkTxnTd < 0) {
- isCatchup = true;
- } else {
- try {
- if
(Env.getCurrentGlobalTransactionMgr().isPreviousTransactionsFinished(
- furtherRepairWatermarkTxnTd, dbId, tblId,
partitionId)) {
- isCatchup = true;
- LOG.info("new replica {} of tablet {} has catchup
with further repair watermark id {}",
- replica, tabletId,
furtherRepairWatermarkTxnTd);
- }
- } catch (Exception e) {
- isCatchup = true;
- }
- }
- }
-
+ boolean isCatchup = checkFurthurRepairFinish(replica,
partition.getVisibleVersion());
replica.incrFurtherRepairCount();
if (isCatchup || replica.getLeftFurtherRepairCount() <= 0) {
replica.setNeedFurtherRepair(false);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
index 24cb84c970e..90d136715ce 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
@@ -45,6 +45,7 @@ import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
+import org.apache.doris.common.util.DebugPointUtil;
import org.apache.doris.common.util.MasterDaemon;
import org.apache.doris.persist.ReplicaPersistInfo;
import org.apache.doris.resource.Tag;
@@ -976,6 +977,7 @@ public class TabletScheduler extends MasterDaemon {
boolean force, LoadStatisticForTag statistic) throws
SchedException {
Replica chosenReplica = null;
double maxScore = 0;
+ long debugHighBeId =
DebugPointUtil.getDebugParamOrDefault("FE.HIGH_LOAD_BE_ID", -1L);
for (Replica replica : replicas) {
BackendLoadStatistic beStatistic =
statistic.getBackendLoadStatistic(replica.getBackendId());
if (beStatistic == null) {
@@ -1000,6 +1002,11 @@ public class TabletScheduler extends MasterDaemon {
maxScore = loadScore;
chosenReplica = replica;
}
+
+ if (debugHighBeId > 0 && replica.getBackendId() == debugHighBeId) {
+ chosenReplica = replica;
+ break;
+ }
}
if (chosenReplica != null) {
@@ -1523,6 +1530,16 @@ public class TabletScheduler extends MasterDaemon {
statusPair.second = tabletCtx.getPriority();
}
}
+
+ if (statusPair.first == TabletStatus.NEED_FURTHER_REPAIR) {
+ // replica is just waiting for finishing txns before
furtherRepairWatermarkTxnTd,
+ // no need to add it immediately
+ Replica replica =
tablet.getReplicaByBackendId(tabletCtx.getDestBackendId());
+ if (replica != null && replica.getVersion() >=
partition.getVisibleVersion()
+ && replica.getLastFailedVersion() < 0) {
+ return;
+ }
+ }
} finally {
tbl.readUnlock();
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]