This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new ee7f9a4f268 [fix](oom) avoid oom when a lot of tablets fail on load
(#36944)
ee7f9a4f268 is described below
commit ee7f9a4f2688c1800a8ed9b7e510cc5530b39274
Author: Yongqiang YANG <[email protected]>
AuthorDate: Thu Jun 27 22:12:42 2024 +0800
[fix](oom) avoid oom when a lot of tablets fail on load (#36944)
pick #36873
---
.../doris/transaction/DatabaseTransactionMgr.java | 62 +++++++++++++++++-----
1 file changed, 49 insertions(+), 13 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
index 3996664708a..368415c3abd 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
@@ -1159,6 +1159,45 @@ public class DatabaseTransactionMgr {
return true;
}
+ private class TabletsPublishResultLogs {
+ public List<String> quorumSuccLogs = Lists.newArrayList();
+ public List<String> timeoutSuccLogs = Lists.newArrayList();
+ public List<String> failedLogs = Lists.newArrayList();
+
+ public void addQuorumSuccLog(String log) {
+ if (quorumSuccLogs.size() < 16) {
+ quorumSuccLogs.add(log);
+ }
+ }
+
+ public void addTimeoutSuccLog(String log) {
+ if (timeoutSuccLogs.size() < 16) {
+ timeoutSuccLogs.add(log);
+ }
+ }
+
+ public void addFailedLog(String log) {
+ if (failedLogs.size() < 16) {
+ failedLogs.add(log);
+ }
+ }
+
+ public void log() {
+ // log failed logs
+ for (String log : failedLogs) {
+ LOG.info(log);
+ }
+ // log timeout succ logs
+ for (String log : timeoutSuccLogs) {
+ LOG.info(log);
+ }
+ // log quorum succ logs
+ for (String log : quorumSuccLogs) {
+ LOG.info(log);
+ }
+ }
+ }
+
private PublishResult finishCheckQuorumReplicas(TransactionState
transactionState,
List<Pair<OlapTable, Partition>> relatedTblPartitions,
Set<Long> errorReplicaIds) {
@@ -1173,7 +1212,7 @@ public class DatabaseTransactionMgr {
List<Replica> tabletSuccReplicas = Lists.newArrayList();
List<Replica> tabletWriteFailedReplicas = Lists.newArrayList();
List<Replica> tabletVersionFailedReplicas = Lists.newArrayList();
- List<String> logs = Lists.newArrayList();
+ TabletsPublishResultLogs logs = new TabletsPublishResultLogs();
Map<Long, PublishVersionTask> publishTasks =
transactionState.getPublishVersionTasks();
PublishResult publishResult = PublishResult.QUORUM_SUCC;
@@ -1224,9 +1263,9 @@ public class DatabaseTransactionMgr {
if (hasFailedReplica) {
String writeDetail =
getTabletWriteDetail(tabletSuccReplicas,
tabletWriteFailedReplicas,
tabletVersionFailedReplicas);
- logs.add(String.format("publish version quorum
succ for transaction %s on tablet %s"
- + " with version %s, and has failed
replicas, load require replica num %s. "
- + "table %s, partition: [ id=%s, commit
version=%s ], tablet detail: %s",
+ logs.addQuorumSuccLog(String.format("publish
version quorum succ for transaction %s "
+ + "on tablet %s with version %s, and has
failed replicas, load require replica "
+ + "num %s. table %s, partition: [ id=%s,
commit version=%s ], tablet detail: %s",
transactionState, tablet.getId(),
newVersion, loadRequiredReplicaNum, tableId,
partitionId,
partition.getCommittedVersion(), writeDetail));
}
@@ -1248,9 +1287,9 @@ public class DatabaseTransactionMgr {
// that are being publised exists on a few replicas we
should go
// ahead, otherwise data may be lost and thre
// publish task hangs forever.
- logs.add(String.format("publish version timeout succ
for transaction %s on tablet %s "
- + "with version %s, and has failed replicas,
load require replica num %s. "
- + "table %s, partition %s, tablet detail: %s",
+ logs.addTimeoutSuccLog(String.format("publish version
timeout succ for transaction %s "
+ + "on tablet %s with version %s, and has
failed replicas, load require replica num %s."
+ + " table %s, partition %s, tablet detail: %s",
transactionState, tablet.getId(), newVersion,
loadRequiredReplicaNum, tableId, partitionId,
writeDetail));
} else {
@@ -1261,8 +1300,8 @@ public class DatabaseTransactionMgr {
tablet.getId(), healthReplicaNum,
loadRequiredReplicaNum, tableId,
partitionId, newVersion);
transactionState.setErrorMsg(errMsg);
- logs.add(String.format("publish version failed for
transaction %s on tablet %s with version"
- + " %s, and has failed replicas, load required
replica num %s. table %s, "
+ logs.addQuorumSuccLog(String.format("publish version
failed for transaction %s on tablet %s "
+ + " with version %s, and has failed replicas,
load required replica num %s. table %s, "
+ "partition %s, tablet detail: %s",
transactionState, tablet.getId(), newVersion,
loadRequiredReplicaNum, tableId, partitionId,
writeDetail));
@@ -1275,10 +1314,7 @@ public class DatabaseTransactionMgr {
|| now - transactionState.getLastPublishLogTime() >
Config.publish_fail_log_interval_second * 1000L;
if (needLog) {
transactionState.setLastPublishLogTime(now);
- for (String log : logs) {
- LOG.info("{}. publish times {}, whole txn publish result {}",
- log, transactionState.getPublishCount(),
publishResult.name());
- }
+ logs.log();
}
return publishResult;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]