This is an automated email from the ASF dual-hosted git repository.

liaoxin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new efd1cd0c692 [improve](routine load) delay schedule EOF tasks to avoid 
too many small transactions (#39975)
efd1cd0c692 is described below

commit efd1cd0c692b2f67c0b423acd41784c5919e799a
Author: hui lai <[email protected]>
AuthorDate: Wed Aug 28 17:05:16 2024 +0800

    [improve](routine load) delay schedule EOF tasks to avoid too many small 
transactions (#39975)
    
    We encountered a scenario where a large number of small transactions
    were generated, resulting in an impact on query performance:
    Kafka's data comes in batches of very small data every very short time,
    which leads to tasks being frequently scheduled and ending very quickly,
    resulting in a large number of small transactions.
    
    To solve this problem, we delay the scheduling of tasks that perceive
    EOF, which would not delay data consumption, for perceiving EOF
    indicates that the consumption speed is greater than the production
    speed.
---
 .../doris/load/routineload/KafkaTaskInfo.java      |  1 +
 .../doris/load/routineload/RoutineLoadJob.java     |  2 +-
 .../load/routineload/RoutineLoadTaskInfo.java      | 22 +++++++++++++++++++++-
 .../load/routineload/RoutineLoadTaskScheduler.java | 11 +++++++++++
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java
index 52a1ad8559f..7aa9ebda09f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java
@@ -60,6 +60,7 @@ public class KafkaTaskInfo extends RoutineLoadTaskInfo {
                 kafkaTaskInfo.getTimeoutMs(), 
kafkaTaskInfo.getTimeoutBackOffCount(),
                 kafkaTaskInfo.getBeId(), isMultiTable);
         this.partitionIdToOffset = partitionIdToOffset;
+        this.isEof = kafkaTaskInfo.getIsEof();
     }
 
     public List<Integer> getPartitions() {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java
 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java
index b983d6beed4..8e3ed8c4682 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java
@@ -1327,7 +1327,7 @@ public abstract class RoutineLoadJob
         } else if (checkCommitInfo(rlTaskTxnCommitAttachment, txnState, 
txnStatusChangeReason)) {
             // step2: update job progress
             updateProgress(rlTaskTxnCommitAttachment);
-            routineLoadTaskInfo.selfAdaptTimeout(rlTaskTxnCommitAttachment);
+            
routineLoadTaskInfo.handleTaskByTxnCommitAttachment(rlTaskTxnCommitAttachment);
         }
 
         if (rlTaskTxnCommitAttachment != null && 
!Strings.isNullOrEmpty(rlTaskTxnCommitAttachment.getErrorLogUrl())) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java
 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java
index d101d98cf85..301efe4d9c9 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java
@@ -76,6 +76,8 @@ public abstract class RoutineLoadTaskInfo {
     protected static final int MAX_TIMEOUT_BACK_OFF_COUNT = 3;
     protected int timeoutBackOffCount = 0;
 
+    protected boolean isEof = false;
+
     // this status will be set when corresponding transaction's status is 
changed.
     // so that user or other logic can know the status of the corresponding 
txn.
     protected TransactionStatus txnStatus = TransactionStatus.UNKNOWN;
@@ -160,6 +162,10 @@ public abstract class RoutineLoadTaskInfo {
         return timeoutBackOffCount;
     }
 
+    public boolean getIsEof() {
+        return isEof;
+    }
+
     public boolean isTimeout() {
         if (txnStatus == TransactionStatus.COMMITTED || txnStatus == 
TransactionStatus.VISIBLE) {
             // the corresponding txn is already finished, this task can not be 
treated as timeout.
@@ -174,7 +180,12 @@ public abstract class RoutineLoadTaskInfo {
         return false;
     }
 
-    public void selfAdaptTimeout(RLTaskTxnCommitAttachment 
rlTaskTxnCommitAttachment) {
+    public void handleTaskByTxnCommitAttachment(RLTaskTxnCommitAttachment 
rlTaskTxnCommitAttachment) {
+        selfAdaptTimeout(rlTaskTxnCommitAttachment);
+        judgeEof(rlTaskTxnCommitAttachment);
+    }
+
+    private void selfAdaptTimeout(RLTaskTxnCommitAttachment 
rlTaskTxnCommitAttachment) {
         long taskExecutionTime = 
rlTaskTxnCommitAttachment.getTaskExecutionTimeMs();
         long timeoutMs = this.timeoutMs;
 
@@ -189,6 +200,15 @@ public abstract class RoutineLoadTaskInfo {
         this.timeoutMs = timeoutMs;
     }
 
+    private void judgeEof(RLTaskTxnCommitAttachment rlTaskTxnCommitAttachment) 
{
+        RoutineLoadJob routineLoadJob = routineLoadManager.getJob(jobId);
+        if (rlTaskTxnCommitAttachment.getTotalRows() < 
routineLoadJob.getMaxBatchRows()
+                && rlTaskTxnCommitAttachment.getReceivedBytes() < 
routineLoadJob.getMaxBatchSizeBytes()
+                && rlTaskTxnCommitAttachment.getTaskExecutionTimeMs() < 
this.timeoutMs) {
+            this.isEof = true;
+        }
+    }
+
     abstract TRoutineLoadTask createRoutineLoadTask() throws UserException;
 
     // begin the txn of this task
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskScheduler.java
 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskScheduler.java
index d4d5d5512d3..8afc35411b5 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskScheduler.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskScheduler.java
@@ -101,6 +101,16 @@ public class RoutineLoadTaskScheduler extends MasterDaemon 
{
         try {
             // This step will be blocked when queue is empty
             RoutineLoadTaskInfo routineLoadTaskInfo = 
needScheduleTasksQueue.take();
+            // try to delay scheduling tasks that are perceived as Eof to 
MaxBatchInterval
+            // to avoid to much small transaction
+            if (routineLoadTaskInfo.getIsEof()) {
+                RoutineLoadJob routineLoadJob = 
routineLoadManager.getJob(routineLoadTaskInfo.getJobId());
+                if (System.currentTimeMillis() - 
routineLoadTaskInfo.getLastScheduledTime()
+                        < routineLoadJob.getMaxBatchIntervalS()) {
+                    needScheduleTasksQueue.addLast(routineLoadTaskInfo);
+                    return;
+                }
+            }
             scheduleOneTask(routineLoadTaskInfo);
         } catch (Exception e) {
             LOG.warn("Taking routine load task from queue has been 
interrupted", e);
@@ -108,6 +118,7 @@ public class RoutineLoadTaskScheduler extends MasterDaemon {
     }
 
     private void scheduleOneTask(RoutineLoadTaskInfo routineLoadTaskInfo) 
throws Exception {
+        routineLoadTaskInfo.setLastScheduledTime(System.currentTimeMillis());
         if (LOG.isDebugEnabled()) {
             LOG.debug("schedule routine load task info {} for job {}",
                     routineLoadTaskInfo.id, routineLoadTaskInfo.getJobId());


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to