This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 1bc6d78ab43 [feat](nereids) adjust stats derive by delta row (#39222)
1bc6d78ab43 is described below

commit 1bc6d78ab43a34939a16efe608f77f401a3e89fd
Author: minghong <[email protected]>
AuthorDate: Mon Sep 2 10:39:23 2024 +0800

    [feat](nereids) adjust stats derive by delta row (#39222)
    
    ## Proposed changes
    After analyzing, user may insert new rows.
    analyzed rows: the rows have been analyzed
    delta row: rows inserted after analyze job
    
    if analyzed rows are filtered out, then we try to estimate filter result
    by delta row with unknown column stats.
    Issue Number: close #xxx
    
    <!--Describe your changes.-->
---
 .../doris/nereids/stats/FilterEstimation.java      | 19 +++++---
 .../doris/nereids/stats/StatsCalculator.java       | 51 ++++----------------
 .../nereids/types/coercion/CharacterType.java      |  4 ++
 .../doris/statistics/ColumnStatisticBuilder.java   | 18 +++++++
 .../org/apache/doris/statistics/Statistics.java    | 39 +++++++++++++--
 .../apache/doris/statistics/StatisticsBuilder.java | 10 +++-
 .../suites/nereids_p0/delta_row/delta_row.groovy   | 55 ++++++++++++++++++++++
 7 files changed, 143 insertions(+), 53 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index 1628c3b7d72..3bc2a880da7 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -92,12 +92,19 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
     /**
      * This method will update the stats according to the selectivity.
      */
-    public Statistics estimate(Expression expression, Statistics statistics) {
-        // For a comparison predicate, only when it's left side is a slot and 
right side is a literal, we would
-        // consider is a valid predicate.
-        Statistics stats = expression.accept(this, new 
EstimationContext(statistics));
-        stats.enforceValid();
-        return stats;
+    public Statistics estimate(Expression expression, Statistics inputStats) {
+        Statistics outputStats = expression.accept(this, new 
EstimationContext(inputStats));
+        if (outputStats.getRowCount() == 0 && inputStats.getDeltaRowCount() > 
0) {
+            StatisticsBuilder deltaStats = new StatisticsBuilder();
+            deltaStats.setDeltaRowCount(0);
+            deltaStats.setRowCount(inputStats.getDeltaRowCount());
+            for (Expression expr : inputStats.columnStatistics().keySet()) {
+                deltaStats.putColumnStatistics(expr, ColumnStatistic.UNKNOWN);
+            }
+            outputStats = expression.accept(this, new 
EstimationContext(deltaStats.build()));
+        }
+        outputStats.enforceValid();
+        return outputStats;
     }
 
     @Override
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index e6f07c65870..fec744b86b1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -130,7 +130,6 @@ import org.apache.doris.nereids.types.DataType;
 import org.apache.doris.nereids.util.PlanUtils;
 import org.apache.doris.qe.ConnectContext;
 import org.apache.doris.statistics.AnalysisManager;
-import org.apache.doris.statistics.ColStatsMeta;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
 import org.apache.doris.statistics.Histogram;
@@ -306,48 +305,16 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
     /**
      * returns the sum of deltaRowCount for all selected partitions or for the 
table.
      */
-    private long computeDeltaRowCount(OlapScan olapScan, SlotReference slot) {
+    private long computeDeltaRowCount(OlapScan olapScan) {
         AnalysisManager analysisManager = 
Env.getCurrentEnv().getAnalysisManager();
         TableStatsMeta tableMeta = 
analysisManager.findTableStatsStatus(olapScan.getTable().getId());
         long deltaRowCount = 0;
         if (tableMeta != null) {
-            ColStatsMeta colMeta = tableMeta.findColumnStatsMeta(
-                    
olapScan.getTable().getIndexNameById(olapScan.getSelectedIndexId()), 
slot.getName());
-            if (colMeta != null && colMeta.partitionUpdateRows != null) {
-                // when fe upgraded from old version, colMeta object may be 
deserialized from json,
-                // and colMeta.partitionUpdateRows could be null
-                if (olapScan.getSelectedPartitionIds().isEmpty()) {
-                    deltaRowCount = tableMeta.updatedRows.get() - 
colMeta.updatedRows;
-                } else {
-                    // sum partition delta row
-                    for (long partitionId : 
olapScan.getSelectedPartitionIds()) {
-                        deltaRowCount += 
tableMeta.partitionUpdateRows.getOrDefault(partitionId, 0L)
-                                - 
colMeta.partitionUpdateRows.getOrDefault(partitionId, 0L);
-                    }
-                }
-            }
+            deltaRowCount = 
tableMeta.getBaseIndexDeltaRowCount(olapScan.getTable());
         }
         return deltaRowCount;
     }
 
-    private void adjustColStats(OlapScan olapScan, SlotReference slot,
-            ColumnStatisticBuilder builder) {
-        if (builder.getAvgSizeByte() <= 0) {
-            
builder.setAvgSizeByte(slot.getDataType().toCatalogDataType().getSlotSize());
-        }
-        long delta = computeDeltaRowCount(olapScan, slot);
-        if (delta > 0) {
-            builder.setCount(builder.getCount() + delta);
-            // clear min-max to avoid error estimation
-            // for example, after yesterday data loaded, user send query about 
yesterday immediately.
-            // since yesterday data are not analyzed, the max date is before 
yesterday, and hence optimizer
-            // estimates the filter result is zero
-            builder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
-                    .setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
-        }
-
-    }
-
     private ColumnStatistic getColumnStatsFromTableCache(CatalogRelation 
catalogRelation, SlotReference slot) {
         long idxId = -1;
         if (catalogRelation instanceof OlapScan) {
@@ -462,6 +429,8 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
         }
 
         // build Stats for olapScan
+        double deltaRowCount = computeDeltaRowCount(olapScan);
+        builder.setDeltaRowCount(deltaRowCount);
         // if slot is invisible, use UNKNOWN
         List<SlotReference> visibleOutputSlots = new ArrayList<>();
         for (Slot slot : ((Plan) olapScan).getOutput()) {
@@ -484,22 +453,22 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
                     ColumnStatistic cache = 
getColumnStatsFromPartitionCache(olapScan, slot, selectedPartitionNames);
                     ColumnStatisticBuilder colStatsBuilder = new 
ColumnStatisticBuilder(cache);
                     colStatsBuilder.setCount(selectedPartitionsRowCount);
-                    adjustColStats(olapScan, slot, colStatsBuilder);
+                    colStatsBuilder.normalizeAvgSizeByte(slot);
                     builder.putColumnStatistics(slot, colStatsBuilder.build());
                 }
                 checkIfUnknownStatsUsedAsKey(builder);
-                builder.setRowCount(selectedPartitionsRowCount);
+                builder.setRowCount(selectedPartitionsRowCount + 
deltaRowCount);
             } else {
                 // if partition row count is invalid (-1), fallback to table 
stats
                 for (SlotReference slot : visibleOutputSlots) {
                     ColumnStatistic cache = 
getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);
                     ColumnStatisticBuilder colStatsBuilder = new 
ColumnStatisticBuilder(cache);
                     colStatsBuilder.setCount(tableRowCount);
-                    adjustColStats(olapScan, slot, colStatsBuilder);
+                    colStatsBuilder.normalizeAvgSizeByte(slot);
                     builder.putColumnStatistics(slot, colStatsBuilder.build());
                 }
                 checkIfUnknownStatsUsedAsKey(builder);
-                builder.setRowCount(tableRowCount);
+                builder.setRowCount(tableRowCount + deltaRowCount);
             }
         } else {
             // get table level stats
@@ -507,11 +476,11 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
                 ColumnStatistic cache = 
getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);
                 ColumnStatisticBuilder colStatsBuilder = new 
ColumnStatisticBuilder(cache);
                 colStatsBuilder.setCount(tableRowCount);
-                adjustColStats(olapScan, slot, colStatsBuilder);
+                colStatsBuilder.normalizeAvgSizeByte(slot);
                 builder.putColumnStatistics(slot, colStatsBuilder.build());
             }
             checkIfUnknownStatsUsedAsKey(builder);
-            builder.setRowCount(tableRowCount);
+            builder.setRowCount(tableRowCount + deltaRowCount);
         }
         return builder.build();
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
index c02ea39e39a..446ccc7fd00 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
@@ -26,10 +26,14 @@ import org.apache.doris.nereids.types.StringType;
  */
 public abstract class CharacterType extends PrimitiveType {
 
+    public static final int DEFAULT_SLOT_SIZE = 20;
     private static final int WIDTH = 16;
 
     protected final int len;
 
+    // When defining SQL schemas, users often tend to set the length of string
+    // fields much longer than actually needed for storage.
+
     public CharacterType(int len) {
         this.len = len;
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index a512fbadbda..4c8df0bf677 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -18,6 +18,8 @@
 package org.apache.doris.statistics;
 
 import org.apache.doris.analysis.LiteralExpr;
+import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.types.coercion.CharacterType;
 
 public class ColumnStatisticBuilder {
     private double count;
@@ -170,4 +172,20 @@ public class ColumnStatisticBuilder {
                 isUnknown, updatedTime);
         return colStats;
     }
+
+    public void normalizeAvgSizeByte(SlotReference slot) {
+        if (isUnknown) {
+            return;
+        }
+        if (avgSizeByte > 0) {
+            return;
+        }
+        avgSizeByte = slot.getDataType().toCatalogDataType().getSlotSize();
+        // When defining SQL schemas, users often tend to set the length of 
string \
+        // fields much longer than actually needed for storage.
+        if (slot.getDataType() instanceof CharacterType) {
+            avgSizeByte = Math.min(avgSizeByte,
+                    CharacterType.DEFAULT_SLOT_SIZE);
+        }
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
index 7101fba5afe..162dab5d136 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
@@ -22,6 +22,7 @@ import org.apache.doris.nereids.stats.StatsMathUtil;
 import org.apache.doris.nereids.trees.expressions.Expression;
 import org.apache.doris.nereids.trees.expressions.Slot;
 import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.types.coercion.CharacterType;
 
 import java.text.DecimalFormat;
 import java.util.HashMap;
@@ -43,15 +44,31 @@ public class Statistics {
     // the byte size of one tuple
     private double tupleSize;
 
-    public Statistics(double rowCount, Map<Expression, ColumnStatistic> 
expressionToColumnStats) {
-        this(rowCount, 1, expressionToColumnStats);
+    private double deltaRowCount = 0.0;
+
+    public Statistics(Statistics another) {
+        this.rowCount = another.rowCount;
+        this.widthInJoinCluster = another.widthInJoinCluster;
+        this.expressionToColumnStats = new 
HashMap<>(another.expressionToColumnStats);
+        this.tupleSize = another.tupleSize;
+        this.deltaRowCount = another.getDeltaRowCount();
     }
 
     public Statistics(double rowCount, int widthInJoinCluster,
-                      Map<Expression, ColumnStatistic> 
expressionToColumnStats) {
+            Map<Expression, ColumnStatistic> expressionToColumnStats, double 
deltaRowCount) {
         this.rowCount = rowCount;
         this.widthInJoinCluster = widthInJoinCluster;
         this.expressionToColumnStats = expressionToColumnStats;
+        this.deltaRowCount = deltaRowCount;
+    }
+
+    public Statistics(double rowCount, Map<Expression, ColumnStatistic> 
expressionToColumnStats) {
+        this(rowCount, 1, expressionToColumnStats, 0);
+    }
+
+    public Statistics(double rowCount, int widthInJoinCluster,
+            Map<Expression, ColumnStatistic> expressionToColumnStats) {
+        this(rowCount, widthInJoinCluster, expressionToColumnStats, 0);
     }
 
     public ColumnStatistic findColumnStatistics(Expression expression) {
@@ -133,7 +150,7 @@ public class Statistics {
             for (Slot slot : slots) {
                 ColumnStatistic s = expressionToColumnStats.get(slot);
                 if (s != null) {
-                    tempSize += Math.max(1, Math.min(20, s.avgSizeByte));
+                    tempSize += Math.max(1, 
Math.min(CharacterType.DEFAULT_SLOT_SIZE, s.avgSizeByte));
                 }
             }
             tupleSize = Math.max(1, tempSize);
@@ -186,7 +203,11 @@ public class Statistics {
             return "-Infinite";
         }
         DecimalFormat format = new DecimalFormat("#,###.##");
-        return format.format(rowCount);
+        String rows = format.format(rowCount);
+        if (deltaRowCount > 0) {
+            rows = rows + "(" + format.format(deltaRowCount) + ")";
+        }
+        return rows;
     }
 
     public String printColumnStats() {
@@ -263,4 +284,12 @@ public class Statistics {
         }
         return builder.build();
     }
+
+    public double getDeltaRowCount() {
+        return deltaRowCount;
+    }
+
+    public void setDeltaRowCount(double deltaRowCount) {
+        this.deltaRowCount = deltaRowCount;
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
index 29f04f2926e..947995f7902 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
@@ -29,6 +29,8 @@ public class StatisticsBuilder {
     private int widthInJoinCluster = 1;
     private final Map<Expression, ColumnStatistic> expressionToColumnStats;
 
+    private double deltaRowCount = 0.0;
+
     public StatisticsBuilder() {
         expressionToColumnStats = new HashMap<>();
     }
@@ -36,6 +38,7 @@ public class StatisticsBuilder {
     public StatisticsBuilder(Statistics statistics) {
         this.rowCount = statistics.getRowCount();
         this.widthInJoinCluster = statistics.getWidthInJoinCluster();
+        this.deltaRowCount = statistics.getDeltaRowCount();
         expressionToColumnStats = new HashMap<>();
         expressionToColumnStats.putAll(statistics.columnStatistics());
     }
@@ -50,6 +53,11 @@ public class StatisticsBuilder {
         return this;
     }
 
+    public StatisticsBuilder setDeltaRowCount(double deltaRowCount) {
+        this.deltaRowCount = deltaRowCount;
+        return this;
+    }
+
     public StatisticsBuilder putColumnStatistics(
             Map<Expression, ColumnStatistic> expressionToColumnStats) {
         this.expressionToColumnStats.putAll(expressionToColumnStats);
@@ -66,6 +74,6 @@ public class StatisticsBuilder {
     }
 
     public Statistics build() {
-        return new Statistics(rowCount, widthInJoinCluster, 
expressionToColumnStats);
+        return new Statistics(rowCount, widthInJoinCluster, 
expressionToColumnStats, deltaRowCount);
     }
 }
diff --git a/regression-test/suites/nereids_p0/delta_row/delta_row.groovy 
b/regression-test/suites/nereids_p0/delta_row/delta_row.groovy
new file mode 100644
index 00000000000..c6f40f5363f
--- /dev/null
+++ b/regression-test/suites/nereids_p0/delta_row/delta_row.groovy
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("delta_row") {
+    String database = context.config.getDbNameByFile(context.file)
+    sql """
+        drop database if exists ${database};
+        create database ${database};
+        use ${database};
+        CREATE TABLE IF NOT EXISTS t (
+            k int(11) null comment "",
+            v string replace null comment "",
+        ) engine=olap
+        DISTRIBUTED BY HASH(k) BUCKETS 5 properties("replication_num" = "1");
+    
+        insert into t values (1, "a"),(2, "b"),(3, 'c'),(4,'d');
+        analyze table t with sync;
+    """
+    explain {
+        sql "physical plan select * from t where k > 6"
+        contains("stats=0,")
+        contains("stats=4 ")
+        // PhysicalResultSink[75] ( outputExprs=[k#0, v#1] )
+        //     +--PhysicalFilter[72]@1 ( stats=0, predicates=(k#0 > 6) )
+        //         +--PhysicalOlapScan[t]@0 ( stats=4 )
+    }
+
+    sql "set global enable_auto_analyze=false;"
+
+    sql "insert into t values (10, 'c');"
+    explain {
+        sql "physical plan select * from t where k > 6"
+        contains("stats=0.5,")
+        contains("stats=5(1)")
+        notContains("stats=0,")
+        notContains("stats=4 ")
+// PhysicalResultSink[75] ( outputExprs=[k#0, v#1] )
+// +--PhysicalFilter[72]@1 ( stats=0.5, predicates=(k#0 > 6) )
+//    +--PhysicalOlapScan[t]@0 ( stats=5(1) )
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to