This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4c6f12fb2ba [opt](nereids) optimize stats derive when using delta rows 
(#52788)
4c6f12fb2ba is described below

commit 4c6f12fb2bafb9f5135526a142d771a69de831ed
Author: minghong <[email protected]>
AuthorDate: Mon Jul 7 10:43:41 2025 +0800

    [opt](nereids) optimize stats derive when using delta rows (#52788)
    
    ### What problem does this PR solve?
    in previous version, when estimated rows of filter is zero, we set all
    column stats as unknown and delta row count to estimate again. And hence
    columns not involved in the filter lost their column stats.
    in this pr, we only set unkonw column stats to those columns which is in
    comparison, and the comparison' selectivity is 0.
    
    this change improves the estimation on plan nodes following the filter
    node.
---
 .../doris/nereids/stats/FilterEstimation.java      | 20 ++++++++++-
 .../doris/nereids/stats/FilterEstimationTest.java  | 41 +++++++++++++++++++++-
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index 8c06105ae20..cb4859978a1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -45,6 +45,7 @@ import 
org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
 import org.apache.doris.nereids.types.DataType;
 import org.apache.doris.nereids.types.DateTimeType;
 import org.apache.doris.nereids.types.coercion.RangeScalable;
+import org.apache.doris.nereids.util.ExpressionUtils;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
 import org.apache.doris.statistics.StatisticRange;
@@ -100,7 +101,24 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
             for (Expression expr : inputStats.columnStatistics().keySet()) {
                 deltaStats.putColumnStatistics(expr, ColumnStatistic.UNKNOWN);
             }
-            outputStats = expression.accept(this, new 
EstimationContext(deltaStats.build()));
+            Statistics deltaOutputStats = expression.accept(this, new 
EstimationContext(deltaStats.build()));
+            StatisticsBuilder builder = new 
StatisticsBuilder(inputStats).setDeltaRowCount(0)
+                    .setRowCount(deltaOutputStats.getRowCount());
+            if (expression instanceof And) {
+                List<Expression> conjuncts = 
ExpressionUtils.extractConjunction(expression);
+                for (Expression conjunct : conjuncts) {
+                    if (conjunct instanceof ComparisonPredicate) {
+                        Statistics partial = conjunct.accept(this, new 
EstimationContext(inputStats));
+                        if (partial.getRowCount() == 0) {
+                            for (Slot slot : conjunct.getInputSlots()) {
+                                builder.putColumnStatistics(slot, 
ColumnStatistic.UNKNOWN);
+                            }
+                        }
+                    }
+
+                }
+            }
+            outputStats = builder.build();
         }
         outputStats.normalizeColumnStatistics();
         return outputStats;
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index b2adb5846f9..3790931736f 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -1548,6 +1548,45 @@ class FilterEstimationTest {
         );
 
         Statistics result = new FilterEstimation().estimate(expr, 
statsBuilder.build());
-        System.out.println(result);
+        Assertions.assertEquals(9, result.getRowCount());
+    }
+
+    /**
+     * estimate base on deltaRows, keep column statistics
+     * if the column is not used in expression which leads to zero row count
+     * for example:
+     * B = 10 and A > '2020-01-03'
+     * because analyze job runs on 2020-01-01, so the column stats of A.max is 
2020-01-01, and hence
+     * estimated output rows is zero.
+     * But after 2020-01-01, some rows are inserted, called delta rows.
+     * we will estimate output based on deltaRows, and assume all column stats 
are unknown.
+     * after estimation, we will put col stats back except A, and run 
Statistics.normalizeColumnStatistics().
+     */
+    @Test
+    void testDeltaRow() {
+        double row = 1000.0;
+        SlotReference a = new SlotReference("a", DateType.INSTANCE);
+        ColumnStatisticBuilder columnStatisticBuilderA = new 
ColumnStatisticBuilder(row)
+                .setNdv(10)
+                .setAvgSizeByte(4)
+                .setNumNulls(0)
+                .setMaxExpr(new org.apache.doris.analysis.DateLiteral(2020, 1, 
1))
+                .setMaxValue(new DateLiteral(2020, 1, 1).getDouble());
+        SlotReference b = new SlotReference("b", IntegerType.INSTANCE);
+        ColumnStatisticBuilder columnStatisticBuilderB = new 
ColumnStatisticBuilder(row)
+                .setNdv(10)
+                .setAvgSizeByte(4)
+                .setNumNulls(0);
+        Expression expr = new And(
+                new EqualTo(b, new IntegerLiteral(1)),
+                new GreaterThan(a, new DateLiteral(2020, 1, 2)));
+        StatisticsBuilder statsBuilder = new StatisticsBuilder();
+        statsBuilder.setRowCount(row)
+                .setDeltaRowCount(100)
+                .putColumnStatistics(a, columnStatisticBuilderA.build())
+                .putColumnStatistics(b, columnStatisticBuilderB.build());
+        Statistics stats = new FilterEstimation().estimate(expr, 
statsBuilder.build());
+        Assertions.assertTrue(stats.findColumnStatistics(a).isUnKnown());
+        Assertions.assertFalse(stats.findColumnStatistics(b).isUnKnown());
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to