This is an automated email from the ASF dual-hosted git repository.
englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 1bc6d78ab43 [feat](nereids) adjust stats derive by delta row (#39222)
1bc6d78ab43 is described below
commit 1bc6d78ab43a34939a16efe608f77f401a3e89fd
Author: minghong <[email protected]>
AuthorDate: Mon Sep 2 10:39:23 2024 +0800
[feat](nereids) adjust stats derive by delta row (#39222)
## Proposed changes
After analyzing, user may insert new rows.
analyzed rows: the rows have been analyzed
delta row: rows inserted after analyze job
if analyzed rows are filtered out, then we try to estimate filter result
by delta row with unknown column stats.
Issue Number: close #xxx
<!--Describe your changes.-->
---
.../doris/nereids/stats/FilterEstimation.java | 19 +++++---
.../doris/nereids/stats/StatsCalculator.java | 51 ++++----------------
.../nereids/types/coercion/CharacterType.java | 4 ++
.../doris/statistics/ColumnStatisticBuilder.java | 18 +++++++
.../org/apache/doris/statistics/Statistics.java | 39 +++++++++++++--
.../apache/doris/statistics/StatisticsBuilder.java | 10 +++-
.../suites/nereids_p0/delta_row/delta_row.groovy | 55 ++++++++++++++++++++++
7 files changed, 143 insertions(+), 53 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index 1628c3b7d72..3bc2a880da7 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -92,12 +92,19 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
/**
* This method will update the stats according to the selectivity.
*/
- public Statistics estimate(Expression expression, Statistics statistics) {
- // For a comparison predicate, only when it's left side is a slot and
right side is a literal, we would
- // consider is a valid predicate.
- Statistics stats = expression.accept(this, new
EstimationContext(statistics));
- stats.enforceValid();
- return stats;
+ public Statistics estimate(Expression expression, Statistics inputStats) {
+ Statistics outputStats = expression.accept(this, new
EstimationContext(inputStats));
+ if (outputStats.getRowCount() == 0 && inputStats.getDeltaRowCount() >
0) {
+ StatisticsBuilder deltaStats = new StatisticsBuilder();
+ deltaStats.setDeltaRowCount(0);
+ deltaStats.setRowCount(inputStats.getDeltaRowCount());
+ for (Expression expr : inputStats.columnStatistics().keySet()) {
+ deltaStats.putColumnStatistics(expr, ColumnStatistic.UNKNOWN);
+ }
+ outputStats = expression.accept(this, new
EstimationContext(deltaStats.build()));
+ }
+ outputStats.enforceValid();
+ return outputStats;
}
@Override
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index e6f07c65870..fec744b86b1 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -130,7 +130,6 @@ import org.apache.doris.nereids.types.DataType;
import org.apache.doris.nereids.util.PlanUtils;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.statistics.AnalysisManager;
-import org.apache.doris.statistics.ColStatsMeta;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
import org.apache.doris.statistics.Histogram;
@@ -306,48 +305,16 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
/**
* returns the sum of deltaRowCount for all selected partitions or for the
table.
*/
- private long computeDeltaRowCount(OlapScan olapScan, SlotReference slot) {
+ private long computeDeltaRowCount(OlapScan olapScan) {
AnalysisManager analysisManager =
Env.getCurrentEnv().getAnalysisManager();
TableStatsMeta tableMeta =
analysisManager.findTableStatsStatus(olapScan.getTable().getId());
long deltaRowCount = 0;
if (tableMeta != null) {
- ColStatsMeta colMeta = tableMeta.findColumnStatsMeta(
-
olapScan.getTable().getIndexNameById(olapScan.getSelectedIndexId()),
slot.getName());
- if (colMeta != null && colMeta.partitionUpdateRows != null) {
- // when fe upgraded from old version, colMeta object may be
deserialized from json,
- // and colMeta.partitionUpdateRows could be null
- if (olapScan.getSelectedPartitionIds().isEmpty()) {
- deltaRowCount = tableMeta.updatedRows.get() -
colMeta.updatedRows;
- } else {
- // sum partition delta row
- for (long partitionId :
olapScan.getSelectedPartitionIds()) {
- deltaRowCount +=
tableMeta.partitionUpdateRows.getOrDefault(partitionId, 0L)
- -
colMeta.partitionUpdateRows.getOrDefault(partitionId, 0L);
- }
- }
- }
+ deltaRowCount =
tableMeta.getBaseIndexDeltaRowCount(olapScan.getTable());
}
return deltaRowCount;
}
- private void adjustColStats(OlapScan olapScan, SlotReference slot,
- ColumnStatisticBuilder builder) {
- if (builder.getAvgSizeByte() <= 0) {
-
builder.setAvgSizeByte(slot.getDataType().toCatalogDataType().getSlotSize());
- }
- long delta = computeDeltaRowCount(olapScan, slot);
- if (delta > 0) {
- builder.setCount(builder.getCount() + delta);
- // clear min-max to avoid error estimation
- // for example, after yesterday data loaded, user send query about
yesterday immediately.
- // since yesterday data are not analyzed, the max date is before
yesterday, and hence optimizer
- // estimates the filter result is zero
- builder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
- .setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
- }
-
- }
-
private ColumnStatistic getColumnStatsFromTableCache(CatalogRelation
catalogRelation, SlotReference slot) {
long idxId = -1;
if (catalogRelation instanceof OlapScan) {
@@ -462,6 +429,8 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
}
// build Stats for olapScan
+ double deltaRowCount = computeDeltaRowCount(olapScan);
+ builder.setDeltaRowCount(deltaRowCount);
// if slot is invisible, use UNKNOWN
List<SlotReference> visibleOutputSlots = new ArrayList<>();
for (Slot slot : ((Plan) olapScan).getOutput()) {
@@ -484,22 +453,22 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
ColumnStatistic cache =
getColumnStatsFromPartitionCache(olapScan, slot, selectedPartitionNames);
ColumnStatisticBuilder colStatsBuilder = new
ColumnStatisticBuilder(cache);
colStatsBuilder.setCount(selectedPartitionsRowCount);
- adjustColStats(olapScan, slot, colStatsBuilder);
+ colStatsBuilder.normalizeAvgSizeByte(slot);
builder.putColumnStatistics(slot, colStatsBuilder.build());
}
checkIfUnknownStatsUsedAsKey(builder);
- builder.setRowCount(selectedPartitionsRowCount);
+ builder.setRowCount(selectedPartitionsRowCount +
deltaRowCount);
} else {
// if partition row count is invalid (-1), fallback to table
stats
for (SlotReference slot : visibleOutputSlots) {
ColumnStatistic cache =
getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);
ColumnStatisticBuilder colStatsBuilder = new
ColumnStatisticBuilder(cache);
colStatsBuilder.setCount(tableRowCount);
- adjustColStats(olapScan, slot, colStatsBuilder);
+ colStatsBuilder.normalizeAvgSizeByte(slot);
builder.putColumnStatistics(slot, colStatsBuilder.build());
}
checkIfUnknownStatsUsedAsKey(builder);
- builder.setRowCount(tableRowCount);
+ builder.setRowCount(tableRowCount + deltaRowCount);
}
} else {
// get table level stats
@@ -507,11 +476,11 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
ColumnStatistic cache =
getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);
ColumnStatisticBuilder colStatsBuilder = new
ColumnStatisticBuilder(cache);
colStatsBuilder.setCount(tableRowCount);
- adjustColStats(olapScan, slot, colStatsBuilder);
+ colStatsBuilder.normalizeAvgSizeByte(slot);
builder.putColumnStatistics(slot, colStatsBuilder.build());
}
checkIfUnknownStatsUsedAsKey(builder);
- builder.setRowCount(tableRowCount);
+ builder.setRowCount(tableRowCount + deltaRowCount);
}
return builder.build();
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
index c02ea39e39a..446ccc7fd00 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
@@ -26,10 +26,14 @@ import org.apache.doris.nereids.types.StringType;
*/
public abstract class CharacterType extends PrimitiveType {
+ public static final int DEFAULT_SLOT_SIZE = 20;
private static final int WIDTH = 16;
protected final int len;
+ // When defining SQL schemas, users often tend to set the length of string
+ // fields much longer than actually needed for storage.
+
public CharacterType(int len) {
this.len = len;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index a512fbadbda..4c8df0bf677 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -18,6 +18,8 @@
package org.apache.doris.statistics;
import org.apache.doris.analysis.LiteralExpr;
+import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.types.coercion.CharacterType;
public class ColumnStatisticBuilder {
private double count;
@@ -170,4 +172,20 @@ public class ColumnStatisticBuilder {
isUnknown, updatedTime);
return colStats;
}
+
+ public void normalizeAvgSizeByte(SlotReference slot) {
+ if (isUnknown) {
+ return;
+ }
+ if (avgSizeByte > 0) {
+ return;
+ }
+ avgSizeByte = slot.getDataType().toCatalogDataType().getSlotSize();
+ // When defining SQL schemas, users often tend to set the length of
string \
+ // fields much longer than actually needed for storage.
+ if (slot.getDataType() instanceof CharacterType) {
+ avgSizeByte = Math.min(avgSizeByte,
+ CharacterType.DEFAULT_SLOT_SIZE);
+ }
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
index 7101fba5afe..162dab5d136 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
@@ -22,6 +22,7 @@ import org.apache.doris.nereids.stats.StatsMathUtil;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.Slot;
import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.types.coercion.CharacterType;
import java.text.DecimalFormat;
import java.util.HashMap;
@@ -43,15 +44,31 @@ public class Statistics {
// the byte size of one tuple
private double tupleSize;
- public Statistics(double rowCount, Map<Expression, ColumnStatistic>
expressionToColumnStats) {
- this(rowCount, 1, expressionToColumnStats);
+ private double deltaRowCount = 0.0;
+
+ public Statistics(Statistics another) {
+ this.rowCount = another.rowCount;
+ this.widthInJoinCluster = another.widthInJoinCluster;
+ this.expressionToColumnStats = new
HashMap<>(another.expressionToColumnStats);
+ this.tupleSize = another.tupleSize;
+ this.deltaRowCount = another.getDeltaRowCount();
}
public Statistics(double rowCount, int widthInJoinCluster,
- Map<Expression, ColumnStatistic>
expressionToColumnStats) {
+ Map<Expression, ColumnStatistic> expressionToColumnStats, double
deltaRowCount) {
this.rowCount = rowCount;
this.widthInJoinCluster = widthInJoinCluster;
this.expressionToColumnStats = expressionToColumnStats;
+ this.deltaRowCount = deltaRowCount;
+ }
+
+ public Statistics(double rowCount, Map<Expression, ColumnStatistic>
expressionToColumnStats) {
+ this(rowCount, 1, expressionToColumnStats, 0);
+ }
+
+ public Statistics(double rowCount, int widthInJoinCluster,
+ Map<Expression, ColumnStatistic> expressionToColumnStats) {
+ this(rowCount, widthInJoinCluster, expressionToColumnStats, 0);
}
public ColumnStatistic findColumnStatistics(Expression expression) {
@@ -133,7 +150,7 @@ public class Statistics {
for (Slot slot : slots) {
ColumnStatistic s = expressionToColumnStats.get(slot);
if (s != null) {
- tempSize += Math.max(1, Math.min(20, s.avgSizeByte));
+ tempSize += Math.max(1,
Math.min(CharacterType.DEFAULT_SLOT_SIZE, s.avgSizeByte));
}
}
tupleSize = Math.max(1, tempSize);
@@ -186,7 +203,11 @@ public class Statistics {
return "-Infinite";
}
DecimalFormat format = new DecimalFormat("#,###.##");
- return format.format(rowCount);
+ String rows = format.format(rowCount);
+ if (deltaRowCount > 0) {
+ rows = rows + "(" + format.format(deltaRowCount) + ")";
+ }
+ return rows;
}
public String printColumnStats() {
@@ -263,4 +284,12 @@ public class Statistics {
}
return builder.build();
}
+
+ public double getDeltaRowCount() {
+ return deltaRowCount;
+ }
+
+ public void setDeltaRowCount(double deltaRowCount) {
+ this.deltaRowCount = deltaRowCount;
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
index 29f04f2926e..947995f7902 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java
@@ -29,6 +29,8 @@ public class StatisticsBuilder {
private int widthInJoinCluster = 1;
private final Map<Expression, ColumnStatistic> expressionToColumnStats;
+ private double deltaRowCount = 0.0;
+
public StatisticsBuilder() {
expressionToColumnStats = new HashMap<>();
}
@@ -36,6 +38,7 @@ public class StatisticsBuilder {
public StatisticsBuilder(Statistics statistics) {
this.rowCount = statistics.getRowCount();
this.widthInJoinCluster = statistics.getWidthInJoinCluster();
+ this.deltaRowCount = statistics.getDeltaRowCount();
expressionToColumnStats = new HashMap<>();
expressionToColumnStats.putAll(statistics.columnStatistics());
}
@@ -50,6 +53,11 @@ public class StatisticsBuilder {
return this;
}
+ public StatisticsBuilder setDeltaRowCount(double deltaRowCount) {
+ this.deltaRowCount = deltaRowCount;
+ return this;
+ }
+
public StatisticsBuilder putColumnStatistics(
Map<Expression, ColumnStatistic> expressionToColumnStats) {
this.expressionToColumnStats.putAll(expressionToColumnStats);
@@ -66,6 +74,6 @@ public class StatisticsBuilder {
}
public Statistics build() {
- return new Statistics(rowCount, widthInJoinCluster,
expressionToColumnStats);
+ return new Statistics(rowCount, widthInJoinCluster,
expressionToColumnStats, deltaRowCount);
}
}
diff --git a/regression-test/suites/nereids_p0/delta_row/delta_row.groovy
b/regression-test/suites/nereids_p0/delta_row/delta_row.groovy
new file mode 100644
index 00000000000..c6f40f5363f
--- /dev/null
+++ b/regression-test/suites/nereids_p0/delta_row/delta_row.groovy
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("delta_row") {
+ String database = context.config.getDbNameByFile(context.file)
+ sql """
+ drop database if exists ${database};
+ create database ${database};
+ use ${database};
+ CREATE TABLE IF NOT EXISTS t (
+ k int(11) null comment "",
+ v string replace null comment "",
+ ) engine=olap
+ DISTRIBUTED BY HASH(k) BUCKETS 5 properties("replication_num" = "1");
+
+ insert into t values (1, "a"),(2, "b"),(3, 'c'),(4,'d');
+ analyze table t with sync;
+ """
+ explain {
+ sql "physical plan select * from t where k > 6"
+ contains("stats=0,")
+ contains("stats=4 ")
+ // PhysicalResultSink[75] ( outputExprs=[k#0, v#1] )
+ // +--PhysicalFilter[72]@1 ( stats=0, predicates=(k#0 > 6) )
+ // +--PhysicalOlapScan[t]@0 ( stats=4 )
+ }
+
+ sql "set global enable_auto_analyze=false;"
+
+ sql "insert into t values (10, 'c');"
+ explain {
+ sql "physical plan select * from t where k > 6"
+ contains("stats=0.5,")
+ contains("stats=5(1)")
+ notContains("stats=0,")
+ notContains("stats=4 ")
+// PhysicalResultSink[75] ( outputExprs=[k#0, v#1] )
+// +--PhysicalFilter[72]@1 ( stats=0.5, predicates=(k#0 > 6) )
+// +--PhysicalOlapScan[t]@0 ( stats=5(1) )
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]