This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7a7bc6a7f73 [opt](nereids) refine expression estimation (#40698)
7a7bc6a7f73 is described below
commit 7a7bc6a7f73d81847f1416dca3485ed18589c274
Author: xzj7019 <[email protected]>
AuthorDate: Tue Sep 24 14:15:22 2024 +0800
[opt](nereids) refine expression estimation (#40698)
Stats deriving refinement step 2: refine expression estimation(part I)
a. refine casewhen/if/literal's avgDataSize/numNull/ndv info.
b. search column statistics cache at first during expression visiting.
c. fix StringType's width() returning -1.
---
.../doris/nereids/stats/ExpressionEstimation.java | 27 ++++++----
.../doris/nereids/stats/FilterEstimation.java | 1 +
.../org/apache/doris/nereids/types/StringType.java | 5 --
.../nereids/types/coercion/CharacterType.java | 2 +-
.../doris/statistics/ColumnStatisticBuilder.java | 3 +-
.../org/apache/doris/statistics/Statistics.java | 2 +-
.../nereids/stats/ExpressionEstimationTest.java | 63 ++++++++++++++++++++++
7 files changed, 85 insertions(+), 18 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
index 126e9041721..b26b0315047 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
@@ -128,6 +128,10 @@ public class ExpressionEstimation extends
ExpressionVisitor<ColumnStatistic, Sta
@Override
public ColumnStatistic visit(Expression expr, Statistics context) {
+ ColumnStatistic stats = context.findColumnStatistics(expr);
+ if (stats != null) {
+ return stats;
+ }
List<Expression> childrenExpr = expr.children();
if (CollectionUtils.isEmpty(childrenExpr)) {
return ColumnStatistic.UNKNOWN;
@@ -135,26 +139,28 @@ public class ExpressionEstimation extends
ExpressionVisitor<ColumnStatistic, Sta
return expr.child(0).accept(this, context);
}
- //TODO: case-when need to re-implemented
@Override
public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics
context) {
double ndv = caseWhen.getWhenClauses().size();
+ double width = 1;
if (caseWhen.getDefaultValue().isPresent()) {
ndv += 1;
}
for (WhenClause clause : caseWhen.getWhenClauses()) {
ColumnStatistic colStats =
ExpressionEstimation.estimate(clause.getResult(), context);
ndv = Math.max(ndv, colStats.ndv);
+ width = Math.max(width, clause.getResult().getDataType().width());
}
if (caseWhen.getDefaultValue().isPresent()) {
ColumnStatistic colStats =
ExpressionEstimation.estimate(caseWhen.getDefaultValue().get(), context);
ndv = Math.max(ndv, colStats.ndv);
+ width = Math.max(width,
caseWhen.getDefaultValue().get().getDataType().width());
}
return new ColumnStatisticBuilder()
.setNdv(ndv)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
- .setAvgSizeByte(8)
+ .setAvgSizeByte(width)
.setNumNulls(0)
.build();
}
@@ -162,15 +168,20 @@ public class ExpressionEstimation extends
ExpressionVisitor<ColumnStatistic, Sta
@Override
public ColumnStatistic visitIf(If ifClause, Statistics context) {
double ndv = 2;
+ double width = 1;
ColumnStatistic colStatsThen =
ExpressionEstimation.estimate(ifClause.child(1), context);
ndv = Math.max(ndv, colStatsThen.ndv);
+ width = Math.max(width, ifClause.child(1).getDataType().width());
+
ColumnStatistic colStatsElse =
ExpressionEstimation.estimate(ifClause.child(2), context);
ndv = Math.max(ndv, colStatsElse.ndv);
+ width = Math.max(width, ifClause.child(2).getDataType().width());
+
return new ColumnStatisticBuilder()
.setNdv(ndv)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
- .setAvgSizeByte(8)
+ .setAvgSizeByte(width)
.setNumNulls(0)
.build();
}
@@ -242,9 +253,9 @@ public class ExpressionEstimation extends
ExpressionVisitor<ColumnStatistic, Sta
return new ColumnStatisticBuilder()
.setMaxValue(literalVal)
.setMinValue(literalVal)
- .setNdv(1)
+ .setNdv(literal.isNullLiteral() ? 0 : 1)
.setNumNulls(literal.isNullLiteral() ? 1 : 0)
- .setAvgSizeByte(1)
+ .setAvgSizeByte(literal.getDataType().width())
.setMinExpr(literal.toLegacyLiteral())
.setMaxExpr(literal.toLegacyLiteral())
.build();
@@ -343,8 +354,7 @@ public class ExpressionEstimation extends
ExpressionVisitor<ColumnStatistic, Sta
return ColumnStatistic.UNKNOWN;
}
// if this is scalar agg, we will update count and ndv to 1 when
visiting group clause
- return new ColumnStatisticBuilder(columnStat)
- .build();
+ return new ColumnStatisticBuilder(columnStat).build();
}
@Override
@@ -355,8 +365,7 @@ public class ExpressionEstimation extends
ExpressionVisitor<ColumnStatistic, Sta
return ColumnStatistic.UNKNOWN;
}
// if this is scalar agg, we will update count and ndv to 1 when
visiting group clause
- return new ColumnStatisticBuilder(columnStat)
- .build();
+ return new ColumnStatisticBuilder(columnStat).build();
}
@Override
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index b3576a0e58e..e7a62dcd484 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -334,6 +334,7 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
} else {
double val = statsForRight.maxValue;
if (val > statsForLeft.maxValue || val <
statsForLeft.minValue) {
+ // TODO: will fix this in the next pr by adding
RangeScalable protection
selectivity = 0.0;
} else if (ndv >= 1.0) {
selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
index 935716e42bf..8e92f83274e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
@@ -31,11 +31,6 @@ public class StringType extends CharacterType {
super(-1);
}
- @Override
- public int width() {
- return len;
- }
-
@Override
public Type toCatalogDataType() {
return Type.STRING;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
index 446ccc7fd00..781b1257028 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
@@ -26,8 +26,8 @@ import org.apache.doris.nereids.types.StringType;
*/
public abstract class CharacterType extends PrimitiveType {
- public static final int DEFAULT_SLOT_SIZE = 20;
private static final int WIDTH = 16;
+ public static final int DEFAULT_WIDTH = WIDTH;
protected final int len;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index 47002355de9..4e190ce388e 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -199,8 +199,7 @@ public class ColumnStatisticBuilder {
// When defining SQL schemas, users often tend to set the length of
string \
// fields much longer than actually needed for storage.
if (slot.getDataType() instanceof CharacterType) {
- avgSizeByte = Math.min(avgSizeByte,
- CharacterType.DEFAULT_SLOT_SIZE);
+ avgSizeByte = Math.min(avgSizeByte, CharacterType.DEFAULT_WIDTH);
}
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
index e18dc097920..7e539ef68f2 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
@@ -151,7 +151,7 @@ public class Statistics {
for (Slot slot : slots) {
ColumnStatistic s = expressionToColumnStats.get(slot);
if (s != null) {
- tempSize += Math.max(1,
Math.min(CharacterType.DEFAULT_SLOT_SIZE, s.avgSizeByte));
+ tempSize += Math.max(1,
Math.min(CharacterType.DEFAULT_WIDTH, s.avgSizeByte));
}
}
tupleSize = Math.max(1, tempSize);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
index 5f91e2b70d8..91da5192b48 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
@@ -31,7 +31,14 @@ import org.apache.doris.nereids.trees.expressions.WhenClause;
import org.apache.doris.nereids.trees.expressions.functions.agg.Max;
import org.apache.doris.nereids.trees.expressions.functions.agg.Min;
import org.apache.doris.nereids.trees.expressions.functions.scalar.If;
+import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral;
import org.apache.doris.nereids.trees.expressions.literal.BooleanLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.DateTimeLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.DateV2Literal;
+import org.apache.doris.nereids.trees.expressions.literal.DecimalLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.NullLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral;
import org.apache.doris.nereids.types.DateType;
import org.apache.doris.nereids.types.DoubleType;
import org.apache.doris.nereids.types.IntegerType;
@@ -44,6 +51,7 @@ import org.apache.commons.math3.util.Precision;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -357,6 +365,7 @@ class ExpressionEstimationTest {
CaseWhen caseWhen = new CaseWhen(whens);
ColumnStatistic est = ExpressionEstimation.estimate(caseWhen, stats);
Assertions.assertEquals(est.ndv, 100);
+ Assertions.assertEquals(est.avgSizeByte, 16);
}
@Test
@@ -383,5 +392,59 @@ class ExpressionEstimationTest {
If ifClause = new If(BooleanLiteral.TRUE, a, b);
ColumnStatistic est = ExpressionEstimation.estimate(ifClause, stats);
Assertions.assertEquals(est.ndv, 100);
+ Assertions.assertEquals(est.avgSizeByte, 16);
+ }
+
+ @Test
+ public void testLiteral() {
+ Statistics stats = new Statistics(1000, new HashMap<>());
+
+ BigIntLiteral l1 = new BigIntLiteral(1000000);
+ ColumnStatistic est = ExpressionEstimation.estimate(l1, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 8);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ VarcharLiteral l2 = new VarcharLiteral("abcdefghij");
+ est = ExpressionEstimation.estimate(l2, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 10);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ DoubleLiteral l3 = new DoubleLiteral(0.01);
+ est = ExpressionEstimation.estimate(l3, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 8);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ DateV2Literal l4 = new DateV2Literal("2024-09-10");
+ est = ExpressionEstimation.estimate(l4, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 4);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ DateTimeLiteral l5 = new DateTimeLiteral("2024-09-10 00:00:00");
+ est = ExpressionEstimation.estimate(l5, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 16);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ BooleanLiteral l6 = BooleanLiteral.TRUE;
+ est = ExpressionEstimation.estimate(l6, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 1);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ DecimalLiteral l7 = new DecimalLiteral(BigDecimal.valueOf(2024.0928));
+ est = ExpressionEstimation.estimate(l7, stats);
+ Assertions.assertEquals(est.ndv, 1);
+ Assertions.assertEquals(est.avgSizeByte, 16);
+ Assertions.assertEquals(est.numNulls, 0);
+
+ NullLiteral l8 = new NullLiteral();
+ est = ExpressionEstimation.estimate(l8, stats);
+ Assertions.assertEquals(est.ndv, 0);
+ Assertions.assertEquals(est.avgSizeByte, 1);
+ Assertions.assertEquals(est.numNulls, 1);
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]