This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new ce67829356c [nereids] consider numNulls in filter estimation #29184
(#29496)
ce67829356c is described below
commit ce67829356cfacb2393d3c1e758ab7a73ac4a516
Author: xzj7019 <[email protected]>
AuthorDate: Fri Jan 5 11:31:50 2024 +0800
[nereids] consider numNulls in filter estimation #29184 (#29496)
---
.../doris/nereids/stats/FilterEstimation.java | 59 ++++--
.../doris/nereids/stats/FilterEstimationTest.java | 203 ++++++++++++++++++++-
2 files changed, 243 insertions(+), 19 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index b716b350f24..c086aaef5c8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -117,6 +117,8 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
colBuilder.setMinValue(union.getLow()).setMinExpr(union.getLowExpr())
.setMaxValue(union.getHigh()).setMaxExpr(union.getHighExpr())
.setNdv(union.getDistinctValues());
+ double maxNumNulls = Math.max(leftColStats.numNulls,
rightColStats.numNulls);
+ colBuilder.setNumNulls(Math.min(colBuilder.getCount(),
maxNumNulls));
orStats.addColumnStats(slot, colBuilder.build());
}
}
@@ -175,7 +177,7 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
}
private Statistics updateLessThanLiteral(Expression leftExpr,
ColumnStatistic statsForLeft,
- ColumnStatistic statsForRight, EstimationContext context, boolean
contains) {
+ ColumnStatistic statsForRight, EstimationContext context) {
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue,
statsForLeft.minExpr,
statsForRight.maxValue, statsForRight.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
@@ -185,7 +187,7 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
}
private Statistics updateGreaterThanLiteral(Expression leftExpr,
ColumnStatistic statsForLeft,
- ColumnStatistic statsForRight, EstimationContext context, boolean
contains) {
+ ColumnStatistic statsForRight, EstimationContext context) {
StatisticRange rightRange = new StatisticRange(statsForRight.minValue,
statsForRight.minExpr,
statsForLeft.maxValue, statsForLeft.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
@@ -202,12 +204,9 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
return estimateEqualTo(cp, statsForLeft, statsForRight, context);
} else {
if (cp instanceof LessThan || cp instanceof LessThanEqual) {
- return updateLessThanLiteral(cp.left(), statsForLeft,
statsForRight,
- context, cp instanceof LessThanEqual);
+ return updateLessThanLiteral(cp.left(), statsForLeft,
statsForRight, context);
} else if (cp instanceof GreaterThan || cp instanceof
GreaterThanEqual) {
-
- return updateGreaterThanLiteral(cp.left(), statsForLeft,
statsForRight, context,
- cp instanceof GreaterThanEqual);
+ return updateGreaterThanLiteral(cp.left(), statsForLeft,
statsForRight, context);
} else {
throw new RuntimeException(String.format("Unexpected
expression : %s", cp.toSql()));
}
@@ -225,6 +224,7 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
} else {
selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv);
}
+ selectivity = getNotNullSelectivity(statsForLeft, selectivity);
Statistics equalStats = context.statistics.withSel(selectivity);
Expression left = cp.left();
equalStats.addColumnStats(left, statsForRight);
@@ -331,10 +331,12 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
selectivity = 1.0;
}
}
+ compareExprStatsBuilder.setNumNulls(0);
Statistics estimated = new Statistics(context.statistics);
+ ColumnStatistic stats = compareExprStatsBuilder.build();
+ selectivity = getNotNullSelectivity(stats, selectivity);
estimated = estimated.withSel(selectivity);
- estimated.addColumnStats(compareExpr,
- compareExprStatsBuilder.build());
+ estimated.addColumnStats(compareExpr, stats);
context.addKeyIfSlot(compareExpr);
return estimated;
}
@@ -394,6 +396,11 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
.setMaxValue(originColStats.maxValue)
.setMaxExpr(originColStats.maxExpr);
}
+ if (not.child().getInputSlots().size() == 1 && !(child
instanceof IsNull)) {
+ // only consider the single column numNull, otherwise,
ignore
+ rowCount = Math.max(rowCount - originColStats.numNulls, 1);
+ statisticsBuilder.setRowCount(rowCount);
+ }
statisticsBuilder.putColumnStatistics(slot,
colBuilder.build());
}
}
@@ -460,15 +467,18 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
.setMaxValue(Double.POSITIVE_INFINITY)
.setMaxExpr(null)
.setNdv(0)
- .setCount(0);
+ .setCount(0)
+ .setNumNulls(0);
} else {
leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
.setMinValue(intersectRange.getLow())
.setMinExpr(intersectRange.getLowExpr())
.setMaxValue(intersectRange.getHigh())
.setMaxExpr(intersectRange.getHighExpr())
- .setNdv(intersectRange.getDistinctValues());
+ .setNdv(intersectRange.getDistinctValues())
+ .setNumNulls(0);
double sel = leftRange.overlapPercentWith(rightRange);
+ sel = getNotNullSelectivity(leftStats, sel);
updatedStatistics = context.statistics.withSel(sel);
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
}
@@ -488,6 +498,7 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
intersectBuilder.setNdv(intersect.getDistinctValues());
intersectBuilder.setMinValue(intersect.getLow());
intersectBuilder.setMaxValue(intersect.getHigh());
+ intersectBuilder.setNumNulls(0);
double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv,
rightStats.ndv));
Statistics updatedStatistics = context.statistics.withSel(sel);
updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build());
@@ -568,10 +579,34 @@ public class FilterEstimation extends
ExpressionVisitor<Statistics, EstimationCo
"col stats not found. slot=%s in %s",
like.left().toSql(), like.toSql());
ColumnStatisticBuilder colBuilder = new
ColumnStatisticBuilder(origin);
- colBuilder.setNdv(origin.ndv *
DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0);
+ double selectivity =
StatsMathUtil.divide(DEFAULT_LIKE_COMPARISON_SELECTIVITY, origin.ndv);
+ double notNullSel = getNotNullSelectivity(origin, selectivity);
+ colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY)
+ .setCount(notNullSel *
context.statistics.getRowCount()).setNumNulls(0);
statsBuilder.putColumnStatistics(like.left(), colBuilder.build());
context.addKeyIfSlot(like.left());
}
return statsBuilder.build();
}
+
+ private double getNotNullSelectivity(ColumnStatistic stats, double
origSel) {
+ double rowCount = stats.count;
+ double numNulls = stats.numNulls;
+
+ // comment following check since current rowCount and ndv may be
inconsistant
+ // e.g, rowCount has been reduced by one filter but another filter
column's
+ // ndv and numNull remains originally, which will unexpectedly go into
the following
+ // normalization.
+
+ //if (numNulls > rowCount - ndv) {
+ // numNulls = rowCount - ndv > 0 ? rowCount - ndv : 0;
+ //}
+ double notNullSel = rowCount <= 1.0 ? 1.0 : 1 -
getValidSelectivity(numNulls / rowCount);
+ double validSel = origSel * notNullSel;
+ return getValidSelectivity(validSel);
+ }
+
+ private static double getValidSelectivity(double nullSel) {
+ return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel);
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index 888ec139df1..177fac64f16 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -64,10 +64,10 @@ class FilterEstimationTest {
Or or = new Or(greaterThan1, lessThan);
Map<Expression, ColumnStatistic> columnStat = new HashMap<>();
ColumnStatistic aStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4)
- .setNumNulls(500).setDataSize(0)
+ .setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic bStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4)
- .setNumNulls(500).setDataSize(0)
+ .setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build();
columnStat.put(a, aStats);
columnStat.put(b, bStats);
@@ -93,10 +93,10 @@ class FilterEstimationTest {
And and = new And(greaterThan1, lessThan);
Map<Expression, ColumnStatistic> columnStat = new HashMap<>();
ColumnStatistic aStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500)
- .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+ .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic bStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500)
- .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+ .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build();
columnStat.put(a, aStats);
columnStat.put(b, bStats);
@@ -185,13 +185,13 @@ class FilterEstimationTest {
Or or = new Or(and, equalTo);
Map<Expression, ColumnStatistic> slotToColumnStat = new HashMap<>();
ColumnStatistic aStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500)
- .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+ .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic bStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500)
- .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+ .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic cStats = new
ColumnStatisticBuilder().setCount(500).setNdv(500)
- .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+ .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
slotToColumnStat.put(a, aStats);
slotToColumnStat.put(b, bStats);
@@ -910,4 +910,193 @@ class FilterEstimationTest {
Statistics result = filterEstimation.estimate(not, stats);
Assertions.assertEquals(result.getRowCount(), 90);
}
+
+ /**
+ * a = 1
+ */
+ @Test
+ public void testNumNullsEqualTo() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ EqualTo equalTo = new EqualTo(a, int1);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(equalTo, stats);
+ Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
+ }
+
+ /**
+ * a > 1
+ */
+ @Test
+ public void testNumNullsComparable() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ GreaterThan greaterThan = new GreaterThan(a, int1);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(greaterThan, stats);
+ Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
+ }
+
+ /**
+ * a in (1, 2)
+ */
+ @Test
+ public void testNumNullsIn() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ IntegerLiteral int2 = new IntegerLiteral(2);
+ InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2));
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(in, stats);
+ Assertions.assertEquals(result.getRowCount(), 10.0, 0.01);
+ }
+
+ /**
+ * not a = 1
+ */
+ @Test
+ public void testNumNullsNotEqualTo() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ EqualTo equalTo = new EqualTo(a, int1);
+ Not not = new Not(equalTo);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(not, stats);
+ Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
+ }
+
+ /**
+ * a not in (1, 2)
+ */
+ @Test
+ public void testNumNullsNotIn() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ IntegerLiteral int2 = new IntegerLiteral(2);
+ InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2));
+ Not not = new Not(in);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(not, stats);
+ Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
+ }
+
+ /**
+ * a >= 1 and a <= 2
+ */
+ @Test
+ public void testNumNullsAnd() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ IntegerLiteral int2 = new IntegerLiteral(2);
+ GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1);
+ LessThanEqual lessThanEqual = new LessThanEqual(a, int2);
+ And and = new And(greaterThanEqual, lessThanEqual);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(and, stats);
+ Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
+ }
+
+ /**
+ * a >= 1 or a <= 2
+ */
+ @Test
+ public void testNumNullsOr() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ IntegerLiteral int2 = new IntegerLiteral(2);
+ GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int2);
+ LessThanEqual lessThanEqual = new LessThanEqual(a, int1);
+ Or or = new Or(greaterThanEqual, lessThanEqual);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(or, stats);
+ Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
+ }
+
+ /**
+ * a >= 1 or a is null
+ */
+ @Test
+ public void testNumNullsOrIsNull() {
+ SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(2)
+ .setAvgSizeByte(4)
+ .setNumNulls(8)
+ .setMaxValue(2)
+ .setMinValue(1)
+ .setCount(10);
+ IntegerLiteral int1 = new IntegerLiteral(1);
+ GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1);
+ IsNull isNull = new IsNull(a);
+ Or or = new Or(greaterThanEqual, isNull);
+ Statistics stats = new Statistics(10, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(or, stats);
+ Assertions.assertEquals(result.getRowCount(), 10.0, 0.01);
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]