This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8f264a7206aba5ec78ae4afba937c195b1847b7d Author: minghong <[email protected]> AuthorDate: Thu May 30 14:32:14 2024 +0800 [opt](nereids) compare str literal as date literal to compute selectivity (#35610) this pr improves #34542, when the real data type is date-like type. Some users are likely to define date(datetime) column as Varchar type. when estimating the selectivity of predicate like A>'2020-01-01', if nereids regards A and '2020-01-01' as date type, the sel is more accurate than that as string type. --- .../org/apache/doris/analysis/DateLiteral.java | 4 + .../doris/nereids/stats/FilterEstimation.java | 127 +++++++++++++++++++-- .../expressions/literal/StringLikeLiteral.java | 11 +- .../doris/nereids/stats/FilterEstimationTest.java | 31 ++++- 4 files changed, 159 insertions(+), 14 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java index a8148237fb7..1ff103097ef 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java @@ -786,6 +786,10 @@ public class DateLiteral extends LiteralExpr { return getLongValue(); } + public double getDoubleValueAsDateTime() { + return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second; + } + @Override protected void toThrift(TExprNode msg) { if (type.isDatetimeV2()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index 2286daaa448..17b1eb39387 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -17,7 +17,9 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.DateLiteral; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.analysis.StringLiteral; import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.And; @@ -39,7 +41,10 @@ import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.types.DateTimeType; import org.apache.doris.nereids.types.coercion.RangeScalable; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; @@ -50,7 +55,10 @@ import org.apache.doris.statistics.StatisticsBuilder; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.function.Predicate; @@ -183,22 +191,22 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo } } - private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, + private Statistics updateLessThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, statsForRight.maxValue, statsForRight.maxExpr, - statsForLeft.ndv, leftExpr.getDataType()); - return estimateBinaryComparisonFilter(leftExpr, + statsForLeft.ndv, dataType); + return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context); } - private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, + private Statistics updateGreaterThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, statsForLeft.maxValue, statsForLeft.maxExpr, - statsForLeft.ndv, leftExpr.getDataType()); - return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context); + statsForLeft.ndv, dataType); + return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context); } private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, @@ -210,14 +218,111 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo if (cp instanceof EqualPredicate) { return estimateEqualTo(cp, statsForLeft, statsForRight, context); } else { + // literal Map used to covert dateLiteral back to stringLiteral + Map<DateLiteral, StringLiteral> literalMap = new HashMap<>(); + DataType compareType = cp.left().getDataType(); + Optional<ColumnStatistic> statsForLeftMayConvertedOpt = + tryConvertStringColStatsToDateColStats(statsForLeft, literalMap); + Optional<ColumnStatistic> statsForRightMayConvertedOpt = (statsForLeftMayConvertedOpt.isPresent()) + ? tryConvertStringColStatsToDateColStats(statsForRight, literalMap) + : Optional.empty(); + + boolean converted = false; + ColumnStatistic statsForLeftMayConverted = statsForLeft; + ColumnStatistic statsForRightMayConverted = statsForRight; + if (statsForLeftMayConvertedOpt.isPresent() && statsForRightMayConvertedOpt.isPresent() + && statsForRightMayConvertedOpt.get().minExpr.getType() + == statsForLeftMayConvertedOpt.get().minExpr.getType()) { + // string type is converted to date type + converted = true; + compareType = DateTimeType.INSTANCE; + statsForLeftMayConverted = statsForLeftMayConvertedOpt.get(); + statsForRightMayConverted = statsForRightMayConvertedOpt.get(); + } + Statistics result = null; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, context); + result = updateLessThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + statsForRightMayConverted, context); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context); + result = updateGreaterThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + statsForRightMayConverted, context); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); } + if (converted) { + // convert min/max of left.colStats back to string type + ColumnStatistic newLeftStats = result.findColumnStatistics(cp.left()); + result.addColumnStats(cp.left(), convertDateColStatsToStringColStats(newLeftStats, literalMap)); + } + return result; + } + } + + private ColumnStatistic convertDateColStatsToStringColStats(ColumnStatistic colStats, + Map<DateLiteral, StringLiteral> literalMap) { + if (colStats.minExpr == null && colStats.maxExpr == null) { + // when sel=0, minExpr and maxExpr are both null + return colStats; + } + Preconditions.checkArgument(colStats.minExpr instanceof DateLiteral + && colStats.maxExpr instanceof DateLiteral, + "cannot convert colStats back to stringType %s", colStats.toString()); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats); + StringLiteral newMinLiteral = new StringLiteral(colStats.maxExpr.toString()); + return builder.setMaxExpr(newMinLiteral) + .setMaxExpr(literalMap.get(colStats.maxExpr)) + .setMaxValue(StringLikeLiteral.getDouble(colStats.maxExpr.toString())) + .setMinExpr(literalMap.get(colStats.minExpr)) + .setMinValue(StringLikeLiteral.getDouble(colStats.minExpr.getStringValue())) + .build(); + } + + private Optional<ColumnStatistic> tryConvertStringColStatsToDateColStats(ColumnStatistic colStats, + Map<DateLiteral, StringLiteral> literalMap) { + if (colStats.minExpr == null || colStats.maxExpr == null) { + return Optional.empty(); + } + if (!(colStats.minExpr instanceof StringLiteral) || !(colStats.maxExpr instanceof StringLiteral)) { + return Optional.empty(); + } + Optional<DateLiteral> newMinExpr = tryConvertStrLiteralToDateLiteral(colStats.minExpr); + if (newMinExpr.isEmpty()) { + return Optional.empty(); + } + Optional<DateLiteral> newMaxExpr = tryConvertStrLiteralToDateLiteral(colStats.maxExpr); + if (newMaxExpr.isEmpty()) { + return Optional.empty(); + } + if (newMaxExpr.get().getType() != newMinExpr.get().getType()) { + return Optional.empty(); + } + literalMap.put(newMinExpr.get(), (StringLiteral) colStats.minExpr); + literalMap.put(newMaxExpr.get(), (StringLiteral) colStats.maxExpr); + + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats); + return Optional.of(builder.setMinValue(newMinExpr.get().getDoubleValueAsDateTime()) + .setMinExpr(newMinExpr.get()) + .setMaxValue(newMaxExpr.get().getDoubleValueAsDateTime()) + .setMaxExpr(newMaxExpr.get()) + .build()); + } + + private Optional<DateLiteral> tryConvertStrLiteralToDateLiteral(LiteralExpr literal) { + if (literal == null) { + return Optional.empty(); + } + if (!(literal instanceof StringLiteral)) { + return Optional.empty(); + } + + DateLiteral dt = null; + try { + dt = new DateLiteral(literal.getStringValue()); + dt.checkValueValid(); + } catch (Exception e) { + // ignore } + return dt == null ? Optional.empty() : Optional.of(dt); } private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic statsForLeft, @@ -467,11 +572,11 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo } } - private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats, + private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType dataType, ColumnStatistic leftStats, StatisticRange rightRange, EstimationContext context) { StatisticRange leftRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr, - leftStats.ndv, leftExpr.getDataType()); + leftStats.ndv, dataType); StatisticRange intersectRange = leftRange.cover(rightRange); ColumnStatisticBuilder leftColumnStatisticBuilder; @@ -495,7 +600,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo .setNdv(intersectRange.getDistinctValues()) .setNumNulls(0); double sel = leftRange.overlapPercentWith(rightRange); - if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) { + if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) { sel = DEFAULT_INEQUALITY_COEFFICIENT; } sel = getNotNullSelectivity(leftStats, sel); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java index e0e28d9399c..815e5742d24 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java @@ -39,11 +39,18 @@ public abstract class StringLikeLiteral extends Literal { @Override public double getDouble() { + return getDouble(value); + } + + /** + * get double value + */ + public static double getDouble(String str) { long v = 0; int pos = 0; - int len = Math.min(value.length(), 7); + int len = Math.min(str.length(), 7); while (pos < len) { - v += Byte.toUnsignedLong(value.getBytes()[pos]) << ((6 - pos) * 8); + v += Byte.toUnsignedLong(str.getBytes()[pos]) << ((6 - pos) * 8); pos++; } return (double) v; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 687a4d7a54a..08aced49e14 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -1148,6 +1148,35 @@ class FilterEstimationTest { */ @Test public void testStringRangeColToLiteral() { + SlotReference a = new SlotReference("a", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("200")) + .setMaxValue(new VarcharLiteral("200").getDouble()) + .setMinExpr(new StringLiteral("100")) + .setMinValue(new VarcharLiteral("100").getDouble()) + .setCount(100); + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build()); + Statistics baseStats = statsBuilder.build(); + VarcharLiteral i500 = new VarcharLiteral("500"); + Statistics filter500 = new FilterEstimation().estimate(new LessThan(a, i500), baseStats); + Assertions.assertEquals(100, filter500.getRowCount()); + + VarcharLiteral i10 = new VarcharLiteral("10"); + Statistics filter10 = new FilterEstimation().estimate(new LessThan(i10, a), baseStats); + Assertions.assertEquals(100, filter10.getRowCount()); + + VarcharLiteral i199 = new VarcharLiteral("199"); + Statistics filter199 = new FilterEstimation().estimate(new GreaterThan(a, i199), baseStats); + Assertions.assertEquals(50, filter199.getRowCount(), 0.01); + } + + @Test + public void testStringRangeColToDateLiteral() { SlotReference a = new SlotReference("a", new VarcharType(25)); ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() .setNdv(100) @@ -1172,7 +1201,7 @@ class FilterEstimationTest { VarcharLiteral year2021 = new VarcharLiteral("2021-12-01"); Statistics filter2021 = new FilterEstimation().estimate(new GreaterThan(a, year2021), baseStats); - Assertions.assertEquals(50, filter2021.getRowCount()); + Assertions.assertEquals(4.24, filter2021.getRowCount(), 0.01); } @Test --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
