This is an automated email from the ASF dual-hosted git repository. amansinha pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push: new 849f896 DRILL-7119: Compute range predicate selectivity using histograms. 849f896 is described below commit 849f896c491118571303a1313f75bece37e80b7e Author: Aman Sinha <asi...@maprtech.com> AuthorDate: Mon Mar 25 06:55:57 2019 -0700 DRILL-7119: Compute range predicate selectivity using histograms. Address code review comments. Add unit test for histogram usage. close apache/drill#1733 --- .../drill/exec/planner/common/DrillStatsTable.java | 2 +- .../drill/exec/planner/common/Histogram.java | 2 +- .../planner/common/NumericEquiDepthHistogram.java | 188 ++++++++++++++++++++- .../exec/planner/cost/DrillRelMdSelectivity.java | 28 ++- .../test/java/org/apache/drill/PlanTestBase.java | 4 +- .../org/apache/drill/exec/sql/TestAnalyze.java | 15 ++ 6 files changed, 225 insertions(+), 14 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java index 65b0b64..d34a5e2 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java @@ -215,7 +215,7 @@ public class DrillStatsTable { // get the histogram for this column Histogram hist = cs.getHistogram(); - histogram.put(cs.getName(), hist); + histogram.put(cs.getName().toUpperCase(), hist); } } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java index 1980ee5..c6444a3 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java @@ -37,5 +37,5 @@ public interface Histogram { * @param filter * @return estimated selectivity or NULL if it could not be estimated for any reason */ - Double estimatedSelectivity(RexNode filter); + Double estimatedSelectivity(final RexNode filter); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java index 386141e..9d5bf6f 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java @@ -21,8 +21,15 @@ package org.apache.drill.exec.planner.common; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; +import java.util.List; + +import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexLiteral; import com.clearspring.analytics.stream.quantile.TDigest; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.SqlOperator; +import org.apache.drill.shaded.guava.com.google.common.base.Preconditions; /** * A column specific equi-depth histogram which is meant for numeric data types @@ -30,6 +37,19 @@ import com.clearspring.analytics.stream.quantile.TDigest; @JsonTypeName("numeric-equi-depth") public class NumericEquiDepthHistogram implements Histogram { + /** + * Use a small non-zero selectivity rather than 0 to account for the fact that + * histogram boundaries are approximate and even if some values lie outside the + * range, we cannot be absolutely sure + */ + static final double SMALL_SELECTIVITY = 0.0001; + + /** + * Use a large selectivity of 1.0 whenever we are reasonably confident that all rows + * qualify. Even if this is off by a small fraction, it is acceptable. + */ + static final double LARGE_SELECTIVITY = 1.0; + // For equi-depth, all buckets will have same (approx) number of rows @JsonProperty("numRowsPerBucket") private long numRowsPerBucket; @@ -69,27 +89,177 @@ public class NumericEquiDepthHistogram implements Histogram { } @Override - public Double estimatedSelectivity(RexNode filter) { + public Double estimatedSelectivity(final RexNode filter) { if (numRowsPerBucket >= 0) { - return 1.0; - } else { - return null; + // at a minimum, the histogram should have a start and end point of 1 bucket, so at least 2 entries + Preconditions.checkArgument(buckets.length >= 2, "Histogram has invalid number of entries"); + final int first = 0; + final int last = buckets.length - 1; + + // number of buckets is 1 less than the total # entries in the buckets array since last + // entry is the end point of the last bucket + final int numBuckets = buckets.length - 1; + final long totalRows = numBuckets * numRowsPerBucket; + if (filter instanceof RexCall) { + // get the operator + SqlOperator op = ((RexCall) filter).getOperator(); + if (op.getKind() == SqlKind.GREATER_THAN || + op.getKind() == SqlKind.GREATER_THAN_OR_EQUAL) { + Double value = getLiteralValue(filter); + if (value != null) { + + // *** Handle the boundary conditions first *** + + // if value is less than or equal to the first bucket's start point then all rows qualify + int result = value.compareTo(buckets[first]); + if (result <= 0) { + return LARGE_SELECTIVITY; + } + // if value is greater than the end point of the last bucket, then none of the rows qualify + result = value.compareTo(buckets[last]); + if (result > 0) { + return SMALL_SELECTIVITY; + } else if (result == 0) { + if (op.getKind() == SqlKind.GREATER_THAN_OR_EQUAL) { + // value is exactly equal to the last bucket's end point so we take the ratio 1/bucket_width + long totalFilterRows = (long) (1 / (buckets[last] - buckets[last - 1]) * numRowsPerBucket); + double selectivity = (double) totalFilterRows / totalRows; + return selectivity; + } else { + // predicate is 'column > value' and value is equal to last bucket's endpoint, so none of + // the rows qualify + return SMALL_SELECTIVITY; + } + } + + // *** End of boundary conditions **** + + int n = getContainingBucket(value, numBuckets); + if (n >= 0) { + // all buckets to the right of containing bucket will be fully covered + int coveredBuckets = (last) - (n + 1); + long coveredRows = numRowsPerBucket * coveredBuckets; + // num matching rows in the current bucket is a function of (end_point_of_bucket - value) + long partialRows = (long) ((buckets[n + 1] - value) / (buckets[n + 1] - buckets[n]) * numRowsPerBucket); + long totalFilterRows = partialRows + coveredRows; + double selectivity = (double)totalFilterRows/totalRows; + return selectivity; + } else { + // value does not exist in any of the buckets + return SMALL_SELECTIVITY; + } + } + } else if (op.getKind() == SqlKind.LESS_THAN || + op.getKind() == SqlKind.LESS_THAN_OR_EQUAL) { + Double value = getLiteralValue(filter); + if (value != null) { + + // *** Handle the boundary conditions first *** + + // if value is greater than the last bucket's end point then all rows qualify + int result = value.compareTo(buckets[last]); + if (result >= 0) { + return LARGE_SELECTIVITY; + } + // if value is less than the first bucket's start point then none of the rows qualify + result = value.compareTo(buckets[first]); + if (result < 0) { + return SMALL_SELECTIVITY; + } else if (result == 0) { + if (op.getKind() == SqlKind.LESS_THAN_OR_EQUAL) { + // value is exactly equal to the first bucket's start point so we take the ratio 1/bucket_width + long totalFilterRows = (long) (1 / (buckets[first + 1] - buckets[first]) * numRowsPerBucket); + double selectivity = (double) totalFilterRows / totalRows; + return selectivity; + } else { + // predicate is 'column < value' and value is equal to first bucket's start point, so none of + // the rows qualify + return SMALL_SELECTIVITY; + } + } + + // *** End of boundary conditions **** + + int n = getContainingBucket(value, numBuckets); + if (n >= 0) { + // all buckets to the left will be fully covered + int coveredBuckets = n; + long coveredRows = numRowsPerBucket * coveredBuckets; + // num matching rows in the current bucket is a function of (value - start_point_of_bucket) + long partialRows = (long) ((value - buckets[n]) / (buckets[n + 1] - buckets[n]) * numRowsPerBucket); + long totalFilterRows = partialRows + coveredRows; + double selectivity = (double)totalFilterRows / totalRows; + return selectivity; + } else { + // value does not exist in any of the buckets + return SMALL_SELECTIVITY; + } + } + } + } + } + return null; + } + + private int getContainingBucket(final Double value, final int numBuckets) { + int i = 0; + int containing_bucket = -1; + // check which bucket this value falls in + for (; i <= numBuckets; i++) { + int result = buckets[i].compareTo(value); + if (result > 0) { + containing_bucket = i - 1; + break; + } else if (result == 0) { + containing_bucket = i; + break; + } + } + return containing_bucket; + } + + private Double getLiteralValue(final RexNode filter) { + Double value = null; + List<RexNode> operands = ((RexCall) filter).getOperands(); + if (operands.size() == 2 && operands.get(1) instanceof RexLiteral) { + RexLiteral l = ((RexLiteral) operands.get(1)); + + switch (l.getTypeName()) { + case DATE: + case TIMESTAMP: + case TIME: + value = (double) ((java.util.Calendar) l.getValue()).getTimeInMillis(); + break; + case INTEGER: + case BIGINT: + case FLOAT: + case DOUBLE: + case DECIMAL: + case BOOLEAN: + value = l.getValueAs(Double.class); + break; + default: + break; + } } + return value; } /** - * Utility method to build a Numeric Equi-Depth Histogram from a t-digest byte array + * Build a Numeric Equi-Depth Histogram from a t-digest byte array * @param tdigest_array + * @param numBuckets + * @param nonNullCount * @return An instance of NumericEquiDepthHistogram */ - public static NumericEquiDepthHistogram buildFromTDigest(byte[] tdigest_array, - int numBuckets, - long nonNullCount) { + public static NumericEquiDepthHistogram buildFromTDigest(final byte[] tdigest_array, + final int numBuckets, + final long nonNullCount) { TDigest tdigest = TDigest.fromBytes(java.nio.ByteBuffer.wrap(tdigest_array)); NumericEquiDepthHistogram histogram = new NumericEquiDepthHistogram(numBuckets); - double q = 1.0/numBuckets; + final double q = 1.0/numBuckets; int i = 0; for (; i < numBuckets; i++) { // get the starting point of the i-th quantile diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java index aae8b1d..4a6646e 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java @@ -19,6 +19,8 @@ package org.apache.drill.exec.planner.cost; import java.util.ArrayList; import java.util.List; +import java.util.EnumSet; +import java.util.Set; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.plan.volcano.RelSubset; import org.apache.calcite.rel.RelNode; @@ -45,6 +47,7 @@ import org.apache.drill.exec.physical.base.GroupScan; import org.apache.drill.exec.planner.common.DrillJoinRelBase; import org.apache.drill.exec.planner.common.DrillRelOptUtil; import org.apache.drill.exec.planner.common.DrillScanRelBase; +import org.apache.drill.exec.planner.common.Histogram; import org.apache.drill.exec.planner.logical.DrillScanRel; import org.apache.drill.exec.planner.logical.DrillTable; import org.apache.drill.exec.planner.physical.PlannerSettings; @@ -64,6 +67,11 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { */ private static final double LIKE_PREDICATE_SELECTIVITY = 0.05; + public static final Set<SqlKind> RANGE_PREDICATE = + EnumSet.of( + SqlKind.LESS_THAN, SqlKind.GREATER_THAN, + SqlKind.LESS_THAN_OR_EQUAL, SqlKind.GREATER_THAN_OR_EQUAL); + @Override public Double getSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) { if (rel instanceof RelSubset && !DrillRelOptUtil.guessRows(rel)) { @@ -145,6 +153,8 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { orSel += RelMdUtil.guessSelectivity(orPred); //CALCITE guess } else if (orPred.isA(SqlKind.EQUALS)) { orSel += computeEqualsSelectivity(table, orPred, fieldNames); + } else if (orPred.isA(RANGE_PREDICATE)) { + orSel += computeRangeSelectivity(table, orPred, fieldNames); } else if (orPred.isA(SqlKind.NOT_EQUALS)) { orSel += 1.0 - computeEqualsSelectivity(table, orPred, fieldNames); } else if (orPred.isA(SqlKind.LIKE)) { @@ -167,7 +177,7 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { } else if (orPred.isA(SqlKind.IS_NOT_NULL)) { orSel += computeIsNotNullSelectivity(table, orPred, fieldNames); } else { - //Use the CALCITE guess. TODO: Use histograms for COMPARISON operator + // Use the CALCITE guess. orSel += guessSelectivity(orPred); } } @@ -188,6 +198,22 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { return guessSelectivity(orPred); } + // Use histogram if available for the range predicate selectivity + private double computeRangeSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) { + String col = getColumn(orPred, fieldNames); + if (col != null) { + if (table.getStatsTable() != null + && table.getStatsTable().getHistogram(col) != null) { + Histogram histogram = table.getStatsTable().getHistogram(col); + Double sel = ((Histogram) histogram).estimatedSelectivity(orPred); + if (sel != null) { + return sel; + } + } + } + return guessSelectivity(orPred); + } + private double computeIsNotNullSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) { String col = getColumn(orPred, fieldNames); if (col != null) { diff --git a/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java b/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java index da257c0..34126cf 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java +++ b/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java @@ -157,7 +157,7 @@ public class PlanTestBase extends BaseTestQuery { for (final String s : expectedPatterns) { final Pattern p = Pattern.compile(s); final Matcher m = p.matcher(plan); - assertTrue(EXPECTED_NOT_FOUND + s, m.find()); + assertTrue(EXPECTED_NOT_FOUND + s + "\n" + plan, m.find()); } } @@ -166,7 +166,7 @@ public class PlanTestBase extends BaseTestQuery { for (final String s : excludedPatterns) { final Pattern p = Pattern.compile(s); final Matcher m = p.matcher(plan); - assertFalse(UNEXPECTED_FOUND + s, m.find()); + assertFalse(UNEXPECTED_FOUND + s + "\n" + plan, m.find()); } } } diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java index 30d23b3..8270352 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java @@ -400,6 +400,21 @@ public class TestAnalyze extends BaseTestQuery { .baselineValues("`hire_date_and_time`", 7) .baselineValues("`salary`", 11) .go(); + + // test the use of the just created histogram + test("alter session set `planner.statistics.use` = true"); + + // check boundary conditions: last bucket + String query = "select 1 from dfs.tmp.employee1 where store_id > 21"; + String[] expectedPlan1 = {"Filter\\(condition.*\\).*rowcount = 112.*,.*", + "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"}; + PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan1, new String[]{}); + + query = "select 1 from dfs.tmp.employee1 where store_id < 15"; + String[] expectedPlan2 = {"Filter\\(condition.*\\).*rowcount = 676.*,.*", + "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"}; + PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan2, new String[]{}); + } finally { test("ALTER SESSION SET `planner.slice_target` = " + ExecConstants.SLICE_TARGET_DEFAULT); }