This is an automated email from the ASF dual-hosted git repository. gparai pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push: new 7c2661d DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates 7c2661d is described below commit 7c2661d5dba80d2bebb100174ecefd5a3b194310 Author: Gautam Parai <gpa...@maprtech.com> AuthorDate: Tue Mar 12 18:29:16 2019 -0700 DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates closes #1699 --- .../drill/exec/planner/common/DrillStatsTable.java | 30 ++++++- .../exec/planner/cost/DrillRelMdSelectivity.java | 98 +++++++++++++++++----- 2 files changed, 106 insertions(+), 22 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java index 668f1d2..e934dfc 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java @@ -66,8 +66,9 @@ public class DrillStatsTable { private final Path tablePath; private final String schemaName; private final String tableName; - private final Map<String, Long> ndv = Maps.newHashMap(); private double rowCount = -1; + private final Map<String, Long> ndv = Maps.newHashMap(); + private final Map<String, Long> nnRowCount = Maps.newHashMap(); private boolean materialized = false; private TableStatistics statistics = null; @@ -135,6 +136,32 @@ public class DrillStatsTable { } /** + * Get non-null rowcount for the column If stats are not present for the given column, a null is returned. + * + * Note: returned data may not be accurate. Accuracy depends on whether the table data has changed + * after the stats are computed. + * + * @param col - column for which non-null rowcount is desired + * @return non-null rowcount of the column, if available. NULL otherwise. + */ + public Double getNNRowCount(String col) { + // Stats might not have materialized because of errors. + if (!materialized) { + return null; + } + final String upperCol = col.toUpperCase(); + Long nnRowCntCol = nnRowCount.get(upperCol); + if (nnRowCntCol == null) { + nnRowCntCol = nnRowCount.get(SchemaPath.getSimplePath(upperCol).toString()); + } + // Cap it at row count (just in case) + if (nnRowCntCol != null) { + return Math.min(nnRowCntCol, rowCount); + } + return null; + } + + /** * Read the stats from storage and keep them in memory. * @param table - Drill table for which we require stats * @param context - Query context @@ -154,6 +181,7 @@ public class DrillStatsTable { for (DirectoryStatistics_v1 ds : ((Statistics_v1) statistics).getDirectoryStatistics()) { for (ColumnStatistics_v1 cs : ds.getColumnStatistics()) { ndv.put(cs.getName().toUpperCase(), cs.getNdv()); + nnRowCount.put(cs.getName().toUpperCase(), (long)cs.getNonNullCount()); rowCount = Math.max(rowCount, cs.getCount()); } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java index e074be4..e87fae2 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java @@ -56,6 +56,13 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { private static final DrillRelMdSelectivity INSTANCE = new DrillRelMdSelectivity(); static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DrillRelMdSelectivity.class); public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider.reflectiveSource(BuiltInMethod.SELECTIVITY.method, INSTANCE); + /* + * For now, we are treating all LIKE predicates to have the same selectivity irrespective of the number or position + * of wildcard characters (%). This is no different than the present Drill/Calcite behaviour w.r.t to LIKE predicates. + * The difference being Calcite keeps the selectivity 25% whereas we keep it at 5% + * TODO: Differentiate leading/trailing wildcard characters(%) or explore different estimation techniques e.g. LSH-based + */ + private static final double LIKE_PREDICATE_SELECTIVITY = 0.05; @Override public Double getSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) { @@ -137,34 +144,32 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { for (RexNode pred : RelOptUtil.conjunctions(predicate)) { double orSel = 0; for (RexNode orPred : RelOptUtil.disjunctions(pred)) { - //CALCITE guess - Double guess = RelMdUtil.guessSelectivity(pred); if (orPred.isA(SqlKind.EQUALS)) { + orSel += computeEqualsSelectivity(table, orPred, fieldNames); + } else if (orPred.isA(SqlKind.NOT_EQUALS)) { + orSel += 1.0 - computeEqualsSelectivity(table, orPred, fieldNames); + } else if (orPred.isA(SqlKind.LIKE)) { + // LIKE selectivity is 5% more than a similar equality predicate, capped at CALCITE guess + orSel += Math.min(computeEqualsSelectivity(table, orPred, fieldNames) + LIKE_PREDICATE_SELECTIVITY, + guessSelectivity(orPred)); + } else if (orPred.isA(SqlKind.NOT)) { if (orPred instanceof RexCall) { - int colIdx = -1; - RexInputRef op = findRexInputRef(orPred); - if (op != null) { - colIdx = op.hashCode(); - } - if (colIdx != -1 && colIdx < fieldNames.size()) { - String col = fieldNames.get(colIdx); - if (table.getStatsTable() != null - && table.getStatsTable().getNdv(col) != null) { - orSel += 1.00 / table.getStatsTable().getNdv(col); - } else { - orSel += guess; - } + // LIKE selectivity is 5% more than a similar equality predicate, capped at CALCITE guess + RexNode childOp = ((RexCall) orPred).getOperands().get(0); + if (childOp.isA(SqlKind.LIKE)) { + orSel += 1.0 - Math.min(computeEqualsSelectivity(table, childOp, fieldNames) + LIKE_PREDICATE_SELECTIVITY, + guessSelectivity(childOp)); } else { - orSel += guess; - if (logger.isDebugEnabled()) { - logger.warn(String.format("No input reference $[%s] found for predicate [%s]", - Integer.toString(colIdx), orPred.toString())); - } + orSel += 1.0 - guessSelectivity(orPred); } } + } else if (orPred.isA(SqlKind.IS_NULL)) { + orSel += 1.0 - computeIsNotNullSelectivity(table, orPred, fieldNames); + } else if (orPred.isA(SqlKind.IS_NOT_NULL)) { + orSel += computeIsNotNullSelectivity(table, orPred, fieldNames); } else { //Use the CALCITE guess. TODO: Use histograms for COMPARISON operator - orSel += guess; + orSel += guessSelectivity(orPred); } } sel *= orSel; @@ -173,6 +178,57 @@ public class DrillRelMdSelectivity extends RelMdSelectivity { return (sel > 1.0) ? 1.0 : sel; } + private double computeEqualsSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) { + String col = getColumn(orPred, fieldNames); + if (col != null) { + if (table.getStatsTable() != null + && table.getStatsTable().getNdv(col) != null) { + return 1.00 / table.getStatsTable().getNdv(col); + } + } + return guessSelectivity(orPred); + } + + private double computeIsNotNullSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) { + String col = getColumn(orPred, fieldNames); + if (col != null) { + if (table.getStatsTable() != null + && table.getStatsTable().getNNRowCount(col) != null) { + // Cap selectivity below Calcite Guess + return Math.min((table.getStatsTable().getNNRowCount(col) / table.getStatsTable().getRowCount()), + RelMdUtil.guessSelectivity(orPred)); + } + } + return guessSelectivity(orPred); + } + + private String getColumn(RexNode orPred, List<String> fieldNames) { + if (orPred instanceof RexCall) { + int colIdx = -1; + RexInputRef op = findRexInputRef(orPred); + if (op != null) { + colIdx = op.hashCode(); + } + if (colIdx != -1 && colIdx < fieldNames.size()) { + return fieldNames.get(colIdx); + } else { + if (logger.isDebugEnabled()) { + logger.warn(String.format("No input reference $[%s] found for predicate [%s]", + Integer.toString(colIdx), orPred.toString())); + } + } + } + return null; + } + + private double guessSelectivity(RexNode orPred) { + if (logger.isDebugEnabled()) { + logger.warn(String.format("Using guess for predicate [%s]", orPred.toString())); + } + //CALCITE guess + return RelMdUtil.guessSelectivity(orPred); + } + private Double getJoinSelectivity(DrillJoinRelBase rel, RelMetadataQuery mq, RexNode predicate) { double sel = 1.0; // determine which filters apply to the left vs right