IMPALA-3148. Fix selectivity computation for pushed Kudu predicates This follows up on a TODO from the Kudu merge and also fixes a bug: IMPALA-976 changed the computation of selectivities for a combined list of conjuncts to better handle expressions with no selectivity estimate. The Kudu implementation was forked from before this change and thus did not have an equivalent change.
This refactors the algorithm to a new static method and calls it from both PlanNode and KuduScanNode so that the selectivity estimate behavior is the same regardless of whether Kudu can evaluate the predicate server-side. Todd tested this on TPCH 3TB and verified that the plans are reasonable now where they used to be nonsense. Change-Id: Id507077b577ed5804fc80517f33ea185f2bff41a Reviewed-on: http://gerrit.cloudera.org:8080/2628 Reviewed-by: Casey Ching <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4bdd0b97 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4bdd0b97 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4bdd0b97 Branch: refs/heads/master Commit: 4bdd0b976dc80f64ebd1f2c37ad03b4ec661cf02 Parents: 86fd262 Author: Todd Lipcon <[email protected]> Authored: Thu Mar 24 22:53:33 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Tue Apr 12 14:03:44 2016 -0700 ---------------------------------------------------------------------- .../cloudera/impala/planner/KuduScanNode.java | 18 ++--- .../com/cloudera/impala/planner/PlanNode.java | 10 ++- .../queries/PlannerTest/kudu-selectivity.test | 71 +++++++++++++++++++- 3 files changed, 80 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4bdd0b97/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java index 558c282..ee5c318 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java @@ -29,6 +29,7 @@ import com.cloudera.impala.common.ImpalaRuntimeException; import com.google.common.base.Charsets; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import org.kududb.client.KuduClient; import org.kududb.client.LocatedTablet; import org.slf4j.Logger; @@ -151,20 +152,11 @@ public class KuduScanNode extends ScanNode { } } - /** - * TODO(kudu-merge): IMPALA-3148 - Update selectivity computation. - */ @Override protected double computeSelectivity() { - double baseSelectivity = super.computeSelectivity(); - // The kuduConjuncts_ are not part of the selectivity calculation of the PlanNode - // superclass. Adjust the selectivity to account for predicates that are - // pushed down to Kudu. - for (Expr e : kuduConjuncts_) { - if (baseSelectivity < 0) continue; - baseSelectivity *= e.getSelectivity(); - } - return baseSelectivity; + List<Expr> allConjuncts = Lists.newArrayList( + Iterables.concat(conjuncts_, kuduConjuncts_)); + return computeCombinedSelectivity(allConjuncts); } @Override @@ -201,7 +193,7 @@ public class KuduScanNode extends ScanNode { } if (!kuduConjuncts_.isEmpty()) { result.append(detailPrefix + "kudu predicates: " + getExplainString( - kuduConjuncts_)); + kuduConjuncts_) + "\n"); } } } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4bdd0b97/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java b/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java index 1169190..643ca01 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java @@ -491,13 +491,13 @@ abstract public class PlanNode extends TreeNode<PlanNode> { * The second issue is addressed by an exponential backoff when multiplying each * additional selectivity into the final result. */ - protected double computeSelectivity() { + static protected double computeCombinedSelectivity(List<Expr> conjuncts) { // Collect all estimated selectivities. List<Double> selectivities = Lists.newArrayList(); - for (Expr e: conjuncts_) { + for (Expr e: conjuncts) { if (e.hasSelectivity()) selectivities.add(e.getSelectivity()); } - if (selectivities.size() != conjuncts_.size()) { + if (selectivities.size() != conjuncts.size()) { // Some conjuncts have no estimated selectivity. Use a single default // representative selectivity for all those conjuncts. selectivities.add(Expr.DEFAULT_SELECTIVITY); @@ -515,6 +515,10 @@ abstract public class PlanNode extends TreeNode<PlanNode> { return Math.max(0.0, Math.min(1.0, result)); } + protected double computeSelectivity() { + return computeCombinedSelectivity(conjuncts_); + } + // Convert this plan node into msg (excluding children), which requires setting // the node type and the node-specific field. protected abstract void toThrift(TPlanNode msg); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4bdd0b97/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test index 0062fd5..fe1cb29 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test @@ -2,7 +2,30 @@ select * from functional_kudu.zipcode_incomes where id = '8600000US00601' ---- PLAN F00:PLAN FRAGMENT [UNPARTITIONED] 00:SCAN KUDU [functional_kudu.zipcode_incomes] - kudu predicates: id = '8600000US00601' hosts=3 per-host-mem=unavailable + kudu predicates: id = '8600000US00601' + hosts=3 per-host-mem=unavailable + tuple-ids=0 row-size=124B cardinality=1 +---- DISTRIBUTEDPLAN +F01:PLAN FRAGMENT [UNPARTITIONED] + 01:EXCHANGE [UNPARTITIONED] + hosts=3 per-host-mem=unavailable + tuple-ids=0 row-size=124B cardinality=1 + +F00:PLAN FRAGMENT [RANDOM] + DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED] + 00:SCAN KUDU [functional_kudu.zipcode_incomes] + kudu predicates: id = '8600000US00601' + hosts=3 per-host-mem=0B + tuple-ids=0 row-size=124B cardinality=1 +==== +# The cardinality from "zip = '2'" should dominate. +select * from functional_kudu.zipcode_incomes where id != '1' and zip = '2' +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] + 00:SCAN KUDU [functional_kudu.zipcode_incomes] + predicates: id != '1' + kudu predicates: zip = '2' + hosts=3 per-host-mem=unavailable tuple-ids=0 row-size=124B cardinality=1 ---- DISTRIBUTEDPLAN F01:PLAN FRAGMENT [UNPARTITIONED] @@ -13,6 +36,48 @@ F01:PLAN FRAGMENT [UNPARTITIONED] F00:PLAN FRAGMENT [RANDOM] DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED] 00:SCAN KUDU [functional_kudu.zipcode_incomes] - kudu predicates: id = '8600000US00601' hosts=3 per-host-mem=0B + predicates: id != '1' + kudu predicates: zip = '2' + hosts=3 per-host-mem=0B tuple-ids=0 row-size=124B cardinality=1 -==== \ No newline at end of file +==== +select * from functional_kudu.zipcode_incomes where id > '1' and zip > '2' +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] + 00:SCAN KUDU [functional_kudu.zipcode_incomes] + predicates: id > '1', zip > '2' + hosts=3 per-host-mem=unavailable + tuple-ids=0 row-size=124B cardinality=3317 +---- DISTRIBUTEDPLAN +F01:PLAN FRAGMENT [UNPARTITIONED] + 01:EXCHANGE [UNPARTITIONED] + hosts=3 per-host-mem=unavailable + tuple-ids=0 row-size=124B cardinality=3317 + +F00:PLAN FRAGMENT [RANDOM] + DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED] + 00:SCAN KUDU [functional_kudu.zipcode_incomes] + predicates: id > '1', zip > '2' + hosts=3 per-host-mem=0B + tuple-ids=0 row-size=124B cardinality=3317 +==== +select * from functional_kudu.zipcode_incomes where id = '1' or id = '2' +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] + 00:SCAN KUDU [functional_kudu.zipcode_incomes] + predicates: id = '1' OR id = '2' + hosts=3 per-host-mem=unavailable + tuple-ids=0 row-size=124B cardinality=2 +---- DISTRIBUTEDPLAN +F01:PLAN FRAGMENT [UNPARTITIONED] + 01:EXCHANGE [UNPARTITIONED] + hosts=3 per-host-mem=unavailable + tuple-ids=0 row-size=124B cardinality=2 + +F00:PLAN FRAGMENT [RANDOM] + DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED] + 00:SCAN KUDU [functional_kudu.zipcode_incomes] + predicates: id = '1' OR id = '2' + hosts=3 per-host-mem=0B + tuple-ids=0 row-size=124B cardinality=2 +====
