Repository: incubator-impala Updated Branches: refs/heads/master f413e236a -> d70ffa455
IMPALA-3450: LIMITs on plan nodes are reflected in cardinality estimates PlanNode includes a 'capAtLimit()' method that can be used in 'computeStats()' on PlanNodes to ensure they do not estimate their cardinality to be more than a pushed-down LIMIT clause. This patch ensures that 'capAtLimit()' is used in all of the relevant classes descending from PlanNode. Change-Id: Ic06dcb93bbb2510c0d40151302bd817ef340b825 Reviewed-on: http://gerrit.cloudera.org:8080/3127 Reviewed-by: Jim Apple <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d70ffa45 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d70ffa45 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d70ffa45 Branch: refs/heads/master Commit: d70ffa455d48bb5ffb26c9eb065a40374c369d8c Parents: f413e23 Author: Jim Apple <[email protected]> Authored: Wed May 18 13:20:40 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Tue May 24 14:40:52 2016 -0700 ---------------------------------------------------------------------- .../impala/planner/AggregationNode.java | 1 + .../impala/planner/AnalyticEvalNode.java | 1 + .../com/cloudera/impala/planner/JoinNode.java | 2 +- .../com/cloudera/impala/planner/SelectNode.java | 1 + .../cloudera/impala/planner/SubplanNode.java | 1 + .../com/cloudera/impala/planner/UnionNode.java | 2 +- .../com/cloudera/impala/planner/UnnestNode.java | 1 + .../impala/planner/PlannerTestBase.java | 33 ++++++ .../queries/PlannerTest/aggregation.test | 2 +- .../queries/PlannerTest/inline-view-limit.test | 13 +++ .../queries/PlannerTest/join-order.test | 32 +++--- .../queries/PlannerTest/joins.test | 111 +++++++++++++++++++ .../queries/PlannerTest/nested-collections.test | 2 +- .../queries/PlannerTest/union.test | 28 +++++ 14 files changed, 210 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java b/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java index 5600bf7..ae6a967 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java @@ -196,6 +196,7 @@ public class AggregationNode extends PlanNode { cardinality_ = Math.min(getChild(0).getCardinality(), cardinality_); } } + cardinality_ = capAtLimit(cardinality_); LOG.trace("stats Agg: cardinality=" + Long.toString(cardinality_)); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java b/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java index 3ee6be6..653e0a1 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java @@ -126,6 +126,7 @@ public class AnalyticEvalNode extends PlanNode { protected void computeStats(Analyzer analyzer) { super.computeStats(analyzer); cardinality_ = getChild(0).cardinality_; + cardinality_ = capAtLimit(cardinality_); } @Override http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java b/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java index 34de765..6d60e43 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java @@ -445,7 +445,7 @@ public abstract class JoinNode extends PlanNode { break; } } - + cardinality_ = capAtLimit(cardinality_); Preconditions.checkState(hasValidStats()); LOG.debug("stats Join: cardinality=" + Long.toString(cardinality_)); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java b/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java index e6d7bf1..ae622c3 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java @@ -63,6 +63,7 @@ public class SelectNode extends PlanNode { Math.round(((double) getChild(0).cardinality_) * computeSelectivity()); Preconditions.checkState(cardinality_ >= 0); } + cardinality_ = capAtLimit(cardinality_); LOG.debug("stats Select: cardinality=" + Long.toString(cardinality_)); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java b/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java index 969ff27..3b9070e 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java @@ -74,6 +74,7 @@ public class SubplanNode extends PlanNode { } else { cardinality_ = -1; } + cardinality_ = capAtLimit(cardinality_); } @Override http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java b/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java index 171451f..9c2ebf2 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java @@ -98,7 +98,7 @@ public class UnionNode extends PlanNode { // are inline views (e.g. select 1 FROM (VALUES(1 x, 1 y)) a FULL OUTER JOIN // (VALUES(1 x, 1 y)) b ON (a.x = b.y)). We need to set the correct value. if (numNodes_ == -1) numNodes_ = 1; - + cardinality_ = capAtLimit(cardinality_); LOG.debug("stats Union: cardinality=" + Long.toString(cardinality_)); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java b/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java index a817b03..0a0ee1b 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java @@ -67,6 +67,7 @@ public class UnnestNode extends PlanNode { // The containing SubplanNode has not yet been initialized, so get the number // of nodes from the SubplanNode's input. numNodes_ = containingSubplanNode_.getChild(0).getNumNodes(); + cardinality_ = capAtLimit(cardinality_); } @Override http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java b/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java index ce8980e..561032d 100644 --- a/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java +++ b/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java @@ -398,6 +398,7 @@ public class PlannerTestBase { testPlan(testCase, Section.PLAN, queryCtx, errorLog, actualOutput); checkScanRangeLocations(testCase, singleNodeExecRequest, errorLog, actualOutput); checkColumnLineage(testCase, singleNodeExecRequest, errorLog, actualOutput); + checkLimitCardinality(query, singleNodeExecRequest, errorLog); // Test distributed plan. testPlan(testCase, Section.DISTRIBUTEDPLAN, queryCtx, errorLog, actualOutput); // test parallel plans @@ -550,6 +551,38 @@ public class PlannerTestBase { } } + /** Checks that limits are accounted for in the cardinality of plan nodes. + */ + private void checkLimitCardinality( + String query, TExecRequest execRequest, StringBuilder errorLog) { + if (execRequest == null) return; + if (!execRequest.isSetQuery_exec_request() + || execRequest.query_exec_request == null) { + return; + } + for (TPlanFragment planFragment : execRequest.query_exec_request.fragments) { + if (!planFragment.isSetPlan() || planFragment.plan == null) continue; + for (TPlanNode node : planFragment.plan.nodes) { + if (!node.isSetLimit() || -1 == node.limit) continue; + if (!node.isSetEstimated_stats() || node.estimated_stats == null) continue; + if (node.limit < node.estimated_stats.cardinality) { + StringBuilder limitCardinalityError = new StringBuilder(); + limitCardinalityError.append("Query: " + query + "\n"); + limitCardinalityError.append( + "Expected cardinality estimate less than or equal to LIMIT: " + + node.limit + "\n"); + limitCardinalityError.append( + "Actual cardinality estimate: " + + node.estimated_stats.cardinality + "\n"); + limitCardinalityError.append( + "In node id " + + node.node_id + "\n"); + errorLog.append(limitCardinalityError.toString()); + } + } + } + } + private void checkColumnLineage(TestCase testCase, TExecRequest execRequest, StringBuilder errorLog, StringBuilder actualOutput) { String query = testCase.getQuery(); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test index 250f85b..47bfb23 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test @@ -1062,4 +1062,4 @@ limit 10 00:SCAN HDFS [tpch_parquet.lineitem] partitions=1/1 files=3 size=195.85MB runtime filters: RF002 -> l_orderkey, RF003 -> l_returnflag -==== +==== \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test index 98bc3a1..8d3a816 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test @@ -613,3 +613,16 @@ where a.id > 10 and b.id > 20 predicates: id != 1, functional.alltypes.id != 2, functional.alltypes.id > 10, functional.alltypes.id > 20 runtime filters: RF000 -> id ==== +# IMPALA-3450: limits on select nodes are reflected in cardinality estimates. The test for +# this is embedded in PlannerTestBase.java and is not visible in these plans, as they only +# have explain_level=1 +select * from (select * from functional.alltypes limit 100) v where id < 10 limit 1 +---- PLAN +01:SELECT +| predicates: functional.alltypes.id < 10 +| limit: 1 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + limit: 100 +==== \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test index ab2b23c..441afca 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test @@ -1275,12 +1275,6 @@ WHERE `$a$2`.`$c$1` > t4.id 08:NESTED LOOP JOIN [INNER JOIN] | predicates: sum(t1.int_col) > t4.id | -|--00:SCAN HDFS [functional.alltypestiny t4] -| partitions=4/4 files=4 size=460B -| runtime filters: RF000 -> t4.bigint_col -| -07:NESTED LOOP JOIN [CROSS JOIN] -| |--05:AGGREGATE [FINALIZE] | | output: sum(t1.int_col) | | limit: 1 @@ -1288,17 +1282,23 @@ WHERE `$a$2`.`$c$1` > t4.id | 04:SCAN HDFS [functional.alltypesagg t1] | partitions=11/11 files=11 size=814.73KB | -03:HASH JOIN [INNER JOIN] -| hash predicates: t1.bigint_col = t2.smallint_col -| runtime filters: RF001 <- t2.smallint_col -| limit: 1 +07:NESTED LOOP JOIN [CROSS JOIN] | -|--02:SCAN HDFS [functional.alltypestiny t2] -| partitions=4/4 files=4 size=460B +|--03:HASH JOIN [INNER JOIN] +| | hash predicates: t1.bigint_col = t2.smallint_col +| | runtime filters: RF001 <- t2.smallint_col +| | limit: 1 +| | +| |--02:SCAN HDFS [functional.alltypestiny t2] +| | partitions=4/4 files=4 size=460B +| | +| 01:SCAN HDFS [functional.alltypes t1] +| partitions=24/24 files=24 size=478.45KB +| runtime filters: RF001 -> t1.bigint_col | -01:SCAN HDFS [functional.alltypes t1] - partitions=24/24 files=24 size=478.45KB - runtime filters: RF001 -> t1.bigint_col +00:SCAN HDFS [functional.alltypestiny t4] + partitions=4/4 files=4 size=460B + runtime filters: RF000 -> t4.bigint_col ==== # Tests assignment of conjuncts to inverted outer joins (IMPALA-1342). select 1 @@ -1498,4 +1498,4 @@ and a.timestamp_col <=> now() partitions=24/24 files=24 size=478.45KB predicates: date_string_col IS NOT DISTINCT FROM '' runtime filters: RF000 -> functional.alltypes.id -==== +==== \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/joins.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test index 6280bf5..5d0e892 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test @@ -2199,3 +2199,114 @@ and t3.a != t2.g 00:SCAN HDFS [functional.nulltable t1] partitions=1/1 files=1 size=18B ==== +# IMPALA-3450: limits on join nodes are reflected in cardinality estimates. The test for +# this is embedded in PlannerTestBase.java and is not visible in these plans, as they only +# have explain_level=1 +select a.c_custkey as c_custkey from tpch.customer a, tpch.customer b limit 1 +---- PLAN +02:NESTED LOOP JOIN [CROSS JOIN] +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB +==== +select a.c_custkey as c_custkey from tpch.customer a left semi join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| runtime filters: RF000 <- b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB + runtime filters: RF000 -> a.c_custkey +==== +select b.c_custkey as c_custkey from tpch.customer a right semi join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [RIGHT SEMI JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| runtime filters: RF000 <- b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB + runtime filters: RF000 -> a.c_custkey +==== +select a.c_custkey as c_custkey from tpch.customer a left outer join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB +==== +select b.c_custkey as c_custkey from tpch.customer a right outer join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [RIGHT OUTER JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| runtime filters: RF000 <- b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB + runtime filters: RF000 -> a.c_custkey +==== +select a.c_custkey as c_custkey from tpch.customer a full outer join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [FULL OUTER JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB +==== +select a.c_custkey as c_custkey from tpch.customer a left anti join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [LEFT ANTI JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB +==== +select b.c_custkey as c_custkey from tpch.customer a right anti join tpch.customer b +using (c_custkey) limit 1 +---- PLAN +02:HASH JOIN [RIGHT ANTI JOIN] +| hash predicates: a.c_custkey = b.c_custkey +| limit: 1 +| +|--01:SCAN HDFS [tpch.customer b] +| partitions=1/1 files=1 size=23.08MB +| +00:SCAN HDFS [tpch.customer a] + partitions=1/1 files=1 size=23.08MB +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test index 4ab84b3..ee0bce4 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test @@ -1772,4 +1772,4 @@ left semi join c2.c_orders o2 | 00:SCAN HDFS [tpch_nested_parquet.customer c1] partitions=1/1 files=4 size=577.87MB -==== +==== \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/union.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/union.test b/testdata/workloads/functional-planner/queries/PlannerTest/union.test index d87c351..86c59c7 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test @@ -2661,3 +2661,31 @@ select 1000, 2000 06:SCAN HDFS [functional.alltypes] partitions=1/24 files=1 size=18.12KB ==== +# IMPALA-3450: limits on union nodes are reflected in cardinality estimates. The test for +# this is embedded in PlannerTestBase.java and is not visible in these plans, as they only +# have explain_level=1 +select * from tpch.lineitem UNION ALL (select * from tpch.lineitem) LIMIT 1 +---- PLAN +00:UNION +| limit: 1 +| +|--02:SCAN HDFS [tpch.lineitem] +| partitions=1/1 files=1 size=718.94MB +| +01:SCAN HDFS [tpch.lineitem] + partitions=1/1 files=1 size=718.94MB +==== +select l_orderkey from tpch.lineitem UNION DISTINCT (select l_orderkey from tpch.lineitem) LIMIT 1 +---- PLAN +03:AGGREGATE [FINALIZE] +| group by: l_orderkey +| limit: 1 +| +00:UNION +| +|--02:SCAN HDFS [tpch.lineitem] +| partitions=1/1 files=1 size=718.94MB +| +01:SCAN HDFS [tpch.lineitem] + partitions=1/1 files=1 size=718.94MB +==== \ No newline at end of file
