IMPALA-6388: Fix the Union node number of hosts estimation Before this patch, we would estimate the number of hosts for the union node by looking only at the first union operand. This is obviously incorrect and lead us to underestimate the value.
We fix the problem by setting the estimate to be the maximum of its children. Testing: - Added a planner test that reproduces the issue Change-Id: I51e1ecca8dbc84b2b5a72708667b2799d00279f0 Reviewed-on: http://gerrit.cloudera.org:8080/9017 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/f8b40622 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/f8b40622 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/f8b40622 Branch: refs/heads/master Commit: f8b406222de8f41765ef1d130e2debbd8ab06369 Parents: 4c43cac Author: Taras Bobrovytsky <[email protected]> Authored: Thu Jan 11 17:01:07 2018 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Jan 17 01:41:16 2018 +0000 ---------------------------------------------------------------------- .../org/apache/impala/planner/UnionNode.java | 1 + .../queries/PlannerTest/union.test | 88 ++++++++++++++++++++ .../workloads/tpch/queries/insert_parquet.test | 2 + .../tpch/queries/tpch-aggregations.test | 2 + 4 files changed, 93 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/fe/src/main/java/org/apache/impala/planner/UnionNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/UnionNode.java b/fe/src/main/java/org/apache/impala/planner/UnionNode.java index 302f62d..52ce508 100644 --- a/fe/src/main/java/org/apache/impala/planner/UnionNode.java +++ b/fe/src/main/java/org/apache/impala/planner/UnionNode.java @@ -117,6 +117,7 @@ public class UnionNode extends PlanNode { if (child.cardinality_ > 0) { cardinality_ = checkedAdd(cardinality_, child.cardinality_); } + numNodes_ = Math.max(child.getNumNodes(), numNodes_); } // The number of nodes of a union node is -1 (invalid) if all the referenced tables // are inline views (e.g. select 1 FROM (VALUES(1 x, 1 y)) a FULL OUTER JOIN http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/testdata/workloads/functional-planner/queries/PlannerTest/union.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/union.test b/testdata/workloads/functional-planner/queries/PlannerTest/union.test index 4ebfb6b..a0d0c26 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test @@ -3215,3 +3215,91 @@ PLAN-ROOT SINK 00:SCAN HDFS [tpch_nested_parquet.customer c] partitions=1/1 files=4 size=292.36MB ==== +# IMPALA-6388: Verify that the order of the union operands does not impact the +# number of hosts computation +select f2 from functional.emptytable +union all +select int_col from functional.alltypes +---- QUERYOPTIONS +explain_level=2 +---- DISTRIBUTEDPLAN +F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +| Per-Host Resources: mem-estimate=0B mem-reservation=0B +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +03:EXCHANGE [UNPARTITIONED] +| mem-estimate=0B mem-reservation=0B +| tuple-ids=2 row-size=4B cardinality=7300 +| +F02:PLAN FRAGMENT [RANDOM] hosts=3 instances=3 +Per-Host Resources: mem-estimate=128.00MB mem-reservation=0B +00:UNION +| pass-through-operands: all +| mem-estimate=0B mem-reservation=0B +| tuple-ids=2 row-size=4B cardinality=7300 +| +|--02:SCAN HDFS [functional.alltypes, RANDOM] +| partitions=24/24 files=24 size=478.45KB +| stored statistics: +| table: rows=7300 size=478.45KB +| partitions: 24/24 rows=7300 +| columns: all +| extrapolated-rows=disabled +| mem-estimate=128.00MB mem-reservation=0B +| tuple-ids=1 row-size=4B cardinality=7300 +| +01:SCAN HDFS [functional.emptytable, RANDOM] + partitions=0/0 files=0 size=0B + stored statistics: + table: rows=unavailable size=unavailable + partitions: 0/0 rows=unavailable + columns: all + extrapolated-rows=disabled + mem-estimate=0B mem-reservation=0B + tuple-ids=0 row-size=4B cardinality=0 +==== +# IMPALA-6388: Verify that the order of the union operands does not impact the +# number of hosts computation +select int_col from functional.alltypes +union all +select f2 from functional.emptytable +---- QUERYOPTIONS +explain_level=2 +---- DISTRIBUTEDPLAN +F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +| Per-Host Resources: mem-estimate=0B mem-reservation=0B +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +03:EXCHANGE [UNPARTITIONED] +| mem-estimate=0B mem-reservation=0B +| tuple-ids=2 row-size=4B cardinality=7300 +| +F02:PLAN FRAGMENT [RANDOM] hosts=3 instances=3 +Per-Host Resources: mem-estimate=128.00MB mem-reservation=0B +00:UNION +| pass-through-operands: all +| mem-estimate=0B mem-reservation=0B +| tuple-ids=2 row-size=4B cardinality=7300 +| +|--02:SCAN HDFS [functional.emptytable, RANDOM] +| partitions=0/0 files=0 size=0B +| stored statistics: +| table: rows=unavailable size=unavailable +| partitions: 0/0 rows=unavailable +| columns: all +| extrapolated-rows=disabled +| mem-estimate=0B mem-reservation=0B +| tuple-ids=1 row-size=4B cardinality=0 +| +01:SCAN HDFS [functional.alltypes, RANDOM] + partitions=24/24 files=24 size=478.45KB + stored statistics: + table: rows=7300 size=478.45KB + partitions: 24/24 rows=7300 + columns: all + extrapolated-rows=disabled + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=4B cardinality=7300 +==== http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/testdata/workloads/tpch/queries/insert_parquet.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/tpch/queries/insert_parquet.test b/testdata/workloads/tpch/queries/insert_parquet.test index 862548e..23c945d 100644 --- a/testdata/workloads/tpch/queries/insert_parquet.test +++ b/testdata/workloads/tpch/queries/insert_parquet.test @@ -57,6 +57,8 @@ bigint # Test to verify that huge (larger than 64k) values can be written, see IMPALA-1705 create table if not exists test_insert_huge_vals (s string) stored as parquet location '$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/test_insert_huge_vals'; +# Large rows are possible in this query, so we bump up the max row size. +set max_row_size=1048576; insert overwrite table test_insert_huge_vals select cast(l_orderkey as string) from tpch.lineitem union select group_concat(concat(s_name, s_address, s_phone)) from tpch.supplier http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/testdata/workloads/tpch/queries/tpch-aggregations.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/tpch/queries/tpch-aggregations.test b/testdata/workloads/tpch/queries/tpch-aggregations.test index 462c3b8..58575ad 100644 --- a/testdata/workloads/tpch/queries/tpch-aggregations.test +++ b/testdata/workloads/tpch/queries/tpch-aggregations.test @@ -2,6 +2,8 @@ ---- QUERY # Regression test for IMPALA-2352. Small buffers of streams get full and we # can switch to IO-buffers without spilling. +# Large rows are possible in this query, so we bump up the max row size. +set max_row_size=1048576; SELECT count(*) FROM (SELECT cast(l_orderkey as string) s FROM tpch.lineitem UNION SELECT group_concat(concat(s_name, s_address, s_phone)) FROM tpch.supplier) v
