This is an automated email from the ASF dual-hosted git repository. lv pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit c659b78198a767b91c293cbaf77f5c8b269fba39 Author: Tim Armstrong <tarmstr...@cloudera.com> AuthorDate: Mon Feb 18 22:53:53 2019 -0800 IMPALA-8214: Fix bad plan in load_nested.py The previous plan had the larger input on the build side of the join and did a broadcast join, which is very suboptimal. This speeds up data loading on my minicluster - 18s vs 31s and has a more significant impact on a real cluster, where queries execute much faster, the memory requirement is significantly reduced and the data loading can potentially be broken up into fewer chunks. I also considered computing stats on the table to let Impala generate the same plan, but this achieves the same goal more efficiently. Testing: Run core tests. Resource estimates in planner tests changed slightly because of the different distribution of data. Change-Id: I55e0ca09590a90ba530efe4e8f8bf587dde3eeeb Reviewed-on: http://gerrit.cloudera.org:8080/12519 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- testdata/bin/load_nested.py | 6 +++--- .../queries/PlannerTest/mt-dop-validation.test | 16 ++++++++-------- .../queries/PlannerTest/tpch-nested.test | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/testdata/bin/load_nested.py b/testdata/bin/load_nested.py index d391fdb..93d8339 100755 --- a/testdata/bin/load_nested.py +++ b/testdata/bin/load_nested.py @@ -104,7 +104,7 @@ def load(): for chunk_idx in xrange(chunks): sql_params["chunk_idx"] = chunk_idx tmp_customer_sql = r""" - SELECT + SELECT STRAIGHT_JOIN c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment, GROUP_CONCAT( @@ -120,8 +120,8 @@ def load(): CAST(lineitems_string AS STRING) ), '\002' ) orders_string - FROM {source_db}.customer - LEFT JOIN tmp_orders_string ON c_custkey = o_custkey + FROM tmp_orders_string + RIGHT JOIN [SHUFFLE] {source_db}.customer ON c_custkey = o_custkey WHERE c_custkey % {chunks} = {chunk_idx} GROUP BY 1, 2, 3, 4, 5, 6, 7, 8""".format(**sql_params) LOG.info("Creating temp customers (chunk {chunk} of {chunks})".format( diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test index 5d340f3..f749a4e 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test @@ -219,7 +219,7 @@ from tpch_nested_parquet.customer c, c.c_orders o, o.o_lineitems where c_custkey < 10 and o_orderkey < 5 and l_linenumber < 3 ---- PLAN F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 -| Per-Host Resources: mem-estimate=88.00MB mem-reservation=88.00MB thread-reservation=1 +| Per-Host Resources: mem-estimate=104.00MB mem-reservation=104.00MB thread-reservation=1 PLAN-ROOT SINK | mem-estimate=0B mem-reservation=0B thread-reservation=0 | @@ -275,14 +275,14 @@ PLAN-ROOT SINK stored statistics: table: rows=150000 size=288.99MB columns missing stats: c_orders - extrapolated-rows=disabled max-scan-range-rows=44225 + extrapolated-rows=disabled max-scan-range-rows=50116 parquet statistics predicates: c_custkey < CAST(10 AS BIGINT) parquet statistics predicates on o: o_orderkey < CAST(5 AS BIGINT) parquet statistics predicates on o_lineitems: l_linenumber < CAST(3 AS INT) parquet dictionary predicates: c_custkey < CAST(10 AS BIGINT) parquet dictionary predicates on o: o_orderkey < CAST(5 AS BIGINT) parquet dictionary predicates on o_lineitems: l_linenumber < CAST(3 AS INT) - mem-estimate=88.00MB mem-reservation=88.00MB thread-reservation=0 + mem-estimate=104.00MB mem-reservation=104.00MB thread-reservation=0 tuple-ids=0 row-size=230B cardinality=15.00K in pipelines: 00(GETNEXT) ---- PARALLELPLANS @@ -297,7 +297,7 @@ PLAN-ROOT SINK | in pipelines: 00(GETNEXT) | F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=9 -Per-Host Resources: mem-estimate=264.00MB mem-reservation=264.00MB thread-reservation=3 +Per-Host Resources: mem-estimate=312.00MB mem-reservation=312.00MB thread-reservation=3 01:SUBPLAN | mem-estimate=0B mem-reservation=0B thread-reservation=0 | tuple-ids=2,1,0 row-size=482B cardinality=1.50M @@ -350,14 +350,14 @@ Per-Host Resources: mem-estimate=264.00MB mem-reservation=264.00MB thread-reserv stored statistics: table: rows=150000 size=288.99MB columns missing stats: c_orders - extrapolated-rows=disabled max-scan-range-rows=44225 + extrapolated-rows=disabled max-scan-range-rows=50116 parquet statistics predicates: c_custkey < CAST(10 AS BIGINT) parquet statistics predicates on o: o_orderkey < CAST(5 AS BIGINT) parquet statistics predicates on o_lineitems: l_linenumber < CAST(3 AS INT) parquet dictionary predicates: c_custkey < CAST(10 AS BIGINT) parquet dictionary predicates on o: o_orderkey < CAST(5 AS BIGINT) parquet dictionary predicates on o_lineitems: l_linenumber < CAST(3 AS INT) - mem-estimate=88.00MB mem-reservation=88.00MB thread-reservation=0 + mem-estimate=104.00MB mem-reservation=104.00MB thread-reservation=0 tuple-ids=0 row-size=230B cardinality=15.00K in pipelines: 00(GETNEXT) ==== @@ -413,7 +413,7 @@ PLAN-ROOT SINK stored statistics: table: rows=150000 size=288.99MB columns missing stats: c_orders, c_orders - extrapolated-rows=disabled max-scan-range-rows=44225 + extrapolated-rows=disabled max-scan-range-rows=50116 parquet statistics predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT) parquet dictionary predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT) mem-estimate=88.00MB mem-reservation=16.00MB thread-reservation=0 @@ -474,7 +474,7 @@ Per-Host Resources: mem-estimate=269.81MB mem-reservation=53.81MB thread-reserva stored statistics: table: rows=150000 size=288.99MB columns missing stats: c_orders, c_orders - extrapolated-rows=disabled max-scan-range-rows=44225 + extrapolated-rows=disabled max-scan-range-rows=50116 parquet statistics predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT) parquet dictionary predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT) mem-estimate=88.00MB mem-reservation=16.00MB thread-reservation=0 diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test index bd2c6d1..fd6363a 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test @@ -3192,7 +3192,7 @@ PLAN-ROOT SINK predicates: substr(c_phone, 1, 2) IN ('13', '31', '23', '29', '30', '18', '17') row-size=47B cardinality=15.00K ---- DISTRIBUTEDPLAN -Max Per-Host Resource Reservation: Memory=25.94MB Threads=8 +Max Per-Host Resource Reservation: Memory=27.94MB Threads=8 Per-Host Resource Estimates: Memory=688MB PLAN-ROOT SINK |