This is an automated email from the ASF dual-hosted git repository.

lv pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c659b78198a767b91c293cbaf77f5c8b269fba39
Author: Tim Armstrong <tarmstr...@cloudera.com>
AuthorDate: Mon Feb 18 22:53:53 2019 -0800

    IMPALA-8214: Fix bad plan in load_nested.py
    
    The previous plan had the larger input on the build side of the join and
    did a broadcast join, which is very suboptimal.
    
    This speeds up data loading on my minicluster - 18s vs 31s and has a
    more significant impact on a real cluster, where queries execute
    much faster, the memory requirement is significantly reduced and
    the data loading can potentially be broken up into fewer chunks.
    
    I also considered computing stats on the table to let Impala generate
    the same plan, but this achieves the same goal more efficiently.
    
    Testing:
    Run core tests. Resource estimates in planner tests changed slightly
    because of the different distribution of data.
    
    Change-Id: I55e0ca09590a90ba530efe4e8f8bf587dde3eeeb
    Reviewed-on: http://gerrit.cloudera.org:8080/12519
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 testdata/bin/load_nested.py                              |  6 +++---
 .../queries/PlannerTest/mt-dop-validation.test           | 16 ++++++++--------
 .../queries/PlannerTest/tpch-nested.test                 |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/testdata/bin/load_nested.py b/testdata/bin/load_nested.py
index d391fdb..93d8339 100755
--- a/testdata/bin/load_nested.py
+++ b/testdata/bin/load_nested.py
@@ -104,7 +104,7 @@ def load():
     for chunk_idx in xrange(chunks):
       sql_params["chunk_idx"] = chunk_idx
       tmp_customer_sql = r"""
-          SELECT
+          SELECT STRAIGHT_JOIN
             c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, 
c_mktsegment,
             c_comment,
             GROUP_CONCAT(
@@ -120,8 +120,8 @@ def load():
                 CAST(lineitems_string AS STRING)
               ), '\002'
             ) orders_string
-          FROM {source_db}.customer
-          LEFT JOIN tmp_orders_string ON c_custkey = o_custkey
+          FROM tmp_orders_string
+          RIGHT JOIN [SHUFFLE] {source_db}.customer ON c_custkey = o_custkey
           WHERE c_custkey % {chunks} = {chunk_idx}
           GROUP BY 1, 2, 3, 4, 5, 6, 7, 8""".format(**sql_params)
       LOG.info("Creating temp customers (chunk {chunk} of {chunks})".format(
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
index 5d340f3..f749a4e 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
@@ -219,7 +219,7 @@ from tpch_nested_parquet.customer c, c.c_orders o, 
o.o_lineitems
 where c_custkey < 10 and o_orderkey < 5 and l_linenumber < 3
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=88.00MB mem-reservation=88.00MB 
thread-reservation=1
+|  Per-Host Resources: mem-estimate=104.00MB mem-reservation=104.00MB 
thread-reservation=1
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B thread-reservation=0
 |
@@ -275,14 +275,14 @@ PLAN-ROOT SINK
    stored statistics:
      table: rows=150000 size=288.99MB
      columns missing stats: c_orders
-   extrapolated-rows=disabled max-scan-range-rows=44225
+   extrapolated-rows=disabled max-scan-range-rows=50116
    parquet statistics predicates: c_custkey < CAST(10 AS BIGINT)
    parquet statistics predicates on o: o_orderkey < CAST(5 AS BIGINT)
    parquet statistics predicates on o_lineitems: l_linenumber < CAST(3 AS INT)
    parquet dictionary predicates: c_custkey < CAST(10 AS BIGINT)
    parquet dictionary predicates on o: o_orderkey < CAST(5 AS BIGINT)
    parquet dictionary predicates on o_lineitems: l_linenumber < CAST(3 AS INT)
-   mem-estimate=88.00MB mem-reservation=88.00MB thread-reservation=0
+   mem-estimate=104.00MB mem-reservation=104.00MB thread-reservation=0
    tuple-ids=0 row-size=230B cardinality=15.00K
    in pipelines: 00(GETNEXT)
 ---- PARALLELPLANS
@@ -297,7 +297,7 @@ PLAN-ROOT SINK
 |  in pipelines: 00(GETNEXT)
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=9
-Per-Host Resources: mem-estimate=264.00MB mem-reservation=264.00MB 
thread-reservation=3
+Per-Host Resources: mem-estimate=312.00MB mem-reservation=312.00MB 
thread-reservation=3
 01:SUBPLAN
 |  mem-estimate=0B mem-reservation=0B thread-reservation=0
 |  tuple-ids=2,1,0 row-size=482B cardinality=1.50M
@@ -350,14 +350,14 @@ Per-Host Resources: mem-estimate=264.00MB 
mem-reservation=264.00MB thread-reserv
    stored statistics:
      table: rows=150000 size=288.99MB
      columns missing stats: c_orders
-   extrapolated-rows=disabled max-scan-range-rows=44225
+   extrapolated-rows=disabled max-scan-range-rows=50116
    parquet statistics predicates: c_custkey < CAST(10 AS BIGINT)
    parquet statistics predicates on o: o_orderkey < CAST(5 AS BIGINT)
    parquet statistics predicates on o_lineitems: l_linenumber < CAST(3 AS INT)
    parquet dictionary predicates: c_custkey < CAST(10 AS BIGINT)
    parquet dictionary predicates on o: o_orderkey < CAST(5 AS BIGINT)
    parquet dictionary predicates on o_lineitems: l_linenumber < CAST(3 AS INT)
-   mem-estimate=88.00MB mem-reservation=88.00MB thread-reservation=0
+   mem-estimate=104.00MB mem-reservation=104.00MB thread-reservation=0
    tuple-ids=0 row-size=230B cardinality=15.00K
    in pipelines: 00(GETNEXT)
 ====
@@ -413,7 +413,7 @@ PLAN-ROOT SINK
    stored statistics:
      table: rows=150000 size=288.99MB
      columns missing stats: c_orders, c_orders
-   extrapolated-rows=disabled max-scan-range-rows=44225
+   extrapolated-rows=disabled max-scan-range-rows=50116
    parquet statistics predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT)
    parquet dictionary predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT)
    mem-estimate=88.00MB mem-reservation=16.00MB thread-reservation=0
@@ -474,7 +474,7 @@ Per-Host Resources: mem-estimate=269.81MB 
mem-reservation=53.81MB thread-reserva
    stored statistics:
      table: rows=150000 size=288.99MB
      columns missing stats: c_orders, c_orders
-   extrapolated-rows=disabled max-scan-range-rows=44225
+   extrapolated-rows=disabled max-scan-range-rows=50116
    parquet statistics predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT)
    parquet dictionary predicates on o1: o1.o_orderkey < CAST(5 AS BIGINT)
    mem-estimate=88.00MB mem-reservation=16.00MB thread-reservation=0
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test 
b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
index bd2c6d1..fd6363a 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
@@ -3192,7 +3192,7 @@ PLAN-ROOT SINK
    predicates: substr(c_phone, 1, 2) IN ('13', '31', '23', '29', '30', '18', 
'17')
    row-size=47B cardinality=15.00K
 ---- DISTRIBUTEDPLAN
-Max Per-Host Resource Reservation: Memory=25.94MB Threads=8
+Max Per-Host Resource Reservation: Memory=27.94MB Threads=8
 Per-Host Resource Estimates: Memory=688MB
 PLAN-ROOT SINK
 |

Reply via email to