This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit c66c184aca24d375b9297d835c72853c3aaaee84
Author: 1mmortal <[email protected]>
AuthorDate: Tue Oct 11 15:01:21 2022 +0800

    Fix incorrect sortOp and eqOp generated by IsCorrelatedEqualityOpExpr. 
(#14108)
    
    In GPDB, expression subquery is promoted to join, e.g.
    select * from T where a > (select 10*avg(x) from R where T.b=R.y);
    in postgres, the plan is:
    
                              QUERY PLAN
    -----------------------------------------------------------------
     Seq Scan on t  (cost=0.00..80364.30 rows=753 width=8)
       Filter: ((a)::numeric > (SubPlan 1))
       SubPlan 1
         ->  Aggregate  (cost=35.53..35.54 rows=1 width=32)
               ->  Seq Scan on r  (cost=0.00..35.50 rows=10 width=4)
                     Filter: (t.b = y)
    
    while in GPDB, the subquery is promoted to join:
    
                                             QUERY PLAN
    
---------------------------------------------------------------------------------------------
     Gather Motion 3:1  (slice1; segments: 3)
       ->  Hash Join  (cost=477.00..969.11 rows=9567 width=8)
             Hash Cond: (t.b = "Expr_SUBQUERY".csq_c0)
             Join Filter: ((t.a)::numeric > "Expr_SUBQUERY".csq_c1)
             ->  Seq Scan on t  (cost=0.00..321.00 rows=28700 width=8)
             ->  Hash  (cost=472.83..472.83 rows=333 width=36)
                   ->  Subquery Scan on "Expr_SUBQUERY"
                         ->  HashAggregate
                               Group Key: r.y
                               ->  Seq Scan on r
     Optimizer: Postgres query optimizer
    
    But if the T.b and R.y are different type, it will report error like:
    ERROR:  operator 37 is not a valid ordering operator (pathkeys.c:579)
    
    The root cause is that IsCorrelatedEqualityOpExpr may generate incorrect 
sort
    operator and equal operator for SortGroupClause when the types of left and 
right
    operands of OpExpr are different.
    
    Since get_ordering_op_properties() requires the type of the left and right
    operands being consistent and SortGroupClause->tleSortGroupRef is 
referencing
    the target entry created from innerExpr, we teach 
IsCorrelatedEqualityOpExpr()
    construct appropriate eqOp and sortOp according to the type of innerExpr.
---
 src/backend/cdb/cdbsubselect.c                     |  29 ++--
 src/test/regress/expected/subselect_gp.out         | 186 +++++++++++++++++++++
 .../regress/expected/subselect_gp_optimizer.out    | 186 +++++++++++++++++++++
 src/test/regress/sql/subselect_gp.sql              |  83 +++++++++
 4 files changed, 470 insertions(+), 14 deletions(-)

diff --git a/src/backend/cdb/cdbsubselect.c b/src/backend/cdb/cdbsubselect.c
index 07fd8d0f6a..6180441c3e 100644
--- a/src/backend/cdb/cdbsubselect.c
+++ b/src/backend/cdb/cdbsubselect.c
@@ -148,13 +148,13 @@ IsCorrelatedOpExpr(OpExpr *opexp, Expr **innerExpr)
  *     returns true if correlated equality condition
  *     *innerExpr - points to the inner expr i.e. bar(innervar) in the 
condition
  *     *eqOp and *sortOp - equality and < operators, to implement the 
condition as a mergejoin.
+ *  The *eqOp and *sortOp should be determined according to innervar's type.
  */
 static bool
 IsCorrelatedEqualityOpExpr(OpExpr *opexp, Expr **innerExpr, Oid *eqOp, Oid 
*sortOp, bool *hashable)
 {
        Oid                     opfamily;
-       Oid                     ltype;
-       Oid                     rtype;
+       Oid                     innerExprType;
        List       *l;
 
        Assert(opexp);
@@ -163,11 +163,17 @@ IsCorrelatedEqualityOpExpr(OpExpr *opexp, Expr 
**innerExpr, Oid *eqOp, Oid *sort
        Assert(eqOp);
        Assert(sortOp);
 
+       if (!IsCorrelatedOpExpr(opexp, innerExpr))
+               return false;
+
+       Assert(*innerExpr);
+       innerExprType = exprType((Node *)*innerExpr);
+
        /*
         * If this is an expression of the form a = b, then we want to know 
about
         * the vars involved.
         */
-       if (!op_mergejoinable(opexp->opno, exprType(linitial(opexp->args))))
+       if (!op_mergejoinable(opexp->opno, innerExprType))
                return false;
 
        /*
@@ -182,24 +188,19 @@ IsCorrelatedEqualityOpExpr(OpExpr *opexp, Expr 
**innerExpr, Oid *eqOp, Oid *sort
        list_free(l);
 
        /*
-        * Look up the correct sort operator from the chosen opfamily.
+        * Look up the correct equility/sort operators from the chosen opfamily.
         */
-       ltype = exprType(linitial(opexp->args));
-       rtype = exprType(lsecond(opexp->args));
-       *eqOp = get_opfamily_member(opfamily, ltype, rtype, 
BTEqualStrategyNumber);
+       *eqOp = get_opfamily_member(opfamily, innerExprType, innerExprType, 
BTEqualStrategyNumber);
        if (!OidIsValid(*eqOp)) /* should not happen */
                elog(ERROR, "could not find member %d(%u,%u) of opfamily %u",
-                        BTEqualStrategyNumber, ltype, rtype, opfamily);
+                        BTEqualStrategyNumber, innerExprType, innerExprType, 
opfamily);
 
-       *sortOp = get_opfamily_member(opfamily, ltype, rtype, 
BTLessStrategyNumber);
+       *sortOp = get_opfamily_member(opfamily, innerExprType, innerExprType, 
BTLessStrategyNumber);
        if (!OidIsValid(*sortOp))       /* should not happen */
                elog(ERROR, "could not find member %d(%u,%u) of opfamily %u",
-                        BTLessStrategyNumber, ltype, rtype, opfamily);
+                        BTLessStrategyNumber, innerExprType, innerExprType, 
opfamily);
 
-       *hashable = op_hashjoinable(*eqOp, ltype);
-
-       if (!IsCorrelatedOpExpr(opexp, innerExpr))
-               return false;
+       *hashable = op_hashjoinable(*eqOp, innerExprType);
 
        return true;
 }
diff --git a/src/test/regress/expected/subselect_gp.out 
b/src/test/regress/expected/subselect_gp.out
index 966335ab41..2995f52a6d 100644
--- a/src/test/regress/expected/subselect_gp.out
+++ b/src/test/regress/expected/subselect_gp.out
@@ -3036,3 +3036,189 @@ select * from param_t a where a.i in
 (1 row)
 
 drop table if exists param_t;
+-- A guard test case for gpexpand's populate SQL
+-- Some simple notes and background is: we want to compute
+-- table size efficiently, it is better to avoid invoke
+-- pg_relation_size() in serial on QD, since this function
+-- will dispatch for each tuple. The bad pattern SQL is like
+--   select pg_relation_size(oid) from pg_class where xxx
+-- The idea is force pg_relations_size is evaluated on each
+-- segment and the sum the result together to get the final
+-- result. To make sure correctness, we have to evaluate
+-- pg_relation_size before any motion. The skill here is
+-- to wrap this in a subquery, due to volatile of pg_relation_size,
+-- this subquery won't be pulled up. Plus the skill of
+-- gp_dist_random('pg_class') we can achieve this goal.
+-- the below test is to verify the plan, we should see pg_relation_size
+-- is evaludated on each segment and then motion then sum together. The
+-- SQL pattern is a catalog join a table size "dict".
+set gp_enable_multiphase_agg = on;
+-- force nestloop join to make test stable since we
+-- are testing plan and do not care about where we
+-- put hash table.
+set enable_hashjoin = off;
+set enable_nestloop = on;
+set enable_indexscan = off;
+set enable_bitmapscan = off;
+explain (verbose on, costs off)
+with cte(table_oid, size) as
+(
+   select
+     table_oid,
+     sum(size) size
+   from (
+     select oid,
+          pg_relation_size(oid)
+     from gp_dist_random('pg_class')
+   ) x(table_oid, size)
+  group by table_oid
+)
+select pc.relname, ts.size
+from pg_class pc, cte ts
+where pc.oid = ts.table_oid;
+                                                   QUERY PLAN                  
                                 
+----------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+   ->  Nested Loop
+         Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+         Join Filter: (pc.oid = pg_class.oid)
+         ->  Redistribute Motion 1:3  (slice2)
+               Output: pc.relname, pc.oid
+               Hash Key: pc.oid
+               ->  Seq Scan on pg_catalog.pg_class pc
+                     Output: pc.relname, pc.oid
+         ->  Materialize
+               Output: pg_class.oid, 
(sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text))))
+               ->  HashAggregate
+                     Output: pg_class.oid, 
sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text)))
+                     Group Key: pg_class.oid
+                     ->  Redistribute Motion 3:3  (slice3; segments: 3)
+                           Output: pg_class.oid, 
(pg_relation_size((pg_class.oid)::regclass, 'main'::text))
+                           Hash Key: pg_class.oid
+                           ->  Seq Scan on pg_catalog.pg_class
+                                 Output: pg_class.oid, 
pg_relation_size((pg_class.oid)::regclass, 'main'::text)
+ Optimizer: Postgres query optimizer
+(21 rows)
+
+set gp_enable_multiphase_agg = off;
+explain (verbose on, costs off)
+with cte(table_oid, size) as
+(
+   select
+     table_oid,
+     sum(size) size
+   from (
+     select oid,
+          pg_relation_size(oid)
+     from gp_dist_random('pg_class')
+   ) x(table_oid, size)
+  group by table_oid
+)
+select pc.relname, ts.size
+from pg_class pc, cte ts
+where pc.oid = ts.table_oid;
+                                                   QUERY PLAN                  
                                 
+----------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+   ->  Nested Loop
+         Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+         Join Filter: (pc.oid = pg_class.oid)
+         ->  Redistribute Motion 1:3  (slice2)
+               Output: pc.relname, pc.oid
+               Hash Key: pc.oid
+               ->  Seq Scan on pg_catalog.pg_class pc
+                     Output: pc.relname, pc.oid
+         ->  Materialize
+               Output: pg_class.oid, 
(sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text))))
+               ->  HashAggregate
+                     Output: pg_class.oid, 
sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text)))
+                     Group Key: pg_class.oid
+                     ->  Redistribute Motion 3:3  (slice3; segments: 3)
+                           Output: pg_class.oid, 
(pg_relation_size((pg_class.oid)::regclass, 'main'::text))
+                           Hash Key: pg_class.oid
+                           ->  Seq Scan on pg_catalog.pg_class
+                                 Output: pg_class.oid, 
pg_relation_size((pg_class.oid)::regclass, 'main'::text)
+ Optimizer: Postgres query optimizer
+(21 rows)
+
+reset gp_enable_multiphase_agg;
+reset enable_hashjoin;
+reset enable_nestloop;
+reset enable_indexscan;
+reset enable_bitmapscan;
+create table sublink_outer_table(a int, b int) distributed by(b);
+create table sublink_inner_table(x int, y bigint) distributed by(y);
+set optimizer to off;
+explain select t.* from sublink_outer_table t join (select y ,10*avg(x) s from 
sublink_inner_table group by y) RR on RR.y = t.b and t.a > rr.s;
+                                            QUERY PLAN                         
                   
+--------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=436.00..1310.77 rows=28700 
width=8)
+   ->  Hash Join  (cost=436.00..928.11 rows=9567 width=8)
+         Hash Cond: (t.b = sublink_inner_table.y)
+         Join Filter: ((t.a)::numeric > (('10'::numeric * 
avg(sublink_inner_table.x))))
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=431.83..431.83 rows=333 width=40)
+               ->  HashAggregate  (cost=423.50..428.50 rows=333 width=40)
+                     Group Key: sublink_inner_table.y
+                     ->  Seq Scan on sublink_inner_table  (cost=0.00..293.67 
rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+explain select * from sublink_outer_table T where a > (select 10*avg(x) from 
sublink_inner_table R where T.b=R.y);
+                                                QUERY PLAN                     
                           
+----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=436.00..1310.77 rows=28700 
width=8)
+   ->  Hash Join  (cost=436.00..928.11 rows=9567 width=8)
+         Hash Cond: (t.b = "Expr_SUBQUERY".csq_c0)
+         Join Filter: ((t.a)::numeric > "Expr_SUBQUERY".csq_c1)
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=431.83..431.83 rows=333 width=40)
+               ->  Subquery Scan on "Expr_SUBQUERY"  (cost=423.50..431.83 
rows=333 width=40)
+                     ->  HashAggregate  (cost=423.50..428.50 rows=333 width=40)
+                           Group Key: r.y
+                           ->  Seq Scan on sublink_inner_table r  
(cost=0.00..293.67 rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(11 rows)
+
+set enable_hashagg to off;
+explain select t.* from sublink_outer_table t join (select y ,10*avg(x) s from 
sublink_inner_table group by y) RR on RR.y = t.b and t.a > rr.s;
+                                               QUERY PLAN                      
                         
+--------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2404.84..3279.62 rows=28700 
width=8)
+   ->  Hash Join  (cost=2404.84..2896.95 rows=9567 width=8)
+         Hash Cond: (t.b = sublink_inner_table.y)
+         Join Filter: ((t.a)::numeric > (('10'::numeric * 
avg(sublink_inner_table.x))))
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=2400.67..2400.67 rows=333 width=40)
+               ->  GroupAggregate  (cost=2197.59..2397.34 rows=333 width=40)
+                     Group Key: sublink_inner_table.y
+                     ->  Sort  (cost=2197.59..2262.51 rows=25967 width=12)
+                           Sort Key: sublink_inner_table.y
+                           ->  Seq Scan on sublink_inner_table  
(cost=0.00..293.67 rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(12 rows)
+
+explain select * from sublink_outer_table T where a > (select 10*avg(x) from 
sublink_inner_table R where T.b=R.y);
+                                                   QUERY PLAN                  
                                 
+----------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2404.84..3279.62 rows=28700 
width=8)
+   ->  Hash Join  (cost=2404.84..2896.95 rows=9567 width=8)
+         Hash Cond: (t.b = "Expr_SUBQUERY".csq_c0)
+         Join Filter: ((t.a)::numeric > "Expr_SUBQUERY".csq_c1)
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=2400.67..2400.67 rows=333 width=40)
+               ->  Subquery Scan on "Expr_SUBQUERY"  (cost=2197.59..2400.67 
rows=333 width=40)
+                     ->  GroupAggregate  (cost=2197.59..2397.34 rows=333 
width=40)
+                           Group Key: r.y
+                           ->  Sort  (cost=2197.59..2262.51 rows=25967 
width=12)
+                                 Sort Key: r.y
+                                 ->  Seq Scan on sublink_inner_table r  
(cost=0.00..293.67 rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+drop table sublink_outer_table;
+drop table sublink_inner_table;
+reset optimizer;
+reset enable_hashagg;
diff --git a/src/test/regress/expected/subselect_gp_optimizer.out 
b/src/test/regress/expected/subselect_gp_optimizer.out
index 533abf961d..8aba40fb65 100644
--- a/src/test/regress/expected/subselect_gp_optimizer.out
+++ b/src/test/regress/expected/subselect_gp_optimizer.out
@@ -3125,3 +3125,189 @@ select * from param_t a where a.i in
 (1 row)
 
 drop table if exists param_t;
+-- A guard test case for gpexpand's populate SQL
+-- Some simple notes and background is: we want to compute
+-- table size efficiently, it is better to avoid invoke
+-- pg_relation_size() in serial on QD, since this function
+-- will dispatch for each tuple. The bad pattern SQL is like
+--   select pg_relation_size(oid) from pg_class where xxx
+-- The idea is force pg_relations_size is evaluated on each
+-- segment and the sum the result together to get the final
+-- result. To make sure correctness, we have to evaluate
+-- pg_relation_size before any motion. The skill here is
+-- to wrap this in a subquery, due to volatile of pg_relation_size,
+-- this subquery won't be pulled up. Plus the skill of
+-- gp_dist_random('pg_class') we can achieve this goal.
+-- the below test is to verify the plan, we should see pg_relation_size
+-- is evaludated on each segment and then motion then sum together. The
+-- SQL pattern is a catalog join a table size "dict".
+set gp_enable_multiphase_agg = on;
+-- force nestloop join to make test stable since we
+-- are testing plan and do not care about where we
+-- put hash table.
+set enable_hashjoin = off;
+set enable_nestloop = on;
+set enable_indexscan = off;
+set enable_bitmapscan = off;
+explain (verbose on, costs off)
+with cte(table_oid, size) as
+(
+   select
+     table_oid,
+     sum(size) size
+   from (
+     select oid,
+          pg_relation_size(oid)
+     from gp_dist_random('pg_class')
+   ) x(table_oid, size)
+  group by table_oid
+)
+select pc.relname, ts.size
+from pg_class pc, cte ts
+where pc.oid = ts.table_oid;
+                                                   QUERY PLAN                  
                                 
+----------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+   ->  Nested Loop
+         Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+         Join Filter: (pc.oid = pg_class.oid)
+         ->  Redistribute Motion 1:3  (slice2)
+               Output: pc.relname, pc.oid
+               Hash Key: pc.oid
+               ->  Seq Scan on pg_catalog.pg_class pc
+                     Output: pc.relname, pc.oid
+         ->  Materialize
+               Output: pg_class.oid, 
(sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text))))
+               ->  HashAggregate
+                     Output: pg_class.oid, 
sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text)))
+                     Group Key: pg_class.oid
+                     ->  Redistribute Motion 3:3  (slice3; segments: 3)
+                           Output: pg_class.oid, 
(pg_relation_size((pg_class.oid)::regclass, 'main'::text))
+                           Hash Key: pg_class.oid
+                           ->  Seq Scan on pg_catalog.pg_class
+                                 Output: pg_class.oid, 
pg_relation_size((pg_class.oid)::regclass, 'main'::text)
+ Optimizer: Postgres query optimizer
+(21 rows)
+
+set gp_enable_multiphase_agg = off;
+explain (verbose on, costs off)
+with cte(table_oid, size) as
+(
+   select
+     table_oid,
+     sum(size) size
+   from (
+     select oid,
+          pg_relation_size(oid)
+     from gp_dist_random('pg_class')
+   ) x(table_oid, size)
+  group by table_oid
+)
+select pc.relname, ts.size
+from pg_class pc, cte ts
+where pc.oid = ts.table_oid;
+                                                   QUERY PLAN                  
                                 
+----------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+   ->  Nested Loop
+         Output: pc.relname, (sum((pg_relation_size((pg_class.oid)::regclass, 
'main'::text))))
+         Join Filter: (pc.oid = pg_class.oid)
+         ->  Redistribute Motion 1:3  (slice2)
+               Output: pc.relname, pc.oid
+               Hash Key: pc.oid
+               ->  Seq Scan on pg_catalog.pg_class pc
+                     Output: pc.relname, pc.oid
+         ->  Materialize
+               Output: pg_class.oid, 
(sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text))))
+               ->  HashAggregate
+                     Output: pg_class.oid, 
sum((pg_relation_size((pg_class.oid)::regclass, 'main'::text)))
+                     Group Key: pg_class.oid
+                     ->  Redistribute Motion 3:3  (slice3; segments: 3)
+                           Output: pg_class.oid, 
(pg_relation_size((pg_class.oid)::regclass, 'main'::text))
+                           Hash Key: pg_class.oid
+                           ->  Seq Scan on pg_catalog.pg_class
+                                 Output: pg_class.oid, 
pg_relation_size((pg_class.oid)::regclass, 'main'::text)
+ Optimizer: Postgres query optimizer
+(21 rows)
+
+reset gp_enable_multiphase_agg;
+reset enable_hashjoin;
+reset enable_nestloop;
+reset enable_indexscan;
+reset enable_bitmapscan;
+create table sublink_outer_table(a int, b int) distributed by(b);
+create table sublink_inner_table(x int, y bigint) distributed by(y);
+set optimizer to off;
+explain select t.* from sublink_outer_table t join (select y ,10*avg(x) s from 
sublink_inner_table group by y) RR on RR.y = t.b and t.a > rr.s;
+                                            QUERY PLAN                         
                   
+--------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=436.00..1310.77 rows=28700 
width=8)
+   ->  Hash Join  (cost=436.00..928.11 rows=9567 width=8)
+         Hash Cond: (t.b = sublink_inner_table.y)
+         Join Filter: ((t.a)::numeric > (('10'::numeric * 
avg(sublink_inner_table.x))))
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=431.83..431.83 rows=333 width=40)
+               ->  HashAggregate  (cost=423.50..428.50 rows=333 width=40)
+                     Group Key: sublink_inner_table.y
+                     ->  Seq Scan on sublink_inner_table  (cost=0.00..293.67 
rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+explain select * from sublink_outer_table T where a > (select 10*avg(x) from 
sublink_inner_table R where T.b=R.y);
+                                                QUERY PLAN                     
                           
+----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=436.00..1310.77 rows=28700 
width=8)
+   ->  Hash Join  (cost=436.00..928.11 rows=9567 width=8)
+         Hash Cond: (t.b = "Expr_SUBQUERY".csq_c0)
+         Join Filter: ((t.a)::numeric > "Expr_SUBQUERY".csq_c1)
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=431.83..431.83 rows=333 width=40)
+               ->  Subquery Scan on "Expr_SUBQUERY"  (cost=423.50..431.83 
rows=333 width=40)
+                     ->  HashAggregate  (cost=423.50..428.50 rows=333 width=40)
+                           Group Key: r.y
+                           ->  Seq Scan on sublink_inner_table r  
(cost=0.00..293.67 rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(11 rows)
+
+set enable_hashagg to off;
+explain select t.* from sublink_outer_table t join (select y ,10*avg(x) s from 
sublink_inner_table group by y) RR on RR.y = t.b and t.a > rr.s;
+                                               QUERY PLAN                      
                         
+--------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2404.84..3279.62 rows=28700 
width=8)
+   ->  Hash Join  (cost=2404.84..2896.95 rows=9567 width=8)
+         Hash Cond: (t.b = sublink_inner_table.y)
+         Join Filter: ((t.a)::numeric > (('10'::numeric * 
avg(sublink_inner_table.x))))
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=2400.67..2400.67 rows=333 width=40)
+               ->  GroupAggregate  (cost=2197.59..2397.34 rows=333 width=40)
+                     Group Key: sublink_inner_table.y
+                     ->  Sort  (cost=2197.59..2262.51 rows=25967 width=12)
+                           Sort Key: sublink_inner_table.y
+                           ->  Seq Scan on sublink_inner_table  
(cost=0.00..293.67 rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(12 rows)
+
+explain select * from sublink_outer_table T where a > (select 10*avg(x) from 
sublink_inner_table R where T.b=R.y);
+                                                   QUERY PLAN                  
                                 
+----------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2404.84..3279.62 rows=28700 
width=8)
+   ->  Hash Join  (cost=2404.84..2896.95 rows=9567 width=8)
+         Hash Cond: (t.b = "Expr_SUBQUERY".csq_c0)
+         Join Filter: ((t.a)::numeric > "Expr_SUBQUERY".csq_c1)
+         ->  Seq Scan on sublink_outer_table t  (cost=0.00..321.00 rows=28700 
width=8)
+         ->  Hash  (cost=2400.67..2400.67 rows=333 width=40)
+               ->  Subquery Scan on "Expr_SUBQUERY"  (cost=2197.59..2400.67 
rows=333 width=40)
+                     ->  GroupAggregate  (cost=2197.59..2397.34 rows=333 
width=40)
+                           Group Key: r.y
+                           ->  Sort  (cost=2197.59..2262.51 rows=25967 
width=12)
+                                 Sort Key: r.y
+                                 ->  Seq Scan on sublink_inner_table r  
(cost=0.00..293.67 rows=25967 width=12)
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+drop table sublink_outer_table;
+drop table sublink_inner_table;
+reset optimizer;
+reset enable_hashagg;
diff --git a/src/test/regress/sql/subselect_gp.sql 
b/src/test/regress/sql/subselect_gp.sql
index 5391e4747c..de95b3d355 100644
--- a/src/test/regress/sql/subselect_gp.sql
+++ b/src/test/regress/sql/subselect_gp.sql
@@ -1214,3 +1214,86 @@ select * from param_t a where a.i in
 
 
 drop table if exists param_t;
+
+-- A guard test case for gpexpand's populate SQL
+-- Some simple notes and background is: we want to compute
+-- table size efficiently, it is better to avoid invoke
+-- pg_relation_size() in serial on QD, since this function
+-- will dispatch for each tuple. The bad pattern SQL is like
+--   select pg_relation_size(oid) from pg_class where xxx
+-- The idea is force pg_relations_size is evaluated on each
+-- segment and the sum the result together to get the final
+-- result. To make sure correctness, we have to evaluate
+-- pg_relation_size before any motion. The skill here is
+-- to wrap this in a subquery, due to volatile of pg_relation_size,
+-- this subquery won't be pulled up. Plus the skill of
+-- gp_dist_random('pg_class') we can achieve this goal.
+
+-- the below test is to verify the plan, we should see pg_relation_size
+-- is evaludated on each segment and then motion then sum together. The
+-- SQL pattern is a catalog join a table size "dict".
+
+set gp_enable_multiphase_agg = on;
+-- force nestloop join to make test stable since we
+-- are testing plan and do not care about where we
+-- put hash table.
+set enable_hashjoin = off;
+set enable_nestloop = on;
+set enable_indexscan = off;
+set enable_bitmapscan = off;
+
+explain (verbose on, costs off)
+with cte(table_oid, size) as
+(
+   select
+     table_oid,
+     sum(size) size
+   from (
+     select oid,
+          pg_relation_size(oid)
+     from gp_dist_random('pg_class')
+   ) x(table_oid, size)
+  group by table_oid
+)
+select pc.relname, ts.size
+from pg_class pc, cte ts
+where pc.oid = ts.table_oid;
+
+set gp_enable_multiphase_agg = off;
+
+explain (verbose on, costs off)
+with cte(table_oid, size) as
+(
+   select
+     table_oid,
+     sum(size) size
+   from (
+     select oid,
+          pg_relation_size(oid)
+     from gp_dist_random('pg_class')
+   ) x(table_oid, size)
+  group by table_oid
+)
+select pc.relname, ts.size
+from pg_class pc, cte ts
+where pc.oid = ts.table_oid;
+
+reset gp_enable_multiphase_agg;
+reset enable_hashjoin;
+reset enable_nestloop;
+reset enable_indexscan;
+reset enable_bitmapscan;
+create table sublink_outer_table(a int, b int) distributed by(b);
+create table sublink_inner_table(x int, y bigint) distributed by(y);
+
+set optimizer to off;
+explain select t.* from sublink_outer_table t join (select y ,10*avg(x) s from 
sublink_inner_table group by y) RR on RR.y = t.b and t.a > rr.s;
+explain select * from sublink_outer_table T where a > (select 10*avg(x) from 
sublink_inner_table R where T.b=R.y);
+
+set enable_hashagg to off;
+explain select t.* from sublink_outer_table t join (select y ,10*avg(x) s from 
sublink_inner_table group by y) RR on RR.y = t.b and t.a > rr.s;
+explain select * from sublink_outer_table T where a > (select 10*avg(x) from 
sublink_inner_table R where T.b=R.y);
+drop table sublink_outer_table;
+drop table sublink_inner_table;
+reset optimizer;
+reset enable_hashagg;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to