(cloudberry) 19/41: Bring back cdbsubselect_drop_distinct. (#14322)

yjhjstz Wed, 01 Jan 2025 19:12:27 -0800

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit ede16fd9b9ae429847be8f3942b0a6bf9a1b574d
Author: Aegeaner <[email protected]>
AuthorDate: Thu Oct 27 17:08:47 2022 +0800

    Bring back cdbsubselect_drop_distinct. (#14322)
    
    Previous commit ebb691a03 is to fix ANY sublink contains distinct-on
    syntax on partial exprs in the target list of subselect. That fix
    method simply not to drop distinct or order-by clause and might lead
    to some sublink cannot be pulled up and bad performance, below is an
    example of such regression:
    
      select a from t where a in (select b from t1 group by b)
    
    For the above SQL, if we do not drop the group-by clause, then later
    planner will take the subselect as not simple and fail to pull it up
    to a join.
    
    This commit fixes the regression by:
    
    1. revert the logic of ebb691a03
    2. fix the issue #12656 using a correct way: if
       the distinct on clause's length is not the
       same as the length of subselect's targetlist,
       we do not drop the distinct clause.
    
    Authored-by: Jian Guo <[email protected]>
    
    Co-authored-by: Zhenghua Lyu <[email protected]>
---
 src/backend/cdb/cdbsubselect.c                     |  22 +++-
 src/backend/optimizer/plan/subselect.c             |  28 ++---
 src/test/regress/expected/notin.out                |  34 +++---
 src/test/regress/expected/notin_optimizer.out      |  34 +++---
 src/test/regress/expected/rpt.out                  | 114 +++++++++------------
 src/test/regress/expected/rpt_optimizer.out        |  21 ++--
 src/test/regress/expected/subselect.out            |  10 +-
 src/test/regress/expected/subselect_gp.out         |  35 +++++--
 .../regress/expected/subselect_gp_optimizer.out    |  64 ++++++++----
 src/test/regress/expected/subselect_optimizer.out  |   8 +-
 src/test/regress/sql/subselect_gp.sql              |   7 ++
 src/test/singlenode_regress/expected/subselect.out |   7 +-
 .../singlenode_regress/expected/subselect_gp.out   |  16 +--
 13 files changed, 213 insertions(+), 187 deletions(-)

diff --git a/src/backend/cdb/cdbsubselect.c b/src/backend/cdb/cdbsubselect.c
index ae305e1e4b..9622aa2d74 100644
--- a/src/backend/cdb/cdbsubselect.c
+++ b/src/backend/cdb/cdbsubselect.c
@@ -466,7 +466,9 @@ cdbsubselect_drop_distinct(Query *subselect)
                subselect->limitOffset == NULL)
        {
                /* Delete DISTINCT. */
-               subselect->distinctClause = NIL;
+               if (!subselect->hasDistinctOn ||
+                       list_length(subselect->distinctClause) == 
list_length(subselect->targetList))
+                       subselect->distinctClause = NIL;
 
                /* Delete GROUP BY if subquery has no aggregates and no HAVING. 
*/
                if (!subselect->hasAggs &&
@@ -485,7 +487,9 @@ cdbsubselect_drop_orderby(Query *subselect)
                subselect->limitOffset == NULL)
        {
                /* Delete ORDER BY. */
-               subselect->sortClause = NIL;
+               if (!subselect->hasDistinctOn ||
+                       list_length(subselect->distinctClause) == 
list_length(subselect->targetList))
+                       subselect->sortClause = NIL;
        }
 }      /* cdbsubselect_drop_orderby */
 
@@ -1500,6 +1504,20 @@ convert_IN_to_antijoin(PlannerInfo *root, SubLink 
*sublink,
 
        if (safe_to_convert_NOTIN(root, sublink, available_rels))
        {
+               /* Delete ORDER BY and DISTINCT.
+                *
+                * There is no need to do the group-by or order-by inside the
+                * subquery, if we have decided to pull up the sublink. For the
+                * group-by case, after the sublink pull-up, there will be a 
semi-join
+                * plan node generated in top level, which will weed out 
duplicate
+                * tuples naturally. For the order-by case, after the sublink 
pull-up,
+                * the subquery will become a jointree, inside which the 
tuples' order
+                * doesn't matter. In a summary, it's safe to elimate the 
group-by or
+                * order-by causes here.
+                */
+               cdbsubselect_drop_orderby(subselect);
+               cdbsubselect_drop_distinct(subselect);
+
                int                     subq_indx      = 
add_notin_subquery_rte(parse, subselect);
                List       *inner_exprs    = NIL;
                List       *outer_exprs    = NIL;
diff --git a/src/backend/optimizer/plan/subselect.c 
b/src/backend/optimizer/plan/subselect.c
index 4975d2cc6b..538e3da6e8 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1538,6 +1538,20 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink 
*sublink,
        Assert(sublink->subLinkType == ANY_SUBLINK);
        Assert(IsA(subselect, Query));
 
+       /* Delete ORDER BY and DISTINCT.
+        *
+        * There is no need to do the group-by or order-by inside the
+        * subquery, if we have decided to pull up the sublink. For the
+        * group-by case, after the sublink pull-up, there will be a semi-join
+        * plan node generated in top level, which will weed out duplicate
+        * tuples naturally. For the order-by case, after the sublink pull-up,
+        * the subquery will become a jointree, inside which the tuples' order
+        * doesn't matter. In a summary, it's safe to elimate the group-by or
+        * order-by causes here.
+       */
+       cdbsubselect_drop_orderby(subselect);
+       cdbsubselect_drop_distinct(subselect);
+
        /*
         * If uncorrelated, and no Var nodes on lhs, the subquery will be 
executed
         * only once.  It should become an InitPlan, but make_subplan() doesn't
@@ -1548,20 +1562,6 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink 
*sublink,
 
        if (correlated)
        {
-               /* Delete ORDER BY and DISTINCT. 
-                *
-                * There is no need to do the group-by or order-by inside the
-                * subquery, if we have decided to pull up the sublink (when the
-                * subquery is correlated). For the group-by case, after the 
sublink
-                * pull-up, there will be a semi-join plan node generated in top
-                * level, which will weed out duplicate tuples naturally. For 
the
-                * order-by case, after the sublink pull-up, the subquery will 
become
-                * a jointree, inside which the tuples' order doesn't matter. 
In a
-                * summary, it's safe to elimate the group-by or order-by 
causes here.
-               */
-               cdbsubselect_drop_orderby(subselect);
-               cdbsubselect_drop_distinct(subselect);
-
                /*
                 * Under certain conditions, we cannot pull up the subquery as 
a join.
                 */
diff --git a/src/test/regress/expected/notin.out 
b/src/test/regress/expected/notin.out
index 97ce4dce01..7baac477d8 100644
--- a/src/test/regress/expected/notin.out
+++ b/src/test/regress/expected/notin.out
@@ -400,30 +400,26 @@ select a,b from g1 where (a,b) not in
 --
 explain select x,y from l1 where (x,y) not in
        (select distinct y, sum(x) from l1 group by y having y < 4 order by y) 
order by 1,2;
-                                                         QUERY PLAN            
                                             
-----------------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  
(cost=10000000002.35..10000000002.40 rows=3 width=8)
+                                                         QUERY PLAN            
                                              
+-----------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  
(cost=10000000008.02..10000000008.03 rows=4 width=8)
    Merge Key: l1.x, l1.y
    ->  Sort  (cost=10000000002.35..10000000002.36 rows=1 width=8)
          Sort Key: l1.x, l1.y
-         ->  Nested Loop Left Anti Semi (Not-In) Join  
(cost=10000000001.07..10000000002.34 rows=1 width=8)
+         ->  Nested Loop Left Anti Semi (Not-In) Join  
(cost=10000000003.25..10000000007.99 rows=2 width=8)
                Join Filter: ((l1.x = "NotIn_SUBQUERY".y) AND (l1.y = 
"NotIn_SUBQUERY".sum))
-               ->  Seq Scan on l1  (cost=0.00..1.03 rows=3 width=8)
-               ->  Materialize  (cost=1.09..1.16 rows=3 width=12)
-                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=1.09..1.15 rows=3 width=12)
-                           ->  Subquery Scan on "NotIn_SUBQUERY"  
(cost=1.09..1.11 rows=1 width=12)
-                                 ->  Unique  (cost=1.09..1.10 rows=1 width=16)
-                                       Group Key: l1_1.y, (sum(l1_1.x))
-                                       ->  Sort  (cost=1.09..1.09 rows=1 
width=16)
-                                             Sort Key: l1_1.y, (sum(l1_1.x))
-                                             ->  HashAggregate  
(cost=1.07..1.08 rows=1 width=16)
-                                                   Group Key: l1_1.y
-                                                   ->  Redistribute Motion 3:3 
 (slice3; segments: 3)  (cost=0.00..1.06 rows=1 width=8)
-                                                         Hash Key: l1_1.y
-                                                         ->  Seq Scan on l1 
l1_1  (cost=0.00..1.04 rows=1 width=8)
-                                                               Filter: (y < 4)
+               ->  Seq Scan on l1  (cost=0.00..3.10 rows=4 width=8)
+               ->  Materialize  (cost=3.25..3.47 rows=3 width=12)
+                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=3.25..3.43 rows=3 width=12)
+                           ->  Subquery Scan on "NotIn_SUBQUERY"  
(cost=3.25..3.31 rows=1 width=12)
+                                 ->  HashAggregate  (cost=3.25..3.28 rows=1 
width=16)
+                                       Group Key: l1_1.y
+                                       ->  Redistribute Motion 3:3  (slice1; 
segments: 3)  (cost=3.14..3.20 rows=1 width=12)
+                                             Hash Key: l1_1.y
+                                                   ->  Seq Scan on l1 l1_1  
(cost=0.00..3.12 rows=2 width=8)
+                                                         Filter: (y < 4)
  Optimizer: Postgres query optimizer
-(21 rows)
+(17 rows)
 
 select x,y from l1 where (x,y) not in
        (select distinct y, sum(x) from l1 group by y having y < 4 order by y) 
order by 1,2;
diff --git a/src/test/regress/expected/notin_optimizer.out 
b/src/test/regress/expected/notin_optimizer.out
index 02a114d083..46052b7e4f 100644
--- a/src/test/regress/expected/notin_optimizer.out
+++ b/src/test/regress/expected/notin_optimizer.out
@@ -407,30 +407,26 @@ select a,b from g1 where (a,b) not in
 --
 explain select x,y from l1 where (x,y) not in
        (select distinct y, sum(x) from l1 group by y having y < 4 order by y) 
order by 1,2;
-                                                         QUERY PLAN            
                                             
-----------------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  
(cost=10000000002.35..10000000002.40 rows=3 width=8)
+                                                         QUERY PLAN            
                                              
+-----------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  
(cost=10000000008.02..10000000008.03 rows=4 width=8)
    Merge Key: l1.x, l1.y
    ->  Sort  (cost=10000000002.35..10000000002.36 rows=1 width=8)
          Sort Key: l1.x, l1.y
-         ->  Nested Loop Left Anti Semi (Not-In) Join  
(cost=10000000001.07..10000000002.34 rows=1 width=8)
+         ->  Nested Loop Left Anti Semi (Not-In) Join  
(cost=10000000003.25..10000000007.99 rows=2 width=8)
                Join Filter: ((l1.x = "NotIn_SUBQUERY".y) AND (l1.y = 
"NotIn_SUBQUERY".sum))
-               ->  Seq Scan on l1  (cost=0.00..1.03 rows=3 width=8)
-               ->  Materialize  (cost=1.09..1.16 rows=3 width=12)
-                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=1.09..1.15 rows=3 width=12)
-                           ->  Subquery Scan on "NotIn_SUBQUERY"  
(cost=1.09..1.11 rows=1 width=12)
-                                 ->  Unique  (cost=1.09..1.10 rows=1 width=16)
-                                       Group Key: l1_1.y, (sum(l1_1.x))
-                                       ->  Sort  (cost=1.09..1.09 rows=1 
width=16)
-                                             Sort Key: l1_1.y, (sum(l1_1.x))
-                                             ->  HashAggregate  
(cost=1.07..1.08 rows=1 width=16)
-                                                   Group Key: l1_1.y
-                                                   ->  Redistribute Motion 3:3 
 (slice3; segments: 3)  (cost=0.00..1.06 rows=1 width=8)
-                                                         Hash Key: l1_1.y
-                                                         ->  Seq Scan on l1 
l1_1  (cost=0.00..1.04 rows=1 width=8)
-                                                               Filter: (y < 4)
+               ->  Seq Scan on l1  (cost=0.00..3.10 rows=4 width=8)
+               ->  Materialize  (cost=3.25..3.47 rows=3 width=12)
+                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=3.25..3.43 rows=3 width=12)
+                           ->  Subquery Scan on "NotIn_SUBQUERY"  
(cost=3.25..3.31 rows=1 width=12)
+                                 ->  HashAggregate  (cost=3.25..3.28 rows=1 
width=16)
+                                       Group Key: l1_1.y
+                                       ->  Redistribute Motion 3:3  (slice1; 
segments: 3)  (cost=3.14..3.20 rows=1 width=12)
+                                             Hash Key: l1_1.y
+                                                   ->  Seq Scan on l1 l1_1  
(cost=0.00..3.12 rows=2 width=8)
+                                                         Filter: (y < 4)
  Optimizer: Postgres query optimizer
-(21 rows)
+(19 rows)
 
 select x,y from l1 where (x,y) not in
        (select distinct y, sum(x) from l1 group by y having y < 4 order by y) 
order by 1,2;
diff --git a/src/test/regress/expected/rpt.out 
b/src/test/regress/expected/rpt.out
index 367ebd5b40..389a5f63c0 100644
--- a/src/test/regress/expected/rpt.out
+++ b/src/test/regress/expected/rpt.out
@@ -1104,15 +1104,13 @@ explain select b from dist_tab where b in (select 
distinct c from rep_tab);
 
---------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  
(cost=10000000001.16..10000000021.44 rows=4 width=4)
    ->  Nested Loop  (cost=10000000001.16..10000000021.39 rows=1 width=4)
-         ->  Unique  (cost=10000000001.03..10000000001.04 rows=2 width=4)
+         ->  HashAggregate  (cost=10000000001.02..10000000001.03 rows=2 
width=4)
                Group Key: rep_tab.c
-               ->  Sort  (cost=10000000001.03..10000000001.03 rows=2 width=4)
-                     Sort Key: rep_tab.c
-                     ->  Seq Scan on rep_tab  
(cost=10000000000.00..10000000001.02 rows=2 width=4)
+               ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 
rows=2 width=4)
          ->  Index Only Scan using idx on dist_tab  (cost=0.13..10.16 rows=1 
width=4)
                Index Cond: (b = rep_tab.c)
  Optimizer: Postgres query optimizer
-(10 rows)
+(8 rows)
 
 select b from dist_tab where b in (select distinct c from rep_tab);
  b 
@@ -1135,20 +1133,16 @@ analyze rand_tab;
 --
 -- join derives EdtHashed
 explain select c from rep_tab where c in (select distinct c from rep_tab);
-                                                    QUERY PLAN                 
                                    
--------------------------------------------------------------------------------------------------------------------
- Gather Motion 1:1  (slice1; segments: 1)  
(cost=20000000002.14..20000000002.14 rows=3 width=4)
-   ->  Hash Join  (cost=20000000001.08..20000000002.14 rows=3 width=4)
+                                              QUERY PLAN                       
                        
+-------------------------------------------------------------------------------------------------------
+ Gather Motion 1:1  (slice1; segments: 1)  
(cost=20000000001.05..20000000002.11 rows=4 width=4)
+   ->  Hash Semi Join  (cost=20000000001.05..20000000002.11 rows=4 width=4)
          Hash Cond: (rep_tab.c = rep_tab_1.c)
          ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 rows=2 
width=4)
-         ->  Hash  (cost=10000000001.06..10000000001.06 rows=2 width=4)
-               ->  Unique  (cost=10000000001.03..10000000001.04 rows=2 width=4)
-                     Group Key: rep_tab_1.c
-                     ->  Sort  (cost=10000000001.03..10000000001.03 rows=2 
width=4)
-                           Sort Key: rep_tab_1.c
-                           ->  Seq Scan on rep_tab rep_tab_1  
(cost=10000000000.00..10000000001.02 rows=2 width=4)
+         ->  Hash  (cost=10000000001.02..10000000001.02 rows=1 width=4)
+               ->  Seq Scan on rep_tab rep_tab_1  
(cost=10000000000.00..10000000001.02 rows=2 width=4)
  Optimizer: Postgres query optimizer
-(11 rows)
+(7 rows)
 
 select c from rep_tab where c in (select distinct c from rep_tab);
  c 
@@ -1163,20 +1157,16 @@ select c from rep_tab where c in (select distinct c 
from rep_tab);
 --
 -- join derives EdtHashed
 explain select a from dist_tab where a in (select distinct c from rep_tab);
-                                               QUERY PLAN                      
                          
----------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.08..20000000002.17 rows=4 width=4)
-   ->  Hash Join  (cost=20000000001.08..20000000002.12 rows=1 width=4)
+                                           QUERY PLAN                          
                 
+------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.05..20000000003.14 rows=4 width=4)
+   ->  Hash Semi Join  (cost=20000000001.05..20000000003.14 rows=2 width=4)
          Hash Cond: (dist_tab.a = rep_tab.c)
-         ->  Seq Scan on dist_tab  (cost=10000000000.00..10000000001.01 rows=1 
width=4)
-         ->  Hash  (cost=10000000001.06..10000000001.06 rows=2 width=4)
-               ->  Unique  (cost=10000000001.03..10000000001.04 rows=2 width=4)
-                     Group Key: rep_tab.c
-                     ->  Sort  (cost=10000000001.03..10000000001.03 rows=2 
width=4)
-                           Sort Key: rep_tab.c
-                           ->  Seq Scan on rep_tab  
(cost=10000000000.00..10000000001.02 rows=2 width=4)
+         ->  Seq Scan on dist_tab  (cost=10000000000.00..10000000002.04 rows=2 
width=4)
+         ->  Hash  (cost=10000000001.02..10000000001.02 rows=1 width=4)
+               ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 
rows=2 width=4)
  Optimizer: Postgres query optimizer
-(11 rows)
+(7 rows)
 
 select a from dist_tab where a in (select distinct c from rep_tab);
  a 
@@ -1193,20 +1183,16 @@ select a from dist_tab where a in (select distinct c 
from rep_tab);
 --
 -- join derives EdtRandom
 explain select d from rand_tab where d in (select distinct c from rep_tab);
-                                               QUERY PLAN                      
                          
----------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.08..20000000002.15 rows=3 width=4)
-   ->  Hash Join  (cost=20000000001.08..20000000002.11 rows=1 width=4)
+                                           QUERY PLAN                          
                 
+------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.05..20000000002.11 rows=4 width=4)
+   ->  Hash Semi Join  (cost=20000000001.05..20000000002.11 rows=2 width=4)
          Hash Cond: (rand_tab.d = rep_tab.c)
-         ->  Seq Scan on rand_tab  (cost=10000000000.00..10000000001.01 rows=1 
width=4)
-         ->  Hash  (cost=10000000001.06..10000000001.06 rows=2 width=4)
-               ->  Unique  (cost=10000000001.03..10000000001.04 rows=2 width=4)
-                     Group Key: rep_tab.c
-                     ->  Sort  (cost=10000000001.03..10000000001.03 rows=2 
width=4)
-                           Sort Key: rep_tab.c
-                           ->  Seq Scan on rep_tab  
(cost=10000000000.00..10000000001.02 rows=2 width=4)
+         ->  Seq Scan on rand_tab  (cost=10000000000.00..10000000001.02 rows=1 
width=4)
+         ->  Hash  (cost=10000000001.02..10000000001.02 rows=1 width=4)
+               ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 
rows=2 width=4)
  Optimizer: Postgres query optimizer
-(11 rows)
+(7 rows)
 
 select d from rand_tab where d in (select distinct c from rep_tab);
  d 
@@ -1221,20 +1207,18 @@ select d from rand_tab where d in (select distinct c 
from rep_tab);
 --
 -- join derives EdtHashed
 explain select c from rep_tab where c in (select distinct a from dist_tab);
-                                                QUERY PLAN                     
                           
-----------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.05..20000000002.14 rows=3 width=4)
-   ->  Hash Join  (cost=20000000001.05..20000000002.09 rows=1 width=4)
+                                                 QUERY PLAN                    
                             
+------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.03..20000000002.14 rows=3 width=4)
+   ->  Hash Semi Join  (cost=20000000001.03..20000000002.09 rows=1 width=4)
          Hash Cond: (rep_tab.c = dist_tab.a)
-         ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 rows=2 
width=4)
-         ->  Hash  (cost=10000000001.04..10000000001.04 rows=1 width=4)
-               ->  Unique  (cost=10000000001.02..10000000001.03 rows=1 width=4)
-                     Group Key: dist_tab.a
-                     ->  Sort  (cost=10000000001.02..10000000001.03 rows=1 
width=4)
-                           Sort Key: dist_tab.a
-                           ->  Seq Scan on dist_tab  
(cost=10000000000.00..10000000001.01 rows=1 width=4)
+         ->  Redistribute Motion 1:3  (slice2; segments: 1)  
(cost=10000000000.00..10000000001.05 rows=1 width=4)
+               Hash Key: rep_tab.c
+               ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 
rows=2 width=4)
+         ->  Hash  (cost=10000000001.01..10000000001.01 rows=1 width=4)
+               ->  Seq Scan on dist_tab  (cost=10000000000.00..10000000001.01 
rows=1 width=4)
  Optimizer: Postgres query optimizer
-(11 rows)
+(9 rows)
 
 select c from rep_tab where c in (select distinct a from dist_tab);
  c 
@@ -1249,22 +1233,20 @@ select c from rep_tab where c in (select distinct a 
from dist_tab);
 --
 -- join derives EdtHashed
 explain select c from rep_tab where c in (select distinct d from rand_tab);
-                                                             QUERY PLAN        
                                                     
-------------------------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.07..20000000002.15 rows=3 width=4)
-   ->  Hash Join  (cost=20000000001.07..20000000002.11 rows=1 width=4)
+                                                 QUERY PLAN                    
                             
+------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  
(cost=20000000001.04..20000000002.15 rows=3 width=4)
+   ->  Hash Semi Join  (cost=20000000001.04..20000000002.11 rows=1 width=4)
          Hash Cond: (rep_tab.c = rand_tab.d)
-         ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 rows=2 
width=4)
-         ->  Hash  (cost=10000000001.06..10000000001.06 rows=1 width=4)
-               ->  Unique  (cost=10000000001.04..10000000001.05 rows=1 width=4)
-                     Group Key: rand_tab.d
-                     ->  Sort  (cost=10000000001.04..10000000001.05 rows=1 
width=4)
-                           Sort Key: rand_tab.d
-                           ->  Redistribute Motion 3:3  (slice2; segments: 3)  
(cost=10000000000.00..10000000001.03 rows=1 width=4)
-                                 Hash Key: rand_tab.d
-                                 ->  Seq Scan on rand_tab  
(cost=10000000000.00..10000000001.01 rows=1 width=4)
+         ->  Redistribute Motion 1:3  (slice2; segments: 1)  
(cost=10000000000.00..10000000001.05 rows=1 width=4)
+               Hash Key: rep_tab.c
+               ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 
rows=2 width=4)
+         ->  Hash  (cost=10000000001.03..10000000001.03 rows=1 width=4)
+               ->  Redistribute Motion 3:3  (slice3; segments: 3)  
(cost=10000000000.00..10000000001.03 rows=1 width=4)
+                     Hash Key: rand_tab.d
+                     ->  Seq Scan on rand_tab  
(cost=10000000000.00..10000000001.01 rows=1 width=4)
  Optimizer: Postgres query optimizer
-(13 rows)
+(11 rows)
 
 select c from rep_tab where c in (select distinct d from rand_tab);
  c 
diff --git a/src/test/regress/expected/rpt_optimizer.out 
b/src/test/regress/expected/rpt_optimizer.out
index 62a9fe489a..5585b29b56 100644
--- a/src/test/regress/expected/rpt_optimizer.out
+++ b/src/test/regress/expected/rpt_optimizer.out
@@ -1092,20 +1092,17 @@ set optimizer_enable_hashjoin=off;
 set enable_hashjoin=off;
 set enable_nestloop=on;
 explain select b from dist_tab where b in (select distinct c from rep_tab);
-                                   QUERY PLAN                                  
  
----------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..443.00 rows=4 width=4)
-   ->  Nested Loop  (cost=0.00..443.00 rows=2 width=4)
-         Join Filter: true
-         ->  GroupAggregate  (cost=0.00..431.00 rows=2 width=4)
+                                           QUERY PLAN                          
                 
+------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  
(cost=10000000001.15..10000000021.42 rows=4 width=4)
+   ->  Nested Loop  (cost=10000000001.15..10000000021.36 rows=1 width=4)
+         ->  HashAggregate  (cost=10000000001.02..10000000001.03 rows=2 
width=4)
                Group Key: rep_tab.c
-               ->  Sort  (cost=0.00..431.00 rows=2 width=4)
-                     Sort Key: rep_tab.c
-                     ->  Seq Scan on rep_tab  (cost=0.00..431.00 rows=2 
width=4)
-         ->  Index Scan using idx on dist_tab  (cost=0.00..12.00 rows=1 
width=4)
+               ->  Seq Scan on rep_tab  (cost=10000000000.00..10000000001.02 
rows=2 width=4)
+         ->  Index Only Scan using idx on dist_tab  (cost=0.13..10.16 rows=1 
width=4)
                Index Cond: (b = rep_tab.c)
- Optimizer: Pivotal Optimizer (GPORCA)
-(11 rows)
+ Optimizer: Postgres query optimizer
+(8 rows)
 
 select b from dist_tab where b in (select distinct c from rep_tab);
  b 
diff --git a/src/test/regress/expected/subselect.out 
b/src/test/regress/expected/subselect.out
index 427fdf060d..f4517c9b58 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1358,16 +1358,10 @@ select * from int4_tbl o where (f1, f1) in
                            Output: i.f1, ((generate_series(1, 50)) / 10)
                            ->  ProjectSet
                                  Output: generate_series(1, 50), i.f1
-                                 ->  GroupAggregate
+                                 ->  Seq Scan on public.int4_tbl i
                                        Output: i.f1
-                                       Group Key: i.f1
-                                       ->  Sort
-                                             Output: i.f1
-                                             Sort Key: i.f1
-                                             ->  Seq Scan on public.int4_tbl i
-                                                   Output: i.f1
  Optimizer: Postgres query optimizer
-(25 rows)
+(20 rows)
 
 select * from int4_tbl o where (f1, f1) in
   (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1);
diff --git a/src/test/regress/expected/subselect_gp.out 
b/src/test/regress/expected/subselect_gp.out
index 3dc5dcdfa7..9aff68f249 100644
--- a/src/test/regress/expected/subselect_gp.out
+++ b/src/test/regress/expected/subselect_gp.out
@@ -1763,11 +1763,9 @@ EXPLAIN select count(*) from
                                  Group Key: b.hundred
                                  ->  Redistribute Motion 3:3  (slice1; 
segments: 3)  (cost=0.00..389.00 rows=3334 width=4)
                                        Hash Key: b.hundred
-                                       ->  HashAggregate  (cost=70.67..71.67 
rows=100 width=4)
-                                             Group Key: b.hundred
                                              ->  Seq Scan on tenk1 b  
(cost=0.00..62.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
-(15 rows)
+(13 rows)
 
 EXPLAIN select count(distinct ss.ten) from
   (select ten from tenk1 a
@@ -1791,9 +1789,7 @@ EXPLAIN select count(distinct ss.ten) from
                                                    Group Key: b.hundred
                                                    ->  Redistribute Motion 3:3 
 (slice1; segments: 3)  (cost=0.00..389.00 rows=3334 width=4)
                                                          Hash Key: b.hundred
-                                                         ->  HashAggregate  
(cost=70.67..71.67 rows=100 width=4)
-                                                               Group Key: 
b.hundred
-                                                               ->  Seq Scan on 
tenk1 b  (cost=0.00..62.33 rows=3333 width=4)
+                                                         ->  Seq Scan on tenk1 
b  (cost=0.00..62.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (19 rows)
 
@@ -3053,6 +3049,33 @@ select * from issue_12656 where (i, j) in (select 
distinct on (i) i, j from issu
  1 | 10002
 (1 row)
 
+-- case 3, check correlated DISTINCT ON
+explain select * from issue_12656 a where (i, j) in
+(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j 
asc);
+                                                         QUERY PLAN            
                                             
+----------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..895.00 rows=43050 
width=8)
+   ->  Seq Scan on issue_12656 a  (cost=0.00..321.00 rows=14350 width=8)
+         Filter: (SubPlan 1)
+         SubPlan 1
+           ->  Unique  (cost=9818.00..10033.25 rows=1000 width=8)
+                 ->  Sort  (cost=9818.00..10033.25 rows=86100 width=8)
+                       Sort Key: b.j
+                       ->  Result  (cost=0.00..2760.50 rows=86100 width=8)
+                             Filter: (a.i = b.i)
+                             ->  Materialize  (cost=0.00..1899.50 rows=86100 
width=8)
+                                   ->  Broadcast Motion 3:3  (slice2; 
segments: 3)  (cost=0.00..1469.00 rows=86100 width=8)
+                                         ->  Seq Scan on issue_12656 b  
(cost=0.00..321.00 rows=28700 width=8)
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+select * from issue_12656 a where (i, j) in
+(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j 
asc);
+ i |   j   
+---+-------
+ 1 | 10001
+(1 row)
+
 ---
 --- Test param info is preserved when bringing a path to OuterQuery locus
 ---
diff --git a/src/test/regress/expected/subselect_gp_optimizer.out 
b/src/test/regress/expected/subselect_gp_optimizer.out
index e9a83655cd..a8bca6365f 100644
--- a/src/test/regress/expected/subselect_gp_optimizer.out
+++ b/src/test/regress/expected/subselect_gp_optimizer.out
@@ -1775,27 +1775,22 @@ EXPLAIN select count(distinct ss.ten) from
 EXPLAIN select count(*) from
   (select 1 from tenk1 a
    where unique1 IN (select distinct hundred from tenk1 b)) ss;
-                                                          QUERY PLAN           
                                                
--------------------------------------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=0.00..431.98 rows=1 width=8)
-   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.98 rows=1 
width=8)
-         ->  Partial Aggregate  (cost=0.00..431.98 rows=1 width=8)
-               ->  Nested Loop  (cost=0.00..431.98 rows=34 width=1)
-                     Join Filter: true
-                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=0.00..431.94 rows=100 width=4)
-                           ->  GroupAggregate  (cost=0.00..431.94 rows=34 
width=4)
-                                 Group Key: tenk1.hundred
-                                 ->  Sort  (cost=0.00..431.94 rows=34 width=4)
-                                       Sort Key: tenk1.hundred
-                                       ->  Redistribute Motion 3:3  (slice3; 
segments: 3)  (cost=0.00..431.94 rows=34 width=4)
-                                             Hash Key: tenk1.hundred
-                                             ->  Streaming HashAggregate  
(cost=0.00..431.94 rows=34 width=4)
-                                                   Group Key: tenk1.hundred
-                                                   ->  Seq Scan on tenk1  
(cost=0.00..431.51 rows=3334 width=4)
-                     ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1  
(cost=0.00..0.04 rows=1 width=1)
-                           Index Cond: (unique1 = tenk1.hundred)
- Optimizer: Pivotal Optimizer (GPORCA) version 2.75.0
-(17 rows)
+                                                        QUERY PLAN             
                                            
+---------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=210.51..210.52 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=210.45..210.50 rows=3 
width=8)
+         ->  Partial Aggregate  (cost=210.45..210.46 rows=1 width=8)
+               ->  Hash Join  (cost=138.92..210.37 rows=33 width=0)
+                     Hash Cond: (a.unique1 = b.hundred)
+                     ->  Seq Scan on tenk1 a  (cost=0.00..62.33 rows=3333 
width=4)
+                     ->  Hash  (cost=137.67..137.67 rows=100 width=4)
+                           ->  HashAggregate  (cost=137.33..137.67 rows=100 
width=4)
+                                 Group Key: b.hundred
+                                 ->  Redistribute Motion 3:3  (slice2; 
segments: 3)  (cost=0.00..129.00 rows=3333 width=4)
+                                       Hash Key: b.hundred
+                                       ->  Seq Scan on tenk1 b  
(cost=0.00..62.33 rows=3333 width=4)
+ Optimizer: Postgres query optimizer
+(13 rows)
 
 EXPLAIN select count(distinct ss.ten) from
   (select ten from tenk1 a
@@ -3146,6 +3141,33 @@ select * from issue_12656 where (i, j) in (select 
distinct on (i) i, j from issu
  1 | 10002
 (1 row)
 
+-- case 3, check correlated DISTINCT ON
+explain select * from issue_12656 a where (i, j) in
+(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j 
asc);
+                                                         QUERY PLAN            
                                             
+----------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..895.00 rows=43050 
width=8)
+   ->  Seq Scan on issue_12656 a  (cost=0.00..321.00 rows=14350 width=8)
+         Filter: (SubPlan 1)
+         SubPlan 1
+           ->  Unique  (cost=9818.00..10033.25 rows=1000 width=8)
+                 ->  Sort  (cost=9818.00..10033.25 rows=86100 width=8)
+                       Sort Key: b.j
+                       ->  Result  (cost=0.00..2760.50 rows=86100 width=8)
+                             Filter: (a.i = b.i)
+                             ->  Materialize  (cost=0.00..1899.50 rows=86100 
width=8)
+                                   ->  Broadcast Motion 3:3  (slice2; 
segments: 3)  (cost=0.00..1469.00 rows=86100 width=8)
+                                         ->  Seq Scan on issue_12656 b  
(cost=0.00..321.00 rows=28700 width=8)
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+select * from issue_12656 a where (i, j) in
+(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j 
asc);
+ i |   j   
+---+-------
+ 1 | 10001
+(1 row)
+
 ---
 --- Test param info is preserved when bringing a path to OuterQuery locus
 ---
diff --git a/src/test/regress/expected/subselect_optimizer.out 
b/src/test/regress/expected/subselect_optimizer.out
index 3855edf1b9..c56ad50a3b 100644
--- a/src/test/regress/expected/subselect_optimizer.out
+++ b/src/test/regress/expected/subselect_optimizer.out
@@ -1410,14 +1410,8 @@ select * from int4_tbl o where (f1, f1) in
                            Output: i.f1, ((generate_series(1, 50)) / 10)
                            ->  ProjectSet
                                  Output: generate_series(1, 50), i.f1
-                                 ->  GroupAggregate
+                                 ->  Seq Scan on public.int4_tbl i
                                        Output: i.f1
-                                       Group Key: i.f1
-                                       ->  Sort
-                                             Output: i.f1
-                                             Sort Key: i.f1
-                                             ->  Seq Scan on public.int4_tbl i
-                                                   Output: i.f1
  Optimizer: Postgres query optimizer
 (19 rows)
 
diff --git a/src/test/regress/sql/subselect_gp.sql 
b/src/test/regress/sql/subselect_gp.sql
index f83ddf6456..bdc4346274 100644
--- a/src/test/regress/sql/subselect_gp.sql
+++ b/src/test/regress/sql/subselect_gp.sql
@@ -1209,6 +1209,13 @@ select * from issue_12656 where (i, j) in (select 
distinct on (i) i, j from issu
 
 select * from issue_12656 where (i, j) in (select distinct on (i) i, j from 
issue_12656 order by i, j desc);
 
+-- case 3, check correlated DISTINCT ON
+explain select * from issue_12656 a where (i, j) in
+(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j 
asc);
+
+select * from issue_12656 a where (i, j) in
+(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j 
asc);
+
 ---
 --- Test param info is preserved when bringing a path to OuterQuery locus
 ---
diff --git a/src/test/singlenode_regress/expected/subselect.out 
b/src/test/singlenode_regress/expected/subselect.out
index 54a2ba10ac..d2b4222e2d 100644
--- a/src/test/singlenode_regress/expected/subselect.out
+++ b/src/test/singlenode_regress/expected/subselect.out
@@ -1356,13 +1356,10 @@ select * from int4_tbl o where (f1, f1) in
                      Output: i.f1, ((generate_series(1, 50)) / 10)
                      ->  ProjectSet
                            Output: generate_series(1, 50), i.f1
-                           ->  HashAggregate
+                           ->  Seq Scan on public.int4_tbl i
                                  Output: i.f1
-                                 Group Key: i.f1
-                                 ->  Seq Scan on public.int4_tbl i
-                                       Output: i.f1
  Optimizer: Postgres query optimizer
-(20 rows)
+(17 rows)
 
 select * from int4_tbl o where (f1, f1) in
   (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1);
diff --git a/src/test/singlenode_regress/expected/subselect_gp.out 
b/src/test/singlenode_regress/expected/subselect_gp.out
index 83cbeb2f5e..7538ea4b4d 100644
--- a/src/test/singlenode_regress/expected/subselect_gp.out
+++ b/src/test/singlenode_regress/expected/subselect_gp.out
@@ -1512,12 +1512,12 @@ EXPLAIN select count(*) from
    where unique1 IN (select distinct hundred from tenk1 b)) ss;
                                                    QUERY PLAN                  
                                 
 
----------------------------------------------------------------------------------------------------------------
- Aggregate  (cost=410.03..410.04 rows=1 width=8)
-   ->  Hash Join  (cost=197.53..409.78 rows=100 width=0)
+ Aggregate  (cost=410.15..410.16 rows=1 width=8)
+   ->  Hash Join  (cost=197.53..409.90 rows=100 width=0)
          Hash Cond: (a.unique1 = b.hundred)
          ->  Seq Scan on tenk1 a  (cost=0.00..185.00 rows=10000 width=4)
          ->  Hash  (cost=196.28..196.28 rows=100 width=4)
-               ->  Unique  (cost=0.29..195.28 rows=100 width=4)
+               ->  HashAggregate  (cost=195.28..196.28 rows=100 width=4)
                      Group Key: b.hundred
                      ->  Index Only Scan using tenk1_hundred on tenk1 b  
(cost=0.29..170.28 rows=10000 width=4)
  Optimizer: Postgres query optimizer
@@ -1528,12 +1528,12 @@ EXPLAIN select count(distinct ss.ten) from
    where unique1 IN (select distinct hundred from tenk1 b)) ss;
                                                    QUERY PLAN                  
                                 
 
----------------------------------------------------------------------------------------------------------------
- Aggregate  (cost=410.03..410.04 rows=1 width=8)
-   ->  Hash Join  (cost=197.53..409.78 rows=100 width=4)
+ Aggregate  (cost=410.15..410.16 rows=1 width=8)
+   ->  Hash Join  (cost=197.53..409.90 rows=100 width=4)
          Hash Cond: (a.unique1 = b.hundred)
          ->  Seq Scan on tenk1 a  (cost=0.00..185.00 rows=10000 width=8)
          ->  Hash  (cost=196.28..196.28 rows=100 width=4)
-               ->  Unique  (cost=0.29..195.28 rows=100 width=4)
+               ->  HashAggregate  (cost=195.28..196.28 rows=100 width=4)
                      Group Key: b.hundred
                      ->  Index Only Scan using tenk1_hundred on tenk1 b  
(cost=0.29..170.28 rows=10000 width=4)
  Optimizer: Postgres query optimizer
@@ -2187,7 +2187,7 @@ select * from foo left outer join baz on (select bar.i 
from bar where bar.i = fo
                        Output: bar.i
                        Filter: (bar.i = foo.i)
  Optimizer: Postgres query optimizer
-(15 rows)
+(14 rows)
 
 select * from foo left outer join baz on (select bar.i from bar where bar.i = 
foo.i) + 1  = baz.j;
  i  | j  | i | j 
@@ -2230,7 +2230,7 @@ select * from foo where
          ->  Seq Scan on subselect_gp.baz b
                Output: b.i
  Optimizer: Postgres query optimizer
-(14 rows)
+(13 rows)
 
 select * from foo where
   (case when foo.i in (select a.i from baz a) then foo.i else null end) in


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 19/41: Bring back cdbsubselect_drop_distinct. (#14322)

Reply via email to