This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


The following commit(s) were added to refs/heads/main by this push:
     new f020573d35 Fix wrong join_rel size estimates for anti join. (#934)
f020573d35 is described below

commit f020573d3598b440526f2fdbeac46b974ba8bd17
Author: Tender Wang <[email protected]>
AuthorDate: Wed Mar 19 03:18:08 2025 +0800

    Fix wrong join_rel size estimates for anti join. (#934)
    
    * Fix wrong join_rel size estimates for anti join.
    
    In Postgres planner, we set join_rel selectivity in 
calc_joinrel_size_estimate()
    for anti join type. Maybe it's codes historical reasons, we have another 
place
    to set anti join selectivity in clauselist_selectivity_ext().
    
    This can lead to wrong join_rel size estimates for anti join type. We can 
remove
    the selectivity computation in clauselist_selectivity_ext(). So we can make 
codes
    be consistent with upstream.
    
    * Fix notin plan diff.
    
    * Fix join_gp_optimizer.out plan diff.
---
 src/backend/optimizer/path/clausesel.c          |  9 +-------
 src/test/regress/expected/join_gp.out           | 28 +++++++++++++++++++++++++
 src/test/regress/expected/join_gp_optimizer.out | 25 ++++++++++++++++++++++
 src/test/regress/expected/notin.out             | 26 +++++++++++------------
 src/test/regress/sql/join_gp.sql                | 12 +++++++++++
 5 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/src/backend/optimizer/path/clausesel.c 
b/src/backend/optimizer/path/clausesel.c
index 34b37d6b51..f4a6aad7ed 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -406,14 +406,7 @@ clauselist_selectivity_ext(PlannerInfo *root,
        }
 
        pfree(rgsel);
-       /* 
-        * For Anti Semi Join, selectivity is determined by the fraction of 
-        * tuples that do no match 
-        */
-       if (JOIN_ANTI == jointype || JOIN_LASJ_NOTIN == jointype)
-       {
-               s1 = (1 - s1);
-       }
+
        return s1;
 }
 
diff --git a/src/test/regress/expected/join_gp.out 
b/src/test/regress/expected/join_gp.out
index 1dbbd4eaa4..6e2a3af395 100644
--- a/src/test/regress/expected/join_gp.out
+++ b/src/test/regress/expected/join_gp.out
@@ -47,6 +47,34 @@ select * from l l1 join l l2 on l1.a = l2.a left join l l3 
on l1.a = l3.a and l1
  2 | 2 | 2
 (5 rows)
 
+--
+-- test anti_join/left_anti_semi_join selectivities
+--
+create table aj_t1(a int, b int, c int) distributed by (a);
+create table aj_t2(a int, b int, c int) distributed by (a);
+insert into aj_t1 values(1,1,1);
+insert into aj_t2 values(1,1,1),(2,2,2);
+explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from 
aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  Hash Anti Join
+         Hash Cond: ((t1.b = t2.b) AND (t1.c = t2.c))
+         ->  Redistribute Motion 3:3  (slice2; segments: 3)
+               Hash Key: t1.b, t1.c
+               ->  Seq Scan on aj_t1 t1
+         ->  Hash
+               ->  Redistribute Motion 3:3  (slice3; segments: 3)
+                     Hash Key: t2.b, t2.c
+                     ->  Seq Scan on aj_t2 t2
+ Optimizer: Postgres query optimizer
+(11 rows)
+
+select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b 
= t2.b and t1.c = t2.c);
+ a 
+---
+(0 rows)
+
 --
 -- test hash join
 --
diff --git a/src/test/regress/expected/join_gp_optimizer.out 
b/src/test/regress/expected/join_gp_optimizer.out
index c37b860b7b..4138130b0a 100644
--- a/src/test/regress/expected/join_gp_optimizer.out
+++ b/src/test/regress/expected/join_gp_optimizer.out
@@ -47,6 +47,31 @@ select * from l l1 join l l2 on l1.a = l2.a left join l l3 
on l1.a = l3.a and l1
  2 | 2 | 2
 (5 rows)
 
+--
+-- test anti_join/left_anti_semi_join selectivities
+--
+create table aj_t1(a int, b int, c int) distributed by (a);
+create table aj_t2(a int, b int, c int) distributed by (a);
+insert into aj_t1 values(1,1,1);
+insert into aj_t2 values(1,1,1),(2,2,2);
+explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from 
aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  Hash Anti Join
+         Hash Cond: ((t1.b = t2.b) AND (t1.c = t2.c))
+         ->  Seq Scan on aj_t1 t1
+         ->  Hash
+               ->  Broadcast Motion 3:3  (slice2; segments: 3)
+                     ->  Seq Scan on aj_t2 t2
+ Optimizer: GPORCA
+(8 rows)
+
+select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b 
= t2.b and t1.c = t2.c);
+ a 
+---
+(0 rows)
+
 --
 -- test hash join
 --
diff --git a/src/test/regress/expected/notin.out 
b/src/test/regress/expected/notin.out
index f0b404e735..5b1688fe00 100644
--- a/src/test/regress/expected/notin.out
+++ b/src/test/regress/expected/notin.out
@@ -317,19 +317,19 @@ select c1 from t1 where c1 > 6 and c1 not in
 --
 explain select c1 from t1,t2 where c1 not in 
        (select c3 from t3) and c1 = c2;
-                                              QUERY PLAN                       
                        
--------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=5.38..8.62 rows=4 width=4)
-   ->  Hash Join  (cost=5.38..8.62 rows=2 width=4)
-         Hash Cond: (t1.c1 = t2.c2)
-         ->  Hash Left Anti Semi (Not-In) Join  (cost=2.26..5.46 rows=2 
width=4)
-               Hash Cond: (t1.c1 = t3.c3)
-               ->  Seq Scan on t1  (cost=0.00..3.10 rows=4 width=4)
-               ->  Hash  (cost=2.15..2.15 rows=3 width=4)
-                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=0.00..2.15 rows=3 width=4)
-                           ->  Seq Scan on t3  (cost=0.00..2.03 rows=1 width=4)
-         ->  Hash  (cost=3.05..3.05 rows=2 width=4)
-               ->  Seq Scan on t2  (cost=0.00..3.05 rows=2 width=4)
+                                           QUERY PLAN                          
                  
+-------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2.12..3.25 rows=4 width=4)
+   ->  Hash Left Anti Semi (Not-In) Join  (cost=2.12..3.20 rows=1 width=4)
+         Hash Cond: (t1.c1 = t3.c3)
+         ->  Hash Join  (cost=1.04..2.10 rows=2 width=4)
+               Hash Cond: (t1.c1 = t2.c2)
+               ->  Seq Scan on t1  (cost=0.00..1.03 rows=3 width=4)
+               ->  Hash  (cost=1.02..1.02 rows=2 width=4)
+                     ->  Seq Scan on t2  (cost=0.00..1.02 rows=2 width=4)
+         ->  Hash  (cost=1.05..1.05 rows=3 width=4)
+               ->  Broadcast Motion 3:3  (slice2; segments: 3)  
(cost=0.00..1.05 rows=3 width=4)
+                     ->  Seq Scan on t3  (cost=0.00..1.01 rows=1 width=4)
  Optimizer: Postgres query optimizer
 (12 rows)
 
diff --git a/src/test/regress/sql/join_gp.sql b/src/test/regress/sql/join_gp.sql
index 3061391c99..fae1fdc7c5 100644
--- a/src/test/regress/sql/join_gp.sql
+++ b/src/test/regress/sql/join_gp.sql
@@ -24,6 +24,18 @@ create temp table l(a int);
 insert into l values (1), (1), (2);
 select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and 
l1.a = 2 order by 1,2,3;
 
+--
+-- test anti_join/left_anti_semi_join selectivities
+--
+create table aj_t1(a int, b int, c int) distributed by (a);
+create table aj_t2(a int, b int, c int) distributed by (a);
+insert into aj_t1 values(1,1,1);
+insert into aj_t2 values(1,1,1),(2,2,2);
+
+explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from 
aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
+
+select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b 
= t2.b and t1.c = t2.c);
+
 --
 -- test hash join
 --


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to