(cloudberry) 02/39: fix results wrong while using union for recursive_cte (#16782)

yjhjstz Mon, 27 Jan 2025 04:32:53 -0800

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit af17f94c8d20ea5a82a0dca7ed3755eaa8b6e10c
Author: xuejing zhao <[email protected]>
AuthorDate: Thu Nov 30 15:14:07 2023 +0800

    fix results wrong while using union for recursive_cte (#16782)
    
    In a recursive CTE, We can use union or union all to connect non-recursive 
part and recursive part.
    If we use union, we should deduplicate the tuples.
    However, from the following plan, Recursive union is executed on 3 segments,
    this only deduplicates the tuples on its segment. there are duplicated 
tuples between different segments.
    ```
    postgres=# explain with recursive x(a) as ( select a from tmp union select 
a+1 from x where a<10) select * from x;
                                          QUERY PLAN
    
    
--------------------------------------------------------------------------------------
    -
     Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..126409.67 
rows=2956100 width=4)
       ->  Recursive Union  (cost=0.00..86995.00 rows=985367 width=4)
             ->  Seq Scan on tmp  (cost=0.00..321.00 rows=28700 width=4)
             ->  WorkTable Scan on x  (cost=0.00..6696.67 rows=95667 width=4)
                   Filter: (a < 10)
     Optimizer: Postgres-based planner
    (6 rows)
    
    Add unique path above Recursive Union if we use union for recursive cte.
    
    postgres=#  explain with recursive x(a) as ( select a from tmp union select 
a+1 from x where a<10) select * from x;
                                                   QUERY PLAN
    
    
--------------------------------------------------------------------------------------
    ------------------
     Gather Motion 3:1  (slice1; segments: 3)  (cost=113014.84..163245.44 
rows=2956100 width=4)
       ->  HashAggregate  (cost=113014.84..123830.78 rows=985367 width=4)
             Group Key: tmp.a
             Planned Partitions: 4
             ->  Redistribute Motion 3:3  (slice2; segments: 3)  
(cost=0.00..106702.33 rows=985367 width=4)
                   Hash Key: tmp.a
                   ->  Recursive Union  (cost=0.00..86995.00 rows=985367 
width=4)
                         ->  Seq Scan on tmp  (cost=0.00..321.00 rows=28700 
width=4)
                         ->  WorkTable Scan on x  (cost=0.00..6696.67 
rows=95667 width=4)
                               Filter: (a < 10)
     Optimizer: Postgres-based planner
    (11 rows)
    ```
    pr: #16772
---
 src/backend/optimizer/prep/prepunion.c         | 15 ++++++++
 src/test/regress/expected/gp_recursive_cte.out | 49 ++++++++++++++++++++++++++
 src/test/regress/sql/gp_recursive_cte.sql      | 21 +++++++++++
 3 files changed, 85 insertions(+)

diff --git a/src/backend/optimizer/prep/prepunion.c 
b/src/backend/optimizer/prep/prepunion.c
index fcd459f782..74725edc0b 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -562,6 +562,21 @@ generate_recursion_path(SetOperationStmt *setOp, 
PlannerInfo *root,
                                                                                
           dNumGroups);
        path->locus = rpath->locus;
 
+       /*
+        * GPDB:
+        * https://github.com/greenplum-db/gpdb/issues/16772
+        * If we use union rather than union all we should deduplicate the 
tuples.
+        * When the locus of recursive union path is Partitioned,
+        * It recursive union node only deduplicates the tuples on its segment.
+        * There are duplicated tuples between different segments.
+        * So we redistribute tuples and add a unique path above recursive 
union path.
+        */
+       if (!setOp->all && CdbPathLocus_IsPartitioned(path->locus))
+       {
+               path = make_motion_hash_all_targets(root, path, tlist);
+               path = make_union_unique(setOp, path, tlist, root);
+       }
+
        add_path(result_rel, path, root);
        postprocess_setop_rel(root, result_rel);
        return result_rel;
diff --git a/src/test/regress/expected/gp_recursive_cte.out 
b/src/test/regress/expected/gp_recursive_cte.out
index 389f888e6b..39a3f5d178 100644
--- a/src/test/regress/expected/gp_recursive_cte.out
+++ b/src/test/regress/expected/gp_recursive_cte.out
@@ -878,3 +878,52 @@ select count(*) from rcte;
 RESET enable_nestloop;
 RESET enable_hashjoin;
 RESET enable_mergejoin;
+-- using union rather than union all for recursive union
+CREATE TABLE tmp(a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO tmp SELECT generate_series(1,5);
+INSERT INTO tmp SELECT * FROM tmp;
+EXPLAIN (costs off)
+WITH RECURSIVE x(a) as
+(
+    select a from tmp
+    union
+    select a+1 from x where a<10
+)
+select * from x ;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  HashAggregate
+         Group Key: tmp.a
+         ->  Redistribute Motion 3:3  (slice2; segments: 3)
+               Hash Key: tmp.a
+               ->  Recursive Union
+                     ->  Seq Scan on tmp
+                     ->  WorkTable Scan on x
+                           Filter: (a < 10)
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+WITH RECURSIVE x(a) as
+(
+    select a from tmp
+    union
+    select a+1 from x where a<10
+)
+select * from x ;
+ a  
+----
+  8
+  2
+  4
+  3
+  7
+  1
+  6
+  5
+  9
+ 10
+(10 rows)
+
diff --git a/src/test/regress/sql/gp_recursive_cte.sql 
b/src/test/regress/sql/gp_recursive_cte.sql
index 549a29fc86..28b8ac0028 100644
--- a/src/test/regress/sql/gp_recursive_cte.sql
+++ b/src/test/regress/sql/gp_recursive_cte.sql
@@ -563,3 +563,24 @@ select count(*) from rcte;
 RESET enable_nestloop;
 RESET enable_hashjoin;
 RESET enable_mergejoin;
+
+-- using union rather than union all for recursive union
+CREATE TABLE tmp(a int, b int);
+INSERT INTO tmp SELECT generate_series(1,5);
+INSERT INTO tmp SELECT * FROM tmp;
+EXPLAIN (costs off)
+WITH RECURSIVE x(a) as
+(
+    select a from tmp
+    union
+    select a+1 from x where a<10
+)
+select * from x ;
+
+WITH RECURSIVE x(a) as
+(
+    select a from tmp
+    union
+    select a+1 from x where a<10
+)
+select * from x ;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 02/39: fix results wrong while using union for recursive_cte (#16782)

Reply via email to