This is an automated email from the ASF dual-hosted git repository. yjhjstz pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit af17f94c8d20ea5a82a0dca7ed3755eaa8b6e10c Author: xuejing zhao <[email protected]> AuthorDate: Thu Nov 30 15:14:07 2023 +0800 fix results wrong while using union for recursive_cte (#16782) In a recursive CTE, We can use union or union all to connect non-recursive part and recursive part. If we use union, we should deduplicate the tuples. However, from the following plan, Recursive union is executed on 3 segments, this only deduplicates the tuples on its segment. there are duplicated tuples between different segments. ``` postgres=# explain with recursive x(a) as ( select a from tmp union select a+1 from x where a<10) select * from x; QUERY PLAN -------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..126409.67 rows=2956100 width=4) -> Recursive Union (cost=0.00..86995.00 rows=985367 width=4) -> Seq Scan on tmp (cost=0.00..321.00 rows=28700 width=4) -> WorkTable Scan on x (cost=0.00..6696.67 rows=95667 width=4) Filter: (a < 10) Optimizer: Postgres-based planner (6 rows) Add unique path above Recursive Union if we use union for recursive cte. postgres=# explain with recursive x(a) as ( select a from tmp union select a+1 from x where a<10) select * from x; QUERY PLAN -------------------------------------------------------------------------------------- ------------------ Gather Motion 3:1 (slice1; segments: 3) (cost=113014.84..163245.44 rows=2956100 width=4) -> HashAggregate (cost=113014.84..123830.78 rows=985367 width=4) Group Key: tmp.a Planned Partitions: 4 -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..106702.33 rows=985367 width=4) Hash Key: tmp.a -> Recursive Union (cost=0.00..86995.00 rows=985367 width=4) -> Seq Scan on tmp (cost=0.00..321.00 rows=28700 width=4) -> WorkTable Scan on x (cost=0.00..6696.67 rows=95667 width=4) Filter: (a < 10) Optimizer: Postgres-based planner (11 rows) ``` pr: #16772 --- src/backend/optimizer/prep/prepunion.c | 15 ++++++++ src/test/regress/expected/gp_recursive_cte.out | 49 ++++++++++++++++++++++++++ src/test/regress/sql/gp_recursive_cte.sql | 21 +++++++++++ 3 files changed, 85 insertions(+) diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index fcd459f782..74725edc0b 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -562,6 +562,21 @@ generate_recursion_path(SetOperationStmt *setOp, PlannerInfo *root, dNumGroups); path->locus = rpath->locus; + /* + * GPDB: + * https://github.com/greenplum-db/gpdb/issues/16772 + * If we use union rather than union all we should deduplicate the tuples. + * When the locus of recursive union path is Partitioned, + * It recursive union node only deduplicates the tuples on its segment. + * There are duplicated tuples between different segments. + * So we redistribute tuples and add a unique path above recursive union path. + */ + if (!setOp->all && CdbPathLocus_IsPartitioned(path->locus)) + { + path = make_motion_hash_all_targets(root, path, tlist); + path = make_union_unique(setOp, path, tlist, root); + } + add_path(result_rel, path, root); postprocess_setop_rel(root, result_rel); return result_rel; diff --git a/src/test/regress/expected/gp_recursive_cte.out b/src/test/regress/expected/gp_recursive_cte.out index 389f888e6b..39a3f5d178 100644 --- a/src/test/regress/expected/gp_recursive_cte.out +++ b/src/test/regress/expected/gp_recursive_cte.out @@ -878,3 +878,52 @@ select count(*) from rcte; RESET enable_nestloop; RESET enable_hashjoin; RESET enable_mergejoin; +-- using union rather than union all for recursive union +CREATE TABLE tmp(a int, b int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +INSERT INTO tmp SELECT generate_series(1,5); +INSERT INTO tmp SELECT * FROM tmp; +EXPLAIN (costs off) +WITH RECURSIVE x(a) as +( + select a from tmp + union + select a+1 from x where a<10 +) +select * from x ; + QUERY PLAN +------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + -> HashAggregate + Group Key: tmp.a + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: tmp.a + -> Recursive Union + -> Seq Scan on tmp + -> WorkTable Scan on x + Filter: (a < 10) + Optimizer: Postgres query optimizer +(10 rows) + +WITH RECURSIVE x(a) as +( + select a from tmp + union + select a+1 from x where a<10 +) +select * from x ; + a +---- + 8 + 2 + 4 + 3 + 7 + 1 + 6 + 5 + 9 + 10 +(10 rows) + diff --git a/src/test/regress/sql/gp_recursive_cte.sql b/src/test/regress/sql/gp_recursive_cte.sql index 549a29fc86..28b8ac0028 100644 --- a/src/test/regress/sql/gp_recursive_cte.sql +++ b/src/test/regress/sql/gp_recursive_cte.sql @@ -563,3 +563,24 @@ select count(*) from rcte; RESET enable_nestloop; RESET enable_hashjoin; RESET enable_mergejoin; + +-- using union rather than union all for recursive union +CREATE TABLE tmp(a int, b int); +INSERT INTO tmp SELECT generate_series(1,5); +INSERT INTO tmp SELECT * FROM tmp; +EXPLAIN (costs off) +WITH RECURSIVE x(a) as +( + select a from tmp + union + select a+1 from x where a<10 +) +select * from x ; + +WITH RECURSIVE x(a) as +( + select a from tmp + union + select a+1 from x where a<10 +) +select * from x ; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
