This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


The following commit(s) were added to refs/heads/main by this push:
     new 5cbac92d378 Fix duplicate distribution keys from subqueries.
5cbac92d378 is described below

commit 5cbac92d37881aabdd244edfb5c1ee9fbec60756
Author: Zhang Mingli <[email protected]>
AuthorDate: Mon Oct 20 10:52:06 2025 +0800

    Fix duplicate distribution keys from subqueries.
    
    The parser normally prevents duplicate distribution keys in main query
    grammar, but they can still originate from subqueries - particularly in
    window function PARTITION BY clauses with equivalent expressions.
    
    When duplicate distribution keys are detected, we now properly fall back
    to alternative processing. The caller already handles this fallback
    scenario correctly; this fix ensures the fallback is triggered when
    needed.
---
 .../test/regress/expected/create_table_distpol.out    |  2 --
 src/backend/cdb/cdbllize.c                            | 19 +++++++++++++++----
 src/test/regress/expected/create_table_distpol.out    |  2 --
 src/test/regress/expected/gp_create_table.out         |  5 +++++
 src/test/regress/sql/gp_create_table.sql              |  4 ++++
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git 
a/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out 
b/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
index 545d14625f4..46479ca2126 100644
--- a/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
+++ b/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out
@@ -128,9 +128,7 @@ select distkey from gp_distribution_policy where localoid = 
'distpol_person_copy
 RESET gp_create_table_random_default_distribution;
 -- Test duplicate distribute keys
 CREATE TABLE ctas_dup_dk as SELECT distinct age as c1, age as c2 from 
distpol_person; 
-ERROR:  duplicate DISTRIBUTED BY column 'c1'
 SELECT distinct age c1, age c2 into ctas_dup_dk_1 from distpol_person;
-ERROR:  duplicate DISTRIBUTED BY column 'c1'
 --
 -- Test deriving distribution key from the query's distribution in
 -- CREATE TABLE AS
diff --git a/src/backend/cdb/cdbllize.c b/src/backend/cdb/cdbllize.c
index a4037c36158..da702806be5 100644
--- a/src/backend/cdb/cdbllize.c
+++ b/src/backend/cdb/cdbllize.c
@@ -161,6 +161,7 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path 
*path)
        ListCell   *dk_cell;
        ListCell   *ec_cell;
        ListCell   *em_cell;
+       bool duplicate_keys_detected = false;
 
        /*
         * Is it a Hashed distribution?
@@ -184,6 +185,9 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path 
*path)
                DistributionKey *dk = lfirst(dk_cell);
                bool            found_expr = false;
 
+               if (duplicate_keys_detected)
+                       return NULL;
+
                foreach(ec_cell, dk->dk_eclasses)
                {
                        EquivalenceClass *ec = lfirst(ec_cell);
@@ -230,10 +234,17 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path 
*path)
                                                Assert(list_length(policykeys) 
< MaxPolicyAttributeNumber);
 
                                                if (list_member_int(policykeys, 
attno))
-                                                       ereport(ERROR,
-                                                                       
(errcode(ERRCODE_DUPLICATE_COLUMN),
-                                                                        
errmsg("duplicate DISTRIBUTED BY column '%s'",
-                                                                               
        target->resname ? target->resname : "???")));
+                                               {
+                                                       /*
+                                                        * Although the parser 
prevents duplicate distribution keys in regular grammar,
+                                                        * they can still occur 
in subqueries(e.g., window function PARTITION BY columns
+                                                        * with equivalent 
expressions).
+                                                        * We fall back to an 
alternative approach when duplicate distribution keys are detected;
+                                                        * the caller handles 
this scenario appropriately.
+                                                        */
+                                                       duplicate_keys_detected 
= true;
+                                                       break;
+                                               }
 
                                                /*
                                                 * We know the btree operator 
family corresponding to
diff --git a/src/test/regress/expected/create_table_distpol.out 
b/src/test/regress/expected/create_table_distpol.out
index 545d14625f4..46479ca2126 100644
--- a/src/test/regress/expected/create_table_distpol.out
+++ b/src/test/regress/expected/create_table_distpol.out
@@ -128,9 +128,7 @@ select distkey from gp_distribution_policy where localoid = 
'distpol_person_copy
 RESET gp_create_table_random_default_distribution;
 -- Test duplicate distribute keys
 CREATE TABLE ctas_dup_dk as SELECT distinct age as c1, age as c2 from 
distpol_person; 
-ERROR:  duplicate DISTRIBUTED BY column 'c1'
 SELECT distinct age c1, age c2 into ctas_dup_dk_1 from distpol_person;
-ERROR:  duplicate DISTRIBUTED BY column 'c1'
 --
 -- Test deriving distribution key from the query's distribution in
 -- CREATE TABLE AS
diff --git a/src/test/regress/expected/gp_create_table.out 
b/src/test/regress/expected/gp_create_table.out
index 88b126e4e65..8a26a9a7e33 100644
--- a/src/test/regress/expected/gp_create_table.out
+++ b/src/test/regress/expected/gp_create_table.out
@@ -202,3 +202,8 @@ c1 int,c2 int,c3 int,c4 int,c5 int,c6 int,c7 int,c8 int,c9 
int,c10 int,c11 int,c
 
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,c44,c45,c46,c47,c48,c49,c50,c51,c52,c53,c54,c55,c56,c57,c58,c59,c60,c61,c62,c63,c64,c65,c66,c67,c68,c69,c70,c71,c72,c73,c74,c75,c76,c77,c78,c79,c80,c81,c82,c83,c84,c85,c86,c87,c88,c89,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99,c100,c101,c102,c103,c104,c105,c106,c107,c108,c109,c110,c111,c112,c113,c114,c115,c116,c117,c118,c119,c120,
 [...]
 );
 ERROR:  tables can have at most 1600 columns
+create table dup_key_t(a int, b int, c int) distributed randomly;
+create temp table ctas_dump_key_t as select rn, a, b from
+ (select a, b, row_number() over (partition by a, b order by c) rn from 
dup_key_t where dup_key_t.a = dup_key_t.b) x;
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 
'rn' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
diff --git a/src/test/regress/sql/gp_create_table.sql 
b/src/test/regress/sql/gp_create_table.sql
index 503ccfa43e1..5b39b216d34 100644
--- a/src/test/regress/sql/gp_create_table.sql
+++ b/src/test/regress/sql/gp_create_table.sql
@@ -129,3 +129,7 @@ c1 int,c2 int,c3 int,c4 int,c5 int,c6 int,c7 int,c8 int,c9 
int,c10 int,c11 int,c
 (
 
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,c44,c45,c46,c47,c48,c49,c50,c51,c52,c53,c54,c55,c56,c57,c58,c59,c60,c61,c62,c63,c64,c65,c66,c67,c68,c69,c70,c71,c72,c73,c74,c75,c76,c77,c78,c79,c80,c81,c82,c83,c84,c85,c86,c87,c88,c89,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99,c100,c101,c102,c103,c104,c105,c106,c107,c108,c109,c110,c111,c112,c113,c114,c115,c116,c117,c118,c119,c120,
 [...]
 );
+
+create table dup_key_t(a int, b int, c int) distributed randomly;
+create temp table ctas_dump_key_t as select rn, a, b from
+ (select a, b, row_number() over (partition by a, b order by c) rn from 
dup_key_t where dup_key_t.a = dup_key_t.b) x;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to