(cloudberry) 12/20: Resolve wrong result when direct dispatch, If opno of clause does not belong to opfamily of distributed key(#14977)

yjhjstz Mon, 06 Jan 2025 00:39:38 -0800

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit 6262846cac441e9f8a1f7888d240ebc4e93313dc
Author: xuejing zhao <[email protected]>
AuthorDate: Mon Feb 27 10:58:43 2023 +0800

    Resolve wrong result when direct dispatch, If opno of clause does not 
belong to opfamily of distributed key(#14977)
    
    If opno of clause does not belong to opfamily of distributed key, and use 
direct dispatch we will get wrong results:
    create table bar(a varchar);
    insert into bar values('a   ');
    explain select * from bar where a = 'a'::bpchar;
    ```
                                      QUERY PLAN
    
------------------------------------------------------------------------------
     Gather Motion 1:1  (slice1; segments: 1)  (cost=0.00..431.00 rows=1 
width=8)
       ->  Seq Scan on bar  (cost=0.00..431.00 rows=1 width=8)
             Filter: ((a)::bpchar = 'a'::bpchar)
     Optimizer: Pivotal Optimizer (GPORCA)
    (4 rows)
    
    select * from bar where a = 'a'::bpchar;
     a
    ---
    (0 rows)
    ```
    
    if opno of the clause does not belong to opfamily of distributed key,
    do not use direct dispatch to avoid wrong results.
    
    There are some cases that we could use direct dispatch, but we don't after 
this commit.
    create table t(a citext);
    explain select * from t where a = 'a'::text;
    texteq does not belong to the opfamily of the table's distributed key 
citext type.
    But from the implementation can deduce: texteq ==> citext_eq, and we can do 
the direct dispatch.
    But we do not have the kind of implication rule in Postgres: texteq ==> 
citext_eq.
    
    more details please refer to issue: 
https://github.com/greenplum-db/gpdb/issues/14887
    
    fix direct_dispatch
---
 src/backend/cdb/cdbtargeteddispatch.c              | 10 ++-
 src/backend/optimizer/util/predtest.c              | 16 ++---
 src/backend/optimizer/util/predtest_valueset.c     | 18 +++--
 src/include/optimizer/predtest_valueset.h          |  4 +-
 src/test/isolation2/expected/setup.out             |  2 +-
 src/test/regress/expected/alter_table_gp.out       |  8 +--
 src/test/regress/expected/btree_index.out          | 35 ++++++++--
 src/test/regress/expected/direct_dispatch.out      | 76 ++++++++++++++++++++++
 .../regress/expected/direct_dispatch_optimizer.out | 76 ++++++++++++++++++++++
 src/test/regress/sql/btree_index.sql               |  6 +-
 src/test/regress/sql/direct_dispatch.sql           | 31 +++++++++
 11 files changed, 252 insertions(+), 30 deletions(-)

diff --git a/src/backend/cdb/cdbtargeteddispatch.c 
b/src/backend/cdb/cdbtargeteddispatch.c
index 845211ac45..22ab3ded4a 100644
--- a/src/backend/cdb/cdbtargeteddispatch.c
+++ b/src/backend/cdb/cdbtargeteddispatch.c
@@ -39,6 +39,7 @@
 #include "cdb/cdbutil.h"
 
 #include "executor/executor.h"
+#include "commands/defrem.h"
 
 #define PRINT_DISPATCH_DECISIONS_STRING ("print_dispatch_decisions")
 
@@ -171,8 +172,10 @@ GetContentIdsFromPlanForSingleRelation(PlannerInfo *root, 
Plan *plan, int rangeT
                seg_id_var = makeVar(rangeTableIndex,
                                                         
GpSegmentIdAttributeNumber,
                                                         vartypeid, type_mod, 
type_coll, 0);
+               Oid opclass = GetDefaultOpClass(vartypeid, HASH_AM_OID);
+               Oid opfamily = get_opclass_family(opclass);
                pvs_segids = DeterminePossibleValueSet((Node *) qualification,
-                                                                               
           (Node *) seg_id_var);
+                                                                               
           (Node *) seg_id_var, opfamily);
                if (!pvs_segids.isAnyValuePossible)
                {
                        seg_ids = GetPossibleValuesAsArray(&pvs_segids, &len);
@@ -218,6 +221,8 @@ GetContentIdsFromPlanForSingleRelation(PlannerInfo *root, 
Plan *plan, int rangeT
                {
                        Var                *var;
                        PossibleValueSet pvs;
+                       Oid policy_opclass = policy->opclasses[i];
+                       Oid policy_opfamily = 
get_opclass_family(policy_opclass);
 
                        var = makeVar(rangeTableIndex,
                                                  policy->attrs[i],
@@ -225,13 +230,12 @@ GetContentIdsFromPlanForSingleRelation(PlannerInfo *root, 
Plan *plan, int rangeT
                                                  parts[i].attr->atttypmod,
                                                  parts[i].attr->attcollation,
                                                  0);
-
                        /**
                         * Note that right now we only examine the given qual.  
This is okay because if there are other
                         *   quals on the plan then those would be ANDed with 
the qual, which can only narrow our choice
                         *   of segment and not expand it.
                         */
-                       pvs = DeterminePossibleValueSet((Node *) qualification, 
(Node *) var);
+                       pvs = DeterminePossibleValueSet((Node *) qualification, 
(Node *) var, policy_opfamily);
 
                        if (pvs.isAnyValuePossible)
                        {
diff --git a/src/backend/optimizer/util/predtest.c 
b/src/backend/optimizer/util/predtest.c
index 16e213faba..a5d905dd24 100644
--- a/src/backend/optimizer/util/predtest.c
+++ b/src/backend/optimizer/util/predtest.c
@@ -2233,7 +2233,7 @@ InvalidateOprProofCacheCallBack(Datum arg, int cacheid, 
uint32 hashvalue)
  * Process an AND clause -- this can do a INTERSECTION between sets learned 
from child clauses
  */
 static PossibleValueSet
-ProcessAndClauseForPossibleValues( PredIterInfoData *clauseInfo, Node *clause, 
Node *variable)
+ProcessAndClauseForPossibleValues( PredIterInfoData *clauseInfo, Node *clause, 
Node *variable, Oid opfamily)
 {
        PossibleValueSet result;
 
@@ -2241,7 +2241,7 @@ ProcessAndClauseForPossibleValues( PredIterInfoData 
*clauseInfo, Node *clause, N
 
        iterate_begin(child, clause, *clauseInfo)
        {
-               PossibleValueSet childPossible = DeterminePossibleValueSet( 
child, variable );
+               PossibleValueSet childPossible = DeterminePossibleValueSet( 
child, variable, opfamily );
                if ( childPossible.isAnyValuePossible)
                {
                        /* any value possible, this AND member does not add any 
information */
@@ -2272,14 +2272,14 @@ ProcessAndClauseForPossibleValues( PredIterInfoData 
*clauseInfo, Node *clause, N
  * Process an OR clause -- this can do a UNION between sets learned from child 
clauses
  */
 static PossibleValueSet
-ProcessOrClauseForPossibleValues( PredIterInfoData *clauseInfo, Node *clause, 
Node *variable)
+ProcessOrClauseForPossibleValues( PredIterInfoData *clauseInfo, Node *clause, 
Node *variable, Oid opfamily)
 {
        PossibleValueSet result;
        InitPossibleValueSetData(&result);
 
        iterate_begin(child, clause, *clauseInfo)
        {
-               PossibleValueSet childPossible = DeterminePossibleValueSet( 
child, variable );
+               PossibleValueSet childPossible = DeterminePossibleValueSet( 
child, variable, opfamily);
                if ( childPossible.isAnyValuePossible)
                {
                        /* any value is possible for the entire AND */
@@ -2321,7 +2321,7 @@ ProcessOrClauseForPossibleValues( PredIterInfoData 
*clauseInfo, Node *clause, No
  *    possible values is within the cross-product of the two variables' sets
  */
 PossibleValueSet
-DeterminePossibleValueSet(Node *clause, Node *variable)
+DeterminePossibleValueSet(Node *clause, Node *variable, Oid opfamily)
 {
        PredIterInfoData clauseInfo;
        PossibleValueSet result;
@@ -2335,11 +2335,11 @@ DeterminePossibleValueSet(Node *clause, Node *variable)
        switch (predicate_classify(clause, &clauseInfo))
        {
                case CLASS_AND:
-                       return ProcessAndClauseForPossibleValues(&clauseInfo, 
clause, variable);
+                       return ProcessAndClauseForPossibleValues(&clauseInfo, 
clause, variable, opfamily);
                case CLASS_OR:
-                       return ProcessOrClauseForPossibleValues(&clauseInfo, 
clause, variable);
+                       return ProcessOrClauseForPossibleValues(&clauseInfo, 
clause, variable, opfamily);
                case CLASS_ATOM:
-                       if (TryProcessExprForPossibleValues(clause, variable, 
&result))
+                       if (TryProcessExprForPossibleValues(clause, variable, 
opfamily, &result))
                        {
                                return result;
                        }
diff --git a/src/backend/optimizer/util/predtest_valueset.c 
b/src/backend/optimizer/util/predtest_valueset.c
index 0fec76ed7a..381b60e952 100644
--- a/src/backend/optimizer/util/predtest_valueset.c
+++ b/src/backend/optimizer/util/predtest_valueset.c
@@ -33,7 +33,7 @@ static void AddValue(PossibleValueSet *pvs, Const 
*valueToCopy);
 static void RemoveValue(PossibleValueSet *pvs, Const *value);
 static bool ContainsValue(PossibleValueSet *pvs, Const *value);
 
-static bool TryProcessOpExprForPossibleValues(OpExpr *expr, Node *variable, 
PossibleValueSet *resultOut);
+static bool TryProcessOpExprForPossibleValues(OpExpr *expr, Node *variable, 
Oid opfamily, PossibleValueSet *resultOut);
 static bool TryProcessNullTestForPossibleValues(NullTest *expr, Node 
*variable, PossibleValueSet *resultOut);
 
 typedef struct ConstHashValue
@@ -308,11 +308,11 @@ RemoveUnmatchingValues(PossibleValueSet *pvs, 
PossibleValueSet *toCheck)
  * @param resultOut will be updated with the modified values
  */
 bool
-TryProcessExprForPossibleValues(Node *expr, Node *variable, PossibleValueSet 
*resultOut)
+TryProcessExprForPossibleValues(Node *expr, Node *variable, Oid opfamily, 
PossibleValueSet *resultOut)
 {
        if (IsA(expr, OpExpr))
        {
-               return TryProcessOpExprForPossibleValues((OpExpr *) expr, 
variable, resultOut);
+               return TryProcessOpExprForPossibleValues((OpExpr *) expr, 
variable, opfamily, resultOut);
        }
        else if (IsA(expr, NullTest))
        {
@@ -323,7 +323,7 @@ TryProcessExprForPossibleValues(Node *expr, Node *variable, 
PossibleValueSet *re
 }
 
 static bool
-TryProcessOpExprForPossibleValues(OpExpr *expr, Node *variable, 
PossibleValueSet *resultOut)
+TryProcessOpExprForPossibleValues(OpExpr *expr, Node *variable, Oid opfamily, 
PossibleValueSet *resultOut)
 {
        Node       *leftop,
                           *rightop,
@@ -380,6 +380,16 @@ TryProcessOpExprForPossibleValues(OpExpr *expr, Node 
*variable, PossibleValueSet
                return false;
        }
 
+       /*
+        * if expr->opno does not belong to opfamily of distributed key,
+        * do not use direct dispatch to avoid wrong results.
+        * more details please refer to 
https://github.com/greenplum-db/gpdb/issues/14887
+        */
+       if (!op_in_opfamily(expr->opno, opfamily))
+       {
+               return false;
+       }
+
        clause_op_infos = get_op_btree_interpretation(expr->opno);
 
        /* check if it's equality operation */
diff --git a/src/include/optimizer/predtest_valueset.h 
b/src/include/optimizer/predtest_valueset.h
index 7f151b7fdc..6b04d2d9de 100644
--- a/src/include/optimizer/predtest_valueset.h
+++ b/src/include/optimizer/predtest_valueset.h
@@ -41,7 +41,7 @@ typedef struct
        bool isAnyValuePossible;
 } PossibleValueSet;
 
-extern PossibleValueSet DeterminePossibleValueSet(Node *clause, Node 
*variable);
+extern PossibleValueSet DeterminePossibleValueSet(Node *clause, Node 
*variable, Oid opfamily);
 
 /* returns a newly allocated list */
 extern Node **GetPossibleValuesAsArray(PossibleValueSet *pvs, int 
*numValuesOut);
@@ -52,6 +52,6 @@ extern void InitPossibleValueSetData(PossibleValueSet *pvs);
 
 extern void AddUnmatchingValues(PossibleValueSet *pvs, PossibleValueSet 
*toCheck);
 extern void RemoveUnmatchingValues(PossibleValueSet *pvs, PossibleValueSet 
*toCheck);
-extern bool TryProcessExprForPossibleValues(Node *expr, Node *variable, 
PossibleValueSet *resultOut);
+extern bool TryProcessExprForPossibleValues(Node *expr, Node *variable, Oid 
opfamily, PossibleValueSet *resultOut);
 
 #endif   /* PREDTEST_VALUESET_H */
diff --git a/src/test/isolation2/expected/setup.out 
b/src/test/isolation2/expected/setup.out
index 8df586b58e..10f69cd1eb 100644
--- a/src/test/isolation2/expected/setup.out
+++ b/src/test/isolation2/expected/setup.out
@@ -115,7 +115,7 @@ if create_slot: cmd += ' --create-slot'
 if slotname is not None: cmd += ' --slot %s' % (slotname) 
 if force_overwrite: cmd += ' --force-overwrite' 
 if xlog_method == 'stream': cmd += ' --wal-method stream' elif xlog_method == 
'fetch': cmd += ' --wal-method fetch' else: plpy.error('invalid xlog method') 
-cmd += ' --no-verify-checksums'
+cmd += ' --no-verify-checksums' 
 try: # Unset PGAPPNAME so that the pg_stat_replication.application_name is not 
affected if os.getenv('PGAPPNAME') is not None: os.environ.pop('PGAPPNAME') 
results = subprocess.check_output(cmd, stderr=subprocess.STDOUT, 
shell=True).replace(b'.', b'').decode() except subprocess.CalledProcessError as 
e: results = str(e) + "\ncommand output: " + str(e.output) 
 return results $$ language plpython3u;
 CREATE
diff --git a/src/test/regress/expected/alter_table_gp.out 
b/src/test/regress/expected/alter_table_gp.out
index 8149d5f901..dffe13af5d 100644
--- a/src/test/regress/expected/alter_table_gp.out
+++ b/src/test/regress/expected/alter_table_gp.out
@@ -371,22 +371,22 @@ CREATE TABLE issue_14279_taptest_table (
     col1 BIGINT,
     col2 TEXT NOT NULL DEFAULT 'stuff', FOREIGN KEY (col2) REFERENCES 
issue_14279_fk_reference(col2))
   PARTITION BY RANGE (col1);
-NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 
'col1' as the Greenplum Database data distribution key for this table.
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 
'col1' as the Cloudberry Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
-WARNING:  referential integrity (FOREIGN KEY) constraints are not supported in 
Greenplum Database, will not be enforced
+WARNING:  referential integrity (FOREIGN KEY) constraints are not supported in 
Cloudberry Database, will not be enforced
 CREATE TABLE issue_14279_taptest_table_p3000000000 (
     LIKE issue_14279_taptest_table
     INCLUDING DEFAULTS INCLUDING CONSTRAINTS);
 NOTICE:  table doesn't have 'DISTRIBUTED BY' clause, defaulting to 
distribution columns from LIKE table
 ALTER TABLE issue_14279_taptest_table ATTACH PARTITION 
issue_14279_taptest_table_p3000000000 FOR VALUES FROM (3000000000) TO 
(3000000100);
-WARNING:  referential integrity (FOREIGN KEY) constraints are not supported in 
Greenplum Database, will not be enforced
+WARNING:  referential integrity (FOREIGN KEY) constraints are not supported in 
Cloudberry Database, will not be enforced
 BEGIN;
 CREATE TABLE issue_14279_taptest_table_p3000000100 (
     LIKE issue_14279_taptest_table
     INCLUDING DEFAULTS INCLUDING CONSTRAINTS);
 NOTICE:  table doesn't have 'DISTRIBUTED BY' clause, defaulting to 
distribution columns from LIKE table
 ALTER TABLE issue_14279_taptest_table ATTACH PARTITION 
issue_14279_taptest_table_p3000000100 FOR VALUES FROM (3000000100) TO 
(3000000200);
-WARNING:  referential integrity (FOREIGN KEY) constraints are not supported in 
Greenplum Database, will not be enforced
+WARNING:  referential integrity (FOREIGN KEY) constraints are not supported in 
Cloudberry Database, will not be enforced
 END;
 INSERT INTO issue_14279_taptest_table SELECT generate_series(3000000000, 
3000000001);
 INSERT INTO issue_14279_taptest_table SELECT generate_series(3000000100, 
3000000101);
diff --git a/src/test/regress/expected/btree_index.out 
b/src/test/regress/expected/btree_index.out
index 39fea6969c..2514aba7de 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -206,7 +206,7 @@ reset enable_sort;
 -- Also check LIKE optimization with binary-compatible cases
 create temp table btree_bpchar (f1 text collate "C");
 create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on);
-insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux');
+insert into btree_bpchar values ('foo'), ('foo  '), ('fool'), ('bar'), 
('quux');
 -- doesn't match index:
 explain (costs off)
 select * from btree_bpchar where f1 like 'foo';
@@ -233,18 +233,19 @@ select * from btree_bpchar where f1 like 'foo%';
 (3 rows)
 
 select * from btree_bpchar where f1 like 'foo%';
-  f1  
-------
+  f1   
+-------
+ foo  
  foo
  fool
-(2 rows)
+(3 rows)
 
 -- these do match the index:
 explain (costs off)
 select * from btree_bpchar where f1::bpchar like 'foo';
                         QUERY PLAN                        
 ----------------------------------------------------------
- Gather Motion 1:1  (slice1; segments: 1)
+ Gather Motion 3:1  (slice1; segments: 3)
    ->  Bitmap Heap Scan on btree_bpchar
          Filter: ((f1)::bpchar ~~ 'foo'::text)
          ->  Bitmap Index Scan on btree_bpchar_f1_idx
@@ -269,10 +270,30 @@ select * from btree_bpchar where f1::bpchar like 'foo%';
 (5 rows)
 
 select * from btree_bpchar where f1::bpchar like 'foo%';
-  f1  
-------
+  f1   
+-------
  foo
  fool
+ foo  
+(3 rows)
+
+explain (costs off)
+select * from btree_bpchar where f1::bpchar ='foo';
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  Bitmap Heap Scan on btree_bpchar
+         Recheck Cond: ((f1)::bpchar = 'foo'::bpchar)
+         ->  Bitmap Index Scan on btree_bpchar_f1_idx
+               Index Cond: ((f1)::bpchar = 'foo'::bpchar)
+ Optimizer: Postgres query optimizer
+(6 rows)
+
+select * from btree_bpchar where f1::bpchar ='foo';
+  f1   
+-------
+ foo  
+ foo
 (2 rows)
 
 -- get test coverage for "single value" deduplication strategy:
diff --git a/src/test/regress/expected/direct_dispatch.out 
b/src/test/regress/expected/direct_dispatch.out
index 0bc86776b9..672a472b7b 100644
--- a/src/test/regress/expected/direct_dispatch.out
+++ b/src/test/regress/expected/direct_dispatch.out
@@ -1118,6 +1118,80 @@ INFO:  Distributed transaction command 'Distributed 
Abort (No Prepared)' to ALL
 -- cleanup
 set test_print_direct_dispatch_info=off;
 set allow_system_table_mods=off;
+-- https://github.com/greenplum-db/gpdb/issues/14887
+-- If opno of clause does not belong to opfamily of distributed key,
+-- do not use direct dispatch to resolve wrong result
+-- FIXME: orca still has wrong results
+create table t_14887(a varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into t_14887 values('a   ');
+explain select * from t_14887 where a = 'a'::bpchar;
+                                   QUERY PLAN                                  
 
+--------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..254.70 rows=53 width=32)
+   ->  Seq Scan on t_14887  (cost=0.00..254.00 rows=18 width=32)
+         Filter: ((a)::bpchar = 'a'::bpchar)
+ Optimizer: Postgres query optimizer
+(4 rows)
+
+select * from t_14887 where a = 'a'::bpchar;
+  a   
+------
+ a   
+(1 row)
+
+-- texteq does not belong to the hash opfamily of the table's citext distkey.
+-- But from the implementation can deduce: texteq ==> citext_eq, and we can
+-- do the direct dispatch.
+-- But we do not have the kind of implication rule in Postgres: texteq ==> 
citext_eq.
+-- Also partition table with citext as hash key and condition with text type
+-- does not do partition prune.
+CREATE EXTENSION if not exists citext;
+drop table t_14887;
+create table t_14887(a citext);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into t_14887 values('A'),('a');
+explain select * from t_14887 where a = 'a'::text;
+                                   QUERY PLAN                                  
 
+--------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..254.70 rows=53 width=32)
+   ->  Seq Scan on t_14887  (cost=0.00..254.00 rows=18 width=32)
+         Filter: ((a)::text = 'a'::text)
+ Optimizer: Postgres query optimizer
+(4 rows)
+
+select * from t_14887 where a = 'a'::text;
+ a 
+---
+ a
+(1 row)
+
+drop table t_14887;
+create table t_14887 (a citext) partition by hash (a);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table t0_14887 partition of t_14887 for values with (modulus 
3,remainder 0);
+NOTICE:  table has parent, setting distribution columns to match parent table
+create table t1_14887 partition of t_14887 for values with (modulus 
3,remainder 1);
+NOTICE:  table has parent, setting distribution columns to match parent table
+create table t2_14887 partition of t_14887 for values with (modulus 
3,remainder 2);
+NOTICE:  table has parent, setting distribution columns to match parent table
+explain select * from t_14887 where a = 'a'::text;
+                                    QUERY PLAN                                 
   
+----------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..764.38 rows=158 
width=32)
+   ->  Append  (cost=0.00..762.26 rows=53 width=32)
+         ->  Seq Scan on t0_14887 t_14887_1  (cost=0.00..254.00 rows=18 
width=32)
+               Filter: ((a)::text = 'a'::text)
+         ->  Seq Scan on t1_14887 t_14887_2  (cost=0.00..254.00 rows=18 
width=32)
+               Filter: ((a)::text = 'a'::text)
+         ->  Seq Scan on t2_14887 t_14887_3  (cost=0.00..254.00 rows=18 
width=32)
+               Filter: ((a)::text = 'a'::text)
+ Optimizer: Postgres query optimizer
+(9 rows)
+
 begin;
 drop table if exists direct_test;
 drop table if exists direct_test_two_column;
@@ -1131,4 +1205,6 @@ drop table if exists MPP_22019_b;
 drop table if exists t_sql_value_function1;
 drop table if exists t_sql_value_function2;
 drop table if exists t_hash_partition;
+drop table if exists t_14887;
+drop extension if exists citext cascade;
 commit;
diff --git a/src/test/regress/expected/direct_dispatch_optimizer.out 
b/src/test/regress/expected/direct_dispatch_optimizer.out
index bff3b4f19c..106c590805 100644
--- a/src/test/regress/expected/direct_dispatch_optimizer.out
+++ b/src/test/regress/expected/direct_dispatch_optimizer.out
@@ -1138,6 +1138,80 @@ INFO:  Distributed transaction command 'Distributed 
Abort (No Prepared)' to ALL
 -- cleanup
 set test_print_direct_dispatch_info=off;
 set allow_system_table_mods=off;
+-- https://github.com/greenplum-db/gpdb/issues/14887
+-- If opno of clause does not belong to opfamily of distributed key,
+-- do not use direct dispatch to resolve wrong result
+-- FIXME: orca still has wrong results
+create table t_14887(a varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into t_14887 values('a   ');
+explain select * from t_14887 where a = 'a'::bpchar;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=8)
+   ->  Seq Scan on t_14887  (cost=0.00..431.00 rows=1 width=8)
+         Filter: ((a)::bpchar = 'a'::bpchar)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(4 rows)
+
+select * from t_14887 where a = 'a'::bpchar;
+  a   
+------
+ a   
+(1 row)
+
+-- texteq does not belong to the hash opfamily of the table's citext distkey.
+-- But from the implementation can deduce: texteq ==> citext_eq, and we can
+-- do the direct dispatch.
+-- But we do not have the kind of implication rule in Postgres: texteq ==> 
citext_eq.
+-- Also partition table with citext as hash key and condition with text type
+-- does not do partition prune.
+CREATE EXTENSION if not exists citext;
+drop table t_14887;
+create table t_14887(a citext);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into t_14887 values('A'),('a');
+explain select * from t_14887 where a = 'a'::text;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=8)
+   ->  Seq Scan on t_14887  (cost=0.00..431.00 rows=1 width=8)
+         Filter: ((a)::text = 'a'::text)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(4 rows)
+
+select * from t_14887 where a = 'a'::text;
+ a 
+---
+ a
+(1 row)
+
+drop table t_14887;
+create table t_14887 (a citext) partition by hash (a);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table t0_14887 partition of t_14887 for values with (modulus 
3,remainder 0);
+NOTICE:  table has parent, setting distribution columns to match parent table
+create table t1_14887 partition of t_14887 for values with (modulus 
3,remainder 1);
+NOTICE:  table has parent, setting distribution columns to match parent table
+create table t2_14887 partition of t_14887 for values with (modulus 
3,remainder 2);
+NOTICE:  table has parent, setting distribution columns to match parent table
+explain select * from t_14887 where a = 'a'::text;
+                                    QUERY PLAN                                 
   
+----------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..764.38 rows=158 
width=32)
+   ->  Append  (cost=0.00..762.26 rows=53 width=32)
+         ->  Seq Scan on t0_14887 t_14887_1  (cost=0.00..254.00 rows=18 
width=32)
+               Filter: ((a)::text = 'a'::text)
+         ->  Seq Scan on t1_14887 t_14887_2  (cost=0.00..254.00 rows=18 
width=32)
+               Filter: ((a)::text = 'a'::text)
+         ->  Seq Scan on t2_14887 t_14887_3  (cost=0.00..254.00 rows=18 
width=32)
+               Filter: ((a)::text = 'a'::text)
+ Optimizer: Postgres query optimizer
+(9 rows)
+
 begin;
 drop table if exists direct_test;
 drop table if exists direct_test_two_column;
@@ -1151,4 +1225,6 @@ drop table if exists MPP_22019_b;
 drop table if exists t_sql_value_function1;
 drop table if exists t_sql_value_function2;
 drop table if exists t_hash_partition;
+drop table if exists t_14887;
+drop extension if exists citext cascade;
 commit;
diff --git a/src/test/regress/sql/btree_index.sql 
b/src/test/regress/sql/btree_index.sql
index a771eb62b8..4e04300e5d 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -92,7 +92,7 @@ reset enable_sort;
 
 create temp table btree_bpchar (f1 text collate "C");
 create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on);
-insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux');
+insert into btree_bpchar values ('foo'), ('foo  '), ('fool'), ('bar'), 
('quux');
 -- doesn't match index:
 explain (costs off)
 select * from btree_bpchar where f1 like 'foo';
@@ -108,6 +108,10 @@ explain (costs off)
 select * from btree_bpchar where f1::bpchar like 'foo%';
 select * from btree_bpchar where f1::bpchar like 'foo%';
 
+explain (costs off)
+select * from btree_bpchar where f1::bpchar ='foo';
+select * from btree_bpchar where f1::bpchar ='foo';
+
 -- get test coverage for "single value" deduplication strategy:
 insert into btree_bpchar select 'foo' from generate_series(1,1500);
 
diff --git a/src/test/regress/sql/direct_dispatch.sql 
b/src/test/regress/sql/direct_dispatch.sql
index 83199f7883..ab8bafafff 100644
--- a/src/test/regress/sql/direct_dispatch.sql
+++ b/src/test/regress/sql/direct_dispatch.sql
@@ -471,6 +471,35 @@ abort;
 set test_print_direct_dispatch_info=off;
 set allow_system_table_mods=off;
 
+-- https://github.com/greenplum-db/gpdb/issues/14887
+-- If opno of clause does not belong to opfamily of distributed key,
+-- do not use direct dispatch to resolve wrong result
+-- FIXME: orca still has wrong results
+create table t_14887(a varchar);
+insert into t_14887 values('a   ');
+explain select * from t_14887 where a = 'a'::bpchar;
+select * from t_14887 where a = 'a'::bpchar;
+
+-- texteq does not belong to the hash opfamily of the table's citext distkey.
+-- But from the implementation can deduce: texteq ==> citext_eq, and we can
+-- do the direct dispatch.
+-- But we do not have the kind of implication rule in Postgres: texteq ==> 
citext_eq.
+-- Also partition table with citext as hash key and condition with text type
+-- does not do partition prune.
+CREATE EXTENSION if not exists citext;
+drop table t_14887;
+create table t_14887(a citext);
+insert into t_14887 values('A'),('a');
+explain select * from t_14887 where a = 'a'::text;
+select * from t_14887 where a = 'a'::text;
+
+drop table t_14887;
+create table t_14887 (a citext) partition by hash (a);
+create table t0_14887 partition of t_14887 for values with (modulus 
3,remainder 0);
+create table t1_14887 partition of t_14887 for values with (modulus 
3,remainder 1);
+create table t2_14887 partition of t_14887 for values with (modulus 
3,remainder 2);
+explain select * from t_14887 where a = 'a'::text;
+
 begin;
 drop table if exists direct_test;
 drop table if exists direct_test_two_column;
@@ -487,5 +516,7 @@ drop table if exists t_sql_value_function1;
 drop table if exists t_sql_value_function2;
 
 drop table if exists t_hash_partition;
+drop table if exists t_14887;
+drop extension if exists citext cascade;
 
 commit;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 12/20: Resolve wrong result when direct dispatch, If opno of clause does not belong to opfamily of distributed key(#14977)

Reply via email to