This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 6b19c44262bc9f498646c30d7efb061c180744e3
Author: David Kimura <[email protected]>
AuthorDate: Wed Nov 1 10:38:38 2023 -0700

    [ORCA] Relax client/server CTYPE encoding requirement (#16619)
    
    ORCA represents strings using wide characters. That requires converting
    to wide format using `vswprintf()`. However, that API may fail if the
    string to covert is incompatible with the set LC_CTYPE character set.
    That can happen if the database defined LC_CTYPE character set doesn't
    support the client character set.
    
    For example, if the database has LC_CTYPE set to 'C', but the client has
    'ko_KR.UTF-8'. In that case, ORCA currently falls back to PLANNER
    because converting to wide format fails.
    
    A few possible solutions were considered:
      1) Remove wide characters from ORCA
      2) Use a 'good' LC_CTYPE while calling `vswprintf()`
      3) Return a generic wide character string on failure
    
    Solution 1 is very invasive. Solution 2 is tricky to pick a 'good'
    LC_CTYPE. Solution 3 seems the simplest and is the implemented approach.
    
    Solution 3 required re-thinking commit 6f0737379c9. Instead of a fall
    back, we allow ORCA use a generic string. Then translate it back to the
    actual string during DXLToPlStmt translation. That handles the case
    where the offending name is in the column project list (see added
    regress testcase).
---
 .../gpopt/translate/CDXLTranslateContext.cpp       |  5 +-
 .../gpopt/translate/CTranslatorDXLToPlStmt.cpp     | 62 ++++++++++++++-
 .../gporca/libgpos/include/gpos/error/CException.h |  3 -
 .../src/unittest/gpos/string/CWStringTest.cpp      | 19 ++---
 .../gporca/libgpos/src/common/clibwrapper.cpp      |  6 +-
 src/backend/gporca/libgpos/src/error/CMessage.cpp  | 10 ---
 src/include/gpopt/translate/CDXLTranslateContext.h | 17 +++-
 src/test/regress/expected/gp_locale.out            | 90 ++++++++++++++++++++++
 src/test/regress/greenplum_schedule                |  2 +-
 src/test/regress/sql/gp_locale.sql                 | 61 +++++++++++++++
 10 files changed, 243 insertions(+), 32 deletions(-)

diff --git a/src/backend/gpopt/translate/CDXLTranslateContext.cpp 
b/src/backend/gpopt/translate/CDXLTranslateContext.cpp
index 3c98143e15..f7f2268393 100644
--- a/src/backend/gpopt/translate/CDXLTranslateContext.cpp
+++ b/src/backend/gpopt/translate/CDXLTranslateContext.cpp
@@ -29,8 +29,9 @@ using namespace gpos;
 //
 //---------------------------------------------------------------------------
 CDXLTranslateContext::CDXLTranslateContext(CMemoryPool *mp,
-                                                                               
   BOOL is_child_agg_node)
-       : m_mp(mp), m_is_child_agg_node(is_child_agg_node)
+                                                                               
   BOOL is_child_agg_node,
+                                                                               
   const Query *query)
+       : m_mp(mp), m_is_child_agg_node(is_child_agg_node), m_query(query)
 {
        // initialize hash table
        m_colid_to_target_entry_map = GPOS_NEW(m_mp) 
ULongToTargetEntryMap(m_mp);
diff --git a/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp 
b/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp
index 5f988418af..8518cb3e40 100644
--- a/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp
+++ b/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp
@@ -168,7 +168,7 @@ CTranslatorDXLToPlStmt::GetPlannedStmtFromDXL(const 
CDXLNode *dxlnode,
 {
        GPOS_ASSERT(nullptr != dxlnode);
 
-       CDXLTranslateContext dxl_translate_ctxt(m_mp, false);
+       CDXLTranslateContext dxl_translate_ctxt(m_mp, false, orig_query);
 
        PlanSlice *topslice;
 
@@ -5441,6 +5441,51 @@ CTranslatorDXLToPlStmt::ProcessDXLTblDescr(
        return index;
 }
 
+//---------------------------------------------------------------------------
+//     @function:
+//             update_unknown_locale_walker
+//
+//     @doc:
+//             Given an expression tree and a TargetEntry pointer context, 
look for a
+//             matching target entry in the expression tree and overwrite the 
given
+//             TargetEntry context's resname with the original found in the 
expression
+//             tree.
+//
+//---------------------------------------------------------------------------
+static bool
+update_unknown_locale_walker(Node *node, void *context)
+{
+       if (node == nullptr)
+       {
+               return false;
+       }
+
+       TargetEntry *unknown_target_entry = (TargetEntry *) context;
+
+       if (IsA(node, TargetEntry))
+       {
+               TargetEntry *te = (TargetEntry *) node;
+
+               if (te->resorigtbl == unknown_target_entry->resorigtbl &&
+                       te->resno == unknown_target_entry->resno)
+               {
+                       unknown_target_entry->resname = te->resname;
+                       return false;
+               }
+       }
+       else if (IsA(node, Query))
+       {
+               Query *query = (Query *) node;
+
+               return gpdb::WalkExpressionTree(
+                       (Node *) query->targetList,
+                       (bool (*)()) update_unknown_locale_walker, (void *) 
context);
+       }
+
+       return gpdb::WalkExpressionTree(
+               node, (bool (*)()) update_unknown_locale_walker, (void *) 
context);
+}
+
 //---------------------------------------------------------------------------
 //     @function:
 //             CTranslatorDXLToPlStmt::TranslateDXLProjList
@@ -5545,6 +5590,21 @@ CTranslatorDXLToPlStmt::TranslateDXLProjList(
                                }
                                target_entry->resorigtbl = 
pteOriginal->resorigtbl;
                                target_entry->resorigcol = 
pteOriginal->resorigcol;
+
+                               // Hack! ORCA represents strings using wide 
characters. That
+                               // can require converting from multibyte 
characters using
+                               // vswprintf(). However, vswprintf() is 
dependent on the system
+                               // locale which is set at the database level. 
When that locale
+                               // cannot interpret the string correctly, it 
fails. ORCA
+                               // bypasses the failure by using a generic 
"UNKNOWN" string.
+                               // When that happens, the following code 
translates it back to
+                               // the original multibyte string.
+                               if (strcmp(target_entry->resname, "UNKNOWN") == 
0)
+                               {
+                                       update_unknown_locale_walker(
+                                               (Node *) 
output_context->GetQuery(),
+                                               (void *) target_entry);
+                               }
                        }
                }
 
diff --git a/src/backend/gporca/libgpos/include/gpos/error/CException.h 
b/src/backend/gporca/libgpos/include/gpos/error/CException.h
index ae63c31097..8d4d6bc8da 100644
--- a/src/backend/gporca/libgpos/include/gpos/error/CException.h
+++ b/src/backend/gporca/libgpos/include/gpos/error/CException.h
@@ -131,9 +131,6 @@ public:
                // unknown exception
                ExmiUnhandled,
 
-               // illegal byte sequence
-               ExmiIllegalByteSequence,
-
                ExmiSentinel
        };
 
diff --git 
a/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp 
b/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp
index 06d0f9b524..7596a8d7df 100644
--- 
a/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp
+++ 
b/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp
@@ -178,30 +178,23 @@ CWStringTest::EresUnittest_AppendFormatInvalidLocale()
        CAutoMemoryPool amp(CAutoMemoryPool::ElcExc);
        CMemoryPool *mp = amp.Pmp();
 
+       CWStringDynamic *expected =
+               GPOS_NEW(mp) CWStringDynamic(mp, GPOS_WSZ_LIT("UNKNOWN"));
+
        CHAR *oldLocale = setlocale(LC_CTYPE, nullptr);
        CWStringDynamic *pstr1 = GPOS_NEW(mp) CWStringDynamic(mp);
 
        GPOS_RESULT eres = GPOS_OK;
 
        setlocale(LC_CTYPE, "C");
-       GPOS_TRY
-       {
-               pstr1->AppendFormat(GPOS_WSZ_LIT("%s"), (CHAR *) "ÃË", 123);
-
-               eres = GPOS_FAILED;
-       }
-       GPOS_CATCH_EX(ex)
-       {
-               GPOS_ASSERT(GPOS_MATCH_EX(ex, CException::ExmaSystem,
-                                                                 
CException::ExmiIllegalByteSequence));
+       pstr1->AppendFormat(GPOS_WSZ_LIT("%s"), (CHAR *) "ÃË", 123);
 
-               GPOS_RESET_EX;
-       }
-       GPOS_CATCH_END;
+       pstr1->Equals(expected);
 
        // cleanup
        setlocale(LC_CTYPE, oldLocale);
        GPOS_DELETE(pstr1);
+       GPOS_DELETE(expected);
 
        return eres;
 }
diff --git a/src/backend/gporca/libgpos/src/common/clibwrapper.cpp 
b/src/backend/gporca/libgpos/src/common/clibwrapper.cpp
index 22b317e80b..33935edd94 100644
--- a/src/backend/gporca/libgpos/src/common/clibwrapper.cpp
+++ b/src/backend/gporca/libgpos/src/common/clibwrapper.cpp
@@ -358,7 +358,11 @@ gpos::clib::Vswprintf(WCHAR *wcstr, SIZE_T max_len, const 
WCHAR *format,
        {
                // Invalid multibyte character encountered. This can happen if 
the byte sequence does not
                // match with the server encoding.
-               GPOS_RAISE(CException::ExmaSystem, 
CException::ExmiIllegalByteSequence);
+               //
+               // Hack! Rather than fail/fall-back here, ORCA uses a generic 
"UNKNOWN"
+               // string. During DXL to PlStmt translation this will be 
translated
+               // back using the original query tree (see TranslateDXLProjList)
+               res = swprintf(wcstr, max_len, format, "UNKNOWN");
        }
 
        return res;
diff --git a/src/backend/gporca/libgpos/src/error/CMessage.cpp 
b/src/backend/gporca/libgpos/src/error/CMessage.cpp
index f075252b97..15d6de984a 100644
--- a/src/backend/gporca/libgpos/src/error/CMessage.cpp
+++ b/src/backend/gporca/libgpos/src/error/CMessage.cpp
@@ -254,16 +254,6 @@ CMessage::GetMessage(ULONG index)
                        CException(CException::ExmaUnhandled, 
CException::ExmiUnhandled),
                        CException::ExsevError, GPOS_WSZ_WSZLEN("Unhandled 
exception"), 0,
                        GPOS_WSZ_WSZLEN("Unhandled exception")),
-
-               CMessage(
-                       CException(CException::ExmaSystem,
-                                          CException::ExmiIllegalByteSequence),
-                       CException::ExsevError,
-                       GPOS_WSZ_WSZLEN(
-                               "Invalid multibyte character for locale 
encountered in metadata name"),
-                       0,
-                       GPOS_WSZ_WSZLEN(
-                               "Invalid multibyte character for locale 
encountered in metadata name")),
        };
 
        return &msg[index];
diff --git a/src/include/gpopt/translate/CDXLTranslateContext.h 
b/src/include/gpopt/translate/CDXLTranslateContext.h
index 0771fc2514..56bcfd41eb 100644
--- a/src/include/gpopt/translate/CDXLTranslateContext.h
+++ b/src/include/gpopt/translate/CDXLTranslateContext.h
@@ -18,6 +18,12 @@
 #ifndef GPDXL_CDXLTranslateContext_H
 #define GPDXL_CDXLTranslateContext_H
 
+extern "C" {
+#include "postgres.h"
+
+#include "nodes/plannodes.h"
+}
+
 #include "gpos/base.h"
 #include "gpos/common/CHashMap.h"
 #include "gpos/common/CHashMapIter.h"
@@ -80,6 +86,8 @@ private:
        // to use OUTER instead of 0 for Var::varno in Agg target lists 
(MPP-12034)
        BOOL m_is_child_agg_node;
 
+       const Query *m_query{nullptr};
+
        // copy the params hashmap
        void CopyParamHashmap(ULongToColParamMap *original);
        void CopyTargetEntryHashmap(ULongToTargetEntryMap *original);
@@ -88,7 +96,8 @@ public:
        CDXLTranslateContext(const CDXLTranslateContext &) = delete;
 
        // ctor/dtor
-       CDXLTranslateContext(CMemoryPool *mp, BOOL is_child_agg_node);
+       CDXLTranslateContext(CMemoryPool *mp, BOOL is_child_agg_node,
+                                                const Query *query);
 
        CDXLTranslateContext(CMemoryPool *mp, BOOL is_child_agg_node,
                                                 ULongToColParamMap *original);
@@ -111,6 +120,12 @@ public:
                return m_colid_to_target_entry_map;
        }
 
+       const Query *
+       GetQuery()
+       {
+               return m_query;
+       }
+
        // return the target entry corresponding to the given ColId
        const TargetEntry *GetTargetEntry(ULONG colid) const;
 
diff --git a/src/test/regress/expected/gp_locale.out 
b/src/test/regress/expected/gp_locale.out
new file mode 100644
index 0000000000..0d916a93d7
--- /dev/null
+++ b/src/test/regress/expected/gp_locale.out
@@ -0,0 +1,90 @@
+-- ORCA uses functions (e.g. vswprintf) to translation to wide character
+-- format. But those libraries may fail if the current locale cannot handle the
+-- character set. This test checks that even when those libraries fail, ORCA is
+-- still able to generate plans.
+--
+-- Create a database that sets the minimum locale
+--
+DROP DATABASE IF EXISTS test_locale;
+CREATE DATABASE test_locale WITH LC_COLLATE='C' LC_CTYPE='C' 
TEMPLATE=template0;
+\c test_locale
+--
+-- drop/add/remove columns
+--
+CREATE TABLE hi_안녕세계 (a int, 안녕세계1 text, 안녕세계2 text, 안녕세계3 text) DISTRIBUTED 
BY (a);
+ALTER TABLE hi_안녕세계 DROP COLUMN 안녕세계2;
+ALTER TABLE hi_안녕세계 ADD COLUMN 안녕세계2_ADD_COLUMN text;
+ALTER TABLE hi_안녕세계 RENAME COLUMN 안녕세계3 TO こんにちわ3;
+INSERT INTO hi_안녕세계 VALUES(1, '안녕세계1 first', '안녕세2 first', '안녕세계3 first');
+INSERT INTO hi_안녕세계 VALUES(42, '안녕세계1 second', '안녕세2 second', '안녕세계3 second');
+--
+-- Try various queries containing multibyte character set and check the column
+-- name output
+--
+SET optimizer_trace_fallback=on;
+-- DELETE
+DELETE FROM hi_안녕세계 WHERE a=42;
+-- UPDATE
+UPDATE hi_안녕세계 SET 안녕세계1='안녕세계1 first UPDATE' WHERE 안녕세계1='안녕세계1 first';
+-- SELECT
+SELECT * FROM hi_안녕세계;
+ a |       안녕세계1        |  こんにちわ3  | 안녕세계2_add_column 
+---+------------------------+---------------+----------------------
+ 1 | 안녕세계1 first UPDATE | 안녕세2 first | 안녕세계3 first
+(1 row)
+
+SELECT 안녕세계1 || こんにちわ3 FROM hi_안녕세계;
+              ?column?               
+-------------------------------------
+ 안녕세계1 first UPDATE안녕세2 first
+(1 row)
+
+-- SELECT ALIAS
+SELECT 안녕세계1 AS 안녕세계1_Alias FROM hi_안녕세계;
+    안녕세계1_alias     
+------------------------
+ 안녕세계1 first UPDATE
+(1 row)
+
+-- SUBQUERY
+SELECT * FROM (SELECT 안녕세계1 FROM hi_안녕세계) t;
+       안녕세계1        
+------------------------
+ 안녕세계1 first UPDATE
+(1 row)
+
+SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM (SELECT 1) AS q;
+  こんにちわ3  
+---------------
+ 안녕세2 first
+(1 row)
+
+SELECT (SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM  hi_안녕세계) FROM (SELECT 1) AS 
q;
+  こんにちわ3  
+---------------
+ 안녕세2 first
+(1 row)
+
+-- CTE
+WITH cte AS
+(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계1 LIKE 
'안녕세계1%';
+       안녕세계1        |  こんにちわ3  
+------------------------+---------------
+ 안녕세계1 first UPDATE | 안녕세2 first
+(1 row)
+
+WITH cte(안녕세계x, こんにちわx) AS
+(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계x LIKE 
'안녕세계1%';
+       안녕세계x        |  こんにちわx  
+------------------------+---------------
+ 안녕세계1 first UPDATE | 안녕세2 first
+(1 row)
+
+-- JOIN
+SELECT * FROM hi_안녕세계 hi_안녕세계1, hi_안녕세계 hi_안녕세계2 WHERE hi_안녕세계1.안녕세계1 LIKE 
'%UPDATE';
+ a |       안녕세계1        |  こんにちわ3  | 안녕세계2_add_column | a |       안녕세계1        
|  こんにちわ3  | 안녕세계2_add_column 
+---+------------------------+---------------+----------------------+---+------------------------+---------------+----------------------
+ 1 | 안녕세계1 first UPDATE | 안녕세2 first | 안녕세계3 first      | 1 | 안녕세계1 first 
UPDATE | 안녕세2 first | 안녕세계3 first
+(1 row)
+
+RESET optimizer_trace_fallback;
diff --git a/src/test/regress/greenplum_schedule 
b/src/test/regress/greenplum_schedule
index b92b9b6ce6..42ebacfd14 100755
--- a/src/test/regress/greenplum_schedule
+++ b/src/test/regress/greenplum_schedule
@@ -37,7 +37,7 @@ test: instr_in_shmem_setup
 test: instr_in_shmem
 
 test: createdb
-test: gp_aggregates gp_aggregates_costs gp_metadata variadic_parameters 
default_parameters function_extensions spi gp_xml shared_scan update_gp 
triggers_gp returning_gp resource_queue_with_rule gp_types gp_index cluster_gp 
combocid_gp gp_sort gp_prepared_xacts gp_backend_info
+test: gp_aggregates gp_aggregates_costs gp_metadata variadic_parameters 
default_parameters function_extensions spi gp_xml shared_scan update_gp 
triggers_gp returning_gp resource_queue_with_rule gp_types gp_index cluster_gp 
combocid_gp gp_sort gp_prepared_xacts gp_backend_info gp_locale
 test: foreign_key_gp
 test: spi_processed64bit
 test: gp_tablespace_with_faults
diff --git a/src/test/regress/sql/gp_locale.sql 
b/src/test/regress/sql/gp_locale.sql
new file mode 100644
index 0000000000..444352c9ed
--- /dev/null
+++ b/src/test/regress/sql/gp_locale.sql
@@ -0,0 +1,61 @@
+-- ORCA uses functions (e.g. vswprintf) to translation to wide character
+-- format. But those libraries may fail if the current locale cannot handle the
+-- character set. This test checks that even when those libraries fail, ORCA is
+-- still able to generate plans.
+
+--
+-- Create a database that sets the minimum locale
+--
+DROP DATABASE IF EXISTS test_locale;
+CREATE DATABASE test_locale WITH LC_COLLATE='C' LC_CTYPE='C' 
TEMPLATE=template0;
+\c test_locale
+
+--
+-- drop/add/remove columns
+--
+CREATE TABLE hi_안녕세계 (a int, 안녕세계1 text, 안녕세계2 text, 안녕세계3 text) DISTRIBUTED 
BY (a);
+ALTER TABLE hi_안녕세계 DROP COLUMN 안녕세계2;
+ALTER TABLE hi_안녕세계 ADD COLUMN 안녕세계2_ADD_COLUMN text;
+ALTER TABLE hi_안녕세계 RENAME COLUMN 안녕세계3 TO こんにちわ3;
+
+INSERT INTO hi_안녕세계 VALUES(1, '안녕세계1 first', '안녕세2 first', '안녕세계3 first');
+INSERT INTO hi_안녕세계 VALUES(42, '안녕세계1 second', '안녕세2 second', '안녕세계3 second');
+
+--
+-- Try various queries containing multibyte character set and check the column
+-- name output
+--
+SET optimizer_trace_fallback=on;
+
+-- DELETE
+DELETE FROM hi_안녕세계 WHERE a=42;
+
+-- UPDATE
+UPDATE hi_안녕세계 SET 안녕세계1='안녕세계1 first UPDATE' WHERE 안녕세계1='안녕세계1 first';
+
+-- SELECT
+SELECT * FROM hi_안녕세계;
+
+SELECT 안녕세계1 || こんにちわ3 FROM hi_안녕세계;
+
+-- SELECT ALIAS
+SELECT 안녕세계1 AS 안녕세계1_Alias FROM hi_안녕세계;
+
+-- SUBQUERY
+SELECT * FROM (SELECT 안녕세계1 FROM hi_안녕세계) t;
+
+SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM (SELECT 1) AS q;
+
+SELECT (SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM  hi_안녕세계) FROM (SELECT 1) AS 
q;
+
+-- CTE
+WITH cte AS
+(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계1 LIKE 
'안녕세계1%';
+
+WITH cte(안녕세계x, こんにちわx) AS
+(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계x LIKE 
'안녕세계1%';
+
+-- JOIN
+SELECT * FROM hi_안녕세계 hi_안녕세계1, hi_안녕세계 hi_안녕세계2 WHERE hi_안녕세계1.안녕세계1 LIKE 
'%UPDATE';
+
+RESET optimizer_trace_fallback;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to