This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 6b19c44262bc9f498646c30d7efb061c180744e3 Author: David Kimura <[email protected]> AuthorDate: Wed Nov 1 10:38:38 2023 -0700 [ORCA] Relax client/server CTYPE encoding requirement (#16619) ORCA represents strings using wide characters. That requires converting to wide format using `vswprintf()`. However, that API may fail if the string to covert is incompatible with the set LC_CTYPE character set. That can happen if the database defined LC_CTYPE character set doesn't support the client character set. For example, if the database has LC_CTYPE set to 'C', but the client has 'ko_KR.UTF-8'. In that case, ORCA currently falls back to PLANNER because converting to wide format fails. A few possible solutions were considered: 1) Remove wide characters from ORCA 2) Use a 'good' LC_CTYPE while calling `vswprintf()` 3) Return a generic wide character string on failure Solution 1 is very invasive. Solution 2 is tricky to pick a 'good' LC_CTYPE. Solution 3 seems the simplest and is the implemented approach. Solution 3 required re-thinking commit 6f0737379c9. Instead of a fall back, we allow ORCA use a generic string. Then translate it back to the actual string during DXLToPlStmt translation. That handles the case where the offending name is in the column project list (see added regress testcase). --- .../gpopt/translate/CDXLTranslateContext.cpp | 5 +- .../gpopt/translate/CTranslatorDXLToPlStmt.cpp | 62 ++++++++++++++- .../gporca/libgpos/include/gpos/error/CException.h | 3 - .../src/unittest/gpos/string/CWStringTest.cpp | 19 ++--- .../gporca/libgpos/src/common/clibwrapper.cpp | 6 +- src/backend/gporca/libgpos/src/error/CMessage.cpp | 10 --- src/include/gpopt/translate/CDXLTranslateContext.h | 17 +++- src/test/regress/expected/gp_locale.out | 90 ++++++++++++++++++++++ src/test/regress/greenplum_schedule | 2 +- src/test/regress/sql/gp_locale.sql | 61 +++++++++++++++ 10 files changed, 243 insertions(+), 32 deletions(-) diff --git a/src/backend/gpopt/translate/CDXLTranslateContext.cpp b/src/backend/gpopt/translate/CDXLTranslateContext.cpp index 3c98143e15..f7f2268393 100644 --- a/src/backend/gpopt/translate/CDXLTranslateContext.cpp +++ b/src/backend/gpopt/translate/CDXLTranslateContext.cpp @@ -29,8 +29,9 @@ using namespace gpos; // //--------------------------------------------------------------------------- CDXLTranslateContext::CDXLTranslateContext(CMemoryPool *mp, - BOOL is_child_agg_node) - : m_mp(mp), m_is_child_agg_node(is_child_agg_node) + BOOL is_child_agg_node, + const Query *query) + : m_mp(mp), m_is_child_agg_node(is_child_agg_node), m_query(query) { // initialize hash table m_colid_to_target_entry_map = GPOS_NEW(m_mp) ULongToTargetEntryMap(m_mp); diff --git a/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp b/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp index 5f988418af..8518cb3e40 100644 --- a/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp +++ b/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp @@ -168,7 +168,7 @@ CTranslatorDXLToPlStmt::GetPlannedStmtFromDXL(const CDXLNode *dxlnode, { GPOS_ASSERT(nullptr != dxlnode); - CDXLTranslateContext dxl_translate_ctxt(m_mp, false); + CDXLTranslateContext dxl_translate_ctxt(m_mp, false, orig_query); PlanSlice *topslice; @@ -5441,6 +5441,51 @@ CTranslatorDXLToPlStmt::ProcessDXLTblDescr( return index; } +//--------------------------------------------------------------------------- +// @function: +// update_unknown_locale_walker +// +// @doc: +// Given an expression tree and a TargetEntry pointer context, look for a +// matching target entry in the expression tree and overwrite the given +// TargetEntry context's resname with the original found in the expression +// tree. +// +//--------------------------------------------------------------------------- +static bool +update_unknown_locale_walker(Node *node, void *context) +{ + if (node == nullptr) + { + return false; + } + + TargetEntry *unknown_target_entry = (TargetEntry *) context; + + if (IsA(node, TargetEntry)) + { + TargetEntry *te = (TargetEntry *) node; + + if (te->resorigtbl == unknown_target_entry->resorigtbl && + te->resno == unknown_target_entry->resno) + { + unknown_target_entry->resname = te->resname; + return false; + } + } + else if (IsA(node, Query)) + { + Query *query = (Query *) node; + + return gpdb::WalkExpressionTree( + (Node *) query->targetList, + (bool (*)()) update_unknown_locale_walker, (void *) context); + } + + return gpdb::WalkExpressionTree( + node, (bool (*)()) update_unknown_locale_walker, (void *) context); +} + //--------------------------------------------------------------------------- // @function: // CTranslatorDXLToPlStmt::TranslateDXLProjList @@ -5545,6 +5590,21 @@ CTranslatorDXLToPlStmt::TranslateDXLProjList( } target_entry->resorigtbl = pteOriginal->resorigtbl; target_entry->resorigcol = pteOriginal->resorigcol; + + // Hack! ORCA represents strings using wide characters. That + // can require converting from multibyte characters using + // vswprintf(). However, vswprintf() is dependent on the system + // locale which is set at the database level. When that locale + // cannot interpret the string correctly, it fails. ORCA + // bypasses the failure by using a generic "UNKNOWN" string. + // When that happens, the following code translates it back to + // the original multibyte string. + if (strcmp(target_entry->resname, "UNKNOWN") == 0) + { + update_unknown_locale_walker( + (Node *) output_context->GetQuery(), + (void *) target_entry); + } } } diff --git a/src/backend/gporca/libgpos/include/gpos/error/CException.h b/src/backend/gporca/libgpos/include/gpos/error/CException.h index ae63c31097..8d4d6bc8da 100644 --- a/src/backend/gporca/libgpos/include/gpos/error/CException.h +++ b/src/backend/gporca/libgpos/include/gpos/error/CException.h @@ -131,9 +131,6 @@ public: // unknown exception ExmiUnhandled, - // illegal byte sequence - ExmiIllegalByteSequence, - ExmiSentinel }; diff --git a/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp b/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp index 06d0f9b524..7596a8d7df 100644 --- a/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp +++ b/src/backend/gporca/libgpos/server/src/unittest/gpos/string/CWStringTest.cpp @@ -178,30 +178,23 @@ CWStringTest::EresUnittest_AppendFormatInvalidLocale() CAutoMemoryPool amp(CAutoMemoryPool::ElcExc); CMemoryPool *mp = amp.Pmp(); + CWStringDynamic *expected = + GPOS_NEW(mp) CWStringDynamic(mp, GPOS_WSZ_LIT("UNKNOWN")); + CHAR *oldLocale = setlocale(LC_CTYPE, nullptr); CWStringDynamic *pstr1 = GPOS_NEW(mp) CWStringDynamic(mp); GPOS_RESULT eres = GPOS_OK; setlocale(LC_CTYPE, "C"); - GPOS_TRY - { - pstr1->AppendFormat(GPOS_WSZ_LIT("%s"), (CHAR *) "ÃË", 123); - - eres = GPOS_FAILED; - } - GPOS_CATCH_EX(ex) - { - GPOS_ASSERT(GPOS_MATCH_EX(ex, CException::ExmaSystem, - CException::ExmiIllegalByteSequence)); + pstr1->AppendFormat(GPOS_WSZ_LIT("%s"), (CHAR *) "ÃË", 123); - GPOS_RESET_EX; - } - GPOS_CATCH_END; + pstr1->Equals(expected); // cleanup setlocale(LC_CTYPE, oldLocale); GPOS_DELETE(pstr1); + GPOS_DELETE(expected); return eres; } diff --git a/src/backend/gporca/libgpos/src/common/clibwrapper.cpp b/src/backend/gporca/libgpos/src/common/clibwrapper.cpp index 22b317e80b..33935edd94 100644 --- a/src/backend/gporca/libgpos/src/common/clibwrapper.cpp +++ b/src/backend/gporca/libgpos/src/common/clibwrapper.cpp @@ -358,7 +358,11 @@ gpos::clib::Vswprintf(WCHAR *wcstr, SIZE_T max_len, const WCHAR *format, { // Invalid multibyte character encountered. This can happen if the byte sequence does not // match with the server encoding. - GPOS_RAISE(CException::ExmaSystem, CException::ExmiIllegalByteSequence); + // + // Hack! Rather than fail/fall-back here, ORCA uses a generic "UNKNOWN" + // string. During DXL to PlStmt translation this will be translated + // back using the original query tree (see TranslateDXLProjList) + res = swprintf(wcstr, max_len, format, "UNKNOWN"); } return res; diff --git a/src/backend/gporca/libgpos/src/error/CMessage.cpp b/src/backend/gporca/libgpos/src/error/CMessage.cpp index f075252b97..15d6de984a 100644 --- a/src/backend/gporca/libgpos/src/error/CMessage.cpp +++ b/src/backend/gporca/libgpos/src/error/CMessage.cpp @@ -254,16 +254,6 @@ CMessage::GetMessage(ULONG index) CException(CException::ExmaUnhandled, CException::ExmiUnhandled), CException::ExsevError, GPOS_WSZ_WSZLEN("Unhandled exception"), 0, GPOS_WSZ_WSZLEN("Unhandled exception")), - - CMessage( - CException(CException::ExmaSystem, - CException::ExmiIllegalByteSequence), - CException::ExsevError, - GPOS_WSZ_WSZLEN( - "Invalid multibyte character for locale encountered in metadata name"), - 0, - GPOS_WSZ_WSZLEN( - "Invalid multibyte character for locale encountered in metadata name")), }; return &msg[index]; diff --git a/src/include/gpopt/translate/CDXLTranslateContext.h b/src/include/gpopt/translate/CDXLTranslateContext.h index 0771fc2514..56bcfd41eb 100644 --- a/src/include/gpopt/translate/CDXLTranslateContext.h +++ b/src/include/gpopt/translate/CDXLTranslateContext.h @@ -18,6 +18,12 @@ #ifndef GPDXL_CDXLTranslateContext_H #define GPDXL_CDXLTranslateContext_H +extern "C" { +#include "postgres.h" + +#include "nodes/plannodes.h" +} + #include "gpos/base.h" #include "gpos/common/CHashMap.h" #include "gpos/common/CHashMapIter.h" @@ -80,6 +86,8 @@ private: // to use OUTER instead of 0 for Var::varno in Agg target lists (MPP-12034) BOOL m_is_child_agg_node; + const Query *m_query{nullptr}; + // copy the params hashmap void CopyParamHashmap(ULongToColParamMap *original); void CopyTargetEntryHashmap(ULongToTargetEntryMap *original); @@ -88,7 +96,8 @@ public: CDXLTranslateContext(const CDXLTranslateContext &) = delete; // ctor/dtor - CDXLTranslateContext(CMemoryPool *mp, BOOL is_child_agg_node); + CDXLTranslateContext(CMemoryPool *mp, BOOL is_child_agg_node, + const Query *query); CDXLTranslateContext(CMemoryPool *mp, BOOL is_child_agg_node, ULongToColParamMap *original); @@ -111,6 +120,12 @@ public: return m_colid_to_target_entry_map; } + const Query * + GetQuery() + { + return m_query; + } + // return the target entry corresponding to the given ColId const TargetEntry *GetTargetEntry(ULONG colid) const; diff --git a/src/test/regress/expected/gp_locale.out b/src/test/regress/expected/gp_locale.out new file mode 100644 index 0000000000..0d916a93d7 --- /dev/null +++ b/src/test/regress/expected/gp_locale.out @@ -0,0 +1,90 @@ +-- ORCA uses functions (e.g. vswprintf) to translation to wide character +-- format. But those libraries may fail if the current locale cannot handle the +-- character set. This test checks that even when those libraries fail, ORCA is +-- still able to generate plans. +-- +-- Create a database that sets the minimum locale +-- +DROP DATABASE IF EXISTS test_locale; +CREATE DATABASE test_locale WITH LC_COLLATE='C' LC_CTYPE='C' TEMPLATE=template0; +\c test_locale +-- +-- drop/add/remove columns +-- +CREATE TABLE hi_안녕세계 (a int, 안녕세계1 text, 안녕세계2 text, 안녕세계3 text) DISTRIBUTED BY (a); +ALTER TABLE hi_안녕세계 DROP COLUMN 안녕세계2; +ALTER TABLE hi_안녕세계 ADD COLUMN 안녕세계2_ADD_COLUMN text; +ALTER TABLE hi_안녕세계 RENAME COLUMN 안녕세계3 TO こんにちわ3; +INSERT INTO hi_안녕세계 VALUES(1, '안녕세계1 first', '안녕세2 first', '안녕세계3 first'); +INSERT INTO hi_안녕세계 VALUES(42, '안녕세계1 second', '안녕세2 second', '안녕세계3 second'); +-- +-- Try various queries containing multibyte character set and check the column +-- name output +-- +SET optimizer_trace_fallback=on; +-- DELETE +DELETE FROM hi_안녕세계 WHERE a=42; +-- UPDATE +UPDATE hi_안녕세계 SET 안녕세계1='안녕세계1 first UPDATE' WHERE 안녕세계1='안녕세계1 first'; +-- SELECT +SELECT * FROM hi_안녕세계; + a | 안녕세계1 | こんにちわ3 | 안녕세계2_add_column +---+------------------------+---------------+---------------------- + 1 | 안녕세계1 first UPDATE | 안녕세2 first | 안녕세계3 first +(1 row) + +SELECT 안녕세계1 || こんにちわ3 FROM hi_안녕세계; + ?column? +------------------------------------- + 안녕세계1 first UPDATE안녕세2 first +(1 row) + +-- SELECT ALIAS +SELECT 안녕세계1 AS 안녕세계1_Alias FROM hi_안녕세계; + 안녕세계1_alias +------------------------ + 안녕세계1 first UPDATE +(1 row) + +-- SUBQUERY +SELECT * FROM (SELECT 안녕세계1 FROM hi_안녕세계) t; + 안녕세계1 +------------------------ + 안녕세계1 first UPDATE +(1 row) + +SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM (SELECT 1) AS q; + こんにちわ3 +--------------- + 안녕세2 first +(1 row) + +SELECT (SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM hi_안녕세계) FROM (SELECT 1) AS q; + こんにちわ3 +--------------- + 안녕세2 first +(1 row) + +-- CTE +WITH cte AS +(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계1 LIKE '안녕세계1%'; + 안녕세계1 | こんにちわ3 +------------------------+--------------- + 안녕세계1 first UPDATE | 안녕세2 first +(1 row) + +WITH cte(안녕세계x, こんにちわx) AS +(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계x LIKE '안녕세계1%'; + 안녕세계x | こんにちわx +------------------------+--------------- + 안녕세계1 first UPDATE | 안녕세2 first +(1 row) + +-- JOIN +SELECT * FROM hi_안녕세계 hi_안녕세계1, hi_안녕세계 hi_안녕세계2 WHERE hi_안녕세계1.안녕세계1 LIKE '%UPDATE'; + a | 안녕세계1 | こんにちわ3 | 안녕세계2_add_column | a | 안녕세계1 | こんにちわ3 | 안녕세계2_add_column +---+------------------------+---------------+----------------------+---+------------------------+---------------+---------------------- + 1 | 안녕세계1 first UPDATE | 안녕세2 first | 안녕세계3 first | 1 | 안녕세계1 first UPDATE | 안녕세2 first | 안녕세계3 first +(1 row) + +RESET optimizer_trace_fallback; diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule index b92b9b6ce6..42ebacfd14 100755 --- a/src/test/regress/greenplum_schedule +++ b/src/test/regress/greenplum_schedule @@ -37,7 +37,7 @@ test: instr_in_shmem_setup test: instr_in_shmem test: createdb -test: gp_aggregates gp_aggregates_costs gp_metadata variadic_parameters default_parameters function_extensions spi gp_xml shared_scan update_gp triggers_gp returning_gp resource_queue_with_rule gp_types gp_index cluster_gp combocid_gp gp_sort gp_prepared_xacts gp_backend_info +test: gp_aggregates gp_aggregates_costs gp_metadata variadic_parameters default_parameters function_extensions spi gp_xml shared_scan update_gp triggers_gp returning_gp resource_queue_with_rule gp_types gp_index cluster_gp combocid_gp gp_sort gp_prepared_xacts gp_backend_info gp_locale test: foreign_key_gp test: spi_processed64bit test: gp_tablespace_with_faults diff --git a/src/test/regress/sql/gp_locale.sql b/src/test/regress/sql/gp_locale.sql new file mode 100644 index 0000000000..444352c9ed --- /dev/null +++ b/src/test/regress/sql/gp_locale.sql @@ -0,0 +1,61 @@ +-- ORCA uses functions (e.g. vswprintf) to translation to wide character +-- format. But those libraries may fail if the current locale cannot handle the +-- character set. This test checks that even when those libraries fail, ORCA is +-- still able to generate plans. + +-- +-- Create a database that sets the minimum locale +-- +DROP DATABASE IF EXISTS test_locale; +CREATE DATABASE test_locale WITH LC_COLLATE='C' LC_CTYPE='C' TEMPLATE=template0; +\c test_locale + +-- +-- drop/add/remove columns +-- +CREATE TABLE hi_안녕세계 (a int, 안녕세계1 text, 안녕세계2 text, 안녕세계3 text) DISTRIBUTED BY (a); +ALTER TABLE hi_안녕세계 DROP COLUMN 안녕세계2; +ALTER TABLE hi_안녕세계 ADD COLUMN 안녕세계2_ADD_COLUMN text; +ALTER TABLE hi_안녕세계 RENAME COLUMN 안녕세계3 TO こんにちわ3; + +INSERT INTO hi_안녕세계 VALUES(1, '안녕세계1 first', '안녕세2 first', '안녕세계3 first'); +INSERT INTO hi_안녕세계 VALUES(42, '안녕세계1 second', '안녕세2 second', '안녕세계3 second'); + +-- +-- Try various queries containing multibyte character set and check the column +-- name output +-- +SET optimizer_trace_fallback=on; + +-- DELETE +DELETE FROM hi_안녕세계 WHERE a=42; + +-- UPDATE +UPDATE hi_안녕세계 SET 안녕세계1='안녕세계1 first UPDATE' WHERE 안녕세계1='안녕세계1 first'; + +-- SELECT +SELECT * FROM hi_안녕세계; + +SELECT 안녕세계1 || こんにちわ3 FROM hi_안녕세계; + +-- SELECT ALIAS +SELECT 안녕세계1 AS 안녕세계1_Alias FROM hi_안녕세계; + +-- SUBQUERY +SELECT * FROM (SELECT 안녕세계1 FROM hi_안녕세계) t; + +SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM (SELECT 1) AS q; + +SELECT (SELECT (SELECT こんにちわ3 FROM hi_안녕세계) FROM hi_안녕세계) FROM (SELECT 1) AS q; + +-- CTE +WITH cte AS +(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계1 LIKE '안녕세계1%'; + +WITH cte(안녕세계x, こんにちわx) AS +(SELECT 안녕세계1, こんにちわ3 FROM hi_안녕세계) SELECT * FROM cte WHERE 안녕세계x LIKE '안녕세계1%'; + +-- JOIN +SELECT * FROM hi_안녕세계 hi_안녕세계1, hi_안녕세계 hi_안녕세계2 WHERE hi_안녕세계1.안녕세계1 LIKE '%UPDATE'; + +RESET optimizer_trace_fallback; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
