This is an automated email from the ASF dual-hosted git repository.

reshke pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


The following commit(s) were added to refs/heads/main by this push:
     new c305e7ba961 Fix std::terminate() caused by uncaught exception in ORCA 
(#1481)
c305e7ba961 is described below

commit c305e7ba961b9d8983b4f01462054bbaee828bbb
Author: NJrslv <[email protected]>
AuthorDate: Fri Dec 19 13:59:01 2025 +0300

    Fix std::terminate() caused by uncaught exception in ORCA (#1481)
    
    ### What does this commit do?
    This PR fixes a crash, `std::terminate()`, caused by an exception occurring 
in the catch block of `GPOPTOptimizedPlan()` during 
`gpopt_context.CloneErrorMsg()`.
    
    When `CloneErrorMsg` fails (typically due to OOM), it throws a `gpdb` 
error, causing `std::terminate()`.
    
    Postgres has graceful error handling for errors that occur while processing 
other errors - even during OOM, the `ErrorContext` has reserved space to 
preserve error messages.
    
    So rethrow the error to postgres handler.
    
    ### Test Plan
    Added a new test to the regression `gporca_fault` test.
    
    I added a query that errors out during optimization in orca and as the 
result falls into `catch` block where another exception in `CloneErrorMsg` is 
thrown via inject fault.
    
    Previously it resulted in the `std::terminate()` call & process crashing. 
Now catch the exception and let the postgres handle it with reserved buffer to 
preserve error message.
    
    ### Impact
    Instead of
    ```sql
    server closed the connection unexpectedly
            This probably means the server terminated abnormally
            before or while processing the request.
    ```
    postgres reports an error (in case of OOM we will see it, not 
std::terminate with exiting processes):
    ```sql
    ERROR:  ...
    ```
---
 src/backend/gpopt/CGPOptimizer.cpp                 |  5 +++--
 src/backend/gpopt/utils/COptTasks.cpp              | 25 ++++++++++++++++++++--
 src/include/gpopt/utils/COptTasks.h                |  2 +-
 src/test/regress/expected/gporca_faults.out        | 25 ++++++++++++++++++++++
 .../regress/expected/gporca_faults_optimizer.out   | 21 ++++++++++++++++++
 src/test/regress/sql/gporca_faults.sql             | 12 +++++++++++
 6 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/src/backend/gpopt/CGPOptimizer.cpp 
b/src/backend/gpopt/CGPOptimizer.cpp
index 11d08f8438a..a0253e44b41 100644
--- a/src/backend/gpopt/CGPOptimizer.cpp
+++ b/src/backend/gpopt/CGPOptimizer.cpp
@@ -63,8 +63,9 @@ CGPOptimizer::GPOPTOptimizedPlan(
        GPOS_CATCH_EX(ex)
        {
                // clone the error message before context free.
+               BOOL clone_failed = false;
                CHAR *serialized_error_msg =
-                       gpopt_context.CloneErrorMsg(MessageContext);
+                       gpopt_context.CloneErrorMsg(MessageContext, 
&clone_failed);
                // clean up context
                gpopt_context.Free(gpopt_context.epinQuery, 
gpopt_context.epinPlStmt);
 
@@ -72,7 +73,7 @@ CGPOptimizer::GPOPTOptimizedPlan(
                // we want to use the correct error code for these, in case an 
application
                // tries to do something smart with them.
 
-               if (GPOS_MATCH_EX(ex, gpdxl::ExmaGPDB, gpdxl::ExmiGPDBError))
+               if (clone_failed || GPOS_MATCH_EX(ex, gpdxl::ExmaGPDB, 
gpdxl::ExmiGPDBError))
                {
                        PG_RE_THROW();
                }
diff --git a/src/backend/gpopt/utils/COptTasks.cpp 
b/src/backend/gpopt/utils/COptTasks.cpp
index dd0e61116d8..6bd93379f0f 100644
--- a/src/backend/gpopt/utils/COptTasks.cpp
+++ b/src/backend/gpopt/utils/COptTasks.cpp
@@ -162,13 +162,34 @@ SOptContext::Free(SOptContext::EPin input, 
SOptContext::EPin output) const
 //
 //---------------------------------------------------------------------------
 CHAR *
-SOptContext::CloneErrorMsg(MemoryContext context) const
+SOptContext::CloneErrorMsg(MemoryContext context, BOOL *clone_failed) const
 {
+       *clone_failed = false;
+
        if (nullptr == context || nullptr == m_error_msg)
        {
                return nullptr;
        }
-       return gpdb::MemCtxtStrdup(context, m_error_msg);
+
+       CHAR *error_msg;
+       GPOS_TRY
+       {
+#ifdef FAULT_INJECTOR
+               if (gpdb::InjectFaultInOptTasks("opt_clone_error_msg") == 
FaultInjectorTypeSkip)
+               {
+                       GpdbEreport(ERRCODE_INTERNAL_ERROR, ERROR, "Injected 
error", nullptr);
+               }
+#endif
+               error_msg = gpdb::MemCtxtStrdup(context, m_error_msg);
+       }
+       GPOS_CATCH_EX(ex)
+       {
+               error_msg = nullptr;
+               *clone_failed = true;
+       }
+       GPOS_CATCH_END;
+
+       return error_msg;
 }
 
 
diff --git a/src/include/gpopt/utils/COptTasks.h 
b/src/include/gpopt/utils/COptTasks.h
index 3fa5f91216c..bed8f36d51a 100644
--- a/src/include/gpopt/utils/COptTasks.h
+++ b/src/include/gpopt/utils/COptTasks.h
@@ -107,7 +107,7 @@ struct SOptContext
        void Free(EPin input, EPin epinOutput) const;
 
        // Clone the error message in given context.
-       CHAR *CloneErrorMsg(struct MemoryContextData *context) const;
+       CHAR *CloneErrorMsg(struct MemoryContextData *context, BOOL 
*clone_failed) const;
 
        // casting function
        static SOptContext *Cast(void *ptr);
diff --git a/src/test/regress/expected/gporca_faults.out 
b/src/test/regress/expected/gporca_faults.out
index c8e6f7ff557..e60f3b048b7 100644
--- a/src/test/regress/expected/gporca_faults.out
+++ b/src/test/regress/expected/gporca_faults.out
@@ -70,3 +70,28 @@ SELECT 
gp_inject_fault('opt_relcache_translator_catalog_access', 'reset', 1);
  Success:
 (1 row)
 
+-- Test to check that GPOPTOptimizedPlan() does not cause std::terminate() by 
throwing an uncaught exception.
+CREATE TABLE test_orca_uncaught_exc(a int, b int) DISTRIBUTED RANDOMLY;
+-- Since ORCA cannot optimize this query, an exception is generated. We then 
inject a second exception when the
+-- first is caught and verify that it is not propagated further (no 
std::terminate() is called and backend is alive).
+SELECT gp_inject_fault('opt_clone_error_msg', 'skip', 1);
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+SELECT sum(distinct a), count(distinct b) FROM test_orca_uncaught_exc;
+ sum | count 
+-----+-------
+     |     0
+(1 row)
+
+SELECT gp_inject_fault('opt_clone_error_msg', 'reset', 1);
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+-- start_ignore
+DROP TABLE test_orca_uncaught_exc;
+-- end_ignore
diff --git a/src/test/regress/expected/gporca_faults_optimizer.out 
b/src/test/regress/expected/gporca_faults_optimizer.out
index 46ca0efe7ad..32b5b317298 100644
--- a/src/test/regress/expected/gporca_faults_optimizer.out
+++ b/src/test/regress/expected/gporca_faults_optimizer.out
@@ -62,3 +62,24 @@ SELECT 
gp_inject_fault('opt_relcache_translator_catalog_access', 'reset', 1);
  Success:
 (1 row)
 
+-- Test to check that GPOPTOptimizedPlan() does not cause std::terminate() by 
throwing an uncaught exception.
+CREATE TABLE test_orca_uncaught_exc(a int, b int) DISTRIBUTED RANDOMLY;
+-- Since ORCA cannot optimize this query, an exception is generated. We then 
inject a second exception when the
+-- first is caught and verify that it is not propagated further (no 
std::terminate() is called and backend is alive).
+SELECT gp_inject_fault('opt_clone_error_msg', 'skip', 1);
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+SELECT sum(distinct a), count(distinct b) FROM test_orca_uncaught_exc;
+ERROR:  Injected error (COptTasks.cpp:180)
+SELECT gp_inject_fault('opt_clone_error_msg', 'reset', 1);
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+-- start_ignore
+DROP TABLE test_orca_uncaught_exc;
+-- end_ignore
diff --git a/src/test/regress/sql/gporca_faults.sql 
b/src/test/regress/sql/gporca_faults.sql
index c9d1634d724..a1d324ad1da 100644
--- a/src/test/regress/sql/gporca_faults.sql
+++ b/src/test/regress/sql/gporca_faults.sql
@@ -30,3 +30,15 @@ SELECT * FROM func1_nosql_vol(5), foo;
 
 -- The fault should *not* be hit above when optimizer = off, to reset it now.
 SELECT gp_inject_fault('opt_relcache_translator_catalog_access', 'reset', 1);
+
+-- Test to check that GPOPTOptimizedPlan() does not cause std::terminate() by 
throwing an uncaught exception.
+CREATE TABLE test_orca_uncaught_exc(a int, b int) DISTRIBUTED RANDOMLY;
+-- Since ORCA cannot optimize this query, an exception is generated. We then 
inject a second exception when the
+-- first is caught and verify that it is not propagated further (no 
std::terminate() is called and backend is alive).
+SELECT gp_inject_fault('opt_clone_error_msg', 'skip', 1);
+SELECT sum(distinct a), count(distinct b) FROM test_orca_uncaught_exc;
+SELECT gp_inject_fault('opt_clone_error_msg', 'reset', 1);
+
+-- start_ignore
+DROP TABLE test_orca_uncaught_exc;
+-- end_ignore


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to