(cloudberry) branch main updated: Fix segfault when recovering 2pc transaction (#1078)

yjhjstz Tue, 17 Jun 2025 09:31:43 -0700

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git



The following commit(s) were added to refs/heads/main by this push:
     new 3bae20b385b Fix segfault when recovering 2pc transaction (#1078)
3bae20b385b is described below

commit 3bae20b385b1727a741271911eba14547a1bacc0
Author: Smyatkin Maxim <smyatkinma...@gmail.com>
AuthorDate: Tue Jun 17 19:30:58 2025 +0300

    Fix segfault when recovering 2pc transaction (#1078)
    
    * Fix segfault when recovering 2pc transaction
    
    When promoting a mirror segment due to failover we have seen a stacktrace 
like this:
    
    ```
    "FATAL","58P01","requested WAL segment pg_xlog/00000023000071D50000001F has 
already been removed",,,,,,,0,,"xlogutils.c",580,"Stack
    trace:
    1    0x557bdb9f09b6 postgres errstart + 0x236
    2    0x557bdb5fc6cf postgres <symbol not found> + 0xdb5fc6cf
    3    0x557bdb5fd021 postgres read_local_xlog_page + 0x191
    4    0x557bdb5fb922 postgres <symbol not found> + 0xdb5fb922
    5    0x557bdb5fba11 postgres XLogReadRecord + 0xa1
    6    0x557bdb5e7767 postgres RecoverPreparedTransactions + 0xd7
    7    0x557bdb5f608b postgres StartupXLOG + 0x2a3b
    8    0x557bdb870a89 postgres StartupProcessMain + 0x139
    9    0x557bdb62f489 postgres AuxiliaryProcessMain + 0x549
    10   0x557bdb86d275 postgres <symbol not found> + 0xdb86d275
    11   0x557bdb8704e3 postgres PostmasterMain + 0x1213
    12   0x557bdb56a1f7 postgres main + 0x497
    13   0x7fded4a61c87 libc.so.6 __libc_start_main + 0xe7
    ```
    Note: stacktrace is from one of our production GP clusters and might be 
slightly different from what we
    will see in cloudberry, but the failure is still present here as well.
    Testcase proves it.
    
    PG13 and PG14 have a fix for this bug, but it's doesn't have any test
    case and looks like we didn't cherry-pick that far. The discussion can be 
found here:
    
https://www.postgresql.org/message-id/flat/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at
    
    In a few words, StartupXLOG() renames the last wal segment to .partial but 
tries to read it by the old name later in
    RecoverPreparedTransactions().
    
    The fix is mostly borrowed from PG14 postgres/postgres@f663b00 with some 
cloudberry-related exceptions.
    Also added a regression test which segfaults without this fix for any
    version of GP, PG<=12 or Cloudberry.
    
    * Add stable ordering to testcase
    
    ---------
    
    Co-authored-by: Jianghua.yjh <yjhj...@gmail.com>
---
 src/backend/access/transam/xlog.c                  | 146 ++++++++---------
 .../segwalrep/startup_rename_prepared_xlog.out     | 173 +++++++++++++++++++++
 src/test/isolation2/isolation2_schedule            |   1 +
 .../sql/segwalrep/startup_rename_prepared_xlog.sql |  89 +++++++++++
 4 files changed, 336 insertions(+), 73 deletions(-)

diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index 722bea2040b..3fb9f121b93 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8321,6 +8321,79 @@ StartupXLOG(void)
                        CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | 
CHECKPOINT_IMMEDIATE);
        }
 
+       /*
+        * Preallocate additional log files, if wanted.
+        */
+       PreallocXlogFiles(EndOfLog);
+
+       /*
+        * Okay, we're officially UP.
+        */
+       InRecovery = false;
+
+       /*
+        * Hook for plugins to do additional startup works.
+        *
+        * Allow to write any WALs in hook.
+        */
+       if (Startup_hook)
+       {
+           LocalSetXLogInsertAllowed();
+           (*Startup_hook) ();
+           LocalXLogInsertAllowed = -1;
+       }
+
+       /* start the archive_timeout timer and LSN running */
+       XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+       XLogCtl->lastSegSwitchLSN = EndOfLog;
+
+       /* also initialize latestCompletedXid, to nextXid - 1 */
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+       ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
+       ShmemVariableCache->latestCompletedGxid = ShmemVariableCache->nextGxid;
+       FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
+       if (IsNormalProcessingMode())
+               elog(LOG, "latest completed transaction id is %u and next 
transaction id is %u",
+                        
XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid),
+                        XidFromFullTransactionId(ShmemVariableCache->nextXid));
+       LWLockRelease(ProcArrayLock);
+
+       /*
+        * Start up subtrans, if not already done for hot standby.  (commit
+        * timestamps are started below, if necessary.)
+        */
+       if (standbyState == STANDBY_DISABLED)
+       {
+               StartupCLOG();
+               StartupSUBTRANS(oldestActiveXID);
+               DistributedLog_Startup(oldestActiveXID,
+                                                          
XidFromFullTransactionId(ShmemVariableCache->nextXid));
+       }
+
+       /*
+        * Perform end of recovery actions for any SLRUs that need it.
+        */
+       TrimCLOG();
+       TrimMultiXact();
+
+       /* Reload shared-memory state for prepared transactions */
+       RecoverPreparedTransactions();
+
+       /* Shut down xlogreader */
+       if (readFile >= 0)
+       {
+               close(readFile);
+               readFile = -1;
+       }
+       XLogReaderFree(xlogreader);
+
+       /*
+        * If any of the critical GUCs have changed, log them before we allow
+        * backends to write WAL.
+        */
+       LocalSetXLogInsertAllowed();
+       XLogReportParameters();
+
        if (ArchiveRecoveryRequested)
        {
                /*
@@ -8402,28 +8475,6 @@ StartupXLOG(void)
                }
        }
 
-       /*
-        * Preallocate additional log files, if wanted.
-        */
-       PreallocXlogFiles(EndOfLog);
-
-       /*
-        * Okay, we're officially UP.
-        */
-       InRecovery = false;
-
-       /*
-        * Hook for plugins to do additional startup works.
-        *
-        * Allow to write any WALs in hook.
-        */
-       if (Startup_hook)
-       {
-           LocalSetXLogInsertAllowed();
-           (*Startup_hook) ();
-           LocalXLogInsertAllowed = -1;
-       }
-
        /*
         * If we are a standby with contentid -1 and undergoing promotion,
         * update ourselves as the new master in catalog.  This does not
@@ -8433,60 +8484,9 @@ StartupXLOG(void)
        bool needToPromoteCatalog = (IS_QUERY_DISPATCHER() &&
                                                                 
ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
 
-       /* start the archive_timeout timer and LSN running */
-       XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
-       XLogCtl->lastSegSwitchLSN = EndOfLog;
-
-       /* also initialize latestCompletedXid, to nextXid - 1 */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-       ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
-       ShmemVariableCache->latestCompletedGxid = ShmemVariableCache->nextGxid;
-       FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
-       if (IsNormalProcessingMode())
-               elog(LOG, "latest completed transaction id is %u and next 
transaction id is %u",
-                        
XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid),
-                        XidFromFullTransactionId(ShmemVariableCache->nextXid));
-       LWLockRelease(ProcArrayLock);
-
-       /*
-        * Start up subtrans, if not already done for hot standby.  (commit
-        * timestamps are started below, if necessary.)
-        */
-       if (standbyState == STANDBY_DISABLED)
-       {
-               StartupCLOG();
-               StartupSUBTRANS(oldestActiveXID);
-               DistributedLog_Startup(oldestActiveXID,
-                                                          
XidFromFullTransactionId(ShmemVariableCache->nextXid));
-       }
-
-       /*
-        * Perform end of recovery actions for any SLRUs that need it.
-        */
-       TrimCLOG();
-       TrimMultiXact();
-
-       /* Reload shared-memory state for prepared transactions */
-       RecoverPreparedTransactions();
-
        if(IsNormalProcessingMode())
                ereport(LOG, (errmsg("database system is ready")));
 
-       /* Shut down xlogreader */
-       if (readFile >= 0)
-       {
-               close(readFile);
-               readFile = -1;
-       }
-       XLogReaderFree(xlogreader);
-
-       /*
-        * If any of the critical GUCs have changed, log them before we allow
-        * backends to write WAL.
-        */
-       LocalSetXLogInsertAllowed();
-       XLogReportParameters();
-
        /*
         * Local WAL inserts enabled, so it's time to finish initialization of
         * commit timestamp.
diff --git 
a/src/test/isolation2/expected/segwalrep/startup_rename_prepared_xlog.out 
b/src/test/isolation2/expected/segwalrep/startup_rename_prepared_xlog.out
new file mode 100644
index 00000000000..d51241e6aa0
--- /dev/null
+++ b/src/test/isolation2/expected/segwalrep/startup_rename_prepared_xlog.out
@@ -0,0 +1,173 @@
+-- Test a bug where the last XLOG segment is renamed to .partial before 
prepared
+-- transactions contained in it are recovered. This would result in segfault at
+-- RecoverPreparedTransactions. The failure is well described in PG hackers:
+-- 
https://www.postgresql.org/message-id/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at
+-- However, in GP due to an older version of PG this error is masked by always 
caching
+-- the last opened xlog segment in XLogRead(). To actually trigger the failure 
we need to
+-- create unfinished prepared transactions in TWO different xlog segments.
+
+-- Allow extra time for mirror promotion to complete recovery to avoid
+-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
+-- are not up. Setting these increase the number of retries in gang creation in
+-- case segment is in recovery. Approximately we want to wait 30 seconds.
+!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation 
--masteronly;
+(exited with code 0)
+!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation 
--masteronly;
+(exited with code 0)
+
+-- The last xlog segment is renamed to .partial only when archive mode is on
+!\retcode gpconfig -c archive_mode -v on;
+(exited with code 0)
+!\retcode gpconfig -c archive_command -v '/bin/true';
+(exited with code 0)
+!\retcode gpstop -rai;
+(exited with code 0)
+
+1: create extension if not exists gp_inject_fault;
+CREATE
+1: create table t_rename1 (a int);
+CREATE
+1: create table t_rename2 (a int);
+CREATE
+
+-- generate an orphaned prepare transaction in first segment
+1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from 
gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-- assume (2), (1) are on different segments and one tuple is on the first 
segment.
+-- the test finally double-check that.
+2&: insert into t_rename1 values(2),(1);  <waiting ...>
+1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from 
gp_segment_configuration where role = 'p' and content = -1;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- start another xlog segment (will be renamed to .partial on failover)
+GP_IGNORE:-- start_ignore
+GP_IGNORE:0U: select pg_switch_xlog();
+GP_IGNORE: pg_switch_xlog 
+GP_IGNORE:----------------
+GP_IGNORE: 0/C0B05E0      
+GP_IGNORE:(1 row)
+GP_IGNORE:-- end_ignore
+
+-- inject another fault and prepare transaction in the new xlog segment
+1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from 
gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from 
gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-- assume (4), (3) are on different segments and one tuple is on the first 
segment.
+-- the test finally double-check that.
+3&: insert into t_rename2 values(2),(1);  <waiting ...>
+1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from 
gp_segment_configuration where role = 'p' and content = -1;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- shutdown primary and make sure the segment is down
+-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where 
c.role='p' and c.content=0), 'stop', 'immediate');
+ pg_ctl 
+--------
+ OK     
+(1 row)
+1: select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+1: select role, preferred_role from gp_segment_configuration where content = 0 
order by role;
+ role | preferred_role 
+------+----------------
+ m    | p              
+ p    | m              
+(2 rows)
+
+-- double confirm that promote succeeds.
+-- also double confirm that
+--  1. tuples (2) and (1) are located on two segments (thus we are testing 2pc 
with prepared transaction).
+--  2. there are tuples on the first segment (we have been testing on the 
first segment).
+1: insert into t_rename1 values(2),(1);
+INSERT 2
+1: select gp_segment_id, * from t_rename1 order by a;
+ gp_segment_id | a 
+---------------+---
+ 1             | 1 
+ 0             | 2 
+(2 rows)
+
+1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from 
gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+2<:  <... completed>
+INSERT 2
+3<:  <... completed>
+INSERT 2
+
+-- confirm the "orphaned" prepared trnasaction commits finally.
+1: select * from t_rename1 order by a;
+ a 
+---
+ 1 
+ 1 
+ 2 
+ 2 
+(4 rows)
+1: select * from t_rename2 order by a;
+ a 
+---
+ 1 
+ 2 
+(2 rows)
+
+-- recovery the nodes
+!\retcode gprecoverseg -a;
+(exited with code 0)
+1: select wait_until_segment_synchronized(0);
+ wait_until_segment_synchronized 
+---------------------------------
+ OK                              
+(1 row)
+
+!\retcode gprecoverseg -ar;
+(exited with code 0)
+1: select wait_until_segment_synchronized(0);
+ wait_until_segment_synchronized 
+---------------------------------
+ OK                              
+(1 row)
+
+-- verify the first segment is recovered to the original state.
+1: select role, preferred_role from gp_segment_configuration where content = 0 
order by role;
+ role | preferred_role 
+------+----------------
+ m    | m              
+ p    | p              
+(2 rows)
+
+-- cleanup
+1: drop table t_rename1;
+DROP
+1: drop table t_rename2;
+DROP
+!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation;
+(exited with code 0)
+!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation;
+(exited with code 0)
+!\retcode gpconfig -r archive_mode --skipvalidation;
+(exited with code 0)
+!\retcode gpconfig -r archive_command --skipvalidation;
+(exited with code 0)
+!\retcode gpstop -rai;
+(exited with code 0)
diff --git a/src/test/isolation2/isolation2_schedule 
b/src/test/isolation2/isolation2_schedule
index 18fd4a2c0c4..f9e6082ff45 100644
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
@@ -250,6 +250,7 @@ test: segwalrep/dtm_recovery_on_standby
 test: segwalrep/commit_blocking_on_standby
 test: segwalrep/dtx_recovery_wait_lsn
 test: segwalrep/select_throttle
+test: segwalrep/startup_rename_prepared_xlog
 test: fts_manual_probe
 test: fts_session_reset
 # unstable FTS test in different arch
diff --git a/src/test/isolation2/sql/segwalrep/startup_rename_prepared_xlog.sql 
b/src/test/isolation2/sql/segwalrep/startup_rename_prepared_xlog.sql
new file mode 100644
index 00000000000..f612647f8ea
--- /dev/null
+++ b/src/test/isolation2/sql/segwalrep/startup_rename_prepared_xlog.sql
@@ -0,0 +1,89 @@
+-- Test a bug where the last XLOG segment is renamed to .partial before 
prepared
+-- transactions contained in it are recovered. This would result in segfault at
+-- RecoverPreparedTransactions. The failure is well described in PG hackers:
+-- 
https://www.postgresql.org/message-id/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at
+-- However, in GP due to an older version of PG this error is masked by always 
caching 
+-- the last opened xlog segment in XLogRead(). To actually trigger the failure 
we need to
+-- create unfinished prepared transactions in TWO different xlog segments.
+
+-- Allow extra time for mirror promotion to complete recovery to avoid
+-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
+-- are not up. Setting these increase the number of retries in gang creation in
+-- case segment is in recovery. Approximately we want to wait 30 seconds.
+!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation 
--masteronly;
+!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation 
--masteronly;
+
+-- The last xlog segment is renamed to .partial only when archive mode is on
+!\retcode gpconfig -c archive_mode -v on;
+!\retcode gpconfig -c archive_command -v '/bin/true';
+!\retcode gpstop -rai;
+
+1: create extension if not exists gp_inject_fault;
+1: create table t_rename1 (a int);
+1: create table t_rename2 (a int);
+
+-- generate an orphaned prepare transaction in first segment
+1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+-- assume (2), (1) are on different segments and one tuple is on the first 
segment.
+-- the test finally double-check that.
+2&: insert into t_rename1 values(2),(1);
+1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+
+-- start another xlog segment (will be renamed to .partial on failover)
+-- start_ignore
+0U: select pg_switch_xlog();
+-- end_ignore
+
+-- inject another fault and prepare transaction in the new xlog segment
+1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+-- assume (4), (3) are on different segments and one tuple is on the first 
segment.
+-- the test finally double-check that.
+3&: insert into t_rename2 values(2),(1);
+1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+
+-- shutdown primary and make sure the segment is down
+-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c
+  where c.role='p' and c.content=0), 'stop', 'immediate');
+1: select gp_request_fts_probe_scan();
+1: select role, preferred_role from gp_segment_configuration where content = 0 
order by role;
+
+-- double confirm that promote succeeds.
+-- also double confirm that
+--  1. tuples (2) and (1) are located on two segments (thus we are testing 2pc 
with prepared transaction).
+--  2. there are tuples on the first segment (we have been testing on the 
first segment).
+1: insert into t_rename1 values(2),(1);
+1: select gp_segment_id, * from t_rename1 order by a;
+
+1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+2<:
+3<:
+
+-- confirm the "orphaned" prepared trnasaction commits finally.
+1: select * from t_rename1 order by a;
+1: select * from t_rename2 order by a;
+
+-- recovery the nodes
+!\retcode gprecoverseg -a;
+1: select wait_until_segment_synchronized(0);
+
+!\retcode gprecoverseg -ar;
+1: select wait_until_segment_synchronized(0);
+
+-- verify the first segment is recovered to the original state.
+1: select role, preferred_role from gp_segment_configuration where content = 0 
order by role;
+
+-- cleanup
+1: drop table t_rename1;
+1: drop table t_rename2;
+!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation;
+!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation;
+!\retcode gpconfig -r archive_mode --skipvalidation;
+!\retcode gpconfig -r archive_command --skipvalidation;
+!\retcode gpstop -rai;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org
For additional commands, e-mail: commits-h...@cloudberry.apache.org

(cloudberry) branch main updated: Fix segfault when recovering 2pc transaction (#1078)

Reply via email to