This is an automated email from the ASF dual-hosted git repository. yjhjstz pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push: new 3bae20b385b Fix segfault when recovering 2pc transaction (#1078) 3bae20b385b is described below commit 3bae20b385b1727a741271911eba14547a1bacc0 Author: Smyatkin Maxim <smyatkinma...@gmail.com> AuthorDate: Tue Jun 17 19:30:58 2025 +0300 Fix segfault when recovering 2pc transaction (#1078) * Fix segfault when recovering 2pc transaction When promoting a mirror segment due to failover we have seen a stacktrace like this: ``` "FATAL","58P01","requested WAL segment pg_xlog/00000023000071D50000001F has already been removed",,,,,,,0,,"xlogutils.c",580,"Stack trace: 1 0x557bdb9f09b6 postgres errstart + 0x236 2 0x557bdb5fc6cf postgres <symbol not found> + 0xdb5fc6cf 3 0x557bdb5fd021 postgres read_local_xlog_page + 0x191 4 0x557bdb5fb922 postgres <symbol not found> + 0xdb5fb922 5 0x557bdb5fba11 postgres XLogReadRecord + 0xa1 6 0x557bdb5e7767 postgres RecoverPreparedTransactions + 0xd7 7 0x557bdb5f608b postgres StartupXLOG + 0x2a3b 8 0x557bdb870a89 postgres StartupProcessMain + 0x139 9 0x557bdb62f489 postgres AuxiliaryProcessMain + 0x549 10 0x557bdb86d275 postgres <symbol not found> + 0xdb86d275 11 0x557bdb8704e3 postgres PostmasterMain + 0x1213 12 0x557bdb56a1f7 postgres main + 0x497 13 0x7fded4a61c87 libc.so.6 __libc_start_main + 0xe7 ``` Note: stacktrace is from one of our production GP clusters and might be slightly different from what we will see in cloudberry, but the failure is still present here as well. Testcase proves it. PG13 and PG14 have a fix for this bug, but it's doesn't have any test case and looks like we didn't cherry-pick that far. The discussion can be found here: https://www.postgresql.org/message-id/flat/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at In a few words, StartupXLOG() renames the last wal segment to .partial but tries to read it by the old name later in RecoverPreparedTransactions(). The fix is mostly borrowed from PG14 postgres/postgres@f663b00 with some cloudberry-related exceptions. Also added a regression test which segfaults without this fix for any version of GP, PG<=12 or Cloudberry. * Add stable ordering to testcase --------- Co-authored-by: Jianghua.yjh <yjhj...@gmail.com> --- src/backend/access/transam/xlog.c | 146 ++++++++--------- .../segwalrep/startup_rename_prepared_xlog.out | 173 +++++++++++++++++++++ src/test/isolation2/isolation2_schedule | 1 + .../sql/segwalrep/startup_rename_prepared_xlog.sql | 89 +++++++++++ 4 files changed, 336 insertions(+), 73 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 722bea2040b..3fb9f121b93 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8321,6 +8321,79 @@ StartupXLOG(void) CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE); } + /* + * Preallocate additional log files, if wanted. + */ + PreallocXlogFiles(EndOfLog); + + /* + * Okay, we're officially UP. + */ + InRecovery = false; + + /* + * Hook for plugins to do additional startup works. + * + * Allow to write any WALs in hook. + */ + if (Startup_hook) + { + LocalSetXLogInsertAllowed(); + (*Startup_hook) (); + LocalXLogInsertAllowed = -1; + } + + /* start the archive_timeout timer and LSN running */ + XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = EndOfLog; + + /* also initialize latestCompletedXid, to nextXid - 1 */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; + ShmemVariableCache->latestCompletedGxid = ShmemVariableCache->nextGxid; + FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); + if (IsNormalProcessingMode()) + elog(LOG, "latest completed transaction id is %u and next transaction id is %u", + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid), + XidFromFullTransactionId(ShmemVariableCache->nextXid)); + LWLockRelease(ProcArrayLock); + + /* + * Start up subtrans, if not already done for hot standby. (commit + * timestamps are started below, if necessary.) + */ + if (standbyState == STANDBY_DISABLED) + { + StartupCLOG(); + StartupSUBTRANS(oldestActiveXID); + DistributedLog_Startup(oldestActiveXID, + XidFromFullTransactionId(ShmemVariableCache->nextXid)); + } + + /* + * Perform end of recovery actions for any SLRUs that need it. + */ + TrimCLOG(); + TrimMultiXact(); + + /* Reload shared-memory state for prepared transactions */ + RecoverPreparedTransactions(); + + /* Shut down xlogreader */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + XLogReaderFree(xlogreader); + + /* + * If any of the critical GUCs have changed, log them before we allow + * backends to write WAL. + */ + LocalSetXLogInsertAllowed(); + XLogReportParameters(); + if (ArchiveRecoveryRequested) { /* @@ -8402,28 +8475,6 @@ StartupXLOG(void) } } - /* - * Preallocate additional log files, if wanted. - */ - PreallocXlogFiles(EndOfLog); - - /* - * Okay, we're officially UP. - */ - InRecovery = false; - - /* - * Hook for plugins to do additional startup works. - * - * Allow to write any WALs in hook. - */ - if (Startup_hook) - { - LocalSetXLogInsertAllowed(); - (*Startup_hook) (); - LocalXLogInsertAllowed = -1; - } - /* * If we are a standby with contentid -1 and undergoing promotion, * update ourselves as the new master in catalog. This does not @@ -8433,60 +8484,9 @@ StartupXLOG(void) bool needToPromoteCatalog = (IS_QUERY_DISPATCHER() && ControlFile->state == DB_IN_ARCHIVE_RECOVERY); - /* start the archive_timeout timer and LSN running */ - XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); - XLogCtl->lastSegSwitchLSN = EndOfLog; - - /* also initialize latestCompletedXid, to nextXid - 1 */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; - ShmemVariableCache->latestCompletedGxid = ShmemVariableCache->nextGxid; - FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); - if (IsNormalProcessingMode()) - elog(LOG, "latest completed transaction id is %u and next transaction id is %u", - XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid), - XidFromFullTransactionId(ShmemVariableCache->nextXid)); - LWLockRelease(ProcArrayLock); - - /* - * Start up subtrans, if not already done for hot standby. (commit - * timestamps are started below, if necessary.) - */ - if (standbyState == STANDBY_DISABLED) - { - StartupCLOG(); - StartupSUBTRANS(oldestActiveXID); - DistributedLog_Startup(oldestActiveXID, - XidFromFullTransactionId(ShmemVariableCache->nextXid)); - } - - /* - * Perform end of recovery actions for any SLRUs that need it. - */ - TrimCLOG(); - TrimMultiXact(); - - /* Reload shared-memory state for prepared transactions */ - RecoverPreparedTransactions(); - if(IsNormalProcessingMode()) ereport(LOG, (errmsg("database system is ready"))); - /* Shut down xlogreader */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - XLogReaderFree(xlogreader); - - /* - * If any of the critical GUCs have changed, log them before we allow - * backends to write WAL. - */ - LocalSetXLogInsertAllowed(); - XLogReportParameters(); - /* * Local WAL inserts enabled, so it's time to finish initialization of * commit timestamp. diff --git a/src/test/isolation2/expected/segwalrep/startup_rename_prepared_xlog.out b/src/test/isolation2/expected/segwalrep/startup_rename_prepared_xlog.out new file mode 100644 index 00000000000..d51241e6aa0 --- /dev/null +++ b/src/test/isolation2/expected/segwalrep/startup_rename_prepared_xlog.out @@ -0,0 +1,173 @@ +-- Test a bug where the last XLOG segment is renamed to .partial before prepared +-- transactions contained in it are recovered. This would result in segfault at +-- RecoverPreparedTransactions. The failure is well described in PG hackers: +-- https://www.postgresql.org/message-id/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at +-- However, in GP due to an older version of PG this error is masked by always caching +-- the last opened xlog segment in XLogRead(). To actually trigger the failure we need to +-- create unfinished prepared transactions in TWO different xlog segments. + +-- Allow extra time for mirror promotion to complete recovery to avoid +-- gprecoverseg BEGIN failures due to gang creation failure as some primaries +-- are not up. Setting these increase the number of retries in gang creation in +-- case segment is in recovery. Approximately we want to wait 30 seconds. +!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly; +(exited with code 0) +!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly; +(exited with code 0) + +-- The last xlog segment is renamed to .partial only when archive mode is on +!\retcode gpconfig -c archive_mode -v on; +(exited with code 0) +!\retcode gpconfig -c archive_command -v '/bin/true'; +(exited with code 0) +!\retcode gpstop -rai; +(exited with code 0) + +1: create extension if not exists gp_inject_fault; +CREATE +1: create table t_rename1 (a int); +CREATE +1: create table t_rename2 (a int); +CREATE + +-- generate an orphaned prepare transaction in first segment +1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) +-- assume (2), (1) are on different segments and one tuple is on the first segment. +-- the test finally double-check that. +2&: insert into t_rename1 values(2),(1); <waiting ...> +1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) + +-- start another xlog segment (will be renamed to .partial on failover) +GP_IGNORE:-- start_ignore +GP_IGNORE:0U: select pg_switch_xlog(); +GP_IGNORE: pg_switch_xlog +GP_IGNORE:---------------- +GP_IGNORE: 0/C0B05E0 +GP_IGNORE:(1 row) +GP_IGNORE:-- end_ignore + +-- inject another fault and prepare transaction in the new xlog segment +1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) +1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) +-- assume (4), (3) are on different segments and one tuple is on the first segment. +-- the test finally double-check that. +3&: insert into t_rename2 values(2),(1); <waiting ...> +1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) + +-- shutdown primary and make sure the segment is down +-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where c.role='p' and c.content=0), 'stop', 'immediate'); + pg_ctl +-------- + OK +(1 row) +1: select gp_request_fts_probe_scan(); + gp_request_fts_probe_scan +--------------------------- + t +(1 row) +1: select role, preferred_role from gp_segment_configuration where content = 0 order by role; + role | preferred_role +------+---------------- + m | p + p | m +(2 rows) + +-- double confirm that promote succeeds. +-- also double confirm that +-- 1. tuples (2) and (1) are located on two segments (thus we are testing 2pc with prepared transaction). +-- 2. there are tuples on the first segment (we have been testing on the first segment). +1: insert into t_rename1 values(2),(1); +INSERT 2 +1: select gp_segment_id, * from t_rename1 order by a; + gp_segment_id | a +---------------+--- + 1 | 1 + 0 | 2 +(2 rows) + +1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) +2<: <... completed> +INSERT 2 +3<: <... completed> +INSERT 2 + +-- confirm the "orphaned" prepared trnasaction commits finally. +1: select * from t_rename1 order by a; + a +--- + 1 + 1 + 2 + 2 +(4 rows) +1: select * from t_rename2 order by a; + a +--- + 1 + 2 +(2 rows) + +-- recovery the nodes +!\retcode gprecoverseg -a; +(exited with code 0) +1: select wait_until_segment_synchronized(0); + wait_until_segment_synchronized +--------------------------------- + OK +(1 row) + +!\retcode gprecoverseg -ar; +(exited with code 0) +1: select wait_until_segment_synchronized(0); + wait_until_segment_synchronized +--------------------------------- + OK +(1 row) + +-- verify the first segment is recovered to the original state. +1: select role, preferred_role from gp_segment_configuration where content = 0 order by role; + role | preferred_role +------+---------------- + m | m + p | p +(2 rows) + +-- cleanup +1: drop table t_rename1; +DROP +1: drop table t_rename2; +DROP +!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation; +(exited with code 0) +!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation; +(exited with code 0) +!\retcode gpconfig -r archive_mode --skipvalidation; +(exited with code 0) +!\retcode gpconfig -r archive_command --skipvalidation; +(exited with code 0) +!\retcode gpstop -rai; +(exited with code 0) diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule index 18fd4a2c0c4..f9e6082ff45 100644 --- a/src/test/isolation2/isolation2_schedule +++ b/src/test/isolation2/isolation2_schedule @@ -250,6 +250,7 @@ test: segwalrep/dtm_recovery_on_standby test: segwalrep/commit_blocking_on_standby test: segwalrep/dtx_recovery_wait_lsn test: segwalrep/select_throttle +test: segwalrep/startup_rename_prepared_xlog test: fts_manual_probe test: fts_session_reset # unstable FTS test in different arch diff --git a/src/test/isolation2/sql/segwalrep/startup_rename_prepared_xlog.sql b/src/test/isolation2/sql/segwalrep/startup_rename_prepared_xlog.sql new file mode 100644 index 00000000000..f612647f8ea --- /dev/null +++ b/src/test/isolation2/sql/segwalrep/startup_rename_prepared_xlog.sql @@ -0,0 +1,89 @@ +-- Test a bug where the last XLOG segment is renamed to .partial before prepared +-- transactions contained in it are recovered. This would result in segfault at +-- RecoverPreparedTransactions. The failure is well described in PG hackers: +-- https://www.postgresql.org/message-id/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at +-- However, in GP due to an older version of PG this error is masked by always caching +-- the last opened xlog segment in XLogRead(). To actually trigger the failure we need to +-- create unfinished prepared transactions in TWO different xlog segments. + +-- Allow extra time for mirror promotion to complete recovery to avoid +-- gprecoverseg BEGIN failures due to gang creation failure as some primaries +-- are not up. Setting these increase the number of retries in gang creation in +-- case segment is in recovery. Approximately we want to wait 30 seconds. +!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly; +!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly; + +-- The last xlog segment is renamed to .partial only when archive mode is on +!\retcode gpconfig -c archive_mode -v on; +!\retcode gpconfig -c archive_command -v '/bin/true'; +!\retcode gpstop -rai; + +1: create extension if not exists gp_inject_fault; +1: create table t_rename1 (a int); +1: create table t_rename2 (a int); + +-- generate an orphaned prepare transaction in first segment +1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content = -1; +-- assume (2), (1) are on different segments and one tuple is on the first segment. +-- the test finally double-check that. +2&: insert into t_rename1 values(2),(1); +1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) + from gp_segment_configuration where role = 'p' and content = -1; + +-- start another xlog segment (will be renamed to .partial on failover) +-- start_ignore +0U: select pg_switch_xlog(); +-- end_ignore + +-- inject another fault and prepare transaction in the new xlog segment +1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content = -1; +1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content = -1; +-- assume (4), (3) are on different segments and one tuple is on the first segment. +-- the test finally double-check that. +3&: insert into t_rename2 values(2),(1); +1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) + from gp_segment_configuration where role = 'p' and content = -1; + +-- shutdown primary and make sure the segment is down +-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c + where c.role='p' and c.content=0), 'stop', 'immediate'); +1: select gp_request_fts_probe_scan(); +1: select role, preferred_role from gp_segment_configuration where content = 0 order by role; + +-- double confirm that promote succeeds. +-- also double confirm that +-- 1. tuples (2) and (1) are located on two segments (thus we are testing 2pc with prepared transaction). +-- 2. there are tuples on the first segment (we have been testing on the first segment). +1: insert into t_rename1 values(2),(1); +1: select gp_segment_id, * from t_rename1 order by a; + +1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content = -1; +2<: +3<: + +-- confirm the "orphaned" prepared trnasaction commits finally. +1: select * from t_rename1 order by a; +1: select * from t_rename2 order by a; + +-- recovery the nodes +!\retcode gprecoverseg -a; +1: select wait_until_segment_synchronized(0); + +!\retcode gprecoverseg -ar; +1: select wait_until_segment_synchronized(0); + +-- verify the first segment is recovered to the original state. +1: select role, preferred_role from gp_segment_configuration where content = 0 order by role; + +-- cleanup +1: drop table t_rename1; +1: drop table t_rename2; +!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation; +!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation; +!\retcode gpconfig -r archive_mode --skipvalidation; +!\retcode gpconfig -r archive_command --skipvalidation; +!\retcode gpstop -rai; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org For additional commands, e-mail: commits-h...@cloudberry.apache.org