On 20/06/2024 16:41, Heikki Linnakangas wrote:
Attached is a patch to fix this, with a test case.

The patch did not compile, thanks to a last-minute change in a field name. Here's a fixed version.

--
Heikki Linnakangas
Neon (https://neon.tech)
From ccb0bd36619f03055382205eddd7a0e9b64bba95 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 20 Jun 2024 15:49:05 +0300
Subject: [PATCH v2 1/2] tests: Trim newline from result returned by
 BackgroundPsql->query

This went unnoticed, because only a few existing callers of
BackgroundPsql->query used the result, and the ones that did were not
bothered by an extra newline. I noticed because I was about to add a
new test that checks the result.
---
 src/test/perl/PostgreSQL/Test/BackgroundPsql.pm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
index 4091c311b8..041b3dfa7d 100644
--- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
+++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
@@ -217,13 +217,13 @@ sub query
 	my $banner = "background_psql: QUERY_SEPARATOR";
 	$self->{stdin} .= "$query\n;\n\\echo $banner\n";
 
-	pump_until($self->{run}, $self->{timeout}, \$self->{stdout}, qr/$banner/);
+	pump_until($self->{run}, $self->{timeout}, \$self->{stdout}, qr/$banner\n/);
 
 	die "psql query timed out" if $self->{timeout}->is_expired;
 	$output = $self->{stdout};
 
 	# remove banner again, our caller doesn't care
-	$output =~ s/\n$banner$//s;
+	$output =~ s/\n$banner\n$//s;
 
 	# clear out output for the next query
 	$self->{stdout} = '';
-- 
2.39.2

From f44c5ec2d3383240c495c36e05da812791e3f5d4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 20 Jun 2024 17:09:41 +0300
Subject: [PATCH v2 2/2] Fix MVCC bug with prepared xact with subxacts on
 standby

We did not recover the subtransaction IDs of prepared transactions
when starting a hot standby from a shutdown checkpoint. As a result,
such subtransactions were considered as aborted, rather than
in-progress. That would lead to hint bits being set incorrectly, and
the subtransactions suddenly becoming visible to old snapshots when
the prepared transaction was committed.

To fix, update pg_subtrans with prepared transactions's subxids when
starting hot standby from a shutdown checkpoint. The snapshots taken
from that state need to be marked as "suboverflowed", so that we also
check the pg_subtrans.

Discussion: XXX
---
 src/backend/access/transam/twophase.c |  7 ++---
 src/backend/access/transam/xlog.c     | 14 +++++----
 src/backend/storage/ipc/procarray.c   | 18 +++++++++--
 src/backend/storage/ipc/standby.c     |  6 ++--
 src/include/storage/standby.h         | 10 +++++-
 src/test/recovery/t/009_twophase.pl   | 45 +++++++++++++++++++++++++++
 src/tools/pgindent/typedefs.list      |  1 +
 7 files changed, 84 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index bf451d42ff..9a8257fcaf 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -2035,9 +2035,8 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
  * This is never called at the end of recovery - we use
  * RecoverPreparedTransactions() at that point.
  *
- * The lack of calls to SubTransSetParent() calls here is by design;
- * those calls are made by RecoverPreparedTransactions() at the end of recovery
- * for those xacts that need this.
+ * This updates pg_subtrans, so that any subtransactions will be correctly
+ * seen as in-progress in snapshots taken during recovery.
  */
 void
 StandbyRecoverPreparedTransactions(void)
@@ -2057,7 +2056,7 @@ StandbyRecoverPreparedTransactions(void)
 
 		buf = ProcessTwoPhaseBuffer(xid,
 									gxact->prepare_start_lsn,
-									gxact->ondisk, false, false);
+									gxact->ondisk, true, false);
 		if (buf != NULL)
 			pfree(buf);
 	}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 330e058c5f..3d084d1c38 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5777,6 +5777,9 @@ StartupXLOG(void)
 				RunningTransactionsData running;
 				TransactionId latestCompletedXid;
 
+				/* Update pg_subtrans entries for any prepared transactions */
+				StandbyRecoverPreparedTransactions();
+
 				/*
 				 * Construct a RunningTransactions snapshot representing a
 				 * shut down server, with only prepared transactions still
@@ -5785,7 +5788,7 @@ StartupXLOG(void)
 				 */
 				running.xcnt = nxids;
 				running.subxcnt = 0;
-				running.subxid_overflow = false;
+				running.subxid_status = SUBXIDS_IN_SUBTRANS;
 				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
 				running.oldestRunningXid = oldestActiveXID;
 				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
@@ -5795,8 +5798,6 @@ StartupXLOG(void)
 				running.xids = xids;
 
 				ProcArrayApplyRecoveryInfo(&running);
-
-				StandbyRecoverPreparedTransactions();
 			}
 		}
 
@@ -8235,6 +8236,9 @@ xlog_redo(XLogReaderState *record)
 
 			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
 
+			/* Update pg_subtrans entries for any prepared transactions */
+			StandbyRecoverPreparedTransactions();
+
 			/*
 			 * Construct a RunningTransactions snapshot representing a shut
 			 * down server, with only prepared transactions still alive. We're
@@ -8243,7 +8247,7 @@ xlog_redo(XLogReaderState *record)
 			 */
 			running.xcnt = nxids;
 			running.subxcnt = 0;
-			running.subxid_overflow = false;
+			running.subxid_status = SUBXIDS_IN_SUBTRANS;
 			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
 			running.oldestRunningXid = oldestActiveXID;
 			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
@@ -8253,8 +8257,6 @@ xlog_redo(XLogReaderState *record)
 			running.xids = xids;
 
 			ProcArrayApplyRecoveryInfo(&running);
-
-			StandbyRecoverPreparedTransactions();
 		}
 
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index d5165aa0d9..387b4a405b 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1106,7 +1106,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 		 * If the snapshot isn't overflowed or if its empty we can reset our
 		 * pending state and use this snapshot instead.
 		 */
-		if (!running->subxid_overflow || running->xcnt == 0)
+		if (running->subxid_status != SUBXIDS_MISSING || running->xcnt == 0)
 		{
 			/*
 			 * If we have already collected known assigned xids, we need to
@@ -1258,7 +1258,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	 * missing, so conservatively assume the last one is latestObservedXid.
 	 * ----------
 	 */
-	if (running->subxid_overflow)
+	if (running->subxid_status == SUBXIDS_MISSING)
 	{
 		standbyState = STANDBY_SNAPSHOT_PENDING;
 
@@ -1270,6 +1270,18 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 		standbyState = STANDBY_SNAPSHOT_READY;
 
 		standbySnapshotPendingXmin = InvalidTransactionId;
+
+		/*
+		 * If the 'xids' array didn't include all subtransactions, we have to
+		 * mark any snapshots taken as overflowed.
+		 */
+		if (running->subxid_status == SUBXIDS_IN_SUBTRANS)
+			procArray->lastOverflowedXid = latestObservedXid;
+		else
+		{
+			Assert(running->subxid_status == SUBXIDS_IN_ARRAY);
+			procArray->lastOverflowedXid = InvalidTransactionId;
+		}
 	}
 
 	/*
@@ -2833,7 +2845,7 @@ GetRunningTransactionData(void)
 
 	CurrentRunningXacts->xcnt = count - subcount;
 	CurrentRunningXacts->subxcnt = subcount;
-	CurrentRunningXacts->subxid_overflow = suboverflowed;
+	CurrentRunningXacts->subxid_status = suboverflowed ? SUBXIDS_IN_SUBTRANS : SUBXIDS_IN_ARRAY;
 	CurrentRunningXacts->nextXid = XidFromFullTransactionId(TransamVariables->nextXid);
 	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
 	CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid;
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 87b04e51b3..872679ca44 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -1184,7 +1184,7 @@ standby_redo(XLogReaderState *record)
 
 		running.xcnt = xlrec->xcnt;
 		running.subxcnt = xlrec->subxcnt;
-		running.subxid_overflow = xlrec->subxid_overflow;
+		running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
 		running.nextXid = xlrec->nextXid;
 		running.latestCompletedXid = xlrec->latestCompletedXid;
 		running.oldestRunningXid = xlrec->oldestRunningXid;
@@ -1349,7 +1349,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
 
 	xlrec.xcnt = CurrRunningXacts->xcnt;
 	xlrec.subxcnt = CurrRunningXacts->subxcnt;
-	xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+	xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
 	xlrec.nextXid = CurrRunningXacts->nextXid;
 	xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
 	xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
@@ -1366,7 +1366,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
 
 	recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
 
-	if (CurrRunningXacts->subxid_overflow)
+	if (xlrec.subxid_overflow)
 		elog(DEBUG2,
 			 "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
 			 CurrRunningXacts->xcnt,
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index 0fc0804e26..cce0bc521e 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -75,11 +75,19 @@ extern void StandbyReleaseOldLocks(TransactionId oldxid);
  * almost immediately see the data we need to begin executing queries.
  */
 
+typedef enum
+{
+	SUBXIDS_IN_ARRAY,			/* xids array includes all running subxids */
+	SUBXIDS_MISSING,			/* snapshot overflowed, subxids are missing */
+	SUBXIDS_IN_SUBTRANS,		/* subxids are not included in 'xids', but
+								 * pg_subtrans is fully up-to-date */
+} subxids_array_status;
+
 typedef struct RunningTransactionsData
 {
 	int			xcnt;			/* # of xact ids in xids[] */
 	int			subxcnt;		/* # of subxact ids in xids[] */
-	bool		subxid_overflow;	/* snapshot overflowed, subxids missing */
+	subxids_array_status subxid_status;
 	TransactionId nextXid;		/* xid from TransamVariables->nextXid */
 	TransactionId oldestRunningXid; /* *not* oldestXmin */
 	TransactionId oldestDatabaseRunningXid; /* same as above, but within the
diff --git a/src/test/recovery/t/009_twophase.pl b/src/test/recovery/t/009_twophase.pl
index 701f9cc20f..ca83d8cbc8 100644
--- a/src/test/recovery/t/009_twophase.pl
+++ b/src/test/recovery/t/009_twophase.pl
@@ -312,6 +312,51 @@ $cur_standby->start;
 
 $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_12'");
 
+###############################################################################
+# Check visibility of prepared transactions in standby after a restart while
+# primary is down.
+###############################################################################
+
+$cur_primary->psql(
+	'postgres', "
+	CREATE TABLE t_009_tbl_standby_mvcc (id int, msg text);
+	BEGIN;
+	INSERT INTO t_009_tbl_standby_mvcc VALUES (1, 'issued to ${cur_primary_name}');
+	SAVEPOINT s1;
+	INSERT INTO t_009_tbl_standby_mvcc VALUES (2, 'issued to ${cur_primary_name}');
+	PREPARE TRANSACTION 'xact_009_standby_mvcc';
+	");
+$cur_primary->stop;
+$cur_standby->restart;
+
+# Acquire a snapshot in standby, before we commit the prepared transaction
+my $standby_session = $cur_standby->background_psql('postgres', on_error_die => 1);
+$standby_session->query_safe("BEGIN ISOLATION LEVEL REPEATABLE READ");
+$psql_out = $standby_session->query_safe(
+	"SELECT count(*) FROM t_009_tbl_standby_mvcc");
+is($psql_out, '0',
+	"Prepared transaction not visible in standby before commit");
+
+# Commit the transaction in primary
+$cur_primary->start;
+$cur_primary->psql('postgres', "
+SET synchronous_commit='remote_apply'; -- To ensure the standby is caught up
+COMMIT PREPARED 'xact_009_standby_mvcc'
+");
+
+# Still not visible to the old snapshot
+$psql_out = $standby_session->query_safe(
+	"SELECT count(*) FROM t_009_tbl_standby_mvcc");
+is($psql_out, '0',
+	"Committed prepared transaction not visible to old snapshot in standby");
+
+# Is visible to a new snapshot
+$standby_session->query_safe("COMMIT");
+$psql_out = $standby_session->query_safe(
+	"SELECT count(*) FROM t_009_tbl_standby_mvcc");
+is($psql_out, '2',
+	"Committed prepared transaction is visible to new snapshot in standby");
+
 ###############################################################################
 # Check for a lock conflict between prepared transaction with DDL inside and
 # replay of XLOG_STANDBY_LOCK wal record.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 61ad417cde..d90982466c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3931,6 +3931,7 @@ string
 substitute_actual_parameters_context
 substitute_actual_srf_parameters_context
 substitute_phv_relids_context
+subxids_array_status
 symbol
 tablespaceinfo
 td_entry
-- 
2.39.2

Reply via email to