From b0e55be9e5bc08912b2b71dd8002687c78dbf037 Mon Sep 17 00:00:00 2001
From: Matt Blewitt <mble@planetscale.com>
Date: Wed, 25 Feb 2026 17:29:44 +0000
Subject: [PATCH] Fix PITR pause bypass when initial XLOG_RUNNING_XACTS has
 subxid overflow

When the first XLOG_RUNNING_XACTS record seen during recovery has
subxid_overflow=true, the standby enters STANDBY_SNAPSHOT_PENDING and
hot standby never activates (LocalHotStandbyActive stays false).

This caused recovery_target_action = 'pause' to be silently bypassed:
recoveryPausesHere() returns immediately when hot standby is not yet
active, so the pause is skipped and the server promotes instead.

Fix: in PerformWalRecovery(), when the recovery target is reached and
the snapshot is still PENDING, force a transition to STANDBY_SNAPSHOT_READY
and call CheckRecoveryConsistency() to activate hot standby before the
target action switch is evaluated.

This is safe because subtransaction commits write to CLOG but produce no
WAL entry, so standbys always see overflowed subxids as INPROGRESS rather
than SUB_COMMITTED.  INPROGRESS subxids are invisible without any SubTrans
lookup, so the missing SubTrans entries that STANDBY_SNAPSHOT_PENDING
guards against cannot cause incorrect visibility results.

Add a TAP test (052_pitr_subxid_overflow.pl) that exercises the scenario:
the overflow transaction is kept open during the base backup's forced
checkpoint so that the very first XLOG_RUNNING_XACTS the standby replays
has subxid_overflow=true.  A named restore point is then created while
the overflow transaction is still open.  Without the fix the standby
promotes silently at the target; with the fix it pauses and accepts
hot-standby queries.

Note: subtransaction XIDs are only assigned when the subtransaction writes,
so gen_subxids() must perform an INSERT at each recursion level to force
the PGPROC subxid cache to overflow.
---
 src/backend/access/transam/xlogrecovery.c     |  29 ++++
 src/test/recovery/meson.build                 |   1 +
 .../recovery/t/052_pitr_subxid_overflow.pl    | 140 ++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 src/test/recovery/t/052_pitr_subxid_overflow.pl

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index ecd66fd8..19057bcb 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1874,6 +1874,35 @@ PerformWalRecovery(void)
 				ereport(FATAL,
 						(errmsg("requested recovery stop point is before consistent recovery point")));
 
+			/*
+			 * If the recovery snapshot is still in the PENDING state (due to
+			 * subxid overflow in the initial XLOG_RUNNING_XACTS record),
+			 * force it to READY now.  All WAL up to the recovery target has
+			 * been replayed, so CLOG and SubTrans are complete for all
+			 * transactions that appear in WAL before the target.
+			 *
+			 * The correctness concern that motivates STANDBY_SNAPSHOT_PENDING
+			 * is that some subxids may lack a parent-mapping in SubTrans.  On
+			 * a standby this is harmless: subtransaction commits (SAVEPOINT
+			 * RELEASE) write to CLOG but generate no WAL entry, so the
+			 * standby always sees those subxids as INPROGRESS rather than
+			 * SUB_COMMITTED.  INPROGRESS subxids are treated as invisible
+			 * without any SubTrans lookup, so missing SubTrans entries cannot
+			 * cause incorrect visibility.
+			 *
+			 * Without this transition, recovery_target_action = 'pause' would
+			 * be silently skipped: recoveryPausesHere() bails out immediately
+			 * when hot standby is not yet active, causing unintended
+			 * promotion.
+			 */
+			if (standbyState == STANDBY_SNAPSHOT_PENDING)
+			{
+				standbyState = STANDBY_SNAPSHOT_READY;
+				ereport(LOG,
+						(errmsg("recovery snapshot advanced to ready at recovery target")));
+				CheckRecoveryConsistency();
+			}
+
 			/*
 			 * This is the last point where we can restart recovery with a new
 			 * recovery target, if we shutdown and begin again. After this,
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 8d204889..cbcd9eb8 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -60,6 +60,7 @@ tests += {
       't/049_wait_for_lsn.pl',
       't/050_redo_segment_missing.pl',
       't/051_effective_wal_level.pl',
+      't/052_pitr_subxid_overflow.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/052_pitr_subxid_overflow.pl b/src/test/recovery/t/052_pitr_subxid_overflow.pl
new file mode 100644
index 00000000..c49fcffc
--- /dev/null
+++ b/src/test/recovery/t/052_pitr_subxid_overflow.pl
@@ -0,0 +1,140 @@
+# Copyright (c) 2025-2026, PostgreSQL Global Development Group
+
+# Test that PITR with recovery_target_action = 'pause' correctly activates
+# hot standby at the recovery target even when the initial XLOG_RUNNING_XACTS
+# record has subxid_overflow=true (SUBXIDS_MISSING).
+#
+# The bug: when the first XLOG_RUNNING_XACTS seen during recovery is
+# overflowed, the standby enters STANDBY_SNAPSHOT_PENDING and hot standby
+# never activates.  recoveryPausesHere() then bails out silently because
+# LocalHotStandbyActive is false, causing unintended promotion instead of
+# the requested pause.
+#
+# To trigger the bug the overflow transaction must be open at the time of
+# the base backup's forced checkpoint, so that the very first
+# XLOG_RUNNING_XACTS record the standby sees is overflowed.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(has_archiving => 1, allows_streaming => 1);
+$node_primary->start;
+
+# A table with a committed row that must be visible at the recovery target.
+$node_primary->safe_psql('postgres',
+	"CREATE TABLE t_committed (id int); INSERT INTO t_committed VALUES (42);");
+
+# Scratch table used only to force subtransaction XID assignment inside
+# gen_subxids.  Subtransaction XIDs are only assigned when the subtransaction
+# writes; without an actual write the PGPROC subxid cache never fills and
+# subxidStatus.overflowed is never set.
+$node_primary->safe_psql('postgres', "CREATE TABLE _subxid_work (id int)");
+
+# PL/pgSQL function that forces XID assignment for each subtransaction level
+# via an INSERT, then recurses.  With n=70 there are 70 simultaneous
+# open subtransactions, exceeding PGPROC_MAX_CACHED_SUBXIDS (64) and
+# setting subxidStatus.overflowed on the backend's PGPROC entry.  The flag
+# persists until the enclosing top-level transaction ends.
+$node_primary->safe_psql('postgres', q{
+	CREATE OR REPLACE FUNCTION gen_subxids(n int) RETURNS void
+	LANGUAGE plpgsql AS $$
+	BEGIN
+		IF n <= 0 THEN RETURN; END IF;
+		INSERT INTO _subxid_work VALUES (n);
+		PERFORM gen_subxids(n - 1);
+		RETURN;
+		EXCEPTION WHEN OTHERS THEN RAISE;
+	END;
+	$$;
+});
+
+# Open a long-running top-level transaction and saturate its subxid cache
+# BEFORE taking the base backup.  The backup forces a checkpoint via
+# pg_basebackup --checkpoint=fast; because the overflow transaction is still
+# open at that point, GetCurrentRunningTransactions() sees
+# subxidStates[].overflowed = true and writes XLOG_RUNNING_XACTS with
+# subxid_overflow=true.  This will be the FIRST XLOG_RUNNING_XACTS the
+# standby replays, putting it immediately into STANDBY_SNAPSHOT_PENDING.
+my $bg = $node_primary->background_psql('postgres');
+$bg->query_safe("BEGIN");
+$bg->query_safe("SELECT gen_subxids(70)");
+
+# Take the base backup.  The background transaction (with overflowed subxid
+# cache) remains open throughout.
+$node_primary->backup('base_backup');
+
+# Create the named restore point while the overflow transaction is still
+# open.  No non-overflowed XLOG_RUNNING_XACTS can appear between the
+# backup's forced checkpoint and this point, so the standby will be in
+# STANDBY_SNAPSHOT_PENDING when it reaches this WAL record.
+my $target = 'subxid_overflow_target';
+$node_primary->safe_psql('postgres',
+	"SELECT pg_create_restore_point('$target')");
+
+# Commit the overflow transaction and insert a row that must NOT be
+# visible at the recovery target.
+$bg->query_safe("COMMIT");
+$bg->quit;
+$node_primary->safe_psql('postgres',
+	"INSERT INTO t_committed VALUES (99)");
+
+# Switch WAL to flush the archive, then wait for archiving to complete.
+my $walfile = $node_primary->safe_psql('postgres',
+	"SELECT pg_walfile_name(pg_switch_wal())");
+$node_primary->poll_query_until('postgres',
+	"SELECT '$walfile' <= last_archived_wal FROM pg_stat_archiver")
+  or die "Timed out waiting for WAL archiving to complete";
+
+# Set up a PITR standby targeting our named restore point with
+# recovery_target_action = 'pause'.  The fix ensures that even though the
+# standby starts in STANDBY_SNAPSHOT_PENDING, it transitions to
+# STANDBY_SNAPSHOT_READY (activating hot standby) before the pause is
+# attempted, so the pause is not silently bypassed.
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, 'base_backup',
+	has_restoring => 1);
+$node_standby->append_conf('postgresql.conf',
+	"recovery_target_name = '$target'
+recovery_target_action = 'pause'");
+$node_standby->start;
+
+# Wait until the standby reaches the recovery target and pauses.
+# Without the fix the server promotes instead and this times out.
+$node_standby->poll_query_until('postgres',
+	"SELECT pg_get_wal_replay_pause_state() = 'paused'")
+  or die "Timed out: recovery did not pause at target (was it promoted instead?)";
+
+# Verify the standby is still in recovery (paused, not promoted).
+is( $node_standby->safe_psql('postgres', "SELECT pg_is_in_recovery()"),
+	't',
+	"standby is paused in recovery at target despite initial subxid overflow");
+
+is( $node_standby->safe_psql('postgres',
+		"SELECT pg_get_wal_replay_pause_state()"),
+	'paused',
+	"recovery_target_action=pause honoured when subxid overflow was present");
+
+# Hot standby queries must work: the committed row is visible, the row
+# committed after the restore point is not.
+is( $node_standby->safe_psql('postgres',
+		"SELECT id FROM t_committed ORDER BY id"),
+	'42',
+	"only pre-target committed row is visible during pause");
+
+# Resume to promote at the target LSN.  The row inserted AFTER the restore
+# point is not applied: recovery_target_action=pause promotes exactly at the
+# target, it does not continue WAL replay past it.
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume()");
+$node_standby->poll_query_until('postgres', "SELECT NOT pg_is_in_recovery()")
+  or die "Timed out waiting for standby to promote after resume";
+
+is( $node_standby->safe_psql('postgres',
+		"SELECT count(*) FROM t_committed"),
+	'1',
+	"promoted at recovery target: only pre-target row visible (WAL replay stops at target)");
+
+done_testing();
-- 
2.52.0