From 1b1aa652aff6681e5f43eba4f4690b174052d478 Mon Sep 17 00:00:00 2001
From: alterego655 <824662526@qq.com>
Date: Fri, 9 Jan 2026 21:32:12 +0800
Subject: [PATCH v1] Use WAIT FOR LSN in
 PostgreSQL::Test::Cluster::wait_for_catchup()

When the standby is passed as a PostgreSQL::Test::Cluster instance,
use the WAIT FOR LSN command on the standby server to implement
wait_for_catchup() for replay, write, and flush modes.  This is more
efficient than polling pg_stat_replication on the upstream, as the
WAIT FOR LSN command uses a latch-based wakeup mechanism.

The optimization applies when:
- The standby is passed as a Cluster object (not just a name string)
- The mode is 'replay', 'write', or 'flush' (not 'sent')
- The standby is in recovery

For 'sent' mode, when the standby is passed as a string (e.g., a
subscription name for logical replication), or when the standby has
been promoted, the function falls back to the original polling-based
approach using pg_stat_replication on the upstream.

Additionally, if the WAIT FOR LSN session is killed by a recovery
conflict (e.g., DROP TABLESPACE killing all backends indiscriminately),
the function catches this error and falls back to polling.  This makes
the test infrastructure robust against the timing-dependent conflicts
that can occur in tests like 031_recovery_conflict.

Discussion: https://postgr.es/m/CABPTF7UiArgW-sXj9CNwRzUhYOQrevLzkYcgBydmX5oDes1sjg%40mail.gmail.com
Author: Xuneng Zhou <xunengzhou@gmail.com>
Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Alvaro Herrera <alvherre@kurilemu.de>
---
 src/test/perl/PostgreSQL/Test/Cluster.pm | 91 +++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 955dfc0e7f8..87c3d2750cb 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -3320,6 +3320,13 @@ If you pass an explicit value of target_lsn, it should almost always be
 the primary's write LSN; so this parameter is seldom needed except when
 querying some intermediate replication node rather than the primary.
 
+When the standby is passed as a PostgreSQL::Test::Cluster instance and is
+in recovery, this function uses the WAIT FOR LSN command on the standby
+for modes replay, write, and flush.  This is more efficient than polling
+pg_stat_replication on the upstream, as WAIT FOR LSN uses a latch-based
+wakeup mechanism.  For 'sent' mode, or when the standby is passed as a
+string (e.g., a subscription name), it falls back to polling.
+
 If there is no active replication connection from this peer, waits until
 poll_query_until timeout.
 
@@ -3339,10 +3346,13 @@ sub wait_for_catchup
 	  . join(', ', keys(%valid_modes))
 	  unless exists($valid_modes{$mode});
 
-	# Allow passing of a PostgreSQL::Test::Cluster instance as shorthand
+	# Keep a reference to the standby node if passed as an object, so we can
+	# use WAIT FOR LSN on it later.
+	my $standby_node;
 	if (blessed($standby_name)
 		&& $standby_name->isa("PostgreSQL::Test::Cluster"))
 	{
+		$standby_node = $standby_name;
 		$standby_name = $standby_name->name;
 	}
 	if (!defined($target_lsn))
@@ -3367,6 +3377,85 @@ sub wait_for_catchup
 	  . $self->name . "\n";
 	# Before release 12 walreceiver just set the application name to
 	# "walreceiver"
+
+	# Use WAIT FOR LSN on the standby when:
+	# - The standby was passed as a Cluster object (so we can connect to it)
+	# - The mode is replay, write, or flush (not 'sent')
+	# - The standby is in recovery
+	# This is more efficient than polling pg_stat_replication on the upstream,
+	# as WAIT FOR LSN uses a latch-based wakeup mechanism.
+	if (defined($standby_node) && ($mode ne 'sent'))
+	{
+		my $standby_in_recovery =
+		  $standby_node->safe_psql('postgres', "SELECT pg_is_in_recovery()");
+		chomp($standby_in_recovery);
+
+		if ($standby_in_recovery eq 't')
+		{
+			# Map mode names to WAIT FOR LSN mode names
+			my %mode_map = (
+				'replay' => 'standby_replay',
+				'write'  => 'standby_write',
+				'flush'  => 'standby_flush',
+			);
+			my $wait_mode = $mode_map{$mode};
+			my $timeout = $PostgreSQL::Test::Utils::timeout_default;
+			my $wait_query =
+			  qq[WAIT FOR LSN '${target_lsn}' WITH (MODE '${wait_mode}', timeout '${timeout}s', no_throw);];
+
+			# Try WAIT FOR LSN. If it succeeds, we're done. If it returns a
+			# non-success status (timeout, not_in_recovery), fail immediately.
+			# If the session is interrupted (e.g., killed by recovery conflict),
+			# fall back to polling on the upstream which is immune to standby-
+			# side conflicts.
+			my $output;
+			local $@;
+			my $wait_succeeded = eval {
+				$output = $standby_node->safe_psql('postgres', $wait_query);
+				chomp($output);
+				1;
+			};
+
+			if ($wait_succeeded && $output eq 'success')
+			{
+				print "done\n";
+				return;
+			}
+
+			# If WAIT FOR LSN executed but returned non-success (e.g., timeout,
+			# not_in_recovery), fail immediately with diagnostic info. Falling
+			# back to polling would just waste time.
+			if ($wait_succeeded)
+			{
+				my $details = $self->safe_psql('postgres',
+					"SELECT * FROM pg_catalog.pg_stat_replication");
+				diag qq(WAIT FOR LSN returned '$output'
+pg_stat_replication on upstream:
+${details});
+				croak "WAIT FOR LSN '$wait_mode' to '$target_lsn' returned '$output'";
+			}
+
+			# WAIT FOR LSN was interrupted. Only fall back to polling if this
+			# looks like a recovery conflict - the canonical PostgreSQL error
+			# message contains "conflict with recovery". Other errors should
+			# fail immediately rather than being masked by a silent fallback.
+			if ($@ =~ /conflict with recovery/i)
+			{
+				diag qq(WAIT FOR LSN interrupted, falling back to polling:
+$@);
+			}
+			else
+			{
+				croak "WAIT FOR LSN failed: $@";
+			}
+		}
+	}
+
+	# Fall back to polling pg_stat_replication on the upstream for:
+	# - 'sent' mode (no corresponding WAIT FOR LSN mode)
+	# - When standby_name is a string (e.g., subscription name)
+	# - When the standby is no longer in recovery (was promoted)
+	# - When WAIT FOR LSN was interrupted (e.g., killed by a recovery conflict)
 	my $query = qq[SELECT '$target_lsn' <= ${mode}_lsn AND state = 'streaming'
          FROM pg_catalog.pg_stat_replication
          WHERE application_name IN ('$standby_name', 'walreceiver')];
-- 
2.51.0