From 48d916e391d5aec6b03614519988dababcc1fbf8 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 27 Jul 2025 11:37:55 +0500
Subject: [PATCH v8] Avoid edge case 2 in multixacts

---
 src/backend/access/transam/multixact.c        | 96 +++++++++++-------
 src/test/modules/test_slru/t/001_multixact.pl | 98 +++++--------------
 src/test/modules/test_slru/test_multixact.c   |  1 -
 src/test/perl/PostgreSQL/Test/Cluster.pm      | 40 ++++++++
 src/test/recovery/t/017_shm.pl                | 38 +------
 5 files changed, 131 insertions(+), 142 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3cb09c3d598..e0d68e9f0eb 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -275,12 +275,6 @@ typedef struct MultiXactStateData
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
 
-	/*
-	 * This is used to sleep until a multixact offset is written when we want
-	 * to create the next one.
-	 */
-	ConditionVariable nextoff_cv;
-
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -921,10 +915,20 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	int			i;
 	LWLock	   *lock;
 	LWLock	   *prevlock = NULL;
+	MultiXactId next = multi + 1;
+	int			next_pageno;
 
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
+	/*
+	 * We must also fill next offset to keep current multi readable
+	 * Handle wraparound as GetMultiXactIdMembers() does it.
+	 */
+	if (next < FirstMultiXactId)
+		next = FirstMultiXactId;
+	next_pageno = MultiXactIdToOffsetPage(next);
+
 	lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 	LWLockAcquire(lock, LW_EXCLUSIVE);
 
@@ -939,19 +943,56 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 	offptr += entryno;
 
-	*offptr = offset;
+	/*
+	 * We might have filled this offset previosuly.
+	 * Cross-check for correctness.
+	 */
+	Assert((*offptr == 0) || (*offptr == offset));
 
+	*offptr = offset;
 	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 
+	if (next_pageno == pageno)
+	{
+		offptr[1] = offset + nmembers;
+	}
+	else
+	{
+		int	next_slotno;
+		MultiXactOffset *next_offptr;
+		int	next_entryno = MultiXactIdToOffsetEntry(next);
+		Assert(next_entryno == 0); /* This is an overflow-only branch */
+
+		/* Swap the lock for a lock of next page */
+		LWLockRelease(lock);
+		lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+		if (SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, next_pageno))
+		{
+			/* Just read a next page */
+			next_slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
+		}
+		else
+		{
+			/*
+			 * We have to create a new page.
+			 * SimpleLruWritePage is already prepared to deal
+			 * with creating a new segment file. We do not need to handle
+			 * race conditions, because this code is only executed in redo
+			 * and we hold appropriate lock of MultiXactOffsetCtl.
+			 */
+			next_slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
+			SimpleLruWritePage(MultiXactOffsetCtl, next_slotno);
+		}
+		next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[next_slotno];
+		next_offptr[next_entryno] = offset + nmembers;
+		MultiXactMemberCtl->shared->page_dirty[next_slotno] = true;
+	}
+
 	/* Release MultiXactOffset SLRU lock. */
 	LWLockRelease(lock);
 
-	/*
-	 * If anybody was waiting to know the offset of this multixact ID we just
-	 * wrote, they can read it now, so wake them up.
-	 */
-	ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
-
 	prev_pageno = -1;
 
 	for (i = 0; i < nmembers; i++, offset++)
@@ -1310,7 +1351,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	MultiXactOffset nextOffset;
 	MultiXactMember *ptr;
 	LWLock	   *lock;
-	bool		slept = false;
 
 	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
@@ -1389,7 +1429,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * 1. This multixact may be the latest one created, in which case there is
 	 * no next one to look at.  In this case the nextOffset value we just
 	 * saved is the correct endpoint.
+	 * TODO: how does it work on Standby?  MultiXactState->nextMXact does not seem to be up-to date.
+	 * nextMXact and nextOffset are in sync, so nothing bad can happen, but nextMXact seems mostly random. 
 	 *
+	 * THIS IS NOT POSSIBLE ANYMORE, KEEP IT FOR HISTORIC REASONS.
 	 * 2. The next multixact may still be in process of being filled in: that
 	 * is, another process may have done GetNewMultiXactId but not yet written
 	 * the offset entry for that ID.  In that scenario, it is guaranteed that
@@ -1415,7 +1458,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * cases, so it seems better than holding the MultiXactGenLock for a long
 	 * time on every multixact creation.
 	 */
-retry:
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
@@ -1479,16 +1521,10 @@ retry:
 
 		if (nextMXOffset == 0)
 		{
-			/* Corner case 2: next multixact is still being filled in */
-			LWLockRelease(lock);
-			CHECK_FOR_INTERRUPTS();
-
-			INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
-
-			ConditionVariableSleep(&MultiXactState->nextoff_cv,
-								   WAIT_EVENT_MULTIXACT_CREATION);
-			slept = true;
-			goto retry;
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("MultiXact %d has invalid next offset",
+							multi)));
 		}
 
 		length = nextMXOffset - offset;
@@ -1497,12 +1533,6 @@ retry:
 	LWLockRelease(lock);
 	lock = NULL;
 
-	/*
-	 * If we slept above, clean up state; it's no longer needed.
-	 */
-	if (slept)
-		ConditionVariableCancelSleep();
-
 	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
 
 	truelength = 0;
@@ -1992,7 +2022,6 @@ MultiXactShmemInit(void)
 
 		/* Make sure we zero out the per-backend state */
 		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
-		ConditionVariableInit(&MultiXactState->nextoff_cv);
 	}
 	else
 		Assert(found);
@@ -2142,8 +2171,7 @@ TrimMultiXact(void)
 	 * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
 	 * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
 	 * rule "write xlog before data," nextMXact successors may carry obsolete,
-	 * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
-	 * operates normally.
+	 * nonzero offset values.
 	 */
 	entryno = MultiXactIdToOffsetEntry(nextMXact);
 	if (entryno != 0)
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl
index e2b567a603d..814ab00353e 100644
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -29,92 +29,46 @@ $node->start;
 $node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
 $node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
 
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
-	q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
-	q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
+# Another multixact test: loosing some multixact must not affect reading near
+# multixacts, even after a crash.
+my $bg_psql = $node->background_psql('postgres');
 
-# This session must observe sleep on the condition variable while generating a
-# multixact.  To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
-	qr/start/,
-	q{
-	\echo start
-	SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+my $multi = $bg_psql->query_safe(
+	q(SELECT test_create_multixact();));
+
+# The space for next multi will be allocated, but it will never be actually
+# recorded.
 $node->safe_psql('postgres',
 	q{SELECT injection_points_attach('multixact-create-from-members','wait');}
 );
 
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
-	qr/start/, q{
-	\echo start
+$bg_psql->query_until(
+	qr/deploying lost multi/, q(
+\echo deploying lost multi
 	SELECT test_create_multixact();
-});
+));
 
 $node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, test-multixact-read',
-	"matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact.  A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, multixact-get-members-cv-sleep',
-	"matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep.  Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('test-multixact-read')});
 $node->safe_psql('postgres',
 	q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
 
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-create-from-members')});
+	q{checkpoint;});
+
+# One more multitransaction to effectivelt emit WAL record about next
+# multitransaction (to avaoid corener case 1).
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
+	q{SELECT test_create_multixact();});
+
+# All set and done, it's time for hard restart
+$node->kill9;
+$node->poll_start;
+$bg_psql->{run}->finish;
 
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+# Verify thet recorded multi is readble, this call must not hang.
+# Also note that all injection points disappeared after server restart.
+$node->safe_psql('postgres',
+	qq{SELECT test_read_multixact('$multi'::xid);});
 
 $node->stop;
 
diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c
index 6c9b0420717..2fd67273ee5 100644
--- a/src/test/modules/test_slru/test_multixact.c
+++ b/src/test/modules/test_slru/test_multixact.c
@@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS)
 	MultiXactId id = PG_GETARG_TRANSACTIONID(0);
 	MultiXactMember *members;
 
-	INJECTION_POINT("test-multixact-read", NULL);
 	/* discard caches */
 	AtEOXact_MultiXact();
 
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 61f68e0cc2e..a162baa55c6 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -1190,6 +1190,46 @@ sub start
 
 =pod
 
+=item $node->poll_start() => success_or_failure
+
+Polls start after kill9
+
+We may need retries to start a new postmaster.  Causes:
+ - kernel is slow to deliver SIGKILL
+ - postmaster parent is slow to waitpid()
+ - postmaster child is slow to exit in response to SIGQUIT
+ - postmaster child is slow to exit after postmaster death
+
+=cut
+
+sub poll_start
+{
+	my ($self) = @_;
+
+	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+	my $attempts = 0;
+
+	while ($attempts < $max_attempts)
+	{
+		$self->start(fail_ok => 1) && return 1;
+
+		# Wait 0.1 second before retrying.
+		usleep(100_000);
+
+		# Clean up in case the start attempt just timed out or some such.
+		$self->stop('fast', fail_ok => 1);
+
+		$attempts++;
+	}
+
+	# Try one last time without fail_ok, which will BAIL_OUT unless it
+	# succeeds.
+	$self->start && return 1;
+	return 0;
+}
+
+=pod
+
 =item $node->kill9()
 
 Send SIGKILL (signal 9) to the postmaster.
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
index c73aa3f0c2c..ac238544217 100644
--- a/src/test/recovery/t/017_shm.pl
+++ b/src/test/recovery/t/017_shm.pl
@@ -67,7 +67,7 @@ log_ipcs();
 # Upon postmaster death, postmaster children exit automatically.
 $gnat->kill9;
 log_ipcs();
-poll_start($gnat);    # gnat recycles its former shm key.
+$gnat->poll_start;    # gnat recycles its former shm key.
 log_ipcs();
 
 note "removing the conflicting shmem ...";
@@ -82,7 +82,7 @@ log_ipcs();
 # the higher-keyed segment that the previous postmaster was using.
 # That's not great, but key collisions should be rare enough to not
 # make this a big problem.
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 $gnat->stop;
 log_ipcs();
@@ -174,43 +174,11 @@ $slow_client->finish;    # client has detected backend termination
 log_ipcs();
 
 # now startup should work
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 
 # finish testing
 $gnat->stop;
 log_ipcs();
 
-
-# We may need retries to start a new postmaster.  Causes:
-# - kernel is slow to deliver SIGKILL
-# - postmaster parent is slow to waitpid()
-# - postmaster child is slow to exit in response to SIGQUIT
-# - postmaster child is slow to exit after postmaster death
-sub poll_start
-{
-	my ($node) = @_;
-
-	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
-	my $attempts = 0;
-
-	while ($attempts < $max_attempts)
-	{
-		$node->start(fail_ok => 1) && return 1;
-
-		# Wait 0.1 second before retrying.
-		usleep(100_000);
-
-		# Clean up in case the start attempt just timed out or some such.
-		$node->stop('fast', fail_ok => 1);
-
-		$attempts++;
-	}
-
-	# Try one last time without fail_ok, which will BAIL_OUT unless it
-	# succeeds.
-	$node->start && return 1;
-	return 0;
-}
-
 done_testing();
-- 
2.39.5 (Apple Git-154)