From e0ca43df50512f45c83e7a3faf3672c8036ab90c Mon Sep 17 00:00:00 2001
From: Zhijie Hou <houzj.fnst@fujitsu.com>
Date: Thu, 13 Nov 2025 10:04:30 +0800
Subject: [PATCH v2] Fix race conditions causing invalidation of newly created
 slots

This commit addresses two race conditions that could lead to the removal of WALs
reserved by a slot, potentially resulting in slot invalidation:

1) Currently, there is a race condition that ReplicationSlotReserveWal selects a
   WAL position to be reserved but has not yet updated estart_lsn, while the
   checkpoint process might select a later WAL position as the minimum, causing
   premature removal of WAL needed by a slot.

This commit fixes it by taking exclusive lock on ReplicationSlotAllocationLock
when reserving WAL to serialize the minimum restart_lsn computation in checkpoint
process and WAL reservation, ensuring that:

* If the WAL reservation occurs first, the checkpoint must wait for the
restart_lsn to be updated before proceeding with WAL removal. This guarantees
that the most recent restart_lsn position is detected.

* If the checkpoint calls CheckPointReplicationSlots() first, then any
subsequent WAL reservation must take a position later than the redo pointer.

2) If a backend advances a slot's restart_lsn reaches
   XLogSetReplicationSlotMinimumLSN but not yet update the minimum LSN, and
   another backend creates a new slot. This can set the minimum LSN to a more
   recent position than the WAL reserved by the new slot, potentially
   invalidating it in the next checkpoint.

The commit resolves this by acquiring an exclusive ReplicationSlotControlLock,
replacing the original shared level lock, to prevent concurrent updates to
restart_lsn.
---
 src/backend/replication/logical/slotsync.c |   9 ++
 src/backend/replication/slot.c             | 101 +++++++++++----------
 2 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c
index 8b4afd87dc9..e4d5ac066b4 100644
--- a/src/backend/replication/logical/slotsync.c
+++ b/src/backend/replication/logical/slotsync.c
@@ -495,6 +495,13 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn)
 	Assert(slot != NULL);
 	Assert(!XLogRecPtrIsValid(slot->data.restart_lsn));
 
+	/*
+	 * Acquire an exclusive lock to prevent race conditions between WAL
+	 * reservation and minimum restart_lsn computation during checkpoints. See
+	 * comments in ReplicationSlotReserveWal() for more details.
+	 */
+	LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
+
 	while (true)
 	{
 		SpinLockAcquire(&slot->mutex);
@@ -546,6 +553,8 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn)
 		/* Retry using the location of the oldest wal segment */
 		XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, restart_lsn);
 	}
+
+	LWLockRelease(ReplicationSlotAllocationLock);
 }
 
 /*
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 1ec1e997b27..0c8ed1d57f4 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1237,7 +1237,15 @@ ReplicationSlotsComputeRequiredLSN(void)
 
 	Assert(ReplicationSlotCtl != NULL);
 
-	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+	/*
+	 * Hold the ReplicationSlotControlLock exclusive until after updating the
+	 * slot minimum LSN value, so no backend can compute and update the new
+	 * value concurrently. This prevents overwriting the minimum LSN with a
+	 * position more recent than the WAL position reserved by another newly
+	 * created slot, ensuring the WALs required by the new slot are not
+	 * prematurely removed during checkpoint.
+	 */
+	LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
 	for (i = 0; i < max_replication_slots; i++)
 	{
 		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
@@ -1282,9 +1290,10 @@ ReplicationSlotsComputeRequiredLSN(void)
 			 restart_lsn < min_required))
 			min_required = restart_lsn;
 	}
-	LWLockRelease(ReplicationSlotControlLock);
 
 	XLogSetReplicationSlotMinimumLSN(min_required);
+
+	LWLockRelease(ReplicationSlotControlLock);
 }
 
 /*
@@ -1571,63 +1580,59 @@ void
 ReplicationSlotReserveWal(void)
 {
 	ReplicationSlot *slot = MyReplicationSlot;
+	XLogSegNo	segno;
+	XLogRecPtr	restart_lsn;
 
 	Assert(slot != NULL);
 	Assert(!XLogRecPtrIsValid(slot->data.restart_lsn));
 	Assert(!XLogRecPtrIsValid(slot->last_saved_restart_lsn));
 
 	/*
-	 * The replication slot mechanism is used to prevent removal of required
-	 * WAL. As there is no interlock between this routine and checkpoints, WAL
-	 * segments could concurrently be removed when a now stale return value of
-	 * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
-	 * this happens we'll just retry.
+	 * Acquire an exclusive lock to prevent the checkpoint process from
+	 * concurrently calculating the minimum restart_lsn (refer to
+	 * CheckPointReplicationSlots). This ensures that no race conditions occur
+	 * where this function selects a WAL position but has not yet updated
+	 * restart_lsn, while the checkpoint process might select a later WAL
+	 * position as the minimum, causing premature removal of WAL needed by
+	 * this slot.
 	 */
-	while (true)
-	{
-		XLogSegNo	segno;
-		XLogRecPtr	restart_lsn;
+	LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
 
-		/*
-		 * For logical slots log a standby snapshot and start logical decoding
-		 * at exactly that position. That allows the slot to start up more
-		 * quickly. But on a standby we cannot do WAL writes, so just use the
-		 * replay pointer; effectively, an attempt to create a logical slot on
-		 * standby will cause it to wait for an xl_running_xact record to be
-		 * logged independently on the primary, so that a snapshot can be
-		 * built using the record.
-		 *
-		 * None of this is needed (or indeed helpful) for physical slots as
-		 * they'll start replay at the last logged checkpoint anyway. Instead
-		 * return the location of the last redo LSN. While that slightly
-		 * increases the chance that we have to retry, it's where a base
-		 * backup has to start replay at.
-		 */
-		if (SlotIsPhysical(slot))
-			restart_lsn = GetRedoRecPtr();
-		else if (RecoveryInProgress())
-			restart_lsn = GetXLogReplayRecPtr(NULL);
-		else
-			restart_lsn = GetXLogInsertRecPtr();
+	/*
+	 * For logical slots log a standby snapshot and start logical decoding at
+	 * exactly that position. That allows the slot to start up more quickly.
+	 * But on a standby we cannot do WAL writes, so just use the replay
+	 * pointer; effectively, an attempt to create a logical slot on standby
+	 * will cause it to wait for an xl_running_xact record to be logged
+	 * independently on the primary, so that a snapshot can be built using the
+	 * record.
+	 *
+	 * None of this is needed (or indeed helpful) for physical slots as
+	 * they'll start replay at the last logged checkpoint anyway. Instead
+	 * return the location of the last redo LSN. While that slightly increases
+	 * the chance that we have to retry, it's where a base backup has to start
+	 * replay at.
+	 */
+	if (SlotIsPhysical(slot))
+		restart_lsn = GetRedoRecPtr();
+	else if (RecoveryInProgress())
+		restart_lsn = GetXLogReplayRecPtr(NULL);
+	else
+		restart_lsn = GetXLogInsertRecPtr();
 
-		SpinLockAcquire(&slot->mutex);
-		slot->data.restart_lsn = restart_lsn;
-		SpinLockRelease(&slot->mutex);
+	SpinLockAcquire(&slot->mutex);
+	slot->data.restart_lsn = restart_lsn;
+	SpinLockRelease(&slot->mutex);
 
-		/* prevent WAL removal as fast as possible */
-		ReplicationSlotsComputeRequiredLSN();
+	/* prevent WAL removal as fast as possible */
+	ReplicationSlotsComputeRequiredLSN();
 
-		/*
-		 * If all required WAL is still there, great, otherwise retry. The
-		 * slot should prevent further removal of WAL, unless there's a
-		 * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
-		 * the new restart_lsn above, so normally we should never need to loop
-		 * more than twice.
-		 */
-		XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
-		if (XLogGetLastRemovedSegno() < segno)
-			break;
-	}
+	XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
+	if (XLogGetLastRemovedSegno() >= segno)
+		elog(ERROR, "WAL required by replication slot %s has been removed concurrently",
+			 NameStr(slot->data.name));
+
+	LWLockRelease(ReplicationSlotAllocationLock);
 
 	if (!RecoveryInProgress() && SlotIsLogical(slot))
 	{
-- 
2.31.1

