From 0282b2b315f2a3a8685eb5ce1e48ce8b57548821 Mon Sep 17 00:00:00 2001
From: Nisha Moond <nisha.moond412@gmail.com>
Date: Tue, 25 Feb 2025 17:33:01 +0530
Subject: [PATCH v1] Fix race condition in ReplicationSlotAcquire() indroduced
 by commit f41d846
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit f41d846, a process could acquire and use a replication slot
that had just been invalidated, leading to unexpected failures. This
occured because the invalidation check was performed before setting the
slot’s active_pid.

The fix ensures the check happens only after acquiring slot, i.e., after
setting s->active_pid = MyProcPid, preventing a race condition with
InvalidatePossiblyObsoleteSlot() and ensuring that invalid slots are not
mistakenly used.
---
 src/backend/replication/slot.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index d089085..af388ad 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -580,19 +580,6 @@ retry:
 						name)));
 	}
 
-	/* Invalid slots can't be modified or used before accessing the WAL. */
-	if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE)
-	{
-		LWLockRelease(ReplicationSlotControlLock);
-
-		ereport(ERROR,
-				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				errmsg("can no longer access replication slot \"%s\"",
-					   NameStr(s->data.name)),
-				errdetail("This replication slot has been invalidated due to \"%s\".",
-						  GetSlotInvalidationCauseName(s->data.invalidated)));
-	}
-
 	/*
 	 * This is the slot we want; check if it's active under some other
 	 * process.  In single user mode, we don't need this check.
@@ -650,13 +637,32 @@ retry:
 	else if (!nowait)
 		ConditionVariableCancelSleep(); /* no sleep needed after all */
 
-	/* Let everybody know we've modified this slot */
-	ConditionVariableBroadcast(&s->active_cv);
-
 	/* We made this slot active, so it's ours now. */
 	MyReplicationSlot = s;
 
 	/*
+	 * Is the slot invalid? Check the invalidated flag only after acquiring
+	 * the slot, i.e., after setting s->active_pid as MyProcPid, to prevent a
+	 * race condition with InvalidatePossiblyObsoleteSlot().
+	 *
+	 * If this check is performed before acquiring the slot or without holding
+	 * a spinlock, the checkpointer process may invalidate the slot right
+	 * after the check but before slot's active_pid is set. This could result
+	 * in the process mistakenly acquiring and using an invalidated slot,
+	 * leading to unexpected behavior or failures later.
+	 */
+	if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE)
+		ereport(ERROR,
+				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("can no longer access replication slot \"%s\"",
+					   NameStr(s->data.name)),
+				errdetail("This replication slot has been invalidated due to \"%s\".",
+						  GetSlotInvalidationCauseName(s->data.invalidated)));
+
+	/* Let everybody know we've modified this slot */
+	ConditionVariableBroadcast(&s->active_cv);
+
+	/*
 	 * The call to pgstat_acquire_replslot() protects against stats for a
 	 * different slot, from before a restart or such, being present during
 	 * pgstat_report_replslot().
-- 
1.8.3.1

