From b2f67581fb55bea5bbf740208394ba6a8346fd19 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 7 Apr 2022 11:15:07 -0400
Subject: [PATCH] Rethink the delay-checkpoint-end mechanism in the
 back-branches.

The back-patch of commit bbace5697df12398e87ffd9879171c39d27f5b33 had
the unfortunate effect of changing the layout of PGPROC in the
back-branches, which could break extensions. This happened because it
changed the delayChkpt from type bool to type int. So, change it back,
and add a new bool delayChkptEnd field instead. The new field should
fall within what used to be padding space within the struct, and so
hopefully won't cause any extensions to break.

Per report from Markus Wanner and discussion with Tom Lane and others.
---
 src/backend/access/transam/multixact.c  |   6 +-
 src/backend/access/transam/twophase.c   |  13 ++-
 src/backend/access/transam/xact.c       |   6 +-
 src/backend/access/transam/xlog.c       |  10 +--
 src/backend/access/transam/xloginsert.c |   2 +-
 src/backend/catalog/storage.c           |   6 +-
 src/backend/storage/buffer/bufmgr.c     |   6 +-
 src/backend/storage/ipc/procarray.c     | 104 +++++++++++++++++++++---
 src/include/storage/proc.h              |  38 +--------
 src/include/storage/procarray.h         |   7 +-
 10 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 50d8bab9e2..b643564f16 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	 * crash/basebackup, even though the state of the data directory would
 	 * require it.
 	 */
-	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
-	MyProc->delayChkpt |= DELAY_CHKPT_START;
+	Assert(!MyProc->delayChkpt);
+	MyProc->delayChkpt = true;
 
 	/* WAL log truncation */
 	WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	/* Then offsets */
 	PerformOffsetsTruncation(oldestMulti, newOldestMulti);
 
-	MyProc->delayChkpt &= ~DELAY_CHKPT_START;
+	MyProc->delayChkpt = false;
 
 	END_CRIT_SECTION();
 	LWLockRelease(MultiXactTruncationLock);
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index dea3f485f7..633a6f1747 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -474,8 +474,9 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	}
 	proc->xid = xid;
 	Assert(proc->xmin == InvalidTransactionId);
-	proc->delayChkpt = 0;
+	proc->delayChkpt = false;
 	proc->statusFlags = 0;
+	proc->delayChkptEnd = false;
 	proc->pid = 0;
 	proc->databaseId = databaseid;
 	proc->roleId = owner;
@@ -1165,8 +1166,7 @@ EndPrepare(GlobalTransaction gxact)
 
 	START_CRIT_SECTION();
 
-	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
-	MyProc->delayChkpt |= DELAY_CHKPT_START;
+	MyProc->delayChkpt = true;
 
 	XLogBeginInsert();
 	for (record = records.head; record != NULL; record = record->next)
@@ -1209,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
 	 * checkpoint starting after this will certainly see the gxact as a
 	 * candidate for fsyncing.
 	 */
-	MyProc->delayChkpt &= ~DELAY_CHKPT_START;
+	MyProc->delayChkpt = false;
 
 	/*
 	 * Remember that we have this GlobalTransaction entry locked for us.  If
@@ -2276,8 +2276,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	START_CRIT_SECTION();
 
 	/* See notes in RecordTransactionCommit */
-	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
-	MyProc->delayChkpt |= DELAY_CHKPT_START;
+	MyProc->delayChkpt = true;
 
 	/*
 	 * Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2325,7 +2324,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	TransactionIdCommitTree(xid, nchildren, children);
 
 	/* Checkpoint can proceed now */
-	MyProc->delayChkpt &= ~DELAY_CHKPT_START;
+	MyProc->delayChkpt = false;
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index c5e7261921..e8523693c0 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1335,9 +1335,9 @@ RecordTransactionCommit(void)
 		 * This makes checkpoint's determination of which xacts are delayChkpt
 		 * a bit fuzzy, but it doesn't matter.
 		 */
-		Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+		Assert(!MyProc->delayChkpt);
 		START_CRIT_SECTION();
-		MyProc->delayChkpt |= DELAY_CHKPT_START;
+		MyProc->delayChkpt = true;
 
 		SetCurrentTransactionStopTimestamp();
 
@@ -1438,7 +1438,7 @@ RecordTransactionCommit(void)
 	 */
 	if (markXidCommitted)
 	{
-		MyProc->delayChkpt &= ~DELAY_CHKPT_START;
+		MyProc->delayChkpt = false;
 		END_CRIT_SECTION();
 	}
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d889d69387..8edf0fa646 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9228,27 +9228,25 @@ CreateCheckPoint(int flags)
 	 * and we will correctly flush the update below.  So we cannot miss any
 	 * xacts we need to wait for.
 	 */
-	vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
+	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
 	if (nvxids > 0)
 	{
 		do
 		{
 			pg_usleep(10000L);	/* wait for 10 msec */
-		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
-											  DELAY_CHKPT_START));
+		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
 	}
 	pfree(vxids);
 
 	CheckPointGuts(checkPoint.redo, flags);
 
-	vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
+	vxids = GetVirtualXIDsDelayingChkptEnd(&nvxids);
 	if (nvxids > 0)
 	{
 		do
 		{
 			pg_usleep(10000L);	/* wait for 10 msec */
-		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
-											  DELAY_CHKPT_COMPLETE));
+		} while (HaveVirtualXIDsDelayingChkptEnd(vxids, nvxids));
 	}
 	pfree(vxids);
 
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 1af4a90c41..b153fad594 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
 	/*
 	 * Ensure no checkpoint can change our view of RedoRecPtr.
 	 */
-	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
+	Assert(MyProc->delayChkpt);
 
 	/*
 	 * Update RedoRecPtr so that we can make the right decision
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index fa5682dce8..b7fc491a68 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -338,8 +338,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	 * the blocks to not exist on disk at all, but not for them to have the
 	 * wrong contents.
 	 */
-	Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
-	MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
+	Assert(!MyProc->delayChkptEnd);
+	MyProc->delayChkptEnd = true;
 
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
@@ -387,7 +387,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
 
 	/* We've done all the critical work, so checkpoints are OK now. */
-	MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
+	MyProc->delayChkptEnd = false;
 
 	/*
 	 * Update upper-level FSM pages to account for the truncation. This is
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a55545a187..49561ae794 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3946,8 +3946,8 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			 * essential that CreateCheckpoint waits for virtual transactions
 			 * rather than full transactionids.
 			 */
-			Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
-			MyProc->delayChkpt |= DELAY_CHKPT_START;
+			Assert(!MyProc->delayChkpt);
+			MyProc->delayChkpt = true;
 			delayChkpt = true;
 			lsn = XLogSaveBufferForHint(buffer, buffer_std);
 		}
@@ -3981,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		UnlockBufHdr(bufHdr, buf_state);
 
 		if (delayChkpt)
-			MyProc->delayChkpt &= ~DELAY_CHKPT_START;
+			MyProc->delayChkpt = false;
 
 		if (dirtied)
 		{
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index ae71d7538b..0ea74f2a3c 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -690,8 +690,9 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		proc->lxid = InvalidLocalTransactionId;
 		proc->xmin = InvalidTransactionId;
 
-		/* be sure this is cleared in abort */
-		proc->delayChkpt = 0;
+		/* be sure these are cleared in abort */
+		proc->delayChkpt = false;
+		proc->delayChkptEnd = false;
 
 		proc->recoveryConflictPending = false;
 
@@ -732,8 +733,9 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
 
-	/* be sure this is cleared in abort */
-	proc->delayChkpt = 0;
+	/* be sure these are cleared in abort */
+	proc->delayChkpt = false;
+	proc->delayChkptEnd = false;
 
 	proc->recoveryConflictPending = false;
 
@@ -3049,8 +3051,7 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
  * delaying checkpoint because they have critical actions in progress.
  *
  * Constructs an array of VXIDs of transactions that are currently in commit
- * critical sections, as shown by having specified delayChkpt bits set in their
- * PGPROC.
+ * critical sections, as shown by having delayChkpt set in their PGPROC.
  *
  * Returns a palloc'd array that should be freed by the caller.
  * *nvxids is the number of valid entries.
@@ -3064,14 +3065,51 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
  * for clearing of delayChkpt to propagate is unimportant for correctness.
  */
 VirtualTransactionId *
-GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
+GetVirtualXIDsDelayingChkpt(int *nvxids)
 {
 	VirtualTransactionId *vxids;
 	ProcArrayStruct *arrayP = procArray;
 	int			count = 0;
 	int			index;
 
-	Assert(type != 0);
+	/* allocate what's certainly enough result space */
+	vxids = (VirtualTransactionId *)
+		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->delayChkpt)
+		{
+			VirtualTransactionId vxid;
+
+			GET_VXID_FROM_PGPROC(vxid, *proc);
+			if (VirtualTransactionIdIsValid(vxid))
+				vxids[count++] = vxid;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	*nvxids = count;
+	return vxids;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptEnd - like GetVirtualXIDsDelayingChkpt,
+ * but for delayChkptEnd
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkptEnd(int *nvxids)
+{
+	VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
 
 	/* allocate what's certainly enough result space */
 	vxids = (VirtualTransactionId *)
@@ -3084,7 +3122,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
 
-		if ((proc->delayChkpt & type) != 0)
+		if (proc->delayChkptEnd)
 		{
 			VirtualTransactionId vxid;
 
@@ -3110,13 +3148,54 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
  * those numbers should be small enough for it not to be a problem.
  */
 bool
-HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
 {
 	bool		result = false;
 	ProcArrayStruct *arrayP = procArray;
 	int			index;
 
-	Assert(type != 0);
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		VirtualTransactionId vxid;
+
+		GET_VXID_FROM_PGPROC(vxid, *proc);
+
+		if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
+		{
+			int			i;
+
+			for (i = 0; i < nvxids; i++)
+			{
+				if (VirtualTransactionIdEquals(vxid, vxids[i]))
+				{
+					result = true;
+					break;
+				}
+			}
+			if (result)
+				break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkptEnd -- like HaveVirtualXIDsDelayingChkpt
+ * but for delayChkptEnd
+ */
+bool
+HaveVirtualXIDsDelayingChkptEnd(VirtualTransactionId *vxids, int nvxids)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -3128,8 +3207,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
 
 		GET_VXID_FROM_PGPROC(vxid, *proc);
 
-		if ((proc->delayChkpt & type) != 0 &&
-			VirtualTransactionIdIsValid(vxid))
+		if (proc->delayChkptEnd && VirtualTransactionIdIsValid(vxid))
 		{
 			int			i;
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index b78012ec2b..0ea8f7a785 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -86,41 +86,6 @@ struct XidCache
  */
 #define INVALID_PGPROCNO		PG_INT32_MAX
 
-/*
- * Flags for PGPROC.delayChkpt
- *
- * These flags can be used to delay the start or completion of a checkpoint
- * for short periods. A flag is in effect if the corresponding bit is set in
- * the PGPROC of any backend.
- *
- * For our purposes here, a checkpoint has three phases: (1) determine the
- * location to which the redo pointer will be moved, (2) write all the
- * data durably to disk, and (3) WAL-log the checkpoint.
- *
- * Setting DELAY_CHKPT_START prevents the system from moving from phase 1
- * to phase 2. This is useful when we are performing a WAL-logged modification
- * of data that will be flushed to disk in phase 2. By setting this flag
- * before writing WAL and clearing it after we've both written WAL and
- * performed the corresponding modification, we ensure that if the WAL record
- * is inserted prior to the new redo point, the corresponding data changes will
- * also be flushed to disk before the checkpoint can complete. (In the
- * extremely common case where the data being modified is in shared buffers
- * and we acquire an exclusive content lock on the relevant buffers before
- * writing WAL, this mechanism is not needed, because phase 2 will block
- * until we release the content lock and then flush the modified data to
- * disk.)
- *
- * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
- * to phase 3. This is useful if we are performing a WAL-logged operation that
- * might invalidate buffers, such as relation truncation. In this case, we need
- * to ensure that any buffers which were invalidated and thus not flushed by
- * the checkpoint are actaully destroyed on disk. Replay can cope with a file
- * or block that doesn't exist, but not with a block that has the wrong
- * contents.
- */
-#define DELAY_CHKPT_START		(1<<0)
-#define DELAY_CHKPT_COMPLETE	(1<<1)
-
 typedef enum
 {
 	PROC_WAIT_STATUS_OK,
@@ -226,11 +191,12 @@ struct PGPROC
 	pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
 								 * started */
 
-	int			delayChkpt;		/* for DELAY_CHKPT_* flags */
+	bool		delayChkpt;		/* true if this proc delays checkpoint start */
 
 	uint8		statusFlags;	/* this backend's status flags, see PROC_*
 								 * above. mirrored in
 								 * ProcGlobal->statusFlags[pgxactoff] */
+	bool		delayChkptEnd;	/* true if this proc delays checkpoint end */
 
 	/*
 	 * Info to allow us to wait for synchronous replication, if needed.
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index 93de230a32..f5d7d41098 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -59,9 +59,12 @@ extern TransactionId GetOldestActiveTransactionId(void);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
 extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
 
-extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
+extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
+extern VirtualTransactionId *GetVirtualXIDsDelayingChkptEnd(int *nvxids);
 extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
-										 int nvxids, int type);
+										 int nvxids);
+extern bool HaveVirtualXIDsDelayingChkptEnd(VirtualTransactionId *vxids,
+											int nvxids);
 
 extern PGPROC *BackendPidGetProc(int pid);
 extern PGPROC *BackendPidGetProcWithLock(int pid);
-- 
2.24.3 (Apple Git-128)

