From cd54e492f19fe9384850e73ab4ee3b3e4881bb58 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 17 Oct 2023 12:27:03 -0400
Subject: [PATCH v9] During online checkpoints, insert XLOG_CHECKPOINT_REDO at
 redo point.

This allows tools that read the WAL sequentially to identify (possible)
redo points when they're reached, rather than only being able to
detect them in retrospect when XLOG_CHECKPOINT_ONLINE is found, possibly
much later in the WAL stream.

Any redo location that precedes the checkpoint location should now point
to an XLOG_CHECKPOINT_REDO record, so add a cross-check to verify this.

While adjusting the code in CreateCheckPoint() for this patch, I made it
call WALInsertLockAcquireExclusive a bit later than before, since there
appears to be no need for it to be held while checking whether the system
is idle, whether this is an end-of-recovery checkpoint, or what the current
timeline is.

Patch by me, based in part on earlier work from Dilip Kumar. Review by
Dilip Kumar, Amit Kapila and Andres Freund. Dilip's earlier patch was
also reviewed by Michael Paquier.

Discussion: http://postgr.es/m/CA+TgmoYy-Vc6G9QKcAKNksCa29cv__czr+N9X_QCxEfQVpp_8w@mail.gmail.com
---
 src/backend/access/rmgrdesc/xlogdesc.c    |   7 +
 src/backend/access/transam/xlog.c         | 193 +++++++++++++++-------
 src/backend/access/transam/xlogrecovery.c |  11 ++
 src/backend/replication/logical/decode.c  |   1 +
 src/include/catalog/pg_control.h          |   1 +
 src/tools/pgindent/typedefs.list          |   1 +
 6 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index f390c177e4..37f59bda7e 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -148,6 +148,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
 						 timestamptz_to_str(xlrec.overwrite_time));
 	}
+	else if (info == XLOG_CHECKPOINT_REDO)
+	{
+		/* No details to write out */
+	}
 }
 
 const char *
@@ -196,6 +200,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_CHECKPOINT_REDO:
+			id = "CHECKPOINT_REDO";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c0e4ca5089..cea13e3d58 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -559,6 +559,16 @@ typedef struct XLogCtlData
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
+/*
+ * Classification of XLogRecordInsert operations.
+ */
+typedef enum
+{
+	WALINSERT_NORMAL,
+	WALINSERT_SPECIAL_SWITCH,
+	WALINSERT_SPECIAL_CHECKPOINT
+} WalInsertClass;
+
 static XLogCtlData *XLogCtl = NULL;
 
 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -739,13 +749,21 @@ XLogInsertRecord(XLogRecData *rdata,
 	bool		inserted;
 	XLogRecord *rechdr = (XLogRecord *) rdata->data;
 	uint8		info = rechdr->xl_info & ~XLR_INFO_MASK;
-	bool		isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
-							   info == XLOG_SWITCH);
+	WalInsertClass class = WALINSERT_NORMAL;
 	XLogRecPtr	StartPos;
 	XLogRecPtr	EndPos;
 	bool		prevDoPageWrites = doPageWrites;
 	TimeLineID	insertTLI;
 
+	/* Does this record type require special handling? */
+	if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
+	{
+		if (info == XLOG_SWITCH)
+			class = WALINSERT_SPECIAL_SWITCH;
+		else if (info == XLOG_CHECKPOINT_REDO)
+			class = WALINSERT_SPECIAL_CHECKPOINT;
+	}
+
 	/* we assume that all of the record header is in the first chunk */
 	Assert(rdata->len >= SizeOfXLogRecord);
 
@@ -793,7 +811,7 @@ XLogInsertRecord(XLogRecData *rdata,
 	 */
 	START_CRIT_SECTION();
 
-	if (likely(!isLogSwitch))
+	if (likely(class == WALINSERT_NORMAL))
 	{
 		WALInsertLockAcquire();
 
@@ -843,7 +861,7 @@ XLogInsertRecord(XLogRecData *rdata,
 		/* Normal records are always inserted. */
 		inserted = true;
 	}
-	else
+	else if (class == WALINSERT_SPECIAL_SWITCH)
 	{
 		/*
 		 * In order to insert an XLOG_SWITCH record, we need to hold all of
@@ -852,14 +870,32 @@ XLogInsertRecord(XLogRecData *rdata,
 		 * remains in the current WAL segment and claimed all of it.
 		 *
 		 * Nonetheless, this case is simpler than the normal cases handled
-		 * above, which must check for changes in doPageWrites and RedoRecPtr.
-		 * Those checks are only needed for records that can contain
-		 * full-pages images, and an XLOG_SWITCH record never does.
+		 * below, which must check for changes in doPageWrites and RedoRecPtr.
+		 * Those checks are only needed for records that can contain buffer
+		 * references, and an XLOG_SWITCH record never does.
 		 */
 		Assert(fpw_lsn == InvalidXLogRecPtr);
 		WALInsertLockAcquireExclusive();
 		inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
 	}
+	else
+	{
+		Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
+
+		/*
+		 * We need to update both the local and shared copies of RedoRecPtr,
+		 * which means that we need to hold all the WAL insertion locks.
+		 * However, there can't be any buffer references, so as above, we need
+		 * not check RedoRecPtr before inserting the record; we just need to
+		 * update it afterwards.
+		 */
+		Assert(fpw_lsn == InvalidXLogRecPtr);
+		WALInsertLockAcquireExclusive();
+		ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
+								  &rechdr->xl_prev);
+		RedoRecPtr = Insert->RedoRecPtr = StartPos;
+		inserted = true;
+	}
 
 	if (inserted)
 	{
@@ -876,7 +912,8 @@ XLogInsertRecord(XLogRecData *rdata,
 		 * All the record data, including the header, is now ready to be
 		 * inserted. Copy the record in the space reserved.
 		 */
-		CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
+		CopyXLogRecordToWAL(rechdr->xl_tot_len,
+							class == WALINSERT_SPECIAL_SWITCH, rdata,
 							StartPos, EndPos, insertTLI);
 
 		/*
@@ -935,7 +972,7 @@ XLogInsertRecord(XLogRecData *rdata,
 	 * padding space that fills the rest of the segment, and perform
 	 * end-of-segment actions (eg, notifying archiver).
 	 */
-	if (isLogSwitch)
+	if (class == WALINSERT_SPECIAL_SWITCH)
 	{
 		TRACE_POSTGRESQL_WAL_SWITCH();
 		XLogFlush(EndPos);
@@ -1054,8 +1091,12 @@ XLogInsertRecord(XLogRecData *rdata,
  *
  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
  * where we actually copy the record to the reserved space.
+ *
+ * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
+ * however, because there are two call sites, the compiler is reluctant to
+ * inline. We use pg_attribute_always_inline here to try to convince it.
  */
-static void
+static pg_attribute_always_inline void
 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 						  XLogRecPtr *PrevPtr)
 {
@@ -6475,17 +6516,22 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
  * In particular note that this routine is synchronous and does not pay
  * attention to CHECKPOINT_WAIT.
  *
- * If !shutdown then we are writing an online checkpoint. This is a very special
- * kind of operation and WAL record because the checkpoint action occurs over
- * a period of time yet logically occurs at just a single LSN. The logical
- * position of the WAL record (redo ptr) is the same or earlier than the
- * physical position. When we replay WAL we locate the checkpoint via its
- * physical position then read the redo ptr and actually start replay at the
- * earlier logical position. Note that we don't write *anything* to WAL at
- * the logical position, so that location could be any other kind of WAL record.
- * All of this mechanism allows us to continue working while we checkpoint.
- * As a result, timing of actions is critical here and be careful to note that
- * this function will likely take minutes to execute on a busy system.
+ * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
+ * record is inserted into WAL at the logical location of the checkpoint, before
+ * flushing anything to disk, and when the checkpoint is eventually completed,
+ * and it is from this point that WAL replay will begin in the case of a recovery
+ * from this checkpoint. Once everything is written to disk, an
+ * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
+ * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
+ * other write-ahead log records to be written while the checkpoint is in
+ * progress, but we must be very careful about order of operations. This function
+ * may take many minutes to execute on a busy system.
+ *
+ * On the other hand, when shutdown is true, concurrent insertion into the
+ * write-ahead log is impossible, so there is no need for two separate records.
+ * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
+ * both the record marking the completion of the checkpoint and the location
+ * from which WAL replay would begin if needed.
  */
 void
 CreateCheckPoint(int flags)
@@ -6497,7 +6543,6 @@ CreateCheckPoint(int flags)
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
 	uint32		freespace;
 	XLogRecPtr	PriorRedoPtr;
-	XLogRecPtr	curInsert;
 	XLogRecPtr	last_important_lsn;
 	VirtualTransactionId *vxids;
 	int			nvxids;
@@ -6567,13 +6612,6 @@ CreateCheckPoint(int flags)
 	 */
 	last_important_lsn = GetLastImportantRecPtr();
 
-	/*
-	 * We must block concurrent insertions while examining insert state to
-	 * determine the checkpoint REDO pointer.
-	 */
-	WALInsertLockAcquireExclusive();
-	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
-
 	/*
 	 * If this isn't a shutdown or forced checkpoint, and if there has been no
 	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
@@ -6584,7 +6622,6 @@ CreateCheckPoint(int flags)
 	{
 		if (last_important_lsn == ControlFile->checkPoint)
 		{
-			WALInsertLockRelease();
 			END_CRIT_SECTION();
 			ereport(DEBUG1,
 					(errmsg_internal("checkpoint skipped because system is idle")));
@@ -6606,38 +6643,47 @@ CreateCheckPoint(int flags)
 	else
 		checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
 
-	checkPoint.fullPageWrites = Insert->fullPageWrites;
-
 	/*
-	 * Compute new REDO record ptr = location of next XLOG record.
-	 *
-	 * NB: this is NOT necessarily where the checkpoint record itself will be,
-	 * since other backends may insert more XLOG records while we're off doing
-	 * the buffer flush work.  Those XLOG records are logically after the
-	 * checkpoint, even though physically before it.  Got that?
+	 * We must block concurrent insertions while examining insert state.
 	 */
-	freespace = INSERT_FREESPACE(curInsert);
-	if (freespace == 0)
+	WALInsertLockAcquireExclusive();
+
+	checkPoint.fullPageWrites = Insert->fullPageWrites;
+
+	if (shutdown)
 	{
-		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
-			curInsert += SizeOfXLogLongPHD;
-		else
-			curInsert += SizeOfXLogShortPHD;
-	}
-	checkPoint.redo = curInsert;
+		XLogRecPtr	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
 
-	/*
-	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
-	 * must be done while holding all the insertion locks.
-	 *
-	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
-	 * pointing past where it really needs to point.  This is okay; the only
-	 * consequence is that XLogInsert might back up whole buffers that it
-	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
-	 * XLogInserts that happen while we are dumping buffers must assume that
-	 * their buffer changes are not included in the checkpoint.
-	 */
-	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+		/*
+		 * Compute new REDO record ptr = location of next XLOG record.
+		 *
+		 * Since this is a shutdown checkpoint, there can't be any concurrent
+		 * WAL insertion.
+		 */
+		freespace = INSERT_FREESPACE(curInsert);
+		if (freespace == 0)
+		{
+			if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
+				curInsert += SizeOfXLogLongPHD;
+			else
+				curInsert += SizeOfXLogShortPHD;
+		}
+		checkPoint.redo = curInsert;
+
+		/*
+		 * Here we update the shared RedoRecPtr for future XLogInsert calls;
+		 * this must be done while holding all the insertion locks.
+		 *
+		 * Note: if we fail to complete the checkpoint, RedoRecPtr will be
+		 * left pointing past where it really needs to point.  This is okay;
+		 * the only consequence is that XLogInsert might back up whole buffers
+		 * that it didn't really need to.  We can't postpone advancing
+		 * RedoRecPtr because XLogInserts that happen while we are dumping
+		 * buffers must assume that their buffer changes are not included in
+		 * the checkpoint.
+		 */
+		RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+	}
 
 	/*
 	 * Now we can release the WAL insertion locks, allowing other xacts to
@@ -6645,6 +6691,33 @@ CreateCheckPoint(int flags)
 	 */
 	WALInsertLockRelease();
 
+	/*
+	 * If this is an online checkpoint, we have not yet determined the redo
+	 * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
+	 * record; the LSN at which it starts becomes the new redo pointer. We
+	 * don't do this for a shutdown checkpoint, because in that case no WAL
+	 * can be written between the redo point and the insertion of the
+	 * checkpoint record itself, so the checkpoint record itself serves to
+	 * mark the redo point.
+	 */
+	if (!shutdown)
+	{
+		int			dummy = 0;
+
+		/* Record must have payload to avoid assertion failure. */
+		XLogBeginInsert();
+		XLogRegisterData((char *) &dummy, sizeof(dummy));
+		(void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
+
+		/*
+		 * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
+		 * shared memory and RedoRecPtr in backend-local memory, but we need
+		 * to copy that into the record that will be inserted when the
+		 * checkpoint is complete.
+		 */
+		checkPoint.redo = RedoRecPtr;
+	}
+
 	/* Update the info_lck-protected copy of RedoRecPtr as well */
 	SpinLockAcquire(&XLogCtl->info_lck);
 	XLogCtl->RedoRecPtr = checkPoint.redo;
@@ -8105,6 +8178,10 @@ xlog_redo(XLogReaderState *record)
 		/* Keep track of full_page_writes */
 		lastFullPageWrites = fpw;
 	}
+	else if (info == XLOG_CHECKPOINT_REDO)
+	{
+		/* nothing to do here, just for informational purposes */
+	}
 }
 
 /*
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index becc2bda62..d6f2bb8286 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1638,6 +1638,17 @@ PerformWalRecovery(void)
 		replayTLI = RedoStartTLI;
 		XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
 		record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
+
+		/*
+		 * If a checkpoint record's redo pointer points back to an earlier
+		 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
+		 * record.
+		 */
+		if (record->xl_rmid != RM_XLOG_ID ||
+			(record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
+			ereport(FATAL,
+					(errmsg("unexpected record type found at redo point %X/%X",
+							LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
 	}
 	else
 	{
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 730061c9da..24b712aa66 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -190,6 +190,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
 		case XLOG_OVERWRITE_CONTRECORD:
+		case XLOG_CHECKPOINT_REDO:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index dc953977c5..1136613259 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -78,6 +78,7 @@ typedef struct CheckPoint
 #define XLOG_FPI						0xB0
 /* 0xC0 is used in Postgres 9.5-11 */
 #define XLOG_OVERWRITE_CONTRECORD		0xD0
+#define XLOG_CHECKPOINT_REDO			0xE0
 
 
 /*
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e69bb671bf..06b25617bc 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2971,6 +2971,7 @@ VolatileFunctionStatus
 Vsrt
 WAIT_ORDER
 WALAvailability
+WalInsertClass
 WALInsertLock
 WALInsertLockPadded
 WALOpenSegment
-- 
2.37.1 (Apple Git-137.1)

