From 9424765162c6f698658fac7c58cc0be960a7a37c Mon Sep 17 00:00:00 2001
From: Dilip kumar <dilipkumar@dkmac.local>
Date: Tue, 15 Aug 2023 14:10:45 +0530
Subject: [PATCH v2] New WAL record for checkpoint redo location

Currently, the checkpoint-redo LSN cannot be accurately detected
while processing the WAL.  Although we have a checkpoint WAL record
containing the exact redo LSN, other WAL records may be inserted
between the checkpoint-redo LSN and the actual checkpoint record.
If we want to stop processing wal exactly at the checkpoint-redo
location then we cannot do that because we would have already
processed some extra records that got added after the redo LSN.

The patch inserts a special wal record named CHECKPOINT_REDO
WAL, and the checkpoint-redo location is set at LSN of this record.

Also after this patch now we do not need to compute the checkpoint
redo lsn based on the current WAL insert location as we are setting
this to the CHECKPOINT_REDO record.
---
 src/backend/access/rmgrdesc/xlogdesc.c   |  7 +++
 src/backend/access/transam/xlog.c        | 72 ++++++++++--------------
 src/backend/replication/logical/decode.c |  1 +
 src/include/catalog/pg_control.h         |  1 +
 4 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index f390c177e4..7868ec7633 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -148,6 +148,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
 						 timestamptz_to_str(xlrec.overwrite_time));
 	}
+	else if(info == XLOG_CHECKPOINT_REDO)
+	{
+		/* No details to write out */
+	}
 }
 
 const char *
@@ -196,6 +200,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_CHECKPOINT_REDO:
+			id = "CHECKPOINT_REDO";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 60c0b7ec3a..d25b7ca812 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6454,11 +6454,9 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
  * position of the WAL record (redo ptr) is the same or earlier than the
  * physical position. When we replay WAL we locate the checkpoint via its
  * physical position then read the redo ptr and actually start replay at the
- * earlier logical position. Note that we don't write *anything* to WAL at
- * the logical position, so that location could be any other kind of WAL record.
- * All of this mechanism allows us to continue working while we checkpoint.
- * As a result, timing of actions is critical here and be careful to note that
- * this function will likely take minutes to execute on a busy system.
+ * earlier logical position. Note that we do write a dummy wal record
+ * called XLOG_CHECKPOINT_REDO record and the checkpoint.redo LSN is always
+ * set to the LSN of this record.
  */
 void
 CreateCheckPoint(int flags)
@@ -6468,11 +6466,9 @@ CreateCheckPoint(int flags)
 	XLogRecPtr	recptr;
 	XLogSegNo	_logSegNo;
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
-	uint32		freespace;
 	XLogRecPtr	PriorRedoPtr;
-	XLogRecPtr	curInsert;
-	XLogRecPtr	last_important_lsn;
 	VirtualTransactionId *vxids;
+	int			dummy = 0;
 	int			nvxids;
 	int			oldXLogAllowed = 0;
 
@@ -6534,19 +6530,6 @@ CreateCheckPoint(int flags)
 	else
 		checkPoint.oldestActiveXid = InvalidTransactionId;
 
-	/*
-	 * Get location of last important record before acquiring insert locks (as
-	 * GetLastImportantRecPtr() also locks WAL locks).
-	 */
-	last_important_lsn = GetLastImportantRecPtr();
-
-	/*
-	 * We must block concurrent insertions while examining insert state to
-	 * determine the checkpoint REDO pointer.
-	 */
-	WALInsertLockAcquireExclusive();
-	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
-
 	/*
 	 * If this isn't a shutdown or forced checkpoint, and if there has been no
 	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
@@ -6555,9 +6538,8 @@ CreateCheckPoint(int flags)
 	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
 				  CHECKPOINT_FORCE)) == 0)
 	{
-		if (last_important_lsn == ControlFile->checkPoint)
+		if (GetLastImportantRecPtr() == ControlFile->checkPoint)
 		{
-			WALInsertLockRelease();
 			END_CRIT_SECTION();
 			ereport(DEBUG1,
 					(errmsg_internal("checkpoint skipped because system is idle")));
@@ -6573,6 +6555,24 @@ CreateCheckPoint(int flags)
 	if (flags & CHECKPOINT_END_OF_RECOVERY)
 		oldXLogAllowed = LocalSetXLogInsertAllowed();
 
+	/*
+	 * Insert a dummy CHECKPOINT_REDO record and set LSN of this record as
+	 * checkpoint.redo.  Although we have the checkpoint record that also
+	 * contains the exact redo lsn, there might have been some other records
+	 * those got inserted between the redo lsn and the actual checkpoint
+	 * record.  So when processing the wal, we cannot rely on the checkpoint
+	 * record if we want to stop at the checkpoint-redo LSN.
+	 */
+	XLogBeginInsert();
+	XLogRegisterData((char *) &dummy, sizeof(dummy));
+	recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
+
+	/*
+	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+	 * must be done while holding all the insertion locks.
+	 */
+	WALInsertLockAcquireExclusive();
+
 	checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
 	if (flags & CHECKPOINT_END_OF_RECOVERY)
 		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
@@ -6581,28 +6581,9 @@ CreateCheckPoint(int flags)
 
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
 
-	/*
-	 * Compute new REDO record ptr = location of next XLOG record.
-	 *
-	 * NB: this is NOT necessarily where the checkpoint record itself will be,
-	 * since other backends may insert more XLOG records while we're off doing
-	 * the buffer flush work.  Those XLOG records are logically after the
-	 * checkpoint, even though physically before it.  Got that?
-	 */
-	freespace = INSERT_FREESPACE(curInsert);
-	if (freespace == 0)
-	{
-		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
-			curInsert += SizeOfXLogLongPHD;
-		else
-			curInsert += SizeOfXLogShortPHD;
-	}
-	checkPoint.redo = curInsert;
+	checkPoint.redo = recptr;
 
 	/*
-	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
-	 * must be done while holding all the insertion locks.
-	 *
 	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
 	 * pointing past where it really needs to point.  This is okay; the only
 	 * consequence is that XLogInsert might back up whole buffers that it
@@ -8074,6 +8055,11 @@ xlog_redo(XLogReaderState *record)
 		/* Keep track of full_page_writes */
 		lastFullPageWrites = fpw;
 	}
+	else if (info == XLOG_CHECKPOINT_REDO)
+	{
+		/* nothing to do here */
+	}
+
 }
 
 /*
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 7039d425e2..89978c2fd8 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -190,6 +190,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
 		case XLOG_OVERWRITE_CONTRECORD:
+		case XLOG_CHECKPOINT_REDO:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index dc953977c5..1136613259 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -78,6 +78,7 @@ typedef struct CheckPoint
 #define XLOG_FPI						0xB0
 /* 0xC0 is used in Postgres 9.5-11 */
 #define XLOG_OVERWRITE_CONTRECORD		0xD0
+#define XLOG_CHECKPOINT_REDO			0xE0
 
 
 /*
-- 
2.39.2 (Apple Git-143)

