From 2fa9f78c95bbe043ba30642d3db6afb959dca418 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilip.kumar@enterprisedb.com>
Date: Fri, 25 Aug 2023 05:16:54 +0000
Subject: [PATCH v3] New WAL record for checkpoint redo location

Currently, the checkpoint-redo LSN cannot be accurately detected
while processing the WAL.  Although we have a checkpoint WAL record
containing the exact redo LSN, other WAL records may be inserted
between the checkpoint-redo LSN and the actual checkpoint record.
If we want to stop processing wal exactly at the checkpoint-redo
location then we cannot do that because we would have already
processed some extra records that got added after the redo LSN.

The patch inserts a special WAL record named CHECKPOINT_REDO
WAL, and the checkpoint-redo location is set at the start LSN of
this record.

However for shutdown checkpoint we don't need this special record
because during shutdown there could not be any concurrent WAL
insertion so the shutdown checkpoint record should be at the
checkpoint-redo location.
---
 src/backend/access/rmgrdesc/xlogdesc.c   |  7 +++
 src/backend/access/transam/xlog.c        | 63 ++++++++++++++++++------
 src/backend/replication/logical/decode.c |  1 +
 src/include/catalog/pg_control.h         |  1 +
 4 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index f390c177e4..7868ec7633 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -148,6 +148,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
 						 timestamptz_to_str(xlrec.overwrite_time));
 	}
+	else if(info == XLOG_CHECKPOINT_REDO)
+	{
+		/* No details to write out */
+	}
 }
 
 const char *
@@ -196,6 +200,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_CHECKPOINT_REDO:
+			id = "CHECKPOINT_REDO";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 60c0b7ec3a..856dc55baf 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6534,6 +6534,31 @@ CreateCheckPoint(int flags)
 	else
 		checkPoint.oldestActiveXid = InvalidTransactionId;
 
+	/*
+	 * Insert a dummy CHECKPOINT_REDO record and set start LSN of this record
+	 * as checkpoint.redo.  Although we have the checkpoint record that also
+	 * contains the exact redo lsn, there might have been some other records
+	 * those got inserted between the redo lsn and the actual checkpoint
+	 * record.  So when processing the wal, we cannot rely on the checkpoint
+	 * record if we want to stop at the checkpoint-redo LSN.
+	 *
+	 * This special record, however, is not required when we doing a shutdown
+	 * checkpoint, as there will be no concurrent wal insertions during that
+	 * time.  So, the shutdown checkpoint LSN will be the same as
+	 * checkpoint-redo LSN.
+	 */
+	if (!shutdown)
+	{
+		int		dummy = 0;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &dummy, sizeof(dummy));
+		XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
+
+		/* store CHECKPOINT_REDO record's start location as checkpoint.redo */
+		checkPoint.redo = ProcLastRecPtr;
+	}
+
 	/*
 	 * Get location of last important record before acquiring insert locks (as
 	 * GetLastImportantRecPtr() also locks WAL locks).
@@ -6545,7 +6570,14 @@ CreateCheckPoint(int flags)
 	 * determine the checkpoint REDO pointer.
 	 */
 	WALInsertLockAcquireExclusive();
-	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
+
+	/*
+	 * For regular checkpoint we have already computed the checkpoint.redo
+	 * location so for shutdown checkpoint set it to the next insert position
+	 * in the wal.
+	 */
+	if (shutdown)
+		curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
 
 	/*
 	 * If this isn't a shutdown or forced checkpoint, and if there has been no
@@ -6582,22 +6614,21 @@ CreateCheckPoint(int flags)
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
 
 	/*
-	 * Compute new REDO record ptr = location of next XLOG record.
-	 *
-	 * NB: this is NOT necessarily where the checkpoint record itself will be,
-	 * since other backends may insert more XLOG records while we're off doing
-	 * the buffer flush work.  Those XLOG records are logically after the
-	 * checkpoint, even though physically before it.  Got that?
+	 * In case of shutdown checkpoint, compute new REDO record
+	 * ptr = location of next XLOG record.
 	 */
-	freespace = INSERT_FREESPACE(curInsert);
-	if (freespace == 0)
+	if (shutdown)
 	{
-		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
-			curInsert += SizeOfXLogLongPHD;
-		else
-			curInsert += SizeOfXLogShortPHD;
+		freespace = INSERT_FREESPACE(curInsert);
+		if (freespace == 0)
+		{
+			if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
+				curInsert += SizeOfXLogLongPHD;
+			else
+				curInsert += SizeOfXLogShortPHD;
+		}
+		checkPoint.redo = curInsert;
 	}
-	checkPoint.redo = curInsert;
 
 	/*
 	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
@@ -8074,6 +8105,10 @@ xlog_redo(XLogReaderState *record)
 		/* Keep track of full_page_writes */
 		lastFullPageWrites = fpw;
 	}
+	else if (info == XLOG_CHECKPOINT_REDO)
+	{
+		/* nothing to do here */
+	}
 }
 
 /*
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 7039d425e2..89978c2fd8 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -190,6 +190,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
 		case XLOG_OVERWRITE_CONTRECORD:
+		case XLOG_CHECKPOINT_REDO:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index dc953977c5..1136613259 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -78,6 +78,7 @@ typedef struct CheckPoint
 #define XLOG_FPI						0xB0
 /* 0xC0 is used in Postgres 9.5-11 */
 #define XLOG_OVERWRITE_CONTRECORD		0xD0
+#define XLOG_CHECKPOINT_REDO			0xE0
 
 
 /*
-- 
2.34.1

