From b8a946291b130e3d1fe5f92c4d79a20443ca82c2 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilip.kumar@enterprisedb.com>
Date: Fri, 25 Aug 2023 05:16:54 +0000
Subject: [PATCH v5] New WAL record for checkpoint redo location

Currently, for non shutdown checkpoints the checkpoint.redo
LSN cannot be accurately detected while processing the WAL.
Although we have a checkpoint WAL record containing the exact
redo LSN, other WAL records may be inserted between the redo
LSN and the actual checkpoint record.

So while processing the WAL, If we want to stop exactly at the
that LSN then we can't do that because we would have already
processed some extra records that got added after the redo LSN.

The patch inserts a special WAL record named CHECKPOINT_REDO
WAL, and the redo location is set at the start LSN of this record.

However for shutdown checkpoint we don't need this special record
because during shutdown there could not be any concurrent WAL
insertion so the shutdown checkpoint record should always be at the
redo location.
---
 .../pg_walinspect/expected/pg_walinspect.out  | 11 ++++
 contrib/pg_walinspect/sql/pg_walinspect.sql   |  7 +++
 src/backend/access/rmgrdesc/xlogdesc.c        |  7 +++
 src/backend/access/transam/xlog.c             | 60 ++++++++++++++-----
 src/backend/replication/logical/decode.c      |  1 +
 src/include/catalog/pg_control.h              |  1 +
 6 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index a8f4c91060..f49c6b4844 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -140,6 +140,17 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
  t
 (1 row)
 
+-- ===================================================================
+-- Test presence of new REDO record at the checkpoint redo location
+-- ===================================================================
+SELECT redo_lsn FROM pg_control_checkpoint() \gset
+SELECT start_lsn = :'redo_lsn'::pg_lsn AS same_lsn, record_type
+  FROM pg_get_wal_record_info(:'redo_lsn');
+ same_lsn |   record_type   
+----------+-----------------
+ t        | CHECKPOINT_REDO
+(1 row)
+
 -- ===================================================================
 -- Tests for permissions
 -- ===================================================================
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index f987ca31c4..443702d26e 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -89,6 +89,13 @@ SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
 SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn5', :'wal_lsn6')
   WHERE relfilenode = :'sample_tbl_oid' AND block_fpi_data IS NOT NULL;
 
+-- ===================================================================
+-- Test presence of new REDO record at the checkpoint redo location
+-- ===================================================================
+SELECT redo_lsn FROM pg_control_checkpoint() \gset
+SELECT start_lsn = :'redo_lsn'::pg_lsn AS same_lsn, record_type
+  FROM pg_get_wal_record_info(:'redo_lsn');
+
 -- ===================================================================
 -- Tests for permissions
 -- ===================================================================
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index f390c177e4..7868ec7633 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -148,6 +148,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
 						 timestamptz_to_str(xlrec.overwrite_time));
 	}
+	else if(info == XLOG_CHECKPOINT_REDO)
+	{
+		/* No details to write out */
+	}
 }
 
 const char *
@@ -196,6 +200,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_CHECKPOINT_REDO:
+			id = "CHECKPOINT_REDO";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f6f8adc72a..77945ed855 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6481,9 +6481,7 @@ CreateCheckPoint(int flags)
 	XLogRecPtr	recptr;
 	XLogSegNo	_logSegNo;
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
-	uint32		freespace;
 	XLogRecPtr	PriorRedoPtr;
-	XLogRecPtr	curInsert;
 	XLogRecPtr	last_important_lsn;
 	VirtualTransactionId *vxids;
 	int			nvxids;
@@ -6547,6 +6545,31 @@ CreateCheckPoint(int flags)
 	else
 		checkPoint.oldestActiveXid = InvalidTransactionId;
 
+	/*
+	 * Insert a dummy CHECKPOINT_REDO record and set start LSN of this record
+	 * as checkpoint.redo for a non-shutdown checkpoint.  Although we have the
+	 * checkpoint record that also contains the exact redo LSN, there might
+	 * have been some other records those got inserted between the redo LSN and
+	 * the actual checkpoint record.  So when processing the WAL, we cannot
+	 * rely on the checkpoint record if we want to stop at the same position as
+	 * the redo LSN.
+	 *
+	 * This special record is not required when doing a shutdown checkpoint;
+	 * the redo LSN is the same LSN as the checkpoint record as there cannot
+	 * be any WAL activity in a shutdown sequence.
+	 */
+	if (!shutdown)
+	{
+		int		dummy = 0;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &dummy, sizeof(dummy));
+		XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
+
+		/* store CHECKPOINT_REDO record's start location as checkpoint.redo */
+		checkPoint.redo = ProcLastRecPtr;
+	}
+
 	/*
 	 * Get location of last important record before acquiring insert locks (as
 	 * GetLastImportantRecPtr() also locks WAL locks).
@@ -6558,7 +6581,6 @@ CreateCheckPoint(int flags)
 	 * determine the checkpoint REDO pointer.
 	 */
 	WALInsertLockAcquireExclusive();
-	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
 
 	/*
 	 * If this isn't a shutdown or forced checkpoint, and if there has been no
@@ -6595,22 +6617,24 @@ CreateCheckPoint(int flags)
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
 
 	/*
-	 * Compute new REDO record ptr = location of next XLOG record.
-	 *
-	 * NB: this is NOT necessarily where the checkpoint record itself will be,
-	 * since other backends may insert more XLOG records while we're off doing
-	 * the buffer flush work.  Those XLOG records are logically after the
-	 * checkpoint, even though physically before it.  Got that?
+	 * In case of shutdown checkpoint, compute new REDO record
+	 * ptr = location of next XLOG record.  For regular checkpoint we have
+	 * already set it to the CHECKPOINT_REDO so nothing to be done for that.
 	 */
-	freespace = INSERT_FREESPACE(curInsert);
-	if (freespace == 0)
+	if (shutdown)
 	{
-		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
-			curInsert += SizeOfXLogLongPHD;
-		else
-			curInsert += SizeOfXLogShortPHD;
+		XLogRecPtr	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
+		uint32		freespace = INSERT_FREESPACE(curInsert);
+
+		if (freespace == 0)
+		{
+			if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
+				curInsert += SizeOfXLogLongPHD;
+			else
+				curInsert += SizeOfXLogShortPHD;
+		}
+		checkPoint.redo = curInsert;
 	}
-	checkPoint.redo = curInsert;
 
 	/*
 	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
@@ -8087,6 +8111,10 @@ xlog_redo(XLogReaderState *record)
 		/* Keep track of full_page_writes */
 		lastFullPageWrites = fpw;
 	}
+	else if (info == XLOG_CHECKPOINT_REDO)
+	{
+		/* nothing to do here */
+	}
 }
 
 /*
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 7039d425e2..89978c2fd8 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -190,6 +190,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
 		case XLOG_OVERWRITE_CONTRECORD:
+		case XLOG_CHECKPOINT_REDO:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index dc953977c5..1136613259 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -78,6 +78,7 @@ typedef struct CheckPoint
 #define XLOG_FPI						0xB0
 /* 0xC0 is used in Postgres 9.5-11 */
 #define XLOG_OVERWRITE_CONTRECORD		0xD0
+#define XLOG_CHECKPOINT_REDO			0xE0
 
 
 /*
-- 
2.39.2 (Apple Git-143)

