From 9264635eb097f0f2e85f733d358b67e6d038edad Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Fri, 1 Dec 2023 04:53:30 +0000
Subject: [PATCH v17] Use 64-bit atomics for xlblocks array elements

In AdvanceXLInsertBuffer(), xlblocks value of a WAL buffer page is
updated only at the end after the page is initialized with all
zeros. A problem with this approach is that anyone reading
xlblocks and WAL buffer page without holding WALBufMappingLock
will see the wrong page contents if the read happens before the
xlblocks is marked with a new entry in AdvanceXLInsertBuffer() at
the end.

To fix this issue, xlblocks is made to use 64-bit atomics instead
of XLogRecPtr and the xlblocks value is marked with
InvalidXLogRecPtr just before the page initialization begins. Once
the page initialization finishes, only then the actual value of
the newly initialized page is marked in xlblocks. A write barrier
is placed in between xlblocks update with InvalidXLogRecPtr and
the page initialization to not cause any memory ordering problems.

With this fix, one can read xlblocks and WAL buffer page without
WALBufMappingLock in the following manner:

  endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);

  /* Requested WAL isn't available in WAL buffers. */
  if (expectedEndPtr != endptr)
      break;

  page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
  data = page + ptr % XLOG_BLCKSZ;
  ...
  pg_read_barrier();
  ...
  memcpy(buf, data, bytes_to_read);
  ...
  pg_read_barrier();

  /* Recheck if the page still exists in WAL buffers. */
  endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);

  /* Return if the page got initalized while we were reading it */
  if (expectedEndPtr != endptr)
      break;
---
 src/backend/access/transam/xlog.c | 55 ++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6526bd4f43..b3c08f3980 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -501,7 +501,7 @@ typedef struct XLogCtlData
 	 * WALBufMappingLock.
 	 */
 	char	   *pages;			/* buffers for unwritten XLOG pages */
-	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
+	pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
 
 	/*
@@ -1634,20 +1634,19 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 	 * out to disk and evicted, and the caller is responsible for making sure
 	 * that doesn't happen.
 	 *
-	 * However, we don't hold a lock while we read the value. If someone has
-	 * just initialized the page, it's possible that we get a "torn read" of
-	 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
-	 * that case we will see a bogus value. That's ok, we'll grab the mapping
-	 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
-	 * the page we're looking for. But it means that when we do this unlocked
-	 * read, we might see a value that appears to be ahead of the page we're
-	 * looking for. Don't PANIC on that, until we've verified the value while
-	 * holding the lock.
+	 * However, we don't hold a lock while we read the value. If someone is
+	 * just about to initialize or has just initialized the page, it's
+	 * possible that we get InvalidXLogRecPtr. That's ok, we'll grab the
+	 * mapping lock (in AdvanceXLInsertBuffer) and retry if we see anything
+	 * else than the page we're looking for. But it means that when we do this
+	 * unlocked read, we might see a value that appears to be ahead of the
+	 * page we're looking for. Don't PANIC on that, until we've verified the
+	 * value while holding the lock.
 	 */
 	expectedEndPtr = ptr;
 	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
 
-	endptr = XLogCtl->xlblocks[idx];
+	endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
 	if (expectedEndPtr != endptr)
 	{
 		XLogRecPtr	initializedUpto;
@@ -1678,7 +1677,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 		WALInsertLockUpdateInsertingAt(initializedUpto);
 
 		AdvanceXLInsertBuffer(ptr, tli, false);
-		endptr = XLogCtl->xlblocks[idx];
+		endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
 
 		if (expectedEndPtr != endptr)
 			elog(PANIC, "could not find WAL buffer for %X/%X",
@@ -1865,7 +1864,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 		 * be zero if the buffer hasn't been used yet).  Fall through if it's
 		 * already written out.
 		 */
-		OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
+		OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]);
 		if (LogwrtResult.Write < OldPageRqstPtr)
 		{
 			/*
@@ -1934,6 +1933,20 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 
 		NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
 
+		/*
+		 * Make sure to mark the xlblocks with InvalidXLogRecPtr before the
+		 * initialization of the page begins so that others reading xlblocks
+		 * without holding a lock, will know that the page initialization has
+		 * just begun.
+		 */
+		pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
+
+		/*
+		 * A write barrier here helps to not reorder the above xlblocks atomic
+		 * write with below page initialization.
+		 */
+		pg_write_barrier();
+
 		/*
 		 * Be sure to re-zero the buffer so that bytes beyond what we've
 		 * written will look like zeroes and not valid XLOG records...
@@ -1987,8 +2000,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 		 */
 		pg_write_barrier();
 
-		*((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
-
+		pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
 		XLogCtl->InitializedUpTo = NewPageEndPtr;
 
 		npages++;
@@ -2206,7 +2218,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 		 * if we're passed a bogus WriteRqst.Write that is past the end of the
 		 * last page that's been initialized by AdvanceXLInsertBuffer.
 		 */
-		XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
+		XLogRecPtr	EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
 
 		if (LogwrtResult.Write >= EndPtr)
 			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
@@ -4708,10 +4720,13 @@ XLOGShmemInit(void)
 	 * needed here.
 	 */
 	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
-	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
-	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
-	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
+	XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
+	allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
 
+	for (i = 0; i < XLOGbuffers; i++)
+	{
+		pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
+	}
 
 	/* WAL insertion locks. Ensure they're aligned to the full padded size */
 	allocptr += sizeof(WALInsertLockPadded) -
@@ -5748,7 +5763,7 @@ StartupXLOG(void)
 		memcpy(page, endOfRecoveryInfo->lastPage, len);
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
-		XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+		pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
 		XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
 	}
 	else
-- 
2.34.1

