From e3dd35828be4dd665cbfbb6ca153fba0011aa0a8 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Sat, 4 Nov 2023 13:48:51 +0000
Subject: [PATCH v15] Use 64-bit atomics for xlblocks array elements

In AdvanceXLInsertBuffer(), xlblocks value of a WAL buffer page is
updated only at the end after the page is initialized with all
zeros. A problem with this approach is that anyone reading
xlblocks and WAL buffer page without holding WALBufMappingLock
will see the wrong page contents if the read happens before the
xlblocks is marked with a new entry in AdvanceXLInsertBuffer() at
the end.

To fix this issue, xlblocks is made to use 64-bit atomics instead
of XLogRecPtr and the xlblocks value is marked with
InvalidXLogRecPtr just before the page initialization begins. Once
the page initialization finishes, only then the actual value of
the newly initialized page is marked in xlblocks. A write barrier
is placed in between xlblocks update with InvalidXLogRecPtr and
the page initialization to not cause any memory ordering problems.

With this fix, one can read xlblocks and WAL buffer page without
WALBufMappingLock in the following manner:

  endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);

  /* Requested WAL isn't available in WAL buffers. */
  if (expectedEndPtr != endptr)
      break;

  page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
  data = page + ptr % XLOG_BLCKSZ;
  ...
  pg_read_barrier();
  ...
  memcpy(buf, data, bytes_to_read);
  ...
  pg_read_barrier();

  /* Recheck if the page still exists in WAL buffers. */
  endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);

  /* Return if the page got initalized while we were reading it */
  if (expectedEndPtr != endptr)
      break;
---
 src/backend/access/transam/xlog.c | 65 +++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b541be8eec..1a2ad1a475 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -501,7 +501,7 @@ typedef struct XLogCtlData
 	 * WALBufMappingLock.
 	 */
 	char	   *pages;			/* buffers for unwritten XLOG pages */
-	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
+	pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
 
 	/*
@@ -1634,20 +1634,19 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 	 * out to disk and evicted, and the caller is responsible for making sure
 	 * that doesn't happen.
 	 *
-	 * However, we don't hold a lock while we read the value. If someone has
-	 * just initialized the page, it's possible that we get a "torn read" of
-	 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
-	 * that case we will see a bogus value. That's ok, we'll grab the mapping
-	 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
-	 * the page we're looking for. But it means that when we do this unlocked
-	 * read, we might see a value that appears to be ahead of the page we're
-	 * looking for. Don't PANIC on that, until we've verified the value while
-	 * holding the lock.
+	 * However, we don't hold a lock while we read the value. If someone is
+	 * just about to initialize or has just initialized the page, it's
+	 * possible that we get InvalidXLogRecPtr. That's ok, we'll grab the
+	 * mapping lock (in AdvanceXLInsertBuffer) and retry if we see anything
+	 * else than the page we're looking for. But it means that when we do this
+	 * unlocked read, we might see a value that appears to be ahead of the
+	 * page we're looking for. Don't PANIC on that, until we've verified the
+	 * value while holding the lock.
 	 */
 	expectedEndPtr = ptr;
 	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
 
-	endptr = XLogCtl->xlblocks[idx];
+	endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
 	if (expectedEndPtr != endptr)
 	{
 		XLogRecPtr	initializedUpto;
@@ -1678,7 +1677,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 		WALInsertLockUpdateInsertingAt(initializedUpto);
 
 		AdvanceXLInsertBuffer(ptr, tli, false);
-		endptr = XLogCtl->xlblocks[idx];
+		endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
 
 		if (expectedEndPtr != endptr)
 			elog(PANIC, "could not find WAL buffer for %X/%X",
@@ -1865,7 +1864,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 		 * be zero if the buffer hasn't been used yet).  Fall through if it's
 		 * already written out.
 		 */
-		OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
+		OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]);
 		if (LogwrtResult.Write < OldPageRqstPtr)
 		{
 			/*
@@ -1934,6 +1933,15 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 
 		NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
 
+		/*
+		 * Make sure to mark the xlblocks with InvalidXLogRecPtr before the
+		 * initialization of the page begins so that others, reading xlblocks
+		 * without holding a lock, will know that the page initialization has
+		 * just begun.
+		 */
+		pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
+		pg_write_barrier();
+
 		/*
 		 * Be sure to re-zero the buffer so that bytes beyond what we've
 		 * written will look like zeroes and not valid XLOG records...
@@ -1987,8 +1995,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 		 */
 		pg_write_barrier();
 
-		*((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
-
+		pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
 		XLogCtl->InitializedUpTo = NewPageEndPtr;
 
 		npages++;
@@ -2187,7 +2194,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 		 * if we're passed a bogus WriteRqst.Write that is past the end of the
 		 * last page that's been initialized by AdvanceXLInsertBuffer.
 		 */
-		XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
+		XLogRecPtr	EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
+
+		/*
+		 * xlblocks value can be InvalidXLogRecPtr before the new WAL buffer
+		 * page gets initialized in AdvanceXLInsertBuffer. In such a case
+		 * re-read the xlblocks value under the lock to ensure the correct
+		 * value is read.
+		 */
+		if (unlikely(XLogRecPtrIsInvalid(EndPtr)))
+		{
+			LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+			EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
+			LWLockRelease(WALBufMappingLock);
+		}
+
+		Assert(!XLogRecPtrIsInvalid(EndPtr));
 
 		if (LogwrtResult.Write >= EndPtr)
 			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
@@ -4675,10 +4697,13 @@ XLOGShmemInit(void)
 	 * needed here.
 	 */
 	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
-	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
-	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
-	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
+	XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
+	allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
 
+	for (i = 0; i < XLOGbuffers; i++)
+	{
+		pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
+	}
 
 	/* WAL insertion locks. Ensure they're aligned to the full padded size */
 	allocptr += sizeof(WALInsertLockPadded) -
@@ -5715,7 +5740,7 @@ StartupXLOG(void)
 		memcpy(page, endOfRecoveryInfo->lastPage, len);
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
-		XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+		pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
 		XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
 	}
 	else
-- 
2.34.1

