From 35589fbefd488b9070c891f5dcfc7caab1b0a980 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Sat, 4 Nov 2023 14:21:43 +0000
Subject: [PATCH v15] Allow WAL reading from WAL buffers

This commit adds WALRead() the capability to read WAL from WAL
buffers when possible. When requested WAL isn't available in WAL
buffers, the WAL is read from the WAL file as usual.

This commit benefits the callers of WALRead(), that are walsenders
and pg_walinspect. They can now avoid reading WAL from the WAL
file (possibly avoiding disk IO). Tests show that the WAL buffers
hit ratio stood at 95% for 1 primary, 1 sync standby, 1 async
standby, with pgbench --scale=300 --client=32 --time=900. In other
words, the walsenders avoided 95% of the time reading from the
file/avoided pread system calls:
https://www.postgresql.org/message-id/CALj2ACXKKK%3DwbiG5_t6dGao5GoecMwRkhr7GjVBM_jg54%2BNa%3DQ%40mail.gmail.com

This commit also benefits when direct IO is enabled for WAL.
Reading WAL from WAL buffers puts back the performance close to
that of without direct IO for WAL:
https://www.postgresql.org/message-id/CALj2ACV6rS%2B7iZx5%2BoAvyXJaN4AG-djAQeM1mrM%3DYSDkVrUs7g%40mail.gmail.com

This commit paves the way for the following features in future:
- Improves synchronous replication performance by replicating
directly from WAL buffers.
- A opt-in way for the walreceivers to receive unflushed WAL.
More details here:
https://www.postgresql.org/message-id/20231011224353.cl7c2s222dw3de4j%40awork3.anarazel.de

Author: Bharath Rupireddy
Reviewed-by: Dilip Kumar, Andres Freund
Reviewed-by: Nathan Bossart, Kuntal Ghosh
Discussion: https://www.postgresql.org/message-id/CALj2ACXKKK%3DwbiG5_t6dGao5GoecMwRkhr7GjVBM_jg54%2BNa%3DQ%40mail.gmail.com
---
 src/backend/access/transam/xlog.c       | 170 ++++++++++++++++++++++++
 src/backend/access/transam/xlogreader.c |  41 +++++-
 src/include/access/xlog.h               |   6 +
 3 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1a2ad1a475..1df74d8f48 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1705,6 +1705,176 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 	return cachedPos + ptr % XLOG_BLCKSZ;
 }
 
+/*
+ * Read WAL from WAL buffers.
+ *
+ * Read 'count' bytes of WAL from WAL buffers into 'buf', starting at location
+ * 'startptr', on timeline 'tli' and return total read bytes.
+ *
+ * This function returns quickly in the following cases:
+ * - When passed-in timeline is different than server's current insertion
+ * timeline as WAL is always inserted into WAL buffers on insertion timeline.
+ *
+ * - When server is in recovery as WAL buffers aren't currently used in
+ * recovery.
+ *
+ * Note that this function reads as much as it can from WAL buffers, meaning,
+ * it may not read all the requested 'count' bytes. Caller must be aware of
+ * this and deal with it.
+ *
+ * Note that function reads WAL from WAL buffers without holding any lock.
+ * First it reads xlblocks atomically for checking page existence, then it
+ * reads the page contents, validates. Finally, it rechecks the page existence
+ * by rereading xlblocks, if the read page is replaced, it discards read page
+ * and returns.
+ *
+ * Note that this function is not available for frontend code as WAL buffers is
+ * an internal mechanism to the server.
+ */
+Size
+XLogReadFromBuffers(XLogReaderState *state,
+					XLogRecPtr startptr,
+					TimeLineID tli,
+					Size count,
+					char *buf)
+{
+	XLogRecPtr	ptr;
+	XLogRecPtr	cur_lsn;
+	Size		nbytes;
+	Size		ntotal;
+	char	   *dst;
+
+	if (RecoveryInProgress())
+		return 0;
+
+	if (tli != GetWALInsertionTimeLine())
+		return 0;
+
+	Assert(!XLogRecPtrIsInvalid(startptr));
+
+	cur_lsn = GetFlushRecPtr(NULL);
+	if (unlikely(startptr > cur_lsn))
+		elog(ERROR, "WAL start LSN %X/%X specified for reading from WAL buffers must be less than current database system WAL LSN %X/%X",
+			 LSN_FORMAT_ARGS(startptr), LSN_FORMAT_ARGS(cur_lsn));
+
+	ptr = startptr;
+	nbytes = count;				/* Total bytes requested to be read by caller. */
+	ntotal = 0;					/* Total bytes read. */
+	dst = buf;
+
+	while (nbytes > 0)
+	{
+		XLogRecPtr	expectedEndPtr;
+		XLogRecPtr	endptr;
+		int			idx;
+		char	   *page;
+		char	   *data;
+		XLogPageHeader phdr;
+		Size		nread;
+
+		idx = XLogRecPtrToBufIdx(ptr);
+		expectedEndPtr = ptr;
+		expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
+		endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
+
+		/* Requested WAL isn't available in WAL buffers. */
+		if (expectedEndPtr != endptr)
+			break;
+
+		/*
+		 * We found WAL buffer page containing given XLogRecPtr. Get starting
+		 * address of the page and a pointer to the right location of given
+		 * XLogRecPtr in that page.
+		 */
+		page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
+		data = page + ptr % XLOG_BLCKSZ;
+
+		/*
+		 * Make sure to not read a page that just got initialized. Return, if
+		 * WAL buffer page doesn't look valid.
+		 */
+		phdr = (XLogPageHeader) page;
+		if (!(phdr->xlp_magic == XLOG_PAGE_MAGIC &&
+			  phdr->xlp_pageaddr == (ptr - (ptr % XLOG_BLCKSZ)) &&
+			  phdr->xlp_tli == tli))
+			break;
+
+		/*
+		 * Note that we don't perform all page header checks here to avoid
+		 * extra work in production builds; callers will anyway do those
+		 * checks extensively. However, in an assert-enabled build, we perform
+		 * all the checks here and raise an error if failed.
+		 */
+#ifdef USE_ASSERT_CHECKING
+		if (unlikely(state != NULL &&
+					 !XLogReaderValidatePageHeader(state, (endptr - XLOG_BLCKSZ),
+												   (char *) phdr)))
+		{
+			if (state->errormsg_buf[0])
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg_internal("%s", state->errormsg_buf)));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg_internal("could not read WAL from WAL buffers")));
+		}
+#endif
+
+		/* Make sure we don't read the page contents before xlblocks. */
+		pg_read_barrier();
+
+		nread = 0;
+
+		/* Read what is wanted, not the whole page. */
+		if ((data + nbytes) <= (page + XLOG_BLCKSZ))
+		{
+			/* All the bytes are in one page. */
+			nread = nbytes;
+		}
+		else
+		{
+			/*
+			 * All the bytes are not in one page. Read available bytes on the
+			 * current page, copy them over to output buffer and continue to
+			 * read remaining bytes.
+			 */
+			nread = XLOG_BLCKSZ - (data - page);
+			Assert(nread > 0 && nread <= nbytes);
+		}
+
+		Assert(nread > 0);
+		memcpy(dst, data, nread);
+
+		/* Make sure we don't read xlblocks before the page contents. */
+		pg_read_barrier();
+
+		/* Recheck if the read page still exists in WAL buffers. */
+		endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
+
+		/* Return if the page got initalized while we were reading it. */
+		if (expectedEndPtr != endptr)
+			break;
+
+		dst += nread;
+		ptr += nread;
+		ntotal += nread;
+		nbytes -= nread;
+	}
+
+	/* We never read more than what the caller has asked for. */
+	Assert(ntotal <= count);
+
+#ifdef WAL_DEBUG
+	if (XLOG_DEBUG)
+		ereport(DEBUG1,
+				(errmsg_internal("read %zu bytes out of %zu bytes from WAL buffers for given start LSN %X/%X, timeline ID %u",
+								 ntotal, count, LSN_FORMAT_ARGS(startptr), tli)));
+#endif
+
+	return ntotal;
+}
+
 /*
  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
  * is the position starting from the beginning of WAL, excluding all WAL
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index e0baa86bd3..5820c5eedc 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1473,8 +1473,10 @@ err:
  * Returns true if succeeded, false if an error occurs, in which case
  * 'errinfo' receives error details.
  *
- * XXX probably this should be improved to suck data directly from the
- * WAL buffers when possible.
+ * When possible, this function reads data directly from WAL buffers. When
+ * requested WAL isn't available in WAL buffers, the WAL is read from the WAL
+ * file as usual. The callers may avoid reading WAL from the WAL file thus
+ * reducing read system calls or even disk IOs.
  */
 bool
 WALRead(XLogReaderState *state,
@@ -1484,6 +1486,41 @@ WALRead(XLogReaderState *state,
 	char	   *p;
 	XLogRecPtr	recptr;
 	Size		nbytes;
+#ifndef FRONTEND
+	Size		nread;
+#endif
+
+#ifndef FRONTEND
+
+	/*
+	 * Try reading WAL from WAL buffers. Frontend code has no idea of WAL
+	 * buffers.
+	 */
+	nread = XLogReadFromBuffers(state, startptr, tli, count, buf);
+
+	Assert(nread >= 0);
+
+	/*
+	 * Check if we have read fully (hit), partially (partial hit) or nothing
+	 * (miss) from WAL buffers. If we have read either partially or nothing,
+	 * then continue to read the remaining bytes the usual way, that is, read
+	 * from WAL file.
+	 *
+	 * XXX: It might be worth to expose WAL buffer read stats.
+	 */
+	if (count == nread)
+		return true;			/* Buffer hit, so return. */
+	else if (count > nread)
+	{
+		/*
+		 * Buffer partial hit, so reset the state to count the read bytes and
+		 * continue.
+		 */
+		buf += nread;
+		startptr += nread;
+		count -= nread;
+	}
+#endif
 
 	p = buf;
 	recptr = startptr;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index a14126d164..18167c36b4 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -251,6 +251,12 @@ extern XLogRecPtr GetLastImportantRecPtr(void);
 
 extern void SetWalWriterSleeping(bool sleeping);
 
+extern Size XLogReadFromBuffers(struct XLogReaderState *state,
+								XLogRecPtr startptr,
+								TimeLineID tli,
+								Size count,
+								char *buf);
+
 /*
  * Routines used by xlogrecovery.c to call back into xlog.c during recovery.
  */
-- 
2.34.1