From e5232b8a8e90f60f45aadc813c5f024c9ecd8dab Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 27 Jul 2023 15:36:01 +0200
Subject: [PATCH v1] Cache btree scan end page across rescans in the same node

If the index is repeatedly scanned for values (e.g. in a nested loop) and if
the values that are being looked up are highly correlated, then we can likely
reuse the previous index scan's last page as a startpoint for the new scan,
instead of going through a relatively expensive index descent.
---
 src/backend/access/nbtree/nbtpage.c   |  18 +++++
 src/backend/access/nbtree/nbtree.c    |   8 +++
 src/backend/access/nbtree/nbtsearch.c | 100 ++++++++++++++++++++++++--
 src/include/access/nbtree.h           |  21 ++++++
 4 files changed, 141 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index d78971bfe8..897b6772fc 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -856,6 +856,24 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 	return buf;
 }
 
+Buffer
+_bt_getrecentbuf(Relation rel, BlockNumber blkno, Buffer buf, int access)
+{
+	Assert(BlockNumberIsValid(blkno));
+	Assert(BufferIsValid(buf));
+
+	if (!ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf))
+	{
+		/* Read an existing block of the relation */
+		buf = ReadBuffer(rel, blkno);
+	}
+
+	_bt_lockbuf(rel, buf, access);
+	_bt_checkpage(rel, buf);
+
+	return buf;
+}
+
 /*
  *	_bt_allocbuf() -- Allocate a new block/page.
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4553aaee53..e4ca2f8ecb 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -376,6 +376,11 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	 */
 	so->currTuples = so->markTuples = NULL;
 
+	so->rescans = so->rescanSamePage = 0;
+	so->recentEndPage = InvalidBlockNumber;
+	so->pageCacheValid = false;
+	so->pageCache = (char *) palloc(BLCKSZ);
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -402,6 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		BTScanPosInvalidate(so->currPos);
 	}
 
+	so->rescans++;
 	so->markItemIndex = -1;
 	so->arrayKeyCount = 0;
 	BTScanPosUnpinIfPinned(so->markPos);
@@ -467,6 +473,8 @@ btendscan(IndexScanDesc scan)
 	/* Release storage */
 	if (so->keyData != NULL)
 		pfree(so->keyData);
+	if (so->pageCache != NULL)
+		pfree(so->pageCache);
 	/* so->arrayKeyData and so->arrayKeys are in arrayContext */
 	if (so->arrayContext != NULL)
 		MemoryContextDelete(so->arrayContext);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 3230b3b894..67e597b56c 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -47,6 +47,45 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
+static void _bt_endscanonpage(BTScanOpaque btso, Buffer buf,
+							  BlockNumber endPage, Page page);
+
+#define RescanMayHitSamePage(so) (((float) ((so)->rescanSamePage) / (float) ((so)->rescans)) >= 0.5)
+
+static void
+_bt_endscanonpage(BTScanOpaque btso, Buffer buf, BlockNumber endPage, Page page)
+{
+	BlockNumber prevEndPage = btso->recentEndPage;
+
+	/*
+	 * We have often (>50%) hit the page the previous scan ended on, so
+	 * cache the current (last) page of the scan for future use.
+	 */
+	if (RescanMayHitSamePage(btso))
+	{
+		/*
+		 * If we have a valid cache, and the cache contains this page, then
+		 * we don't have anything to do.
+		 */
+		if (prevEndPage == endPage && btso->pageCacheValid)
+		{
+			/* do nothing */
+		}
+		else
+		{
+			memcpy(btso->pageCache, page, BLCKSZ);
+			btso->pageCacheValid = true;
+			btso->recentBuffer = buf;
+			btso->recentEndPage = endPage;
+		}
+	}
+	else if (prevEndPage != endPage)
+	{
+		btso->recentEndPage = endPage;
+		btso->pageCacheValid = false;
+	}
+}
+
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -872,7 +911,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	Buffer		buf;
+	Buffer		buf = InvalidBuffer;
 	BTStack		stack;
 	OffsetNumber offnum;
 	StrategyNumber strat;
@@ -1371,13 +1410,59 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	inskey.keysz = keysCount;
 
 	/*
-	 * Use the manufactured insertion scan key to descend the tree and
-	 * position ourselves on the target leaf page.
+	 * If we've restarted the scan through amrescan, it is quite possible
+	 * that a previous scan ended on the same page we're trying to find.
+	 * If this happened repeatedly, we cache the last page of the scan,
+	 * so that we may be able to ignore the penalty of traversing the tree
+	 * from the top.
+	 * 
+	 * XXX: Maybe this might not be concurrency-safe? Haven't thought about it
+	 * quite yet.
 	 */
-	stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot);
+	if (RescanMayHitSamePage(so) && so->pageCacheValid)
+	{
+		Page	page = so->pageCache;
+		BTPageOpaque opaque = BTPageGetOpaque(page);
+
+		/* cached page is a leaf page, and is not empty */
+		Assert(P_ISLEAF(opaque) && PageGetMaxOffsetNumber(page) != 0);
+
+		/*
+		 * If the search key doesn't fit within the min/max of this page,
+		 * we continue with normal index descent.
+		 */
+		if (_bt_compare(rel, &inskey, page, P_HIKEY) >= 0)
+			goto nocache;
+		if (_bt_compare(rel, &inskey, page, PageGetMaxOffsetNumber(page)) <= 0)
+			goto nocache;
+
+		buf = _bt_getrecentbuf(rel, so->recentEndPage, so->recentBuffer,
+							   BT_READ);
 
-	/* don't need to keep the stack around... */
-	_bt_freestack(stack);
+		/*
+		 * It is possible the page has split in the meantime, so we may have
+		 * to move right
+		 */
+		buf = _bt_moveright(rel, NULL, &inskey, buf, false, NULL, BT_READ,
+							scan->xs_snapshot);
+		so->rescanSamePage += 1;
+	}
+
+nocache:
+	if (!BufferIsValid(buf))
+	{
+		/*
+		 * Use the manufactured insertion scan key to descend the tree and
+		 * position ourselves on the target leaf page.
+		 */
+		stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot);
+
+		/* don't need to keep the stack around... */
+		_bt_freestack(stack);
+
+		if (buf == so->recentBuffer)
+			so->rescanSamePage += 1;
+	}
 
 	if (!BufferIsValid(buf))
 	{
@@ -1792,6 +1877,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
 	}
 
+	if (!continuescan)
+		_bt_endscanonpage(so, so->currPos.buf, so->currPos.currPage, page);
+
 	return (so->currPos.firstItem <= so->currPos.lastItem);
 }
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8891fa7973..b76d433725 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1062,6 +1062,26 @@ typedef struct BTScanOpaqueData
 	char	   *currTuples;		/* tuple storage for currPos */
 	char	   *markTuples;		/* tuple storage for markPos */
 
+	/*
+	 * If we're on the outer side of a loop join, parameterized joins may
+	 * have some correlation, i.e. repeated close values which fit on
+	 * nearby pages. By tracking at which leaf page we start and end our
+	 * scans, we can detect this case and (in some cases) reduce buffer
+	 * accesses by a huge margin.
+	 * 
+	 * TODO: Caching this page locally is OK, because this is inside a query
+	 * context and thus bound to the lifetime and snapshot of the query.
+	 * Accessing an updated version of the page might not be, so that needs
+	 * checking.
+	 */
+	int			rescans;		/* number of rescans */
+	int			rescanSamePage;	/* number of rescans that started on the previous scan's end location */
+
+	BlockNumber	recentEndPage;	/* scan end page of previous scan */
+	Buffer		recentBuffer;	/* buffer of recentEndPage. May not match. */
+	bool		pageCacheValid;	/* is the cached page valid? */
+	char	   *pageCache;		/* copy of the previous scan's last page, if valid */
+
 	/*
 	 * If the marked position is on the same page as current position, we
 	 * don't use markPos, but just keep the marked itemIndex in markItemIndex
@@ -1207,6 +1227,7 @@ extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
 							bool *allequalimage);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
+extern Buffer _bt_getrecentbuf(Relation rel, BlockNumber blkno, Buffer buffer, int access);
 extern Buffer _bt_allocbuf(Relation rel, Relation heaprel);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
 							   BlockNumber blkno, int access);
-- 
2.40.1

