From 4f8076e5dd749d98eb74d0dfa6cfb127ded1dba0 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 7 Feb 2021 19:24:03 -0800
Subject: [PATCH v4 2/3] Recycle pages deleted during same VACUUM.

TODO: Respect work_mem in temporary space that remembers the details of
pages deleted during current VACUUM.

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wzk76_P=67iUscb1UN44-gyZL-KgpsXbSxq_bdcMa7Q+wQ@mail.gmail.com
---
 src/include/access/nbtree.h         | 25 ++++++++++-
 src/backend/access/nbtree/nbtpage.c | 61 +++++++++++++++----------
 src/backend/access/nbtree/nbtree.c  | 69 ++++++++++++++++++++++-------
 3 files changed, 115 insertions(+), 40 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 44d176255a..0e81ff8356 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -272,6 +272,29 @@ BTPageGetDeleteXid(Page page)
 	return contents->safexid;
 }
 
+/*
+ * BTVacState is nbtree.c state used during VACUUM.  It is exported for use by
+ * page deletion related code in nbtpage.c.
+ */
+typedef struct BTPendingRecycle
+{
+	BlockNumber		  blkno;
+	FullTransactionId safexid;
+} BTPendingRecycle;
+
+typedef struct BTVacState
+{
+	IndexVacuumInfo *info;
+	IndexBulkDeleteResult *stats;
+	IndexBulkDeleteCallback callback;
+	void	   *callback_state;
+	BTCycleId	cycleid;
+	MemoryContext pagedelcontext;
+	BTPendingRecycle *deleted;
+	uint32			 sizedeleted;
+	uint32			 ndeleted;
+} BTVacState;
+
 /*
  *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
  *	page.  The high key is not a tuple that is used to visit the heap.  It is
@@ -1142,7 +1165,7 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
 									  Relation heapRel,
 									  TM_IndexDeleteOp *delstate);
-extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf);
+extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
 
 /*
  * prototypes for functions in nbtsearch.c
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index e0d1e585f6..408230cb67 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -50,7 +50,7 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
 									 BlockNumber scanblkno,
 									 bool *rightsib_empty,
-									 uint32 *ndeleted);
+									 BTVacState *vstate);
 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
 									BTStack stack,
 									Buffer *subtreeparent,
@@ -1763,20 +1763,22 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
  * should never pass a buffer containing an existing deleted page here.  The
  * lock and pin on caller's buffer will be dropped before we return.
  *
- * Returns the number of pages successfully deleted (zero if page cannot
- * be deleted now; could be more than one if parent or right sibling pages
- * were deleted too).  Note that this does not include pages that we delete
- * that the btvacuumscan scan has yet to reach; they'll get counted later
- * instead.
+ * Maintains bulk delete stats for caller, which are taken from vstate.  We
+ * need to cooperate closely with caller here so that whole VACUUM operation
+ * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
+ * delete in passing.  If such pages happen to be from a block number that is
+ * ahead of the current scanblkno position, then caller is expected to count
+ * them directly later on.  It's simpler for us to understand caller's
+ * requirements than it would be for caller to understand when or how a
+ * deleted page became deleted after the fact.
  *
  * NOTE: this leaks memory.  Rather than trying to clean up everything
  * carefully, it's better to run it in a temp context that can be reset
  * frequently.
  */
-uint32
-_bt_pagedel(Relation rel, Buffer leafbuf)
+void
+_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 {
-	uint32		ndeleted = 0;
 	BlockNumber rightsib;
 	bool		rightsib_empty;
 	Page		page;
@@ -1784,7 +1786,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 
 	/*
 	 * Save original leafbuf block number from caller.  Only deleted blocks
-	 * that are <= scanblkno get counted in ndeleted return value.
+	 * that are <= scanblkno are added to bulk delete stat's pages_deleted
+	 * count.
 	 */
 	BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
 
@@ -1846,7 +1849,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 										 RelationGetRelationName(rel))));
 
 			_bt_relbuf(rel, leafbuf);
-			return ndeleted;
+			return;
 		}
 
 		/*
@@ -1876,7 +1879,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 			Assert(!P_ISHALFDEAD(opaque));
 
 			_bt_relbuf(rel, leafbuf);
-			return ndeleted;
+			return;
 		}
 
 		/*
@@ -1925,8 +1928,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 				if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
 				{
 					ReleaseBuffer(leafbuf);
-					Assert(ndeleted == 0);
-					return ndeleted;
+					return;
 				}
 
 				/* we need an insertion scan key for the search, so build one */
@@ -1967,7 +1969,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 			if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
 			{
 				_bt_relbuf(rel, leafbuf);
-				return ndeleted;
+				return;
 			}
 		}
 
@@ -1982,7 +1984,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 		{
 			/* Check for interrupts in _bt_unlink_halfdead_page */
 			if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
-										  &rightsib_empty, &ndeleted))
+										  &rightsib_empty, vstate))
 			{
 				/*
 				 * _bt_unlink_halfdead_page should never fail, since we
@@ -1993,7 +1995,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 				 * lock and pin on leafbuf for us.
 				 */
 				Assert(false);
-				return ndeleted;
+				return;
 			}
 		}
 
@@ -2030,8 +2032,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf)
 
 		leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
 	}
-
-	return ndeleted;
 }
 
 /*
@@ -2266,9 +2266,10 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
  */
 static bool
 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
-						 bool *rightsib_empty, uint32 *ndeleted)
+						 bool *rightsib_empty, BTVacState *vstate)
 {
 	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
+	IndexBulkDeleteResult *stats = vstate->stats;
 	BlockNumber leafleftsib;
 	BlockNumber leafrightsib;
 	BlockNumber target;
@@ -2676,12 +2677,24 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		_bt_relbuf(rel, buf);
 
 	/*
-	 * If btvacuumscan won't revisit this page in a future btvacuumpage call
-	 * and count it as deleted then, we count it as deleted by current
-	 * btvacuumpage call
+	 * Maintain pages_deleted in a way that takes into account how
+	 * btvacuumpage() will count deleted pages that have yet to become
+	 * scanblkno -- only count page when it's not going to get that treatment
+	 * later on.
 	 */
 	if (target <= scanblkno)
-		(*ndeleted)++;
+		stats->pages_deleted++;
+
+	if (vstate->ndeleted >= vstate->sizedeleted)
+	{
+		vstate->sizedeleted *= 2;
+		vstate->deleted =
+				repalloc(vstate->deleted,
+						 sizeof(BTPendingRecycle) * vstate->sizedeleted);
+	}
+	vstate->deleted[vstate->ndeleted].blkno = target;
+	vstate->deleted[vstate->ndeleted].safexid = safexid;
+	vstate->ndeleted++;
 
 	return true;
 }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1e6da9439f..c3b32bb71c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -32,23 +32,13 @@
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
 
 
-/* Working state needed by btvacuumpage */
-typedef struct
-{
-	IndexVacuumInfo *info;
-	IndexBulkDeleteResult *stats;
-	IndexBulkDeleteCallback callback;
-	void	   *callback_state;
-	BTCycleId	cycleid;
-	MemoryContext pagedelcontext;
-} BTVacState;
-
 /*
  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
  *
@@ -920,6 +910,34 @@ _bt_page_recyclable(BTPageOpaque opaque, Page page)
 	return false;
 }
 
+/*
+ * _bt_newly_deleted_pages_recycle() -- Are _bt_pagedel pages recyclable now?
+ *
+ * Note that we assume that the array is ordered by safexid.  No further
+ * entries can be safe to recycle once we encounter the first non-recyclable
+ * entry in the deleted array.
+ */
+static inline void
+_bt_newly_deleted_pages_recycle(Relation rel, BTVacState *vstate)
+{
+	IndexBulkDeleteResult *stats = vstate->stats;
+
+	/* Recompute VACUUM XID boundaries */
+	(void) GetOldestNonRemovableTransactionId(NULL);
+
+	for (int i = 0; i < vstate->ndeleted; i++)
+	{
+		BlockNumber		  blkno = vstate->deleted[i].blkno;
+		FullTransactionId safexid = vstate->deleted[i].safexid;
+
+		if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
+			break;
+
+		RecordFreeIndexPage(rel, blkno);
+		stats->pages_free++;
+	}
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -1054,6 +1072,11 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 												  "_bt_pagedel",
 												  ALLOCSET_DEFAULT_SIZES);
 
+	/* Allocate _bt_newly_deleted_pages_recycle related information */
+	vstate.sizedeleted = 16384;
+	vstate.deleted = palloc(sizeof(BTPendingRecycle) * vstate.sizedeleted);
+	vstate.ndeleted = 0;
+
 	/*
 	 * The outer loop iterates over all index pages except the metapage, in
 	 * physical order (we hope the kernel will cooperate in providing
@@ -1122,7 +1145,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 *
 	 * Note that if no recyclable pages exist, we don't bother vacuuming the
 	 * FSM at all.
+	 *
+	 * Before vacuuming the FSM, try to make the most of the pages we
+	 * ourselves deleted: see if they can be recycled already (try to avoid
+	 * waiting until the next VACUUM operation to recycle).  Our approach is
+	 * to check the local array of pages that were newly deleted during this
+	 * VACUUM.
 	 */
+	if (vstate.ndeleted > 0)
+		_bt_newly_deleted_pages_recycle(rel, &vstate);
+
+	pfree(vstate.deleted);
+
 	if (stats->pages_free > 0)
 		IndexFreeSpaceMapVacuum(rel);
 
@@ -1260,6 +1294,13 @@ backtrack:
 		/*
 		 * Already deleted page (which could be leaf or internal).  Can't
 		 * recycle yet.
+		 *
+		 * This is a deleted page that must have been deleted in a previous
+		 * VACUUM operation, that nevertheless cannot be recycled now.  There
+		 * is no good reason to expect that to change any time soon, which is
+		 * why it isn't among the pages that _bt_newly_deleted_pages_recycle
+		 * will consider as candidates to recycle at the end of btvacuumscan
+		 * call.
 		 */
 		stats->pages_deleted++;
 	}
@@ -1479,12 +1520,10 @@ backtrack:
 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
 
 		/*
-		 * We trust the _bt_pagedel return value because it does not include
-		 * any page that a future call here from btvacuumscan is expected to
-		 * count.  There will be no double-counting.
+		 * _bt_pagedel maintains the bulk delete stats on our behalf
 		 */
 		Assert(blkno == scanblkno);
-		stats->pages_deleted += _bt_pagedel(rel, buf);
+		_bt_pagedel(rel, buf, vstate);
 
 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
-- 
2.27.0

