From ba8c73c2aff010703a97b10065ecc978f41cd77d Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Wed, 7 Mar 2018 11:30:11 +0500
Subject: [PATCH 2/2] Physical GiST scan during VACUUM v3

---
 src/backend/access/gist/README       |  35 ++++
 src/backend/access/gist/gistvacuum.c | 337 ++++++++++++++++++++++++++++++-----
 2 files changed, 332 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b8..9548872be8 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function can choose between two strategies: logical scan or physical scan.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains graph structure in palloc'ed array to collect block numbers of
+internal pages that need cleansing from references to empty leafs. Also, the
+array contains offsets on the internal page to potentially free leaf page. This
+scan method is chosen when maintenance work memory is sufficient to hold
+necessary graph structure.
+
+The logical scan is chosen when there is not enough maintenance memory to
+execute the physical scan. Logical scan traverses GiST index in DFS, looking up
+into incomplete split branches. The logical scan can be slower on hard disk
+drives.
+
+The result of both scans are the same: the stack of block numbers of internal
+pages with the list of offsets potentially referencing empty leaf pages. After
+the scan, for each internal pages under exclusive lock, each potentially free
+leaf page is examined. gistbulkdelete() never delete last one reference from
+internal page to preserve balanced tree properties.
+
+The physical scan can return empty leaf pages offsets unordered. Thus, before
+executing PageIndexMultiDelete offsets (already locked and checked) are sorted.
+This step is not necessary for the logical scan.
+
+Both scans hold only one lock at a time. Physical scan grabs exclusive lock
+instantly, while logical scan takes shared lock and then swaps it to exclusive.
+This is done because amount of work on internal page done by physical scan is
+lower and amount of internal pages is relatively low compared to the amount of
+leaf pages.
+
 
 Authors:
 	Teodor Sigaev	<teodor@sigaev.ru>
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 213e01202b..e2c37a55a6 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -102,8 +102,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 typedef struct GistBDItem
 {
-	GistNSN		parentlsn;
-	BlockNumber blkno;
+	GistNSN		 parentlsn;
+	BlockNumber  blkno;
+	OffsetNumber parentoffset;
 	struct GistBDItem *next;
 } GistBDItem;
 
@@ -128,30 +129,204 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 }
 
 /*
- * Bulk deletion of all index entries pointing to a set of heap tuples and
- * check invalid tuples left after upgrade.
- * The set of target tuples is specified via a callback routine that tells
- * whether any given heap tuple (identified by ItemPointer) is being deleted.
- *
- * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ * During physical scan for every pair parent-child we can either find parent
+ * first or child first. Every time we open internal page - we mark parent
+ * block no for every child and set GIST_PS_HAS_PARENT. When scan will get to
+ * child page, if this page turns out to be empty - we will get back by
+ * parent link. If we find child first (still without parent link), we mark
+ * the page as GIST_PS_EMPTY_LEAF if it is ready to be deleted. When we will
+ * scan it's parent - we will pick it to rescan list.
  */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+#define GIST_PS_HAS_PARENT 1
+#define GIST_PS_EMPTY_LEAF 2
+
+
+/* Physiscal scan item */
+typedef struct GistPSItem
 {
-	Relation	rel = info->index;
-	GistBDItem *stack,
-			   *ptr;
-	BlockNumber recentParent = InvalidBlockNumber;
-	List	   *rescanList = NULL;
-	ListCell   *cell;
+	BlockNumber  parent;
+	List*        emptyLeafOffsets;
+	OffsetNumber parentOffset;
+	uint16_t     flags;
+} GistPSItem;
+
+/* Blocknumber of internal pages with offsets to rescan for deletion */
+typedef struct GistRescanItem
+{
+	BlockNumber       blkno;
+	List*             emptyLeafOffsets;
+	struct GistRescanItem* next;
+} GistRescanItem;
+
+/* Read all pages sequentially populating array of GistPSItem */
+static GistRescanItem*
+gistbulkdeletephysicalcan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state, BlockNumber npages)
+{
+	Relation	     rel = info->index;
+	GistRescanItem *result = NULL;
+	BlockNumber      blkno;
 
-	/* first time through? */
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
-	stats->estimated_count = false;
-	stats->num_index_tuples = 0;
+	/* Here we will store whole graph of the index */
+	GistPSItem *graph = palloc0(npages * sizeof(GistPSItem));
+
+
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		Buffer		 buffer;
+		Page		 page;
+		OffsetNumber i,
+					 maxoff;
+		IndexTuple   idxtuple;
+		ItemId	     iid;
+
+		vacuum_delay_point();
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+									info->strategy);
+		/*
+		 * We are not going to stay here for a long time, calling recursive algorithms.
+		 * Especially for an internal page. So, agressivly grab an exclusive lock.
+		 */
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		page = (Page) BufferGetPage(buffer);
+
+		if (PageIsNew(page) || GistPageIsDeleted(page))
+		{
+			UnlockReleaseBuffer(buffer);
+			/* TODO: Should not we record free page here? */
+			continue;
+		}
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		if (GistPageIsLeaf(page))
+		{
+			OffsetNumber todelete[MaxOffsetNumber];
+			int			ntodelete = 0;
+
+			/*
+			 * Remove deletable tuples from page
+			 */
+
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			{
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state))
+					todelete[ntodelete++] = i;
+				else
+					stats->num_index_tuples += 1;
+			}
+
+			stats->tuples_removed += ntodelete;
+
+			/* We have dead tuples on the page */
+			if (ntodelete)
+			{
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				PageIndexMultiDelete(page, todelete, ntodelete);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr	recptr;
+
+					recptr = gistXLogUpdate(buffer,
+											todelete, ntodelete,
+											NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				}
+				else
+					PageSetLSN(page, gistGetFakeLSN(rel));
 
+				END_CRIT_SECTION();
+			}
+
+			/* The page is completely empty */
+			if (ntodelete == maxoff)
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				if (graph[blkno].flags & GIST_PS_HAS_PARENT)
+				{
+					/* Go to parent and append myself */
+					BlockNumber parentblockno = graph[blkno].parent;
+					graph[parentblockno].emptyLeafOffsets = lappend_int(graph[parentblockno].emptyLeafOffsets, (int)graph[blkno].parentOffset);
+				}
+				else
+				{
+					/* Parent will collect me later */
+					graph[blkno].flags |= GIST_PS_EMPTY_LEAF;
+				}
+			}
+		}
+		else
+		{
+			/* For internal pages we remember stucture of the tree */
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			{
+				BlockNumber childblkno;
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+				childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				if (graph[childblkno].flags & GIST_PS_EMPTY_LEAF)
+				{
+					/* Child has been scanned earlier and is ready to be picked up */
+					graph[blkno].emptyLeafOffsets = lappend_int(graph[blkno].emptyLeafOffsets, i);
+				}
+				else
+				{
+					/* Collect leaf when scan will come close */
+					graph[childblkno].parent = blkno;
+					graph[childblkno].parentOffset = i;
+					graph[childblkno].flags |= GIST_PS_HAS_PARENT;
+				}
+
+
+				if (GistTupleIsInvalid(idxtuple))
+					ereport(LOG,
+							(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+									RelationGetRelationName(rel)),
+							 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+							 errhint("Please REINDEX it.")));
+			}
+		}
+		UnlockReleaseBuffer(buffer);
+	}
+
+	/* Search for internal pages pointing to empty leafs */
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		if (graph[blkno].emptyLeafOffsets)
+		{
+			GistRescanItem *next = palloc(sizeof(GistRescanItem));
+			next->blkno = blkno;
+			next->emptyLeafOffsets = graph[blkno].emptyLeafOffsets;
+			next->next = result;
+			result = next;
+		}
+	}
+
+	pfree(graph);
+
+	return result;
+}
+
+/* Logical scan descends from root to leafs in DFS search */
+static GistRescanItem*
+gistbulkdeletelogicalscan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation        rel = info->index;
+	BlockNumber     recentParent = InvalidBlockNumber;
+	GistBDItem     *stack,
+				   *ptr;
+	GistRescanItem *result = NULL;
+
+	/* This stack is used to organize DFS */
 	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
 	stack->blkno = GIST_ROOT_BLKNO;
 
@@ -236,11 +411,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				END_CRIT_SECTION();
 			}
 
-			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
-				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber)
 			{
 				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
-				rescanList = lappend_int(rescanList, recentParent);
+				if (result == NULL || result->blkno != recentParent)
+				{
+					GistRescanItem *next = palloc(sizeof(GistRescanItem));
+					next->blkno = recentParent;
+					next->emptyLeafOffsets = NULL;
+					next->next = result;
+					result = next;
+				}
+				result->emptyLeafOffsets = lappend_int(result->emptyLeafOffsets, stack->parentoffset);
 			}
 		}
 		else
@@ -260,6 +442,7 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 				ptr->parentlsn = BufferGetLSNAtomic(buffer);
 				ptr->next = stack->next;
+				ptr->parentoffset = i;
 				stack->next = ptr;
 
 				if (GistTupleIsInvalid(idxtuple))
@@ -280,20 +463,82 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		vacuum_delay_point();
 	}
 
-	/* rescan inner pages that had empty child pages */
-	foreach(cell,rescanList)
+	return result;
+}
+
+/*
+ * This function is used to sort offsets for PageIndexMultiDelete
+ * When employing physical scan rescan offsets are not ordered.
+ */
+static int
+compare_offsetnumber(const void *x, const void *y)
+{
+	OffsetNumber a = *((OffsetNumber *)x);
+	OffsetNumber b = *((OffsetNumber *)y);
+	return a - b;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples and
+ * check invalid tuples left after upgrade.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation		rel = info->index;
+	GistRescanItem *rescan;
+	BlockNumber		npages;
+	bool			needLock;
+
+	/* first time through? */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	/* we'll re-count the tuples each time */
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
+
+	/*
+	 * Need lock unless it's local to this backend.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	/* try to find deleted pages */
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/* If we have enough space to contruct map of whole graph, then we can do sequential reading of all index */
+	if (npages * (sizeof(GistPSItem)) > maintenance_work_mem * 1024)
 	{
-		Buffer		buffer;
-		Page		page;
-		OffsetNumber i,
-					maxoff;
-		IndexTuple	idxtuple;
-		ItemId		iid;
-		OffsetNumber todelete[MaxOffsetNumber];
-		Buffer		buftodelete[MaxOffsetNumber];
-		int			ntodelete = 0;
+		rescan = gistbulkdeletelogicalscan(info, stats, callback, callback_state);
+	}
+	else
+	{
+		rescan = gistbulkdeletephysicalcan(info, stats, callback, callback_state, npages);
+	}
 
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+	/* rescan inner pages that had empty child pages */
+	while (rescan)
+	{
+		Buffer			 buffer;
+		Page			 page;
+		OffsetNumber 	 i,
+						 maxoff;
+		IndexTuple		 idxtuple;
+		ItemId			 iid;
+		OffsetNumber 	 todelete[MaxOffsetNumber];
+		Buffer			 buftodelete[MaxOffsetNumber];
+		int				 ntodelete = 0;
+		ListCell  		*cell;
+		GistRescanItem	*oldRescan;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, rescan->blkno,
 									RBM_NORMAL, info->strategy);
 		LockBuffer(buffer, GIST_EXCLUSIVE);
 		gistcheckpage(rel, buffer);
@@ -303,11 +548,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 
 		maxoff = PageGetMaxOffsetNumber(page);
 
-		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		/* Check that leafs are still empty and decide what to delete */
+		foreach(cell, rescan->emptyLeafOffsets)
 		{
 			Buffer		leafBuffer;
 			Page		leafPage;
 
+			i = (OffsetNumber)lfirst_int(cell);
+			if(i > maxoff)
+			{
+				continue;
+			}
+
 			iid = PageGetItemId(page, i);
 			idxtuple = (IndexTuple) PageGetItem(page, iid);
 
@@ -337,7 +589,9 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 			START_CRIT_SECTION();
 
 			MarkBufferDirty(buffer);
-				PageIndexMultiDelete(page, todelete, ntodelete);
+			/* Prepare possibly onurdered offsets */
+			qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber);
+			PageIndexMultiDelete(page, todelete, ntodelete);
 
 			if (RelationNeedsWAL(rel))
 			{
@@ -375,11 +629,14 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		}
 
 		UnlockReleaseBuffer(buffer);
+		oldRescan = rescan;
+		rescan = rescan->next;
+		list_free(oldRescan->emptyLeafOffsets);
+		pfree(oldRescan);
 
 		vacuum_delay_point();
 	}
 
-	list_free(rescanList);
 
 	return stats;
 }
\ No newline at end of file
-- 
2.14.3 (Apple Git-98)

