From 4ce6b43685db913c30f863a87126410c690c4e75 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Fri, 13 Jul 2018 22:12:17 +0400
Subject: [PATCH 1/2] Delete pages during GiST VACUUM v9

---
 src/backend/access/gist/README       |  35 ++++++++++
 src/backend/access/gist/gist.c       |  18 ++++++
 src/backend/access/gist/gistbuild.c  |   5 --
 src/backend/access/gist/gistutil.c   |   3 +-
 src/backend/access/gist/gistvacuum.c | 122 +++++++++++++++++++++++++++++++++--
 src/backend/access/gist/gistxlog.c   |  63 +++++++++++++++++-
 src/include/access/gist.h            |   3 +
 src/include/access/gist_private.h    |  24 +++++--
 src/include/access/gistxlog.h        |  17 ++++-
 src/test/regress/expected/gist.out   |   4 +-
 src/test/regress/sql/gist.sql        |   4 +-
 11 files changed, 272 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b8..9548872be8 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function can choose between two strategies: logical scan or physical scan.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains graph structure in palloc'ed array to collect block numbers of
+internal pages that need cleansing from references to empty leafs. Also, the
+array contains offsets on the internal page to potentially free leaf page. This
+scan method is chosen when maintenance work memory is sufficient to hold
+necessary graph structure.
+
+The logical scan is chosen when there is not enough maintenance memory to
+execute the physical scan. Logical scan traverses GiST index in DFS, looking up
+into incomplete split branches. The logical scan can be slower on hard disk
+drives.
+
+The result of both scans are the same: the stack of block numbers of internal
+pages with the list of offsets potentially referencing empty leaf pages. After
+the scan, for each internal pages under exclusive lock, each potentially free
+leaf page is examined. gistbulkdelete() never delete last one reference from
+internal page to preserve balanced tree properties.
+
+The physical scan can return empty leaf pages offsets unordered. Thus, before
+executing PageIndexMultiDelete offsets (already locked and checked) are sorted.
+This step is not necessary for the logical scan.
+
+Both scans hold only one lock at a time. Physical scan grabs exclusive lock
+instantly, while logical scan takes shared lock and then swaps it to exclusive.
+This is done because amount of work on internal page done by physical scan is
+lower and amount of internal pages is relatively low compared to the amount of
+leaf pages.
+
 
 Authors:
 	Teodor Sigaev	<teodor@sigaev.ru>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7..5297e1691d 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -700,6 +700,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			/*
+			 * Currently internal pages are not deleted during vacuum,
+			 * so we do not need to check if page is deleted
+			 */
+
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -834,6 +839,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 				}
 			}
 
+			/*
+			 * Leaf pages can be left deleted but still referenced 
+			 * until it's space is reused. Downlink to this page may be already
+			 * removed from the internal page, but this scan can posess it.
+			 */
+			if(GistPageIsDeleted(stack->page))
+			{
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
+
 			/* now state.stack->(page, buffer and blkno) points to leaf page */
 
 			gistinserttuple(&state, stack, giststate, itup,
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f014..f26f139a9e 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 12804c321c..41978bb5e5 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -23,6 +23,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -806,7 +807,7 @@ gistNewBuffer(Relation r)
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 5948218c77..a96d91eb1d 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,10 +16,13 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/transam.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+#include "access/xact.h"
 
 
 /*
@@ -125,7 +128,6 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 	}
 }
 
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -135,12 +137,14 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-			   IndexBulkDeleteCallback callback, void *callback_state)
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
 {
 	Relation	rel = info->index;
 	GistBDItem *stack,
 			   *ptr;
+	BlockNumber recentParent = InvalidBlockNumber;
+	List	   *rescanList = NULL;
+	ListCell   *cell;
 
 	/* first time through? */
 	if (stats == NULL)
@@ -233,9 +237,16 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 				END_CRIT_SECTION();
 			}
 
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
+				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				rescanList = lappend_int(rescanList, recentParent);
+			}
 		}
 		else
 		{
+			recentParent = stack->blkno;
 			/* check for split proceeded after look at parent */
 			pushStackIfSplited(page, stack);
 
@@ -270,5 +281,108 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		vacuum_delay_point();
 	}
 
+	/* rescan inner pages that had empty child pages */
+	foreach(cell,rescanList)
+	{
+		Buffer		buffer;
+		Page		page;
+		OffsetNumber i,
+					maxoff;
+		IndexTuple	idxtuple;
+		ItemId		iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		Buffer		buftodelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+									RBM_NORMAL, info->strategy);
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		Assert(!GistPageIsLeaf(page));
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		{
+			Buffer		leafBuffer;
+			Page		leafPage;
+
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)),
+								RBM_NORMAL, info->strategy);
+			LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+			gistcheckpage(rel, leafBuffer);
+			leafPage = (Page) BufferGetPage(leafBuffer);
+			Assert(GistPageIsLeaf(leafPage));
+
+			if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+				&& !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+				&& ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+			{
+				buftodelete[ntodelete] = leafBuffer;
+				todelete[ntodelete++] = i;
+			}
+			else
+				UnlockReleaseBuffer(leafBuffer);
+		}
+
+
+		if (ntodelete)
+		{
+			/*
+			 * Like in _bt_unlink_halfdead_page we need a upper bound on xid
+			 * that could hold downlinks to this page. We use
+			 * ReadNewTransactionId() to instead of GetCurrentTransactionId
+			 * since we are in a VACUUM.
+			 */
+			TransactionId txid = ReadNewTransactionId();
+
+			START_CRIT_SECTION();
+
+			/* Mark pages as deleted dropping references from internal pages */
+			for (i = 0; i < ntodelete; i++)
+			{
+				Page		leafPage = (Page)BufferGetPage(buftodelete[i]);
+
+				GistPageSetDeleteXid(leafPage,txid);
+
+				GistPageSetDeleted(leafPage);
+				MarkBufferDirty(buftodelete[i]);
+				stats->pages_deleted++;
+
+				MarkBufferDirty(buffer);
+				/* Offsets are changed as long as we delete tuples from internal page */
+				PageIndexTupleDelete(page, todelete[i] - i);
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr recptr =
+						gistXLogSetDeleted(rel->rd_node, buftodelete[i],
+						GistPageGetDeleteXid(leafPage), buffer, todelete[i] - i);
+					PageSetLSN(page, recptr);
+					PageSetLSN(leafPage, recptr);
+				}
+				else
+				{
+					PageSetLSN(page, gistGetFakeLSN(rel));
+					PageSetLSN(leafPage, gistGetFakeLSN(rel));
+				}
+
+				UnlockReleaseBuffer(buftodelete[i]);
+			}
+			END_CRIT_SECTION();
+		}
+
+		UnlockReleaseBuffer(buffer);
+
+		vacuum_delay_point();
+	}
+
+	list_free(rescanList);
+
 	return stats;
-}
+}
\ No newline at end of file
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e09126978..a48a8341c0 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -60,6 +60,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		GistPageSetDeleteXid(page, xldata->id);
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+		UnlockReleaseBuffer(buffer);
+	}
+
+	if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+		UnlockReleaseBuffer(buffer);
+	}
+}
 /*
  * redo any page update (except page split)
  */
@@ -112,8 +145,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
-			if (GistPageIsLeaf(page))
-				GistMarkTuplesDeleted(page);
+
+			GistMarkTuplesDeleted(page);
 		}
 
 		/* Add new tuples if any */
@@ -324,6 +357,9 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -442,6 +478,29 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+/*
+ * Write XLOG record describing a page delete. This also includes removal of
+ * downlink from internal page.
+ */
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid,
+					Buffer internalPageBuffer, OffsetNumber internalPageOffset) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.id = xid;
+	xlrec.downlinkOffset = internalPageBuffer;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 827566dc6e..0dd2bf47c8 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -151,6 +151,9 @@ typedef struct GISTENTRY
 #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
 #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
 
+#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+
 /*
  * Vector of GISTENTRY structs; user-defined methods union and picksplit
  * take it as one of their arguments
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba..1f82695b1d 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -16,6 +16,7 @@
 
 #include "access/amapi.h"
 #include "access/gist.h"
+#include "access/gistxlog.h"
 #include "access/itup.h"
 #include "fmgr.h"
 #include "lib/pairingheap.h"
@@ -51,6 +52,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
 
-/* despite the name, gistxlogPage is not part of any xlog record */
-typedef struct gistxlogPage
-{
-	BlockNumber blkno;
-	int			num;			/* number of index tuples following */
-} gistxlogPage;
-
 /* SplitedPageLayout - gistSplit function result */
 typedef struct SplitedPageLayout
 {
@@ -409,6 +408,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid, Buffer internalPageBuffer,
+					OffsetNumber internalPageOffset);
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 1a2b9496d0..f93024ab25 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,12 +17,14 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
 
 /*
  * Backup Blk 0: updated page.
@@ -59,6 +61,19 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId id;
+   OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */
+} gistxlogPageDelete;
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+   BlockNumber blkno;
+   int			num;			/* number of index tuples following */
+} gistxlogPage;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf..5b92f08c74 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
 select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 vacuum analyze gist_point_tbl;
 -- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13..e66396e851 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
 
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 
 vacuum analyze gist_point_tbl;
-- 
2.15.2 (Apple Git-101.1)

