This v5 version of the patch is intended to use with v3-0001-Make-all-nbtree-index-tuples-have-unique-keys patch [1].

[1] https://www.postgresql.org/message-id/CAH2-WzkmTRXh%3DzyMAUHyG3%3DO-QQip6CJc2VyNijRO-vzgPxmoQ%40mail.gmail.com


13.07.2018 00:00, Peter Geoghegan пишет:
On Tue, Jul 3, 2018 at 5:17 AM, Andrey V. Lepikhov
<a.lepik...@postgrespro.ru> wrote:
Done.
Attachment contains an update for use v.2 of the 'Ensure nbtree leaf tuple
keys are always unique' patch.
My v3 is still pending, but is now a lot better than v2. There were
bugs in v2 that were fixed.

One area that might be worth investigating is retail index tuple
deletion performed within the executor in the event of non-HOT
updates. Maybe LP_REDIRECT could be repurposed to mean "ghost record",
at least in unique index tuples with no NULL values. The idea is that
MVCC index scans can skip over those if they've already found a
visible tuple with the same value. Also, when there was about to be a
page split, they could be treated a little bit like LP_DEAD items. Of
course, the ghost bit would have to be treated as a hint that could be
"wrong" (e.g. because the transaction hasn't committed yet), so you'd
have to go to the heap in the context of a page split, to double
check. Also, you'd need heuristics that let you give up on this
strategy when it didn't help.

I think that this could work well enough for OLTP workloads, and might
be more future-proof than doing it in VACUUM. Though, of course, it's
still very complicated.


--
---
Andrey Lepikhov
Postgres Professional
https://postgrespro.com
The Russian Postgres Company

>From 022338178180b9968e365ed4157d35be7fb4c664 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepik...@postgrespro.ru>
Date: Mon, 23 Jul 2018 09:35:17 +0500
Subject: [PATCH 1/4] v5 Retail IndexTuple Deletion Access Method

---
 contrib/bloom/blutils.c              |   1 +
 src/backend/access/brin/brin.c       |   1 +
 src/backend/access/gin/ginutil.c     |   1 +
 src/backend/access/gist/gist.c       |   1 +
 src/backend/access/hash/hash.c       |   1 +
 src/backend/access/index/indexam.c   |  15 +++
 src/backend/access/nbtree/nbtree.c   | 138 +++++++++++++++++++++++++++
 src/backend/access/spgist/spgutils.c |   1 +
 src/include/access/amapi.h           |   6 ++
 src/include/access/genam.h           |  18 ++++
 src/include/access/nbtree.h          |   4 +
 11 files changed, 187 insertions(+)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 6b2b9e3742..96f1d47c70 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -126,6 +126,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = blbuild;
 	amroutine->ambuildempty = blbuildempty;
 	amroutine->aminsert = blinsert;
+	amroutine->amtargetdelete = NULL;
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index e95fbbcea7..a0e06bd868 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -103,6 +103,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = brinbuild;
 	amroutine->ambuildempty = brinbuildempty;
 	amroutine->aminsert = brininsert;
+	amroutine->amtargetdelete = NULL;
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 0a32182dd7..acf14e7bec 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -58,6 +58,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = ginbuild;
 	amroutine->ambuildempty = ginbuildempty;
 	amroutine->aminsert = gininsert;
+	amroutine->amtargetdelete = NULL;
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7..d7a53d2ca9 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -80,6 +80,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = gistbuild;
 	amroutine->ambuildempty = gistbuildempty;
 	amroutine->aminsert = gistinsert;
+	amroutine->amtargetdelete = NULL;
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0002df30c0..5fb32d6eba 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -76,6 +76,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = hashbuild;
 	amroutine->ambuildempty = hashbuildempty;
 	amroutine->aminsert = hashinsert;
+	amroutine->amtargetdelete = NULL;
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 22b5cc921f..9ebeb789fb 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -730,6 +730,21 @@ index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
 	return ntids;
 }
 
+IndexTargetDeleteResult *
+index_target_delete(IndexTargetDeleteInfo *info,
+					IndexTargetDeleteResult *stats,
+					Datum *values,
+					bool *isnull)
+{
+	Relation indexRelation = info->indexRelation;
+
+	RELATION_CHECKS;
+
+	CHECK_REL_PROCEDURE(amtargetdelete);
+
+	return indexRelation->rd_amroutine->amtargetdelete(info, stats, values, isnull);
+}
+
 /* ----------------
  *		index_bulk_delete - do mass deletion of index entries
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index e8725fbbe1..44f4d0dc9f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -34,7 +34,9 @@
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
+#include "utils/lsyscache.h"
 #include "utils/memutils.h"
+#include "utils/syscache.h"
 
 
 /* Working state needed by btvacuumpage */
@@ -127,6 +129,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = btbuild;
 	amroutine->ambuildempty = btbuildempty;
 	amroutine->aminsert = btinsert;
+	amroutine->amtargetdelete = bttargetdelete;
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
@@ -886,6 +889,141 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	return stats;
 }
 
+static int
+tid_list_search(ItemPointer tid, ItemPointer tid_list, int ntid)
+{
+	for (int i = 0; i < ntid; i++)
+		if (ItemPointerEquals(tid, &(tid_list[i])))
+			return i;
+	return -1;
+}
+
+IndexTargetDeleteResult*
+bttargetdelete(IndexTargetDeleteInfo *info,
+			   IndexTargetDeleteResult *stats,
+			   Datum *values,
+			   bool *isnull)
+{
+	Relation		irel = info->indexRelation;
+	Relation		hrel = info->heapRelation;
+	ScanKey			skey;
+	int				keysCount = IndexRelationGetNumberOfKeyAttributes(irel);
+	BTStack			stack;
+	Buffer			buf;
+	Page			page;
+	BTPageOpaque	opaque;
+	OffsetNumber	offnum;
+	int				ndeletable = 0;
+	OffsetNumber	deletable[MaxOffsetNumber];
+	IndexTuple		itup;
+
+	if (stats == NULL)
+		stats = (IndexTargetDeleteResult *) palloc0(sizeof(IndexTargetDeleteResult));
+
+	itup = index_form_tuple(RelationGetDescr(irel), values, isnull);
+	skey = _bt_mkscankey(irel, itup);
+
+	/* Descend the tree and position ourselves on the target leaf page. */
+	stack = _bt_search(irel, keysCount, skey, false, &buf, BT_READ, NULL);
+	_bt_freestack(stack);
+
+	/* To prepare tuple entries search across index pages */
+	Assert(BufferIsValid(buf));
+	offnum = _bt_binsrch(irel, buf, keysCount, skey, false);
+	page = BufferGetPage(buf);
+	_bt_checkpage(irel, buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		int32		cmpval;
+		ItemId		itemid;
+		IndexTuple	itup;
+		int			pos;
+
+		/* Switch to the next page */
+		if (P_IGNORE(opaque) || (offnum > PageGetMaxOffsetNumber(page)))
+		{
+			/*
+			 * Before unlocking index page we need to delete
+			 * all currently found tuples
+			 */
+			if (ndeletable > 0)
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBufferForCleanup(buf);
+
+				_bt_delitems_delete(irel, buf, deletable, ndeletable, hrel);
+
+				stats->tuples_removed += ndeletable;
+				ndeletable = 0;
+			}
+
+			/*
+			 * Check for end-of-index
+			 */
+			if (P_RIGHTMOST(opaque))
+				/* it is rightmost leaf */
+				break;
+
+			/*
+			 * Switch to the next index page
+			 */
+			buf = _bt_relandgetbuf(irel, buf, opaque->btpo_next, BT_READ);
+			page = BufferGetPage(buf);
+			_bt_checkpage(irel, buf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			offnum = P_FIRSTDATAKEY(opaque);
+			continue;
+		}
+
+		/*
+		 * This index entry satisfied to the scan key?
+		 */
+		cmpval = _bt_compare(irel, keysCount, skey, page, offnum);
+
+		if (cmpval != 0)
+			/* End of index entries, satisfied to the scan key */
+			break;
+
+		/*
+		 * To load index tuple and look for matches in the TID list of
+		 * dead heap tuples
+		 */
+		itemid = PageGetItemId(page, offnum);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+		pos = tid_list_search(&(itup->t_tid), info->dead_tuples, info->num_dead_tuples);
+
+		if ((pos >= 0) && (!info->found_dead_tuples[pos]))
+		{
+			/* index entry for TID of dead tuple is found */
+			deletable[ndeletable++] = offnum;
+			info->found_dead_tuples[pos] = true;
+		}
+
+		offnum = OffsetNumberNext(offnum);
+	}
+
+	/*
+	 * Delete all found index tuples
+	 */
+	if (ndeletable > 0)
+	{
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		LockBufferForCleanup(buf);
+
+		_bt_delitems_delete(irel, buf, deletable, ndeletable, hrel);
+
+		stats->tuples_removed += ndeletable;
+	}
+
+	/* Release scan key, unpin and unlock buffer */
+	_bt_freeskey(skey);
+	_bt_relbuf(irel, buf);
+
+	return stats;
+}
+
 /*
  * Post-VACUUM cleanup.
  *
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 4a9b5da268..325728122b 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -56,6 +56,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambuild = spgbuild;
 	amroutine->ambuildempty = spgbuildempty;
 	amroutine->aminsert = spginsert;
+	amroutine->amtargetdelete = NULL;
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 14526a6bb2..497b54e8a8 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -76,6 +76,11 @@ typedef bool (*aminsert_function) (Relation indexRelation,
 								   IndexUniqueCheck checkUnique,
 								   struct IndexInfo *indexInfo);
 
+/* target delete */
+typedef IndexTargetDeleteResult *(*amtargetdelete_function) (IndexTargetDeleteInfo *info,
+															 IndexTargetDeleteResult *stats,
+															 Datum *values,
+															 bool *isnull);
 /* bulk delete */
 typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
 														 IndexBulkDeleteResult *stats,
@@ -207,6 +212,7 @@ typedef struct IndexAmRoutine
 	ambuild_function ambuild;
 	ambuildempty_function ambuildempty;
 	aminsert_function aminsert;
+	amtargetdelete_function amtargetdelete;
 	ambulkdelete_function ambulkdelete;
 	amvacuumcleanup_function amvacuumcleanup;
 	amcanreturn_function amcanreturn;	/* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 24c720bf42..79b386eebd 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -51,6 +51,15 @@ typedef struct IndexVacuumInfo
 	BufferAccessStrategy strategy;	/* access strategy for reads */
 } IndexVacuumInfo;
 
+typedef struct IndexTargetDeleteInfo
+{
+	Relation	heapRelation;
+	Relation	indexRelation;			/* the index being vacuumed */
+	int			num_dead_tuples;
+	ItemPointer	dead_tuples;
+	bool*		found_dead_tuples;
+} IndexTargetDeleteInfo;
+
 /*
  * Struct for statistics returned by ambulkdelete and amvacuumcleanup
  *
@@ -79,6 +88,11 @@ typedef struct IndexBulkDeleteResult
 	BlockNumber pages_free;		/* # pages available for reuse */
 } IndexBulkDeleteResult;
 
+typedef struct IndexTargetDeleteResult
+{
+	int		tuples_removed; /* # removed during vacuum operation */
+} IndexTargetDeleteResult;
+
 /* Typedef for callback function to determine if a tuple is bulk-deletable */
 typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
 
@@ -163,6 +177,10 @@ extern HeapTuple index_fetch_heap(IndexScanDesc scan);
 extern HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction);
 extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
 
+extern IndexTargetDeleteResult *index_target_delete(IndexTargetDeleteInfo *info,
+													IndexTargetDeleteResult *stats,
+													Datum *values,
+													bool *isnull);
 extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 				  IndexBulkDeleteResult *stats,
 				  IndexBulkDeleteCallback callback,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 04ecb4cbc0..3b9461c3b4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -508,6 +508,10 @@ extern void btparallelrescan(IndexScanDesc scan);
 extern void btendscan(IndexScanDesc scan);
 extern void btmarkpos(IndexScanDesc scan);
 extern void btrestrpos(IndexScanDesc scan);
+extern IndexTargetDeleteResult *bttargetdelete(IndexTargetDeleteInfo *info,
+		IndexTargetDeleteResult *stats,
+		   Datum *values,
+		   bool *isnull);
 extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
 			 IndexBulkDeleteResult *stats,
 			 IndexBulkDeleteCallback callback,
-- 
2.17.1

>From 7b56036ea5f6817c1f81e3d53a197c50911ec69a Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepik...@postgrespro.ru>
Date: Mon, 23 Jul 2018 09:36:23 +0500
Subject: [PATCH 2/4] v5 Quick Vacuum Strategy

---
 src/backend/access/heap/heapam.c    |  31 ++++++
 src/backend/access/heap/pruneheap.c |  40 ++++++-
 src/backend/commands/vacuumlazy.c   | 161 +++++++++++++++++++++++++++-
 src/backend/utils/misc/guc.c        |  10 +-
 src/include/access/heapam.h         |   1 +
 5 files changed, 234 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 72395a50b8..dd033668b6 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -9392,6 +9392,36 @@ heap_sync(Relation rel)
 	}
 }
 
+/*
+ * Mask DEAD tuples at a HEAP page
+ *
+ * We introduce this function for wal_consistency_checking satisfaction at
+ * Hot Standby node.
+ *
+ * DEAD tuple on a master node has t_cid and t_infomask, which can not be
+ * obtained by WAL records applying on a Hot Standby node.
+ */
+static void
+mask_dead_tuples(Page page)
+{
+	OffsetNumber	offnum;
+
+	for (offnum = FirstOffsetNumber; offnum <= PageGetMaxOffsetNumber(page); offnum = OffsetNumberNext(offnum))
+	{
+		ItemId	lp = PageGetItemId(page, offnum);
+		char*	htup_begin;
+
+		if (!ItemIdIsDead(lp))
+			continue;
+
+		if (!ItemIdHasStorage(lp))
+			continue;
+
+		htup_begin = (char *) PageGetItem(page, lp);
+		memset(htup_begin, MASK_MARKER, ItemIdGetLength(lp));
+	}
+}
+
 /*
  * Mask a heap page before performing consistency checks on it.
  */
@@ -9405,6 +9435,7 @@ heap_mask(char *pagedata, BlockNumber blkno)
 
 	mask_page_hint_bits(page);
 	mask_unused_space(page);
+	mask_dead_tuples(page);
 
 	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
 	{
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index c2f5343dac..ebbafe3275 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -43,6 +43,9 @@ typedef struct
 	bool		marked[MaxHeapTuplesPerPage + 1];
 } PruneState;
 
+/* Parameter for target deletion strategy in lazy vacuum */
+double target_index_deletion_factor = 0.01;
+
 /* Local functions */
 static int heap_prune_chain(Relation relation, Buffer buffer,
 				 OffsetNumber rootoffnum,
@@ -405,7 +408,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 			if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)
 				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
 			{
-				heap_prune_record_unused(prstate, rootoffnum);
+				heap_prune_record_dead(prstate, rootoffnum);
 				HeapTupleHeaderAdvanceLatestRemovedXid(htup,
 													   &prstate->latestRemovedXid);
 				ndeleted++;
@@ -580,8 +583,10 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 		 */
 		for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
 		{
-			heap_prune_record_unused(prstate, chainitems[i]);
 			ndeleted++;
+			if (chainitems[i] == latestdead)
+				continue;
+			heap_prune_record_unused(prstate, chainitems[i]);
 		}
 
 		/*
@@ -598,9 +603,28 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 		 * redirect the root to the correct chain member.
 		 */
 		if (i >= nchain)
+		{
+			if (rootoffnum != latestdead)
+			{
+				if (ItemIdIsNormal(rootlp))
+					heap_prune_record_unused(prstate, latestdead);
+				else
+				{
+					/*
+					 * We allow overlapping of redirected and dead items
+					 */
+					heap_prune_record_redirect(prstate, rootoffnum, latestdead);
+					heap_prune_record_dead(prstate, latestdead);
+				}
+			}
 			heap_prune_record_dead(prstate, rootoffnum);
+		}
 		else
+		{
+			if (rootoffnum != latestdead)
+				heap_prune_record_unused(prstate, latestdead);
 			heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
+		}
 	}
 	else if (nchain < 2 && ItemIdIsRedirected(rootlp))
 	{
@@ -653,7 +677,12 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
 	Assert(prstate->ndead < MaxHeapTuplesPerPage);
 	prstate->nowdead[prstate->ndead] = offnum;
 	prstate->ndead++;
-	Assert(!prstate->marked[offnum]);
+
+	/*
+	 * We suppress checking prstate->marked[offnum]. It is not the best idea,
+	 * but this is most simplistic way to enable Dead Redirecting by
+	 * overlapping Dead and Redirected states.
+	 */
 	prstate->marked[offnum] = true;
 }
 
@@ -706,7 +735,10 @@ heap_page_prune_execute(Buffer buffer,
 		OffsetNumber off = *offnum++;
 		ItemId		lp = PageGetItemId(page, off);
 
-		ItemIdSetDead(lp);
+		if (target_index_deletion_factor > 0)
+			ItemIdMarkDead(lp);
+		else
+			ItemIdSetDead(lp);
 	}
 
 	/* Update all now-unused line pointers */
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 5649a70800..9a625b8dba 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -41,13 +41,16 @@
 #include "access/heapam_xlog.h"
 #include "access/htup_details.h"
 #include "access/multixact.h"
+#include "access/nbtree.h"
 #include "access/transam.h"
 #include "access/visibilitymap.h"
 #include "access/xlog.h"
+#include "catalog/index.h"
 #include "catalog/storage.h"
 #include "commands/dbcommands.h"
 #include "commands/progress.h"
 #include "commands/vacuum.h"
+#include "executor/executor.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
@@ -139,7 +142,6 @@ typedef struct LVRelStats
 	bool		lock_waiter_detected;
 } LVRelStats;
 
-
 /* A few variables that don't seem worth passing around as parameters */
 static int	elevel = -1;
 
@@ -156,6 +158,8 @@ static void lazy_scan_heap(Relation onerel, int options,
 			   bool aggressive);
 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
+static void quick_vacuum_index(Relation irel, Relation hrel,
+					LVRelStats *vacrelstats);
 static void lazy_vacuum_index(Relation indrel,
 				  IndexBulkDeleteResult **stats,
 				  LVRelStats *vacrelstats);
@@ -734,10 +738,17 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 
 			/* Remove index entries */
 			for (i = 0; i < nindexes; i++)
-				lazy_vacuum_index(Irel[i],
+			{
+				bool use_quick_strategy = (vacrelstats->num_dead_tuples/vacrelstats->old_live_tuples < target_index_deletion_factor);
+
+				if (use_quick_strategy && (Irel[i]->rd_amroutine->amtargetdelete != NULL))
+					quick_vacuum_index(Irel[i], onerel, vacrelstats);
+				else
+					lazy_vacuum_index(Irel[i],
 								  &indstats[i],
 								  vacrelstats);
 
+			}
 			/*
 			 * Report that we are now vacuuming the heap.  We also increase
 			 * the number of index scans here; note that by using
@@ -1379,10 +1390,16 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 
 		/* Remove index entries */
 		for (i = 0; i < nindexes; i++)
-			lazy_vacuum_index(Irel[i],
+		{
+			bool use_quick_strategy = (vacrelstats->num_dead_tuples/vacrelstats->old_live_tuples < target_index_deletion_factor);
+
+			if (use_quick_strategy && (Irel[i]->rd_amroutine->amtargetdelete != NULL))
+				quick_vacuum_index(Irel[i], onerel, vacrelstats);
+			else
+				lazy_vacuum_index(Irel[i],
 							  &indstats[i],
 							  vacrelstats);
-
+		}
 		/* Report that we are now vacuuming the heap */
 		hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
 		hvp_val[1] = vacrelstats->num_index_scans + 1;
@@ -1671,6 +1688,142 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup)
 	return false;
 }
 
+/*
+ * Get tuple from heap for a scan key building.
+ */
+static HeapTuple
+get_tuple_by_tid(Relation rel, ItemPointer tid)
+{
+	Buffer			buffer;
+	Page			page;
+	OffsetNumber	offnum;
+	ItemId			lp;
+	HeapTuple		tuple;
+	bool			needLock = !RELATION_IS_LOCAL(rel);
+	BlockNumber		npages;
+
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+	if (ItemPointerGetBlockNumber(tid) > npages)
+		return NULL;
+
+	buffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(tid), RBM_NORMAL, NULL);
+	LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+	page = (Page) BufferGetPage(buffer);
+	offnum = ItemPointerGetOffsetNumber(tid);
+	lp = PageGetItemId(page, offnum);
+
+	/*
+	 * VACUUM Races: someone already remove the tuple from HEAP. Ignore it.
+	 */
+	if (!ItemIdIsUsed(lp))
+	{
+		UnlockReleaseBuffer(buffer);
+		return NULL;
+	}
+	/* Walk along the chain */
+	while (!ItemIdHasStorage(lp))
+	{
+		offnum = ItemIdGetRedirect(lp);
+		lp = PageGetItemId(page, offnum);
+		Assert(ItemIdIsUsed(lp));
+	}
+
+	/* Form a tuple */
+	tuple = palloc(sizeof(HeapTupleData));
+	ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);
+	tuple->t_tableOid = RelationGetRelid(rel);
+	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
+	tuple->t_len = ItemIdGetLength(lp);
+	UnlockReleaseBuffer(buffer);
+	return tuple;
+}
+
+/*
+ *	quick_vacuum_index() -- quick vacuum one index relation.
+ *
+ *		Delete all the index entries pointing to tuples listed in
+ *		vacrelstats->dead_tuples.
+ */
+static void
+quick_vacuum_index(Relation irel, Relation hrel,
+				   LVRelStats *vacrelstats)
+{
+	int				tnum;
+	bool*			found = palloc0(vacrelstats->num_dead_tuples*sizeof(bool));
+	IndexInfo* 		indexInfo = BuildIndexInfo(irel);
+	EState*			estate = CreateExecutorState();
+	ExprContext*	econtext = GetPerTupleExprContext(estate);
+	ExprState*		predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+	TupleTableSlot*	slot = MakeSingleTupleTableSlot(RelationGetDescr(hrel));
+
+	IndexTargetDeleteResult	stats;
+	IndexTargetDeleteInfo	ivinfo;
+
+	ivinfo.indexRelation = irel;
+	ivinfo.heapRelation = hrel;
+
+	econtext->ecxt_scantuple = slot;
+
+	/* Get tuple from heap */
+	for (tnum = 0; tnum < vacrelstats->num_dead_tuples; tnum++)
+	{
+		HeapTuple		tuple;
+		Datum			values[INDEX_MAX_KEYS];
+		bool			isnull[INDEX_MAX_KEYS];
+
+		/* Index entry for the TID was deleted early */
+		if (found[tnum])
+			continue;
+
+		/* Get a tuple from heap */
+		if ((tuple = get_tuple_by_tid(hrel, &(vacrelstats->dead_tuples[tnum]))) == NULL)
+		{
+			/*
+			 * Tuple has 'not used' status.
+			 */
+			found[tnum] = true;
+			continue;
+		}
+
+		/*
+		 * Form values[] and isnull[] arrays from for index tuple
+		 * by heap tuple
+		 */
+		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+		ExecStoreTuple(tuple, slot, InvalidBuffer, false);
+
+		/*
+		 * In a partial index, ignore tuples that don't satisfy the
+		 * predicate.
+		 */
+		if ((predicate != NULL) && (!ExecQual(predicate, econtext)))
+		{
+			found[tnum] = true;
+			continue;
+		}
+
+		FormIndexDatum(indexInfo, slot, estate, values, isnull);
+
+		/*
+		 * Make attempt to delete some index entries by one tree descent.
+		 * We use only a part of TID list, which contains not found TID's.
+		 */
+		ivinfo.dead_tuples = &(vacrelstats->dead_tuples[tnum]);
+		ivinfo.num_dead_tuples = vacrelstats->num_dead_tuples - tnum;
+		ivinfo.found_dead_tuples = found + tnum;
+		index_target_delete(&ivinfo, &stats, values, isnull);
+	}
+
+	pfree(found);
+	ExecDropSingleTupleTableSlot(slot);
+	FreeExecutorState(estate);
+}
 
 /*
  *	lazy_vacuum_index() -- vacuum one index relation.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a88ea6cfc9..e9e953ece6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3243,7 +3243,15 @@ static struct config_real ConfigureNamesReal[] =
 		0.1, 0.0, 100.0,
 		NULL, NULL, NULL
 	},
-
+	{
+		{"target_index_deletion_factor", PGC_SIGHUP, AUTOVACUUM,
+			gettext_noop("Maximum number of vacuumed tuples as a fraction of reltuples where we can use target index vacuum strategy."),
+			NULL
+		},
+		&target_index_deletion_factor,
+		0.01, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
 	{
 		{"checkpoint_completion_target", PGC_SIGHUP, WAL_CHECKPOINTS,
 			gettext_noop("Time spent flushing dirty buffers during checkpoint, as fraction of checkpoint interval."),
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index ca5cad7497..c5a7bc8361 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -100,6 +100,7 @@ extern Relation heap_openrv_extended(const RangeVar *relation,
 typedef struct HeapScanDescData *HeapScanDesc;
 typedef struct ParallelHeapScanDescData *ParallelHeapScanDesc;
 
+extern double target_index_deletion_factor;
 /*
  * HeapScanIsValid
  *		True iff the heap scan is valid.
-- 
2.17.1

>From ec49e3c00d21b5aa3ff9bca382cd8d745c5fe7f5 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepik...@postgrespro.ru>
Date: Mon, 23 Jul 2018 09:38:45 +0500
Subject: [PATCH 4/4] v5 Retail IndexTuple Deletion with TID-sorting in Leaf

---
 src/backend/access/nbtree/nbtree.c | 75 ++++++++++++++++++++----------
 src/backend/commands/vacuumlazy.c  |  8 ++--
 src/include/access/genam.h         |  2 +-
 3 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 44f4d0dc9f..50378e0f51 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -889,15 +889,19 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	return stats;
 }
 
-static int
-tid_list_search(ItemPointer tid, ItemPointer tid_list, int ntid)
-{
-	for (int i = 0; i < ntid; i++)
-		if (ItemPointerEquals(tid, &(tid_list[i])))
-			return i;
-	return -1;
-}
-
+/*
+ * Deletion of index entries pointing to heap tuples.
+ *
+ * Constraints:
+ * 1. TID list info->dead_tuples arranged in ASC order.
+ * 2. Logical duplicates of index tuples stored in DESC order.
+ *
+ * The function generates an insertion scan key and descent by btree for first
+ * index tuple what satisfies scan key and last TID in info->dead_tuples list.
+ * For the scan results it deletes all index entries, matched to the TID list.
+ *
+ * Result: a palloc'd struct containing statistical info.
+ */
 IndexTargetDeleteResult*
 bttargetdelete(IndexTargetDeleteInfo *info,
 			   IndexTargetDeleteResult *stats,
@@ -916,33 +920,33 @@ bttargetdelete(IndexTargetDeleteInfo *info,
 	int				ndeletable = 0;
 	OffsetNumber	deletable[MaxOffsetNumber];
 	IndexTuple		itup;
+	int				pos = info->last_dead_tuple;
 
 	if (stats == NULL)
 		stats = (IndexTargetDeleteResult *) palloc0(sizeof(IndexTargetDeleteResult));
 
+	/* Assemble scankey */
 	itup = index_form_tuple(RelationGetDescr(irel), values, isnull);
 	skey = _bt_mkscankey(irel, itup);
 
 	/* Descend the tree and position ourselves on the target leaf page. */
-	stack = _bt_search(irel, keysCount, skey, false, &buf, BT_READ, NULL);
-	_bt_freestack(stack);
+	stack = _bt_search(irel, keysCount, skey, &info->dead_tuples[pos], false, &buf, BT_READ, NULL);
 
 	/* To prepare tuple entries search across index pages */
 	Assert(BufferIsValid(buf));
-	offnum = _bt_binsrch(irel, buf, keysCount, skey, false);
 	page = BufferGetPage(buf);
 	_bt_checkpage(irel, buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	offnum = _bt_binsrch(irel, buf, keysCount, skey, &info->dead_tuples[pos], P_FIRSTDATAKEY(opaque), false);
 
 	for (;;)
 	{
 		int32		cmpval;
 		ItemId		itemid;
 		IndexTuple	itup;
-		int			pos;
 
 		/* Switch to the next page */
-		if (P_IGNORE(opaque) || (offnum > PageGetMaxOffsetNumber(page)))
+		if (offnum > PageGetMaxOffsetNumber(page))
 		{
 			/*
 			 * Before unlocking index page we need to delete
@@ -967,20 +971,27 @@ bttargetdelete(IndexTargetDeleteInfo *info,
 				break;
 
 			/*
-			 * Switch to the next index page
+			 * Traverse to a next reliable index page
 			 */
-			buf = _bt_relandgetbuf(irel, buf, opaque->btpo_next, BT_READ);
+			buf = _bt_moveright(irel, buf, keysCount, skey, &info->dead_tuples[pos],
+												false, true, stack, BT_READ, NULL);
 			page = BufferGetPage(buf);
 			_bt_checkpage(irel, buf);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-			offnum = P_FIRSTDATAKEY(opaque);
-			continue;
+			Assert(!P_IGNORE(opaque));
+			/* Set offnum to first potentially interesting item */
+			offnum = _bt_binsrch(irel, buf, keysCount, skey, &info->dead_tuples[pos], P_FIRSTDATAKEY(opaque), false);
+
+			if (offnum > PageGetMaxOffsetNumber(page))
+				break;
+			else
+				continue;
 		}
 
 		/*
 		 * This index entry satisfied to the scan key?
 		 */
-		cmpval = _bt_compare(irel, keysCount, skey, page, offnum);
+		cmpval = _bt_compare(irel, keysCount, skey, NULL, page, offnum);
 
 		if (cmpval != 0)
 			/* End of index entries, satisfied to the scan key */
@@ -992,15 +1003,28 @@ bttargetdelete(IndexTargetDeleteInfo *info,
 		 */
 		itemid = PageGetItemId(page, offnum);
 		itup = (IndexTuple) PageGetItem(page, itemid);
-		pos = tid_list_search(&(itup->t_tid), info->dead_tuples, info->num_dead_tuples);
 
-		if ((pos >= 0) && (!info->found_dead_tuples[pos]))
+		/*
+		 * Search for next TID from presorted btree result comparable
+		 * to TID from presorted dead_tuples tid list
+		 */
+		while (pos >= 0)
 		{
-			/* index entry for TID of dead tuple is found */
-			deletable[ndeletable++] = offnum;
-			info->found_dead_tuples[pos] = true;
+			int res = ItemPointerCompare(&(itup->t_tid), &info->dead_tuples[pos]);
+			if ((res == 0) && (!info->found_dead_tuples[pos]))
+			{
+				/* index entry for TID of dead tuple is found */
+				deletable[ndeletable++] = offnum;
+				info->found_dead_tuples[pos] = true;
+			}
+			else if (res > 0)
+				break;
+			pos--;
 		}
 
+		if (pos < 0)
+			break;
+
 		offnum = OffsetNumberNext(offnum);
 	}
 
@@ -1017,7 +1041,8 @@ bttargetdelete(IndexTargetDeleteInfo *info,
 		stats->tuples_removed += ndeletable;
 	}
 
-	/* Release scan key, unpin and unlock buffer */
+	/* Release stack, scan key, unpin and unlock buffer */
+	_bt_freestack(stack);
 	_bt_freeskey(skey);
 	_bt_relbuf(irel, buf);
 
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 9a625b8dba..1b952fedd9 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -1770,7 +1770,7 @@ quick_vacuum_index(Relation irel, Relation hrel,
 	econtext->ecxt_scantuple = slot;
 
 	/* Get tuple from heap */
-	for (tnum = 0; tnum < vacrelstats->num_dead_tuples; tnum++)
+	for (tnum = vacrelstats->num_dead_tuples-1; tnum >= 0; tnum--)
 	{
 		HeapTuple		tuple;
 		Datum			values[INDEX_MAX_KEYS];
@@ -1814,9 +1814,9 @@ quick_vacuum_index(Relation irel, Relation hrel,
 		 * Make attempt to delete some index entries by one tree descent.
 		 * We use only a part of TID list, which contains not found TID's.
 		 */
-		ivinfo.dead_tuples = &(vacrelstats->dead_tuples[tnum]);
-		ivinfo.num_dead_tuples = vacrelstats->num_dead_tuples - tnum;
-		ivinfo.found_dead_tuples = found + tnum;
+		ivinfo.dead_tuples = vacrelstats->dead_tuples;
+		ivinfo.last_dead_tuple = tnum;
+		ivinfo.found_dead_tuples = found;
 		index_target_delete(&ivinfo, &stats, values, isnull);
 	}
 
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 79b386eebd..297d357a7d 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -55,7 +55,7 @@ typedef struct IndexTargetDeleteInfo
 {
 	Relation	heapRelation;
 	Relation	indexRelation;			/* the index being vacuumed */
-	int			num_dead_tuples;
+	int			last_dead_tuple;
 	ItemPointer	dead_tuples;
 	bool*		found_dead_tuples;
 } IndexTargetDeleteInfo;
-- 
2.17.1

Reply via email to