From 22f7c69552bcd6b89a586554a1361926925f4332 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Sun, 29 Oct 2023 21:39:23 +0100
Subject: [PATCH v14 1/7] btree: Implement dynamic prefix compression

Because tuples in the btree are ordered, if some prefix of the
scan attributes on both sides of the compared tuple are equal
to the scankey, then the current tuple that is being compared
must also contain the same prefix that equals the scankey.

Due to concurrent page splits and deletes, the key range of a
page may move while we travel down the btree. This means we can't
directly reuse left- and right-key prefixes we established on the
parent page while we descend the btree: the data on the page
may not match the parent page's separator keys anymore (see [0]).

So, we can't always descend the btree while keeping the discovered
prefixes. However, on each page we only see a snapshot of the
page, so we can use dynamic prefix truncation at the page level.
This still improves the worst-case number of compare operations
during btree descent from depth * ceil(log(tuples_per_page)) * natts
to depth * (ceil(log(tuples_per_page)) + 3 * natts).

In passing, move _bt_moveright from btree.h to nbtsearch.c: there
was no usage of the function left outside nbtsearch.c.
---
 contrib/amcheck/verify_nbtree.c       |  25 +++++--
 src/backend/access/nbtree/README      |  34 +++++++++
 src/backend/access/nbtree/nbtinsert.c |  34 ++++++---
 src/backend/access/nbtree/nbtsearch.c | 101 +++++++++++++++++++++++---
 src/include/access/nbtree.h           |   9 +--
 5 files changed, 168 insertions(+), 35 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 7282cf7fc8..40fc0a8af7 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -1706,6 +1706,8 @@ bt_target_page_check(BtreeCheckState *state)
 		if (state->checkunique && state->indexinfo->ii_Unique &&
 			P_ISLEAF(topaque) && OffsetNumberNext(offset) <= max)
 		{
+			AttrNumber cmpattr = 1;
+
 			/* Save current scankey tid */
 			scantid = skey->scantid;
 
@@ -1722,7 +1724,7 @@ bt_target_page_check(BtreeCheckState *state)
 			 * treated as different to any other key.
 			 */
 			if (_bt_compare(state->rel, skey, state->target,
-							OffsetNumberNext(offset)) != 0 || skey->anynullkeys)
+							OffsetNumberNext(offset), &cmpattr) != 0 || skey->anynullkeys)
 			{
 				lVis_i = -1;
 				lVis_tid = NULL;
@@ -1801,6 +1803,8 @@ bt_target_page_check(BtreeCheckState *state)
 			if (state->checkunique && state->indexinfo->ii_Unique &&
 				rightkey && P_ISLEAF(topaque) && rightblock_number != P_NONE)
 			{
+				AttrNumber cmpattr = 1;
+
 				elog(DEBUG2, "check cross page unique condition");
 
 				/*
@@ -1811,7 +1815,7 @@ bt_target_page_check(BtreeCheckState *state)
 				rightkey->scantid = NULL;
 
 				/* The first key on the next page is the same */
-				if (_bt_compare(state->rel, rightkey, state->target, max) == 0 && !rightkey->anynullkeys)
+				if (_bt_compare(state->rel, rightkey, state->target, max, &cmpattr) == 0 && !rightkey->anynullkeys)
 				{
 					elog(DEBUG2, "cross page equal keys");
 					state->target = palloc_btree_page(state,
@@ -3012,6 +3016,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		BTInsertStateData insertstate;
 		OffsetNumber offnum;
 		Page		page;
+		AttrNumber	compareattr = 1;
 
 		insertstate.itup = itup;
 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
@@ -3021,13 +3026,13 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		insertstate.buf = lbuf;
 
 		/* Get matching tuple on leaf page */
-		offnum = _bt_binsrch_insert(state->rel, &insertstate);
+		offnum = _bt_binsrch_insert(state->rel, &insertstate, 1);
 		/* Compare first >= matching item on leaf page, if any */
 		page = BufferGetPage(lbuf);
 		/* Should match on first heap TID when tuple has a posting list */
 		if (offnum <= PageGetMaxOffsetNumber(page) &&
 			insertstate.postingoff <= 0 &&
-			_bt_compare(state->rel, key, page, offnum) == 0)
+			_bt_compare(state->rel, key, page, offnum, &compareattr) == 0)
 			exists = true;
 		_bt_relbuf(state->rel, lbuf);
 	}
@@ -3089,6 +3094,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	ItemId		itemid;
 	int32		cmp;
+	AttrNumber	compareattr = 1;
 
 	Assert(key->pivotsearch);
 
@@ -3099,7 +3105,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 	if (!key->heapkeyspace)
 		return invariant_leq_offset(state, key, upperbound);
 
-	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound, &compareattr);
 
 	/*
 	 * _bt_compare() is capable of determining that a scankey with a
@@ -3151,10 +3157,11 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 					 OffsetNumber upperbound)
 {
 	int32		cmp;
+	AttrNumber	compareattr = 1;
 
 	Assert(key->pivotsearch);
 
-	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound, &compareattr);
 
 	return cmp <= 0;
 }
@@ -3174,10 +3181,11 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
 				   OffsetNumber lowerbound)
 {
 	int32		cmp;
+	AttrNumber	compareattr = 1;
 
 	Assert(key->pivotsearch);
 
-	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
+	cmp = _bt_compare(state->rel, key, state->target, lowerbound, &compareattr);
 
 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
 	if (!key->heapkeyspace)
@@ -3212,13 +3220,14 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	ItemId		itemid;
 	int32		cmp;
+	AttrNumber	compareattr = 1;
 
 	Assert(key->pivotsearch);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
 								  upperbound);
-	cmp = _bt_compare(state->rel, key, nontarget, upperbound);
+	cmp = _bt_compare(state->rel, key, nontarget, upperbound, &compareattr);
 
 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
 	if (!key->heapkeyspace)
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 52e646c7f7..61601aca61 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -901,6 +901,40 @@ large groups of duplicates, maximizing space utilization.  Note also that
 deduplication more efficient.  Deduplication can be performed infrequently,
 without merging together existing posting list tuples too often.
 
+Notes about dynamic prefix truncation
+-------------------------------------
+
+When we do a binary search on a sorted set (such as a BTree), we know that a
+tuple will be smaller than its left neighbour, and larger than its right
+neighbour.  If we know that both the left and right neighbour share a prefix
+with the scan key, we deduce that the current tuple must also share this
+prefix with the scan key, which allows us to skip this shared prefix during
+tuple compare operations when we descend the btree.
+
+However, the BTree is not a static sorted list: It is a dynamic data
+structure which changes over time.  Because of this, we cannot just reuse the
+established prefix from the parent page: concurrent page deletions may have
+moved in new tuples from the keyspace left of the left separator of this
+page's downlink on the parent page, and page splits may have moved tuples
+that were previously in this page's indicated keyspace away from the page.
+This means that we have to re-establish the prefix that is shared with the
+scan key at every level of the btree.
+
+Note that this is most effective on indexes where a prefix of key columns has
+many duplicate values.  Compare operations on full index tuples stop at the
+first non-equal attribute: it is quite unlikely that an index on
+(unique, 1, 1, 1) will ever compare the latter columns, while one on
+(1, 1, 1, unique) will essentially always have to compare all attributes to
+hit an attribute that isn't equal to that of the scan key.  The dynamic
+prefix truncation allows PostgreSQL to skip most compare operations on the
+(1, 1, 1 ...) prefix, but it cannot improve the (unique, ...) case because
+that has no "wasteful" prefix that we can skip.
+
+With the above optimizations, dynamic prefix truncation improves the worst
+case complexity of indexing from O(tree_height * natts * log(tups_per_page))
+to O(tree_height * (3*natts + log(tups_per_page))) attribute compare
+operations, which can improve performance significantly.
+
 Notes about deduplication
 -------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 9cff4f2931..b9f2580c1d 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -328,6 +328,7 @@ _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
 		{
 			Page		page;
 			BTPageOpaque opaque;
+			AttrNumber	prefix = 1;
 
 			_bt_checkpage(rel, insertstate->buf);
 			page = BufferGetPage(insertstate->buf);
@@ -346,7 +347,8 @@ _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
 				!P_IGNORE(opaque) &&
 				PageGetFreeSpace(page) > insertstate->itemsz &&
 				PageGetMaxOffsetNumber(page) >= P_HIKEY &&
-				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY,
+							&prefix) > 0)
 			{
 				/*
 				 * Caller can use the fastpath optimization because cached
@@ -440,7 +442,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	 * in the fastpath below, but also in the _bt_findinsertloc() call later.
 	 */
 	Assert(!insertstate->bounds_valid);
-	offset = _bt_binsrch_insert(rel, insertstate);
+	offset = _bt_binsrch_insert(rel, insertstate, 1);
 
 	/*
 	 * Scan over all equal tuples, looking for live conflicts.
@@ -450,6 +452,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
+		AttrNumber	cmpatt = 1;
+
 		/*
 		 * Each iteration of the loop processes one heap TID, not one index
 		 * tuple.  Current offset number for page isn't usually advanced on
@@ -485,7 +489,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				Assert(insertstate->bounds_valid);
 				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
 				Assert(insertstate->low <= insertstate->stricthigh);
-				Assert(_bt_compare(rel, itup_key, page, offset) < 0);
+				Assert(_bt_compare(rel, itup_key, page, offset, &cmpatt) < 0);
 				break;
 			}
 
@@ -510,7 +514,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				if (!inposting)
 				{
 					/* Plain tuple, or first TID in posting list tuple */
-					if (_bt_compare(rel, itup_key, page, offset) != 0)
+					if (_bt_compare(rel, itup_key, page, offset, &cmpatt) != 0)
 						break;	/* we're past all the equal tuples */
 
 					/* Advanced curitup */
@@ -720,11 +724,12 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 		else
 		{
 			int			highkeycmp;
+			cmpatt = 1;
 
 			/* If scankey == hikey we gotta check the next page too */
 			if (P_RIGHTMOST(opaque))
 				break;
-			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY, &cmpatt);
 			Assert(highkeycmp <= 0);
 			if (highkeycmp != 0)
 				break;
@@ -867,6 +872,8 @@ _bt_findinsertloc(Relation rel,
 
 			for (;;)
 			{
+				AttrNumber	cmpatt = 1;
+
 				/*
 				 * Does the new tuple belong on this page?
 				 *
@@ -884,7 +891,7 @@ _bt_findinsertloc(Relation rel,
 
 				/* Test '<=', not '!=', since scantid is set now */
 				if (P_RIGHTMOST(opaque) ||
-					_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+					_bt_compare(rel, itup_key, page, P_HIKEY, &cmpatt) <= 0)
 					break;
 
 				_bt_stepright(rel, heapRel, insertstate, stack);
@@ -937,6 +944,8 @@ _bt_findinsertloc(Relation rel,
 		 */
 		while (PageGetFreeSpace(page) < insertstate->itemsz)
 		{
+			AttrNumber	cmpatt = 1;
+
 			/*
 			 * Before considering moving right, see if we can obtain enough
 			 * space by erasing LP_DEAD items
@@ -967,7 +976,7 @@ _bt_findinsertloc(Relation rel,
 				break;
 
 			if (P_RIGHTMOST(opaque) ||
-				_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+				_bt_compare(rel, itup_key, page, P_HIKEY, &cmpatt) != 0 ||
 				pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
 				break;
 
@@ -982,10 +991,13 @@ _bt_findinsertloc(Relation rel,
 	 * We should now be on the correct page.  Find the offset within the page
 	 * for the new tuple. (Possibly reusing earlier search bounds.)
 	 */
-	Assert(P_RIGHTMOST(opaque) ||
-		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
+	{
+		AttrNumber	cmpatt PG_USED_FOR_ASSERTS_ONLY = 1;
+		Assert(P_RIGHTMOST(opaque) ||
+			   _bt_compare(rel, itup_key, page, P_HIKEY, &cmpatt) <= 0);
+	}
 
-	newitemoff = _bt_binsrch_insert(rel, insertstate);
+	newitemoff = _bt_binsrch_insert(rel, insertstate, 1);
 
 	if (insertstate->postingoff == -1)
 	{
@@ -1004,7 +1016,7 @@ _bt_findinsertloc(Relation rel,
 		 */
 		Assert(!insertstate->bounds_valid);
 		insertstate->postingoff = 0;
-		newitemoff = _bt_binsrch_insert(rel, insertstate);
+		newitemoff = _bt_binsrch_insert(rel, insertstate, 1);
 		Assert(insertstate->postingoff == 0);
 	}
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index efc5284e5b..71fd658d15 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -27,6 +27,9 @@
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
+							Buffer buf, bool forupdate, BTStack stack,
+							int access, AttrNumber *prefix);
 static int	_bt_binsrch_posting(BTScanInsert key, Page page,
 								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
@@ -98,6 +101,7 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 {
 	BTStack		stack_in = NULL;
 	int			page_access = BT_READ;
+	AttrNumber	highkeyprefix = 1;
 
 	/* heaprel must be set whenever _bt_allocbuf is reachable */
 	Assert(access == BT_READ || access == BT_WRITE);
@@ -134,7 +138,7 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 		 * opportunity to finish splits of internal pages too.
 		 */
 		*bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
-							  stack_in, page_access);
+							  stack_in, page_access, &highkeyprefix);
 
 		/* if this is a leaf page, we're done */
 		page = BufferGetPage(*bufP);
@@ -185,6 +189,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 	 */
 	if (access == BT_WRITE && page_access == BT_READ)
 	{
+		highkeyprefix = 1;
+
 		/* trade in our read lock for a write lock */
 		_bt_unlockbuf(rel, *bufP);
 		_bt_lockbuf(rel, *bufP, BT_WRITE);
@@ -194,7 +200,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 		 * but before we acquired a write lock.  If it has, we may need to
 		 * move right to its new sibling.  Do that.
 		 */
-		*bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
+		*bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE,
+							  &highkeyprefix);
 	}
 
 	return stack_in;
@@ -230,6 +237,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
  * On entry, we have the buffer pinned and a lock of the type specified by
  * 'access'.  If we move right, we release the buffer and lock and acquire
  * the same on the right sibling.  Return value is the buffer we stop at.
+ *
+ * On exit, we've updated *comparecol with the prefix that the high key shares
  */
 Buffer
 _bt_moveright(Relation rel,
@@ -238,7 +247,8 @@ _bt_moveright(Relation rel,
 			  Buffer buf,
 			  bool forupdate,
 			  BTStack stack,
-			  int access)
+			  int access,
+			  AttrNumber *prefix)
 {
 	Page		page;
 	BTPageOpaque opaque;
@@ -262,16 +272,36 @@ _bt_moveright(Relation rel,
 	 *
 	 * We also have to move right if we followed a link that brought us to a
 	 * dead page.
+	 *
+	 * Note: It is important that *comparecol is set to an accurate value at
+	 * return. Rightmost pages have no prefix, thus set it to 1.  In all other
+	 * cases, it must be updated with the prefix result of this page's
+	 * HIGHKEY's full _bt_compare.
 	 */
 	cmpval = key->nextkey ? 0 : 1;
 
 	for (;;)
 	{
+		/*
+		 * We explicitly don't reuse the prefix argument of this function,
+		 * as we may need to move multiple times, and we don't want to
+		 * overwrite the value stored in *prefix if we could reuse it.
+		 * Additionally, its value will be useless for any of our own
+		 * _bt_compare calls: only if the downlink right separator/HIKEY
+		 * optimization is applicable we can use the value, and when it is
+		 * applicable then we don't have to call _bt_compare.
+		 */
+		AttrNumber	prefixatt = 1;
+
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 
 		if (P_RIGHTMOST(opaque))
+		{
+			/* set the compare column when breaking out of the loop */
+			*prefix = 1;
 			break;
+		}
 
 		/*
 		 * Finish any incomplete splits we encounter along the way.
@@ -297,14 +327,19 @@ _bt_moveright(Relation rel,
 			continue;
 		}
 
-		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
+		if (P_IGNORE(opaque) ||
+			_bt_compare(rel, key, page, P_HIKEY, &prefixatt) >= cmpval)
 		{
-			/* step right one page */
+			/* set the compare column when breaking out of the loop */
 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
 			continue;
 		}
 		else
+		{
+			/* make sure to set the compare column */
+			*prefix = prefixatt;
 			break;
+		}
 	}
 
 	if (P_IGNORE(opaque))
@@ -345,6 +380,13 @@ _bt_binsrch(Relation rel,
 				high;
 	int32		result,
 				cmpval;
+	/*
+	 * Prefix bounds, for the high/low offset's compare columns.
+	 * "highkeyprefix" is the value for this page's high key (if any) or 1
+	 * (no established shared prefix)
+	 */
+	AttrNumber	highkeyprefix = 1,
+				lowkeyprefix = 1;
 
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
@@ -377,6 +419,10 @@ _bt_binsrch(Relation rel,
 	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
 	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
 	 *
+	 * We maintain highkeyprefix and lowkeyprefix to keep track of prefixes
+	 * that tuples share with the scan key, potentially allowing us to skip a
+	 * prefix in the midpoint comparison.
+	 *
 	 * We can fall out when high == low.
 	 */
 	high++;						/* establish the loop invariant for high */
@@ -386,15 +432,22 @@ _bt_binsrch(Relation rel,
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
+		AttrNumber	prefix = Min(highkeyprefix, lowkeyprefix); /* update prefix bounds */
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare(rel, key, page, mid, &prefix);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lowkeyprefix = prefix;
+		}
 		else
+		{
 			high = mid;
+			highkeyprefix = prefix;
+		}
 	}
 
 	/*
@@ -439,7 +492,8 @@ _bt_binsrch(Relation rel,
  * list split).
  */
 OffsetNumber
-_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate,
+				   AttrNumber highkeyprefix)
 {
 	BTScanInsert key = insertstate->itup_key;
 	Page		page;
@@ -449,6 +503,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 				stricthigh;
 	int32		result,
 				cmpval;
+	AttrNumber	lowkeyprefix = 1;
 
 	page = BufferGetPage(insertstate->buf);
 	opaque = BTPageGetOpaque(page);
@@ -499,16 +554,22 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
+		AttrNumber	prefix = Min(highkeyprefix, lowkeyprefix);
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare(rel, key, page, mid, &prefix);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lowkeyprefix = prefix;
+		}
 		else
 		{
 			high = mid;
+			highkeyprefix = prefix;
+
 			if (result != 0)
 				stricthigh = high;
 		}
@@ -643,6 +704,13 @@ _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
  * matching TID in the posting tuple, which caller must handle
  * themselves (e.g., by splitting the posting list tuple).
  *
+ * NOTE: The "comparecol" argument must refer to the first attribute of the
+ * index tuple of which the caller knows that it does not match the scan key:
+ * this means 1 for "no known matching attributes", up to the number of key
+ * attributes + 1 if the caller knows that all key attributes of the index
+ * tuple match those of the scan key.  See backend/access/nbtree/README for
+ * details.
+ *
  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
  * "minus infinity": this routine will always claim it is less than the
  * scankey.  The actual key value stored is explicitly truncated to 0
@@ -656,7 +724,8 @@ int32
 _bt_compare(Relation rel,
 			BTScanInsert key,
 			Page page,
-			OffsetNumber offnum)
+			OffsetNumber offnum,
+			AttrNumber *compareattr)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = BTPageGetOpaque(page);
@@ -696,8 +765,9 @@ _bt_compare(Relation rel,
 	ncmpkey = Min(ntupatts, key->keysz);
 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
 	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
-	scankey = key->scankeys;
-	for (int i = 1; i <= ncmpkey; i++)
+
+	scankey = key->scankeys + ((*compareattr) - 1);
+	for (int i = *compareattr; i <= ncmpkey; i++)
 	{
 		Datum		datum;
 		bool		isNull;
@@ -741,11 +811,20 @@ _bt_compare(Relation rel,
 
 		/* if the keys are unequal, return the difference */
 		if (result != 0)
+		{
+			*compareattr = (AttrNumber) i;
 			return result;
+		}
 
 		scankey++;
 	}
 
+	/*
+	 * All tuple attributes are equal to the scan key, only later attributes
+	 * could potentially not equal the scan key.
+	 */
+	*compareattr = ntupatts + 1;
+
 	/*
 	 * All non-truncated attributes (other than heap TID) were found to be
 	 * equal.  Treat truncated attributes as minus infinity when scankey has a
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7bfbf3086c..ae4e902ad0 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1237,11 +1237,10 @@ extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
  */
 extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
 						  Buffer *bufP, int access);
-extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
-							Buffer buf, bool forupdate, BTStack stack,
-							int access);
-extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
-extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
+extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate,
+									   AttrNumber highkeyprefix);
+extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page,
+						 OffsetNumber offnum, AttrNumber *compareattr);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
-- 
2.40.1

