From f2859bb02f1c238d3a15e3d9b2de6a01c3543953 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 4 May 2019 19:22:38 -0700
Subject: [PATCH] Remove downlink variable from nbtree stack.

---
 src/backend/access/nbtree/nbtinsert.c | 40 +++++++++++++++++++--------
 src/backend/access/nbtree/nbtpage.c   |  3 +-
 src/backend/access/nbtree/nbtsearch.c | 17 ++++--------
 src/include/access/nbtree.h           | 15 ++++------
 4 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 602f8849d4..059df74103 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1795,7 +1795,6 @@ _bt_insert_parent(Relation rel,
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
 			stack->bts_offset = InvalidOffsetNumber;
-			stack->bts_btentry = InvalidBlockNumber;
 			stack->bts_parent = NULL;
 			_bt_relbuf(rel, pbuf);
 		}
@@ -1817,8 +1816,7 @@ _bt_insert_parent(Relation rel,
 		 * new downlink will be inserted at the correct offset. Even buf's
 		 * parent may have changed.
 		 */
-		stack->bts_btentry = bknum;
-		pbuf = _bt_getstackbuf(rel, stack);
+		pbuf = _bt_getstackbuf(rel, stack, bknum);
 
 		/*
 		 * Now we can unlock the right child. The left child will be unlocked
@@ -1897,13 +1895,31 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
 }
 
 /*
- *	_bt_getstackbuf() -- Walk back up the tree one step, and find the item
- *						 we last looked at in the parent.
+ *	_bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
+ *						 tuple whose downlink points to child page.
  *
- *		This is possible because we save the downlink from the parent item,
- *		which is enough to uniquely identify it.  Insertions into the parent
- *		level could cause the item to move right; deletions could cause it
- *		to move left, but not left of the page we previously found it in.
+ *		Caller passes child's block number, which is enough to uniquely
+ *		identify it using a linear search that matches it to a downlink in
+ *		parent.  Details of the parent are taken from stack for parent
+ *		level, which was stashed during initial descent.
+ *
+ *		The details of the parent page/downlink from stack are inherently
+ *		approximate, though child's downlink can typically be relocated very
+ *		quickly.  Insertions into the parent level could cause the pivot
+ *		tuple to move right; deletions could cause it to move left, but not
+ *		left of the page we previously found it on.
+ *
+ *		Note also that it's even possible that located pivot tuple is not
+ *		even _bt_compare()-equal to the pivot tuple whose downlink was
+ *		originally followed.  For example, caller may have had to move right
+ *		or step right earlier, and will now require a downlink to a child
+ *		page that is actually a sibling of the page encountered when
+ *		downlink was followed during descent.
+ *
+ *		Caller should have a lock on child's buffer, too, since caller's
+ *		stack might otherwise go stale immediately. (Page deletion caller
+ *		can get away with a lock on leaf level page when locating topparent
+ *		downlink, though.)
  *
  *		Adjusts bts_blkno & bts_offset if changed.
  *
@@ -1911,7 +1927,7 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
  *		(should not happen).
  */
 Buffer
-_bt_getstackbuf(Relation rel, BTStack stack)
+_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
 {
 	BlockNumber blkno;
 	OffsetNumber start;
@@ -1973,7 +1989,7 @@ _bt_getstackbuf(Relation rel, BTStack stack)
 				itemid = PageGetItemId(page, offnum);
 				item = (IndexTuple) PageGetItem(page, itemid);
 
-				if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
+				if (BTreeInnerTupleGetDownLink(item) == child)
 				{
 					/* Return accurate pointer to where link is now */
 					stack->bts_blkno = blkno;
@@ -1989,7 +2005,7 @@ _bt_getstackbuf(Relation rel, BTStack stack)
 				itemid = PageGetItemId(page, offnum);
 				item = (IndexTuple) PageGetItem(page, itemid);
 
-				if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
+				if (BTreeInnerTupleGetDownLink(item) == child)
 				{
 					/* Return accurate pointer to where link is now */
 					stack->bts_blkno = blkno;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 50455db9af..2a65db3613 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1228,8 +1228,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 	 * non-unique high keys in leaf level pages.  Even heapkeyspace indexes
 	 * can have a stale stack due to insertions into the parent.
 	 */
-	stack->bts_btentry = child;
-	pbuf = _bt_getstackbuf(rel, stack);
+	pbuf = _bt_getstackbuf(rel, stack, child);
 	if (pbuf == InvalidBuffer)
 		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
 			 RelationGetRelationName(rel), child);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index c655dadb96..cc6ccf2b4c 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -148,21 +148,16 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		/*
 		 * We need to save the location of the index entry we chose in the
 		 * parent page on a stack. In case we split the tree, we'll use the
-		 * stack to work back up to the parent page.  We also save the actual
-		 * downlink (block) to uniquely identify the index entry, in case it
-		 * moves right while we're working lower in the tree.  See the paper
-		 * by Lehman and Yao for how this is detected and handled. (We use the
-		 * child link during the second half of a page split -- if caller ends
-		 * up splitting the child it usually ends up inserting a new pivot
-		 * tuple for child's new right sibling immediately after the original
-		 * bts_offset offset recorded here.  The downlink block will be needed
-		 * to check if bts_offset remains the position of this same pivot
-		 * tuple.)
+		 * stack to work back up to the parent page.  We use the child block
+		 * number (or possibly the block number of a page to its right)
+		 * during the second half of a page split -- if caller ends up
+		 * splitting the child it usually ends up inserting a new pivot tuple
+		 * for child's new right sibling immediately after the original
+		 * bts_offset offset recorded here.
 		 */
 		new_stack = (BTStack) palloc(sizeof(BTStackData));
 		new_stack->bts_blkno = par_blkno;
 		new_stack->bts_offset = offnum;
-		new_stack->bts_btentry = blkno;
 		new_stack->bts_parent = stack_in;
 
 		/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index a3583f225b..ad7963cba8 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -403,20 +403,15 @@ typedef struct BTMetaPageData
 #define BT_WRITE		BUFFER_LOCK_EXCLUSIVE
 
 /*
- *	BTStackData -- As we descend a tree, we push the (location, downlink)
- *	pairs from internal pages onto a private stack.  If we split a
- *	leaf, we use this stack to walk back up the tree and insert data
- *	into parent pages (and possibly to split them, too).  Lehman and
- *	Yao's update algorithm guarantees that under no circumstances can
- *	our private stack give us an irredeemably bad picture up the tree.
- *	Again, see the paper for details.
+ * BTStackData -- As we descend a tree, we push the location of pivot
+ * tuples whose downlinks are followed from internal pages onto a private
+ * stack.  If we split a leaf, we use this stack to walk back up the tree
+ * and insert data into parent pages (and possibly to split them, too).
  */
-
 typedef struct BTStackData
 {
 	BlockNumber bts_blkno;
 	OffsetNumber bts_offset;
-	BlockNumber bts_btentry;
 	struct BTStackData *bts_parent;
 } BTStackData;
 
@@ -731,7 +726,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
  */
 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
 						 IndexUniqueCheck checkUnique, Relation heapRel);
-extern Buffer _bt_getstackbuf(Relation rel, BTStack stack);
+extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
 extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
 
 /*
-- 
2.17.1

