From eafb33b14c702a14b9357927dd2726b77eefc53b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 6 May 2019 16:23:40 -0700
Subject: [PATCH v2] Don't leave behind junk nbtree pages during split.

Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR.  At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section.  This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions.  The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog().  There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.

Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes.  This happened at a point
that broke the rule around zeroing the right page in _bt_split().  If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents.  This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.

To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired.  It is now safe for _bt_truncate()
to throw an error -- a junk page/buffer will not be left behind.

Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
---
 src/backend/access/nbtree/nbtinsert.c | 202 +++++++++++++++-----------
 1 file changed, 114 insertions(+), 88 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index c6c952dd49..78afdc9866 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -49,8 +49,8 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
 			   OffsetNumber newitemoff,
 			   bool split_only_page);
 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
-		  Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff,
-		  Size newitemsz, IndexTuple newitem, bool newitemonleft);
+		  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
+		  IndexTuple newitem);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 				  BTStack stack, bool is_root, bool is_only);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
@@ -943,7 +943,6 @@ _bt_insertonpg(Relation rel,
 {
 	Page		page;
 	BTPageOpaque lpageop;
-	OffsetNumber firstright = InvalidOffsetNumber;
 	Size		itemsz;
 
 	page = BufferGetPage(buf);
@@ -979,7 +978,6 @@ _bt_insertonpg(Relation rel,
 	{
 		bool		is_root = P_ISROOT(lpageop);
 		bool		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
-		bool		newitemonleft;
 		Buffer		rbuf;
 
 		/*
@@ -1000,14 +998,8 @@ _bt_insertonpg(Relation rel,
 		Assert(!(P_ISLEAF(lpageop) &&
 				 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
-		/* Choose the split point */
-		firstright = _bt_findsplitloc(rel, page,
-									  newitemoff, itemsz, itup,
-									  &newitemonleft);
-
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, itup_key, buf, cbuf, firstright, newitemoff,
-						 itemsz, itup, newitemonleft);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@@ -1211,9 +1203,8 @@ _bt_insertonpg(Relation rel,
  *	_bt_split() -- split a page in the btree.
  *
  *		On entry, buf is the page to split, and is pinned and write-locked.
- *		firstright is the item index of the first item to be moved to the
- *		new right page.  newitemoff etc. tell us about the new item that
- *		must be inserted along with the data from the old page.
+ *		newitemoff etc. tell us about the new item that must be inserted
+ *		along with the data from the original page.
  *
  *		itup_key is used for suffix truncation on leaf pages (internal
  *		page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
@@ -1226,8 +1217,7 @@ _bt_insertonpg(Relation rel,
  */
 static Buffer
 _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
-		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
-		  IndexTuple newitem, bool newitemonleft)
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
 {
 	Buffer		rbuf;
 	Page		origpage;
@@ -1246,36 +1236,59 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	IndexTuple	item;
 	OffsetNumber leftoff,
 				rightoff;
+	OffsetNumber firstright;
 	OffsetNumber maxoff;
 	OffsetNumber i;
-	bool		isleaf;
+	bool		newitemonleft,
+				isleaf;
 	IndexTuple	lefthikey;
 	int			indnatts = IndexRelationGetNumberOfAttributes(rel);
 	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 
-	/* Acquire a new page to split into */
-	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
-
 	/*
 	 * origpage is the original page to be split.  leftpage is a temporary
 	 * buffer that receives the left-sibling data, which will be copied back
 	 * into origpage on success.  rightpage is the new page that receives the
-	 * right-sibling data.  If we fail before reaching the critical section,
-	 * origpage hasn't been modified and leftpage is only workspace. In
-	 * principle we shouldn't need to worry about rightpage either, because it
-	 * hasn't been linked into the btree page structure; but to avoid leaving
-	 * possibly-confusing junk behind, we are careful to rewrite rightpage as
-	 * zeroes before throwing any error.
+	 * right-sibling data, which won't be allocated just yet.  It's okay if we
+	 * fail before reaching the point that rightpage is allocated, since
+	 * origpage won't have been modified and leftpage is only workspace.
 	 */
 	origpage = BufferGetPage(buf);
-	leftpage = PageGetTempPage(origpage);
-	rightpage = BufferGetPage(rbuf);
-
+	oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
 	origpagenumber = BufferGetBlockNumber(buf);
-	rightpagenumber = BufferGetBlockNumber(rbuf);
 
+	/*
+	 * Choose a point to split the original page at.
+	 *
+	 * A split point can be thought of as a point between two existing tuples
+	 * on the original page, provided you pretend that the new item that
+	 * didn't fit is already on the original page.
+	 *
+	 * firstright is the item index of the first item to be moved to the new
+	 * right page.  newitemonleft is needed to disambiguate which side of the
+	 * new item to split on in the case where firstright indicates split is
+	 * either to the immediate left or the immediate right of the new item.
+	 */
+	firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
+								  newitem, &newitemonleft);
+
+	/* Allocate temp buffer for left page */
+	leftpage = PageGetTempPage(origpage);
 	_bt_pageinit(leftpage, BufferGetPageSize(buf));
-	/* rightpage was already initialized by _bt_getbuf */
+	lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+
+	/*
+	 * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
+	 * and HAS_GARBAGE flags.
+	 */
+	lopaque->btpo_flags = oopaque->btpo_flags;
+	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	/* set flag in left page indicating that rightpage has no downlink yet */
+	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
+	lopaque->btpo_prev = oopaque->btpo_prev;
+	/* handle btpo_next after rightpage allocated */
+	lopaque->btpo.level = oopaque->btpo.level;
+	/* Wait until both buffers are locked before initializing btpo_cycleid */
 
 	/*
 	 * Copy the original page's LSN into leftpage, which will become the
@@ -1283,62 +1296,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 * examine the LSN and possibly dump it in a page image.
 	 */
 	PageSetLSN(leftpage, PageGetLSN(origpage));
-
-	/* init btree private data */
-	oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
-	lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
-	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
-
 	isleaf = P_ISLEAF(oopaque);
 
-	/* if we're splitting this page, it won't be the root when we're done */
-	/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
-	lopaque->btpo_flags = oopaque->btpo_flags;
-	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
-	ropaque->btpo_flags = lopaque->btpo_flags;
-	/* set flag in left page indicating that the right page has no downlink */
-	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
-	lopaque->btpo_prev = oopaque->btpo_prev;
-	lopaque->btpo_next = rightpagenumber;
-	ropaque->btpo_prev = origpagenumber;
-	ropaque->btpo_next = oopaque->btpo_next;
-	lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
-	/* Since we already have write-lock on both pages, ok to read cycleid */
-	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
-	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
-
-	/*
-	 * If the page we're splitting is not the rightmost page at its level in
-	 * the tree, then the first entry on the page is the high key for the
-	 * page.  We need to copy that to the right half.  Otherwise (meaning the
-	 * rightmost page case), all the items on the right half will be user
-	 * data.
-	 */
-	rightoff = P_HIKEY;
-
-	if (!P_RIGHTMOST(oopaque))
-	{
-		itemid = PageGetItemId(origpage, P_HIKEY);
-		itemsz = ItemIdGetLength(itemid);
-		item = (IndexTuple) PageGetItem(origpage, itemid);
-		Assert(BTreeTupleGetNAtts(item, rel) > 0);
-		Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
-		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
-						false, false) == InvalidOffsetNumber)
-		{
-			memset(rightpage, 0, BufferGetPageSize(rbuf));
-			elog(ERROR, "failed to add hikey to the right sibling"
-				 " while splitting block %u of index \"%s\"",
-				 origpagenumber, RelationGetRelationName(rel));
-		}
-		rightoff = OffsetNumberNext(rightoff);
-	}
-
 	/*
 	 * The "high key" for the new left page will be the first key that's going
-	 * to go into the new right page, or possibly a truncated version if this
-	 * is a leaf page split.  This might be either the existing data item at
-	 * position firstright, or the incoming tuple.
+	 * to go into the new right page, or a truncated version if this is a leaf
+	 * page split.
 	 *
 	 * The high key for the left page is formed using the first item on the
 	 * right page, which may seem to be contrary to Lehman & Yao's approach of
@@ -1360,7 +1323,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 * tuple could be physically larger despite being opclass-equal in respect
 	 * of all attributes prior to the heap TID attribute.)
 	 */
-	leftoff = P_HIKEY;
 	if (!newitemonleft && newitemoff == firstright)
 	{
 		/* incoming tuple will become first on right page */
@@ -1416,23 +1378,87 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	else
 		lefthikey = item;
 
+	/*
+	 * Add new high key to leftpage
+	 */
+	leftoff = P_HIKEY;
+
 	Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
 	Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
 	if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
 					false, false) == InvalidOffsetNumber)
-	{
-		memset(rightpage, 0, BufferGetPageSize(rbuf));
 		elog(ERROR, "failed to add hikey to the left sibling"
 			 " while splitting block %u of index \"%s\"",
 			 origpagenumber, RelationGetRelationName(rel));
-	}
 	leftoff = OffsetNumberNext(leftoff);
 	/* be tidy */
 	if (lefthikey != item)
 		pfree(lefthikey);
 
 	/*
-	 * Now transfer all the data items to the appropriate page.
+	 * Acquire a new right page to split into, now that left page has a new
+	 * high key.  From here on, it's not okay to throw an error without
+	 * zeroing rightpage image first.  Zeroing rightpage avoids confusing
+	 * VACUUM, which might otherwise try to re-find a downlink to a leftover
+	 * junk page.
+	 *
+	 * It would be reasonable to start the critical section here instead, but
+	 * it's better to delay doing so until the original page is actually
+	 * modified.  That way, failing to add new items to a page (a known
+	 * symptom of corruption) won't cause us to PANIC.
+	 */
+	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	rightpage = BufferGetPage(rbuf);
+	rightpagenumber = BufferGetBlockNumber(rbuf);
+	/* rightpage was initialized by _bt_getbuf */
+	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+	/* Since we have write-lock on both pages now, ok to read cycleid */
+	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
+	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
+	/* Set leftpage's sibling pointer to point to new rightpage */
+	lopaque->btpo_next = rightpagenumber;
+
+	/*
+	 * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
+	 * and HAS_GARBAGE flags.
+	 */
+	ropaque->btpo_flags = oopaque->btpo_flags;
+	ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	ropaque->btpo_prev = origpagenumber;
+	ropaque->btpo_next = oopaque->btpo_next;
+	ropaque->btpo.level = oopaque->btpo.level;
+
+	/*
+	 * Add new high key to rightpage where necessary.
+	 *
+	 * If the page we're splitting is not the rightmost page at its level in
+	 * the tree, then the first entry on the page is the high key for the
+	 * page.  We need to copy that to the right half.
+	 */
+	rightoff = P_HIKEY;
+
+	if (!P_RIGHTMOST(oopaque))
+	{
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		itemsz = ItemIdGetLength(itemid);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+		Assert(BTreeTupleGetNAtts(item, rel) > 0);
+		Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
+		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+						false, false) == InvalidOffsetNumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add hikey to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		rightoff = OffsetNumberNext(rightoff);
+	}
+
+	/*
+	 * Now transfer all the data items (non-pivot tuples in isleaf case, or
+	 * additional pivot tuples in !isleaf case) to the appropriate page.
 	 *
 	 * Note: we *must* insert at least the right page's items in item-number
 	 * order, for the benefit of _bt_restore_page().
-- 
2.17.1

