From 6bf12b80259c48ee613a56e85f37ebb73cfdf819 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 30 Apr 2019 10:55:52 -0700
Subject: [PATCH v2] Fix nbtsort.c's page space accounting.

Commit dd299df8189 failed to have nbtsort.c conservatively assume that
suffix truncation was ineffective (i.e. that it would represent heap TID
in new high key).  Space for a possible heap TID in new leaf page high
key was budgeted within _bt_check_third_page(), but nbtsort.c's
definition of when a page is completely full didn't explicitly consider
high keys where heap TID is represented.  When the page was deemed full,
it might already be too late: there might be insufficient space, because
the last existing non-pivot tuple on page gets replaced with new high
key that is slightly larger (larger by the space required to store a
MAXALIGN()'d heap TID item pointer).

To fix, bring nbtsort.c in line with nbtsplitloc.c, which already
explicitly assumes that new high key will need to have a heap TID added
when high key is formed on the leaf level.

Reported-By: Andreas Joseph Krogh
Discussion: https://postgr.es/m/VisenaEmail.c5.3ee7fe277d514162.16a6d785bea@tc7-visena
---
 src/backend/access/nbtree/nbtsort.c | 44 ++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 9ac4c1e1c0..7db468bb58 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -841,6 +841,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	OffsetNumber last_off;
 	Size		pgspc;
 	Size		itupsz;
+	bool		isleaf;
 
 	/*
 	 * This is a handy place to check for cancel interrupts during the btree
@@ -855,9 +856,13 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	pgspc = PageGetFreeSpace(npage);
 	itupsz = IndexTupleSize(itup);
 	itupsz = MAXALIGN(itupsz);
+	/* Leaf pages use suffix truncation, and need extra heap TID space */
+	isleaf = state->btps_level == 0;
 
 	/*
-	 * Check whether the item can fit on a btree page at all.
+	 * Check whether the item can fit on page, while making sure that page has
+	 * at least two tuples (in addition to page high key) before starting next
+	 * page.
 	 *
 	 * Every newly built index will treat heap TID as part of the keyspace,
 	 * which imposes the requirement that new high keys must occasionally have
@@ -870,16 +875,27 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	 * the reserved space.  This should never fail on internal pages.
 	 */
 	if (unlikely(itupsz > BTMaxItemSize(npage)))
-		_bt_check_third_page(wstate->index, wstate->heap,
-							 state->btps_level == 0, npage, itup);
+		_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
+							 itup);
 
 	/*
-	 * Check to see if page is "full".  It's definitely full if the item won't
-	 * fit.  Otherwise, compare to the target freespace derived from the
-	 * fillfactor.  However, we must put at least two items on each page, so
-	 * disregard fillfactor if we don't have that many.
+	 * Page is definitely full if the new item won't fit.  We take into
+	 * account the possible need for heap TID space within _bt_truncate() when
+	 * page is a leaf page.  It is guaranteed that we can fit at least 2
+	 * non-pivot tuples plus a high key with heap TID when finishing off a
+	 * leaf page, because _bt_check_third_page() conservatively rejects
+	 * oversized non-pivot tuples. (On internal pages we can always fit 3
+	 * pivot tuples, including high key.)
+	 *
+	 * Most of the time, page is only "full" in the sense that inserting new
+	 * tuple would cause us to exceed fillfactor-wise limit (no need to take
+	 * heap TID space into account in this soft limit).  However, we must
+	 * always leave at least two items plus high key on each page before
+	 * starting a new page, so disregard fillfactor if we don't have enough
+	 * items to make that work yet.
 	 */
-	if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
+	if (pgspc - (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) < itupsz ||
+		(pgspc < state->btps_full && last_off > P_FIRSTKEY))
 	{
 		/*
 		 * Finish off the page and write it out.
@@ -889,7 +905,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		ItemId		ii;
 		ItemId		hii;
 		IndexTuple	oitup;
-		BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage);
 
 		/* Create new page of same level */
 		npage = _bt_blnewpage(state->btps_level);
@@ -917,7 +932,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		ItemIdSetUnused(ii);	/* redundant */
 		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
 
-		if (P_ISLEAF(opageop))
+		if (isleaf)
 		{
 			IndexTuple	lastleft;
 			IndexTuple	truncated;
@@ -944,8 +959,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * to actually save space on the leaf page).  We delete the
 			 * original high key, and add our own truncated high key at the
 			 * same offset.  It's okay if the truncated tuple is slightly
-			 * larger due to containing a heap TID value, since this case is
-			 * known to _bt_check_third_page(), which reserves space.
+			 * larger due to containing a heap TID value, since that was taken
+			 * into account when we determined that page is full; the extra
+			 * space must already be available on page.
 			 *
 			 * Note that the page layout won't be changed very much.  oitup is
 			 * already located at the physical beginning of tuple space, so we
@@ -979,9 +995,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <=
 				IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
 				BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) ||
-			   P_LEFTMOST(opageop));
+			   P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
 		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
-			   !P_LEFTMOST(opageop));
+			   !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
 		BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
 		_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
 		pfree(state->btps_minkey);
-- 
2.17.1

