From 5c35e641de47fa23ed86924c20076919321b6444 Mon Sep 17 00:00:00 2001
From: Floris van Nee <florisvannee@optiver.com>
Date: Tue, 9 Mar 2021 14:21:00 +0100
Subject: [PATCH] Allow inserting tuples into almost-empty pages

When selecting a new page to insert tuples in, the FSM was not
always taken into account. This happens when the tuple to be
inserted is larger than the maximum number of bytes that should
be on the page according to the fill factor. This leads to ever-
growing tables when large tuples are frequently updated, as
these will always be put on a new page. This patch addresses the
bug by properly selecting empty pages for reuse.
---
 src/backend/access/heap/hio.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 75223c9bea..4433e63b30 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -335,7 +335,8 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	Buffer		buffer = InvalidBuffer;
 	Page		page;
 	Size		pageFreeSpace = 0,
-				saveFreeSpace = 0;
+				saveFreeSpace = 0,
+				targetFreeSpace = 0;
 	BlockNumber targetBlock,
 				otherBlock;
 	bool		needLock;
@@ -358,6 +359,22 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);
 
+	if (len + saveFreeSpace > MaxHeapTupleSize)
+	{
+		/*
+		 * We want to allow inserting a large tuple into an empty page
+		 * even if that would violate the fillfactor. Otherwise, we would
+		 * unnecessarily extend the relation. Instead, ask the FSM for
+		 * MaxHeapTupleSize bytes, minus a small amount of slack. This
+		 * will allow it to return a page that is not quite empty because
+		 * of free line pointers. We allow 2% slack, which is somewhat
+		 * arbitrary.
+		 */
+		targetFreeSpace = Max(len, MaxHeapTupleSize - (MaxHeapTupleSize * 2 / 100));
+	}
+	else
+		targetFreeSpace = len + saveFreeSpace;
+
 	if (otherBuffer != InvalidBuffer)
 		otherBlock = BufferGetBlockNumber(otherBuffer);
 	else
@@ -376,13 +393,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	 * When use_fsm is false, we either put the tuple onto the existing target
 	 * page or extend the relation.
 	 */
-	if (len + saveFreeSpace > MaxHeapTupleSize)
-	{
-		/* can't fit, don't bother asking FSM */
-		targetBlock = InvalidBlockNumber;
-		use_fsm = false;
-	}
-	else if (bistate && bistate->current_buf != InvalidBuffer)
+	if (bistate && bistate->current_buf != InvalidBuffer)
 		targetBlock = BufferGetBlockNumber(bistate->current_buf);
 	else
 		targetBlock = RelationGetTargetBlock(relation);
@@ -393,7 +404,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * We have no cached target page, so ask the FSM for an initial
 		 * target.
 		 */
-		targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
+		targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace);
 	}
 
 	/*
@@ -517,7 +528,7 @@ loop:
 		}
 
 		pageFreeSpace = PageGetHeapFreeSpace(page);
-		if (len + saveFreeSpace <= pageFreeSpace)
+		if (targetFreeSpace <= pageFreeSpace)
 		{
 			/* use this page as future insert target, too */
 			RelationSetTargetBlock(relation, targetBlock);
@@ -550,7 +561,7 @@ loop:
 		targetBlock = RecordAndGetPageWithFreeSpace(relation,
 													targetBlock,
 													pageFreeSpace,
-													len + saveFreeSpace);
+													targetFreeSpace);
 	}
 
 	/*
@@ -582,7 +593,7 @@ loop:
 			 * Check if some other backend has extended a block for us while
 			 * we were waiting on the lock.
 			 */
-			targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
+			targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace);
 
 			/*
 			 * If some other waiter has already extended the relation, we
-- 
2.29.2

