From 1314d4be1d0d0b5ab472884a01804ae910e3c610 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 31 Aug 2025 10:00:24 +0500
Subject: [PATCH v2 2/2] btree: Add scan abort mechanism for page merge with
 tuple movement

When B-tree page merge moves tuples between pages, concurrent scans can
miss tuples or see duplicates. This adds a BTP_HAD_TUPLES_MOVED flag to
mark pages that had tuples moved during merge, and aborts scans that
encounter such deleted pages with a serialization failure error.

The default mergefactor is set to 0 to disable merging by default,
ensuring no behavior change unless explicitly configured. Includes
TAP test demonstrating the scan abort mechanism.
---
 src/backend/access/nbtree/nbtpage.c           |  4 ++
 src/backend/access/nbtree/nbtsearch.c         | 61 ++++++++++++++++++-
 src/include/access/nbtree.h                   |  4 +-
 .../t/008_btree_merge_scan_correctness.pl     | 37 ++++++-----
 4 files changed, 83 insertions(+), 23 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 792bc52558b..664f5f62b6b 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -2818,6 +2818,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		Page		leafpage = BufferGetPage(leafbuf);
 		Page		rightpage = BufferGetPage(rbuf);
 		BTPageOpaque rightopaque = BTPageGetOpaque(rightpage);
+		BTPageOpaque leafopaque = BTPageGetOpaque(leafpage);
 		OffsetNumber insert_at = P_FIRSTDATAKEY(rightopaque);
 		int			i;
 
@@ -2838,6 +2839,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 
 		elog(DEBUG1, "Page merge completed in index \"%s\": moved %d tuples from page %u to page %u",
 			 RelationGetRelationName(rel), merge_ntuples, leafblkno, rightsib);
+
+		/* Mark that this page had tuples moved for scan detection */
+		leafopaque->btpo_flags |= BTP_HAD_TUPLES_MOVED;
 	}
 
 	/*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 6ff31f87033..28ce65faa71 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -2223,6 +2223,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		Assert(!scan->parallel_scan);
 	}
 
+
+
 	BTScanPosUnpinIfPinned(so->currPos);
 
 	/*
@@ -2233,11 +2235,11 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	 */
 	{
 		Relation rel = scan->indexRelation;
-		BlockNumber blkno = so->currPos.currPage;
+		BlockNumber ip_blkno = so->currPos.currPage;
 		/* Only pause scans on our test index (btree_test_idx has OID around 16400+) */
-		if (rel && RelationGetRelid(rel) > 16384 && blkno == 20)
+		if (rel && RelationGetRelid(rel) > 16384 && ip_blkno == 20)
 		{
-			INJECTION_POINT("btree-scan-between-pages", &blkno);
+			INJECTION_POINT("btree-scan-between-pages", &ip_blkno);
 		}
 	}
 
@@ -2446,6 +2448,19 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 		page = BufferGetPage(so->currPos.buf);
 		opaque = BTPageGetOpaque(page);
 		lastcurrblkno = blkno;
+
+		/*
+		 * Check if this is a deleted page that had tuples moved during merge.
+		 * If so, abort the scan to prevent incorrect results.
+		 */
+		if (P_ISDELETED(opaque) && P_HAD_TUPLES_MOVED(opaque))
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("scan aborted due to concurrent page merge with tuple movement"),
+					 errhint("Retry the operation.")));
+		}
+
 		if (likely(!P_IGNORE(opaque)))
 		{
 			/* see if there are any matches on this page */
@@ -2549,6 +2564,20 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 				/* Found desired page, return it */
 				return buf;
 			}
+
+			/*
+			 * Check if this is a deleted page that had tuples moved during merge.
+			 * If so, abort the scan to prevent incorrect results.
+			 */
+			if (P_ISDELETED(opaque) && P_HAD_TUPLES_MOVED(opaque))
+			{
+				_bt_relbuf(rel, buf);
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("scan aborted due to concurrent page merge with tuple movement"),
+						 errhint("Retry the operation.")));
+			}
+
 			if (P_RIGHTMOST(opaque) || ++tries > 4)
 				break;
 			/* step right */
@@ -2568,6 +2597,19 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 		opaque = BTPageGetOpaque(page);
 		if (P_ISDELETED(opaque))
 		{
+			/*
+			 * Check if this is a deleted page that had tuples moved during merge.
+			 * If so, abort the scan to prevent incorrect results.
+			 */
+			if (P_HAD_TUPLES_MOVED(opaque))
+			{
+				_bt_relbuf(rel, buf);
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("scan aborted due to concurrent page merge with tuple movement"),
+						 errhint("Retry the operation.")));
+			}
+
 			/*
 			 * It was deleted.  Move right to first nondeleted page (there
 			 * must be one); that is the page that has acquired the deleted
@@ -2585,6 +2627,19 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 				opaque = BTPageGetOpaque(page);
 				if (!P_ISDELETED(opaque))
 					break;
+
+				/*
+				 * Check if this is a deleted page that had tuples moved during merge.
+				 * If so, abort the scan to prevent incorrect results.
+				 */
+				if (P_HAD_TUPLES_MOVED(opaque))
+				{
+					_bt_relbuf(rel, buf);
+					ereport(ERROR,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("scan aborted due to concurrent page merge with tuple movement"),
+							 errhint("Retry the operation.")));
+				}
 			}
 		}
 		else
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6fd1bcd142d..041bf0596ae 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -83,6 +83,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples (deprecated) */
 #define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
 #define BTP_HAS_FULLXID	(1 << 8)	/* contains BTDeletedPageData */
+#define BTP_HAD_TUPLES_MOVED (1 << 9)	/* page was deleted after moving tuples */
 
 /*
  * The max allowed value of a cycle ID is a bit less than 64K.  This is
@@ -201,7 +202,7 @@ typedef struct BTMetaPageData
 #define BTREE_DEFAULT_FILLFACTOR	90
 #define BTREE_NONLEAF_FILLFACTOR	70
 #define BTREE_SINGLEVAL_FILLFACTOR	96
-#define BTREE_DEFAULT_MERGEFACTOR	5
+#define BTREE_DEFAULT_MERGEFACTOR	0	/* Disabled by default for safety */
 
 /*
  *	In general, the btree code tries to localize its knowledge about
@@ -228,6 +229,7 @@ typedef struct BTMetaPageData
 #define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 #define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 #define P_HAS_FULLXID(opaque)	(((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+#define P_HAD_TUPLES_MOVED(opaque) (((opaque)->btpo_flags & BTP_HAD_TUPLES_MOVED) != 0)
 
 /*
  * BTDeletedPageData is the page contents of a deleted page
diff --git a/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl b/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl
index abb40f6a4ff..ca06cd2cf28 100644
--- a/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl
+++ b/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl
@@ -76,7 +76,7 @@ $node->safe_psql('postgres', q{
 my $forward_scan = $node->background_psql('postgres');
 my $backward_scan = $node->background_psql('postgres');
 
-# Start queries without waiting for completion (they'll hang at injection point)
+# Start queries without waiting for completion (they should abort with serialization error)
 $forward_scan->query_until(qr/starting forward scan/, q{
 	SET enable_seqscan = off;
 	SET enable_bitmapscan = off;
@@ -100,9 +100,13 @@ sleep(1);
 
 # Run VACUUM while scans are paused - this may trigger page merge
 $node->safe_psql('postgres', q{
+	SET client_min_messages TO DEBUG1;
 	VACUUM btree_test;
 });
 
+# Get current log position to check for new errors
+my $log_offset = -s $node->logfile;
+
 # Release waiting scans
 $node->safe_psql('postgres', q{
 	SELECT injection_points_detach('btree-scan-between-pages');
@@ -110,28 +114,23 @@ $node->safe_psql('postgres', q{
 	SELECT injection_points_wakeup('btree-scan-between-pages');
 });
 
-$forward_scan->quit;
-$backward_scan->quit;
-
-# Analyze results for scan correctness issues
-my $expected_count = $node->safe_psql('postgres', 
-	'SELECT count(*) FROM btree_test');
+# Wait for scans to abort with serialization errors
+$node->wait_for_log('scan aborted due to concurrent page merge with tuple movement',
+	$log_offset);
 
-my $forward_count = $node->safe_psql('postgres',
-	'SELECT count(*) FROM forward_results');
+# Clean up background processes - they should have failed
+$forward_scan->{run}->finish;
+$backward_scan->{run}->finish;
 
-my $backward_count = $node->safe_psql('postgres', 
-	'SELECT count(*) FROM backward_results');
+$node->stop('fast');
 
-# Report results
-note("Expected rows: $expected_count");
-note("Forward scan rows: $forward_count");
-note("Backward scan rows: $backward_count");
+# Verify that scans were aborted by checking the log file
+my $log_contents = slurp_file($node->logfile);
+my $error_count = () = $log_contents =~ /scan aborted due to concurrent page merge with tuple movement/g;
 
-# Test assertions - these should fail with page merge, showing the race condition
-is($forward_count, $expected_count, 'Forward scan returns correct count');
-is($backward_count, $expected_count, 'Backward scan returns correct count');
+note("Found $error_count scan abort errors in log");
 
-$node->stop('fast');
+# We should see at least two scan abort error (possibly two, one for each scan)
+ok($error_count >= 2, 'At least twp scan was aborted due to tuple movement');
 
 done_testing();
\ No newline at end of file
-- 
2.39.5 (Apple Git-154)

