From b297dc807dd2b59a86734d8fb98f03429c9113cc Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Wed, 20 Aug 2025 23:24:36 +0500
Subject: [PATCH v1] btree: Add page merge during vacuum to reduce index bloat

Implement automatic merging of nearly-empty B-tree leaf pages during vacuum.
When a page exceeds the mergefactor threshold (default 5% = 95% empty), move
remaining tuples to the right sibling and delete the source page, reducing
index bloat.

Changes:
- Add mergefactor reloption (0-50%, default 5%) for configurable merge threshold
- Detect mergefactor-empty pages in btvacuumpage() for merge consideration
- Add merge logic in _bt_unlink_halfdead_page() with tuple movement
- Perform feasibility checks for right sibling space and level
- Handle WAL replay by reading tuples from target page before reinit
- Skip merge when vacuum deletions are pending to avoid WAL conflicts
- Update assertions to allow non-empty pages for merge scenarios

The implementation maintains crash safety through existing critical sections
and WAL logging, preserves B-tree sort order, and coordinates with vacuum
operations to prevent offset conflicts during standby replay.

Usage: CREATE INDEX ON table (col) WITH (mergefactor=10);
---
 src/backend/access/common/reloptions.c |  10 +
 src/backend/access/nbtree/nbtpage.c    | 275 +++++++++++++++++++++++--
 src/backend/access/nbtree/nbtree.c     |  30 ++-
 src/backend/access/nbtree/nbtutils.c   |   3 +-
 src/backend/access/nbtree/nbtxlog.c    |  82 +++++++-
 src/include/access/nbtree.h            |   8 +
 src/include/access/nbtxlog.h           |   4 +-
 7 files changed, 390 insertions(+), 22 deletions(-)

diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 0af3fea68fa..b16fd32dc9b 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -192,6 +192,16 @@ static relopt_int intRelOpts[] =
 		},
 		BTREE_DEFAULT_FILLFACTOR, BTREE_MIN_FILLFACTOR, 100
 	},
+	{
+		{
+			"mergefactor",
+			"Minimum percentage of free space to trigger btree page merge during vacuum",
+			RELOPT_KIND_BTREE,
+			ShareUpdateExclusiveLock	/* since it applies only to vacuum
+										 * operations */
+		},
+		BTREE_DEFAULT_MERGEFACTOR, 0, 50
+	},
 	{
 		{
 			"fillfactor",
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index c79dd38ee18..792bc52558b 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1805,6 +1805,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 	bool		rightsib_empty;
 	Page		page;
 	BTPageOpaque opaque;
+	OffsetNumber maxoff;
+	OffsetNumber minoff;
+	bool		page_has_tuples;
 
 	/*
 	 * Save original leafbuf block number from caller.  Only deleted blocks
@@ -1876,8 +1879,12 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 
 		/*
 		 * We can never delete rightmost pages nor root pages.  While at it,
-		 * check that page is empty, since it's possible that the leafbuf page
-		 * was empty a moment ago, but has since had some inserts.
+		 * check that page is empty or nearly empty, since it's
+		 * possible that the leafbuf page was empty a moment ago, but has since
+		 * had some inserts.
+		 *
+		 * For pages that have tuples, attempt to move them to the right sibling
+		 * if there's enough space. This enables merging of nearly-empty pages.
 		 *
 		 * To keep the algorithm simple, we also never delete an incompletely
 		 * split page (they should be rare enough that this doesn't make any
@@ -1893,9 +1900,11 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 		 * we know we stepped right from a page that passed these tests, so
 		 * it's OK.
 		 */
-		if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
-			P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
-			P_INCOMPLETE_SPLIT(opaque))
+		maxoff = PageGetMaxOffsetNumber(page);
+		minoff = P_FIRSTDATAKEY(opaque);
+		page_has_tuples = (minoff <= maxoff);
+
+		if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_INCOMPLETE_SPLIT(opaque))
 		{
 			/* Should never fail to delete a half-dead page */
 			Assert(!P_ISHALFDEAD(opaque));
@@ -1904,6 +1913,88 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 			return;
 		}
 
+		/*
+		 * If page has tuples, check if they can be merged with right sibling.
+		 * Actual merge will be done later in _bt_unlink_halfdead_page() within
+		 * the same WAL record as the page deletion for atomicity.
+		 */
+		if (page_has_tuples)
+		{
+			BlockNumber right_sibling = opaque->btpo_next;
+			Buffer		rbuf;
+			Page		rpage;
+			BTPageOpaque ropaque;
+			Size		space_needed = 0;
+			OffsetNumber i;
+			bool		merge_feasible = false;
+
+			/* Calculate total space needed for all tuples */
+			for (i = minoff; i <= maxoff; i++)
+			{
+				ItemId		itemid = PageGetItemId(page, i);
+				space_needed += ItemIdGetLength(itemid) + sizeof(ItemIdData);
+			}
+
+			/* Check if merge with right sibling is feasible */
+			if (right_sibling != P_NONE)
+			{
+				rbuf = _bt_getbuf(rel, right_sibling, BT_READ);
+				rpage = BufferGetPage(rbuf);
+				ropaque = BTPageGetOpaque(rpage);
+
+				/* Verify right sibling is a leaf page at same level */
+				if (P_ISLEAF(ropaque) && ropaque->btpo_level == opaque->btpo_level &&
+					!P_ISDELETED(ropaque) && !P_ISHALFDEAD(ropaque))
+				{
+					Size freespace = PageGetFreeSpace(rpage);
+
+					if (freespace >= space_needed)
+					{
+						merge_feasible = true;
+						elog(DEBUG1, "Page merge feasible for index \"%s\": can move %d tuples from page %u to page %u (need %zu bytes, have %zu bytes free)",
+							 RelationGetRelationName(rel), (int)(maxoff - minoff + 1),
+							 BufferGetBlockNumber(leafbuf), right_sibling, space_needed, freespace);
+					}
+					else
+					{
+						elog(DEBUG1, "Page merge not feasible for index \"%s\": right sibling page %u has insufficient space (need %zu bytes, have %zu bytes free)",
+							 RelationGetRelationName(rel), right_sibling, space_needed, freespace);
+					}
+				}
+				else
+				{
+					elog(DEBUG1, "Page merge not feasible for index \"%s\": right sibling page %u is not suitable (is_leaf=%d, deleted=%d, halfdead=%d, level=%d vs %d)",
+						 RelationGetRelationName(rel), right_sibling, P_ISLEAF(ropaque),
+						 P_ISDELETED(ropaque), P_ISHALFDEAD(ropaque), ropaque->btpo_level, opaque->btpo_level);
+				}
+
+				_bt_relbuf(rel, rbuf);
+			}
+			else
+			{
+				elog(DEBUG1, "Page merge not feasible for index \"%s\": page %u has no right sibling",
+					 RelationGetRelationName(rel), BufferGetBlockNumber(leafbuf));
+			}
+
+			/* If merge is not feasible, abort the page deletion */
+			if (!merge_feasible)
+			{
+				_bt_relbuf(rel, leafbuf);
+				return;
+			}
+
+			/*
+			 * Merge is feasible. Mark page as candidates for merge and proceed
+			 * with deletion. The actual merge will happen in _bt_unlink_halfdead_page().
+			 */
+		}
+
+		/*
+		 * Page might not be empty if we're doing a merge, but we've validated
+		 * that merge is feasible. Proceed with deletion - the actual merge will
+		 * happen in _bt_unlink_halfdead_page().
+		 */
+
 		/*
 		 * First, remove downlink pointing to the page (or a parent of the
 		 * page, if we are going to delete a taller subtree), and mark the
@@ -2104,9 +2195,21 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
 	page = BufferGetPage(leafbuf);
 	opaque = BTPageGetOpaque(page);
 
+	/*
+	 * Assert page is suitable for deletion. Page must be a leaf, not root,
+	 * not rightmost, and not ignored. For traditional deletion, page must be
+	 * empty. For merge scenarios (nearly empty pages), page may have tuples.
+	 */
 	Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
-		   P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
-		   P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+		   P_ISLEAF(opaque) && !P_IGNORE(opaque));
+
+	/*
+	 * Page should be either empty (traditional deletion) or nearly empty
+	 * with tuples that can be merged (new merge feature).
+	 */
+	Assert(P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page) ||
+		   (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) &&
+			PageGetFreeSpace(page) >= ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100));
 	Assert(heaprel != NULL);
 
 	/*
@@ -2231,7 +2334,13 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
 	opaque = BTPageGetOpaque(page);
 	opaque->btpo_flags |= BTP_HALF_DEAD;
 
-	Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
+	/*
+	 * Page should only have high key in traditional deletion scenario.
+	 * In merge scenarios, page may have data tuples that will be moved later.
+	 */
+	Assert(PageGetMaxOffsetNumber(page) == P_HIKEY ||
+		   (PageGetMaxOffsetNumber(page) > P_HIKEY &&
+			PageGetFreeSpace(page) >= ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100));
 	MemSet(&trunctuple, 0, sizeof(IndexTupleData));
 	trunctuple.t_info = sizeof(IndexTupleData);
 	if (topparent != leafblkno)
@@ -2333,9 +2442,19 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	FullTransactionId safexid;
 	bool		rightsib_is_rightmost;
 	uint32		targetlevel;
+	IndexTuple	tuple;
+	Size		tupsz;
+	int			idx;
 	IndexTuple	leafhikey;
 	BlockNumber leaftopparent;
 
+	/* Variables for page merge functionality */
+	IndexTuple	*merge_tuples = NULL;
+	Size		*merge_tupsz = NULL;
+	OffsetNumber *merge_deletable = NULL;
+	int			merge_ntuples = 0;
+	bool		do_merge = false;
+
 	page = BufferGetPage(leafbuf);
 	opaque = BTPageGetOpaque(page);
 
@@ -2480,11 +2599,24 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 
 	if (target == leafblkno)
 	{
-		if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
-			!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
+		/*
+		 * Validate leaf page status. Page must be leaf and half-dead.
+		 * For traditional deletion, page must be empty. For merge scenarios,
+		 * page may have data tuples that will be moved to right sibling.
+		 */
+		if (!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
 			elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
 				 target, RelationGetRelationName(rel));
 
+		/* Check if page is empty (traditional deletion) or suitable for merge */
+		if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
+		{
+			/* Page has tuples - validate it's suitable for merge */
+			if (PageGetFreeSpace(page) < ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100)
+				elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
+					 target, RelationGetRelationName(rel));
+		}
+
 		/* Leaf page is also target page: don't set leaftopparent */
 		leaftopparent = InvalidBlockNumber;
 	}
@@ -2591,6 +2723,71 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * Here we begin doing the deletion.
 	 */
 
+	/*
+	 * Prepare merge data before critical section if needed.
+	 * We'll use local variables and perform the merge within the critical section.
+	 */
+
+	if (target == leafblkno)
+	{
+		Page		leafpage = BufferGetPage(leafbuf);
+		BTPageOpaque leafopaque = BTPageGetOpaque(leafpage);
+		OffsetNumber minoff = P_FIRSTDATAKEY(leafopaque);
+		OffsetNumber maxoff = PageGetMaxOffsetNumber(leafpage);
+
+		/* Check if leaf page has tuples that need to be merged */
+		if (minoff <= maxoff && P_ISLEAF(leafopaque) && P_ISHALFDEAD(leafopaque))
+		{
+			Page		rightpage = BufferGetPage(rbuf);
+			BTPageOpaque rightopaque = BTPageGetOpaque(rightpage);
+			Size		space_needed = 0;
+			OffsetNumber i;
+
+			merge_ntuples = maxoff - minoff + 1;
+
+			/* Calculate total space needed for all tuples */
+			for (i = minoff; i <= maxoff; i++)
+			{
+				itemid = PageGetItemId(leafpage, i);
+				space_needed += ItemIdGetLength(itemid) + sizeof(ItemIdData);
+			}
+
+			/* Verify that right sibling has enough space (should have been checked earlier) */
+			if (P_ISLEAF(rightopaque) && rightopaque->btpo_level == leafopaque->btpo_level &&
+				!P_ISDELETED(rightopaque) && !P_ISHALFDEAD(rightopaque) &&
+				PageGetFreeSpace(rightpage) >= space_needed)
+			{
+				elog(DEBUG1, "Performing page merge in index \"%s\": moving %d tuples from page %u to page %u",
+					 RelationGetRelationName(rel), merge_ntuples, leafblkno, rightsib);
+
+				/* Pre-allocate and copy all tuples outside critical section */
+				merge_tuples = palloc(sizeof(IndexTuple) * merge_ntuples);
+				merge_tupsz = palloc(sizeof(Size) * merge_ntuples);
+				merge_deletable = palloc(sizeof(OffsetNumber) * merge_ntuples);
+
+				for (i = minoff; i <= maxoff; i++)
+				{
+					itemid = PageGetItemId(leafpage, i);
+					tuple = (IndexTuple) PageGetItem(leafpage, itemid);
+					tupsz = ItemIdGetLength(itemid);
+					idx = i - minoff;
+
+					merge_tuples[idx] = palloc(tupsz);
+					memcpy(merge_tuples[idx], tuple, tupsz);
+					merge_tupsz[idx] = tupsz;
+					merge_deletable[idx] = i;
+				}
+
+				do_merge = true;
+			}
+			else
+			{
+				elog(DEBUG1, "Cannot perform page merge in index \"%s\": right sibling conditions not met",
+					 RelationGetRelationName(rel));
+			}
+		}
+	}
+
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
@@ -2611,6 +2808,38 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	Assert(opaque->btpo_prev == target);
 	opaque->btpo_prev = leftsib;
 
+	/*
+	 * Perform page merge if needed. Move tuples from the leaf page to the right
+	 * sibling before marking the leaf page as deleted. This must be done within
+	 * the critical section to ensure atomicity with the page deletion.
+	 */
+	if (do_merge)
+	{
+		Page		leafpage = BufferGetPage(leafbuf);
+		Page		rightpage = BufferGetPage(rbuf);
+		BTPageOpaque rightopaque = BTPageGetOpaque(rightpage);
+		OffsetNumber insert_at = P_FIRSTDATAKEY(rightopaque);
+		int			i;
+
+		/* Move all saved tuples to right sibling */
+		for (i = 0; i < merge_ntuples; i++)
+		{
+			if (PageAddItem(rightpage, (Item) merge_tuples[i], merge_tupsz[i],
+							insert_at, false, false) == InvalidOffsetNumber)
+			{
+				elog(PANIC, "failed to move tuple during page merge in index \"%s\" - space should have been checked",
+					 RelationGetRelationName(rel));
+			}
+			insert_at++;
+		}
+
+		/* Clear all tuples from the leaf page using pre-allocated array */
+		PageIndexMultiDelete(leafpage, merge_deletable, merge_ntuples);
+
+		elog(DEBUG1, "Page merge completed in index \"%s\": moved %d tuples from page %u to page %u",
+			 RelationGetRelationName(rel), merge_ntuples, leafblkno, rightsib);
+	}
+
 	/*
 	 * If we deleted a parent of the targeted leaf page, instead of the leaf
 	 * itself, update the leaf to point to the next remaining child in the
@@ -2676,10 +2905,15 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 
 		XLogBeginInsert();
 
-		XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
+		/* Always use REGBUF_STANDARD for target page to allow reading during replay */
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
 		if (BufferIsValid(lbuf))
 			XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
-		XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
+		/* Always use REGBUF_FORCE_IMAGE for right sibling if we're merging tuples */
+		if (merge_ntuples > 0)
+			XLogRegisterBuffer(2, rbuf, REGBUF_FORCE_IMAGE);
+		else
+			XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
 		if (target != leafblkno)
 			XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
 
@@ -2694,8 +2928,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		xlrec.leafrightsib = leafrightsib;
 		xlrec.leaftopparent = leaftopparent;
 
+		/* Note: merge information is inferred during WAL replay by checking target page content */
+
 		XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);
 
+		/* Note: merged tuple data is read directly from target page during WAL replay */
+
 		if (BufferIsValid(metabuf))
 		{
 			XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
@@ -2739,6 +2977,19 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 
 	END_CRIT_SECTION();
 
+	/* Clean up merge-related memory allocations */
+	if (do_merge && merge_tuples != NULL)
+	{
+		for (int i = 0; i < merge_ntuples; i++)
+		{
+			if (merge_tuples[i] != NULL)
+				pfree(merge_tuples[i]);
+		}
+		pfree(merge_tuples);
+		pfree(merge_tupsz);
+		pfree(merge_deletable);
+	}
+
 	/* release metapage */
 	if (BufferIsValid(metabuf))
 		_bt_relbuf(rel, metabuf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c130..281467ff18f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1620,6 +1620,9 @@ backtrack:
 		 * since that would require teaching _bt_pagedel() about backtracking
 		 * (doesn't seem worth adding more complexity to deal with that).
 		 *
+		 * We also attempt page deletion if the page is mostly empty (by bytes).
+		 * This enables merging of nearly-empty pages to reduce bloat.
+		 *
 		 * We don't count the number of live TIDs during cleanup-only calls to
 		 * btvacuumscan (i.e. when callback is not set).  We count the number
 		 * of index tuples directly instead.  This avoids the expense of
@@ -1630,12 +1633,35 @@ backtrack:
 		 */
 		if (minoff > maxoff)
 			attempt_pagedel = (blkno == scanblkno);
-		else if (callback)
+		else if (blkno == scanblkno)
+		{
+			/* Check if page meets merge threshold by space usage */
+			Size		freespace = PageGetFreeSpace(page);
+			Size		pagesize = BLCKSZ - SizeOfPageHeaderData;
+			int			mergefactor = BTGetMergeFactor(rel);
+
+					/*
+		 * Only attempt page merge if there were no vacuum deletions
+		 * on this page. If there were deletions, the vacuum WAL record
+		 * was already written with specific offset numbers that would
+		 * become invalid if we merge tuples to another page.
+		 */
+		if (freespace >= (pagesize * mergefactor) / 100 && ndeletable == 0 && nupdatable == 0)
+			attempt_pagedel = true;
+		}
+
+		if (callback)
 			stats->num_index_tuples += nhtidslive;
 		else
 			stats->num_index_tuples += maxoff - minoff + 1;
 
-		Assert(!attempt_pagedel || nhtidslive == 0);
+		/*
+		 * Assert that either we're not attempting page deletion, or if we are,
+		 * it's either because the page is empty (nhtidslive == 0) or because
+		 * we're attempting a merge of a mostly empty page with tuples.
+		 */
+		Assert(!attempt_pagedel || nhtidslive == 0 ||
+			   (blkno == scanblkno && PageGetFreeSpace(page) >= ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100));
 	}
 
 	if (attempt_pagedel)
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 9aed207995f..30b986f35be 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3649,7 +3649,8 @@ btoptions(Datum reloptions, bool validate)
 		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
 		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
 		{"deduplicate_items", RELOPT_TYPE_BOOL,
-		offsetof(BTOptions, deduplicate_items)}
+		offsetof(BTOptions, deduplicate_items)},
+		{"mergefactor", RELOPT_TYPE_INT, offsetof(BTOptions, mergefactor)}
 	};
 
 	return (bytea *) build_reloptions(reloptions, validate,
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index d31dd56732d..7ee05d122f8 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -813,6 +813,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	Buffer		rightbuf;
 	Page		page;
 	BTPageOpaque pageop;
+	IndexTuple *merge_tuples;
+	uint16		merge_ntuples;
 
 	leftsib = xlrec->leftsib;
 	rightsib = xlrec->rightsib;
@@ -823,6 +825,10 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	/* No leaftopparent for level 0 (leaf page) or level 1 target */
 	Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1);
 
+	/* Initialize merge variables */
+	merge_tuples = NULL;
+	merge_ntuples = 0;
+
 	/*
 	 * In normal operation, we would lock all the pages this WAL record
 	 * touches before changing any of them.  In WAL replay, we at least lock
@@ -847,12 +853,57 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	else
 		leftbuf = InvalidBuffer;
 
-	/* Rewrite target page as empty deleted page */
-	target = XLogInitBufferForRedo(record, 0);
-	page = (Page) BufferGetPage(target);
+	/*
+	 * Handle target page. For page merges, we first read existing content
+	 * to extract tuples, then reinitialize. For simple deletions, we can
+	 * initialize directly.
+	 */
+	if (XLogReadBufferForRedo(record, 0, &target) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(target);
+		pageop = BTPageGetOpaque(page);
 
-	_bt_pageinit(page, BufferGetPageSize(target));
-	pageop = BTPageGetOpaque(page);
+		/* Check if this is a leaf page merge case */
+		if (isleaf && rightsib != P_NONE && !P_RIGHTMOST(pageop))
+		{
+			OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+			OffsetNumber minoff = P_FIRSTDATAKEY(pageop);
+
+			/* If page has tuples, save them for merging */
+			if (minoff <= maxoff)
+			{
+				OffsetNumber offnum;
+				uint16		i = 0;
+
+				merge_ntuples = maxoff - minoff + 1;
+				merge_tuples = (IndexTuple *) palloc(merge_ntuples * sizeof(IndexTuple));
+
+				/* Copy all tuples from the page */
+				for (offnum = minoff; offnum <= maxoff; offnum++)
+				{
+					ItemId		itemid = PageGetItemId(page, offnum);
+					IndexTuple	tuple = (IndexTuple) PageGetItem(page, itemid);
+					Size		tupsz = IndexTupleSize(tuple);
+
+					merge_tuples[i] = (IndexTuple) palloc(tupsz);
+					memcpy(merge_tuples[i], tuple, tupsz);
+					i++;
+				}
+			}
+		}
+
+		/* Now reinitialize target page as empty deleted page */
+		_bt_pageinit(page, BufferGetPageSize(target));
+		pageop = BTPageGetOpaque(page);
+	}
+	else
+	{
+		/* Page doesn't need redo, but we still need to get the buffer */
+		target = XLogInitBufferForRedo(record, 0);
+		page = (Page) BufferGetPage(target);
+		_bt_pageinit(page, BufferGetPageSize(target));
+		pageop = BTPageGetOpaque(page);
+	}
 
 	pageop->btpo_prev = leftsib;
 	pageop->btpo_next = rightsib;
@@ -865,17 +916,36 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	PageSetLSN(page, lsn);
 	MarkBufferDirty(target);
 
-	/* Fix left-link of right sibling */
+	/* Fix left-link of right sibling and replay merged tuples if any */
 	if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO)
 	{
 		page = (Page) BufferGetPage(rightbuf);
 		pageop = BTPageGetOpaque(page);
 		pageop->btpo_prev = leftsib;
 
+		/*
+		 * Note: If tuples were merged during the original operation, the right
+		 * sibling buffer was registered with REGBUF_FORCE_IMAGE, so the page
+		 * is automatically restored to its final post-merge state. No explicit
+		 * tuple insertion is needed during replay.
+		 *
+		 * For non-merge operations, we only need to update the left-link.
+		 */
+
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(rightbuf);
 	}
 
+	/* Clean up merge tuples */
+	if (merge_tuples != NULL)
+	{
+		uint16		i;
+
+		for (i = 0; i < merge_ntuples; i++)
+			pfree(merge_tuples[i]);
+		pfree(merge_tuples);
+	}
+
 	/* Release siblings */
 	if (BufferIsValid(leftbuf))
 		UnlockReleaseBuffer(leftbuf);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e709d2e0afe..6fd1bcd142d 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -201,6 +201,7 @@ typedef struct BTMetaPageData
 #define BTREE_DEFAULT_FILLFACTOR	90
 #define BTREE_NONLEAF_FILLFACTOR	70
 #define BTREE_SINGLEVAL_FILLFACTOR	96
+#define BTREE_DEFAULT_MERGEFACTOR	5
 
 /*
  *	In general, the btree code tries to localize its knowledge about
@@ -1153,6 +1154,7 @@ typedef struct BTOptions
 	int			fillfactor;		/* page fill factor in percent (0..100) */
 	float8		vacuum_cleanup_index_scale_factor;	/* deprecated */
 	bool		deduplicate_items;	/* Try to deduplicate items? */
+	int			mergefactor;	/* page merge factor in percent (0..100) */
 } BTOptions;
 
 #define BTGetFillFactor(relation) \
@@ -1168,6 +1170,12 @@ typedef struct BTOptions
 				 relation->rd_rel->relam == BTREE_AM_OID), \
 	((relation)->rd_options ? \
 	 ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
+#define BTGetMergeFactor(relation) \
+	(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+				 relation->rd_rel->relam == BTREE_AM_OID), \
+	 (relation)->rd_options ? \
+	 ((BTOptions *) (relation)->rd_options)->mergefactor : \
+	 BTREE_DEFAULT_MERGEFACTOR)
 
 /*
  * Constant definition for progress reporting.  Phase numbers must match
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index fbbe115c771..5ba5eb69b1d 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -325,7 +325,9 @@ typedef struct xl_btree_unlink_page
 	BlockNumber leafrightsib;
 	BlockNumber leaftopparent;	/* next child down in the subtree */
 
-	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
+	/*
+	 * xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META
+	 */
 } xl_btree_unlink_page;
 
 #define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
-- 
2.39.5 (Apple Git-154)

