From bfc1d3b51cd3a3cf21d454e4d73de10ccc8eefbf Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 27 Aug 2019 11:44:17 -0700
Subject: [PATCH v3 1/2] Use full 64-bit XID for nbtree page deletion.

Otherwise, after a deleted page gets even older, it becomes unrecyclable
again.  This is the nbtree equivalent of commit 6655a729, which did the
same thing within GiST.

Stop storing an XID for that tracks the oldest safexid across all
deleted pages in an index altogether.  There is no longer any point in
doing this.  It only ever made sense when bpto.xact fields could
wraparound.

The old btm_oldest_btpo_xact metapage field has been repurposed in a way
that preserves on-disk compatibility for pg_upgrade.  Rename this uint32
field, and use it to store the number of deleted pages that we expect to
be able to recycle during the next btvacuumcleanup() that actually scans
the index.  This approach is a little unorthodox, but we were already
using btm_oldest_btpo_xact (now called btm_last_cleanup_num_delpages) in
approximately the same way. And in exactly the same place: inside the
_bt_vacuum_needs_cleanup() function.

The general assumption is that we ought to be able to recycle however
many pages btm_last_cleanup_num_delpages indicates by deciding to scan
the index during a btvacuumcleanup() call (_bt_vacuum_needs_cleanup()'s
decision).  Note that manually issued VACUUMs won't be able to recycle
btm_last_cleanup_num_delpages pages (and _bt_vacuum_needs_cleanup()
won't instruct btvacuumcleanup() to skip scanning the index) unless at
least one XID is consumed between VACUUMs.

Bump XLOG_PAGE_MAGIC due to WAL record switch over to full XIDs.
---
 src/include/access/nbtree.h           |  79 ++++++++--
 src/include/access/nbtxlog.h          |  28 ++--
 src/include/storage/standby.h         |   2 +
 src/backend/access/gist/gistxlog.c    |  24 +---
 src/backend/access/nbtree/nbtinsert.c |  25 ++--
 src/backend/access/nbtree/nbtpage.c   | 198 +++++++++++++++-----------
 src/backend/access/nbtree/nbtree.c    | 177 ++++++++++++-----------
 src/backend/access/nbtree/nbtsearch.c |   6 +-
 src/backend/access/nbtree/nbtsort.c   |   2 +-
 src/backend/access/nbtree/nbtxlog.c   |  39 +++--
 src/backend/access/rmgrdesc/nbtdesc.c |  17 ++-
 src/backend/storage/ipc/standby.c     |  28 ++++
 contrib/amcheck/verify_nbtree.c       |  81 +++++++----
 contrib/pageinspect/btreefuncs.c      |  65 ++++++---
 contrib/pgstattuple/pgstatindex.c     |   8 +-
 15 files changed, 485 insertions(+), 294 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index cad4f2bdeb..37d895371a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -37,8 +37,9 @@ typedef uint16 BTCycleId;
  *
  *	In addition, we store the page's btree level (counting upwards from
  *	zero at a leaf page) as well as some flag bits indicating the page type
- *	and status.  If the page is deleted, we replace the level with the
- *	next-transaction-ID value indicating when it is safe to reclaim the page.
+ *	and status.  If the page is deleted, a BTDeletedPageContents struct is
+ *	stored in the page's tuple area, while a standard BTPageOpaqueData struct
+ *	is stored in the page special area.
  *
  *	We also store a "vacuum cycle ID".  When a page is split while VACUUM is
  *	processing the index, a nonzero value associated with the VACUUM run is
@@ -52,17 +53,17 @@ typedef uint16 BTCycleId;
  *
  *	NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
  *	instead.
+ *
+ *	NOTE: the btpo_level field used to be a union type in order to allow
+ *	deleted pages to store a 32-bit safexid in the same field.  We now store
+ *	64-bit/full safexid values using BTDeletedPageContents instead.
  */
 
 typedef struct BTPageOpaqueData
 {
 	BlockNumber btpo_prev;		/* left sibling, or P_NONE if leftmost */
 	BlockNumber btpo_next;		/* right sibling, or P_NONE if rightmost */
-	union
-	{
-		uint32		level;		/* tree level --- zero for leaf pages */
-		TransactionId xact;		/* next transaction ID, if deleted */
-	}			btpo;
+	uint32		btpo_level;		/* tree level --- zero for leaf pages */
 	uint16		btpo_flags;		/* flag bits, see below */
 	BTCycleId	btpo_cycleid;	/* vacuum cycle ID of latest split */
 } BTPageOpaqueData;
@@ -78,6 +79,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_SPLIT_END	(1 << 5)	/* rightmost page of split group */
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples (deprecated) */
 #define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
+#define BTP_HAS_FULLXID	(1 << 8)	/* contains BTDeletedPageContents */
 
 /*
  * The max allowed value of a cycle ID is a bit less than 64K.  This is
@@ -105,10 +107,12 @@ typedef struct BTMetaPageData
 	BlockNumber btm_fastroot;	/* current "fast" root location */
 	uint32		btm_fastlevel;	/* tree level of the "fast" root page */
 	/* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
-	TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted
-										 * pages */
-	float8		btm_last_cleanup_num_heap_tuples;	/* number of heap tuples
-													 * during last cleanup */
+
+	/* number of deleted, non-recyclable pages during last cleanup */
+	uint32		btm_last_cleanup_num_delpages;
+	/* number of heap tuples during last cleanup */
+	float8		btm_last_cleanup_num_heap_tuples;
+
 	bool		btm_allequalimage;	/* are all columns "equalimage"? */
 } BTMetaPageData;
 
@@ -220,6 +224,53 @@ typedef struct BTMetaPageData
 #define P_IGNORE(opaque)		(((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 #define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 #define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+#define P_HAS_FULLXID(opaque)	(((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+
+/*
+ * BTDeletedPageContents is the page contents of a deleted page
+ */
+typedef struct BTDeletedPageContents
+{
+	/* last xid which might land on the page and get confused */
+	FullTransactionId safexid;
+} BTDeletedPageContents;
+
+static inline void
+BTPageSetDeleted(Page page, FullTransactionId safexid)
+{
+	BTPageOpaque opaque;
+	PageHeader	header;
+	BTDeletedPageContents *contents;
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	header = ((PageHeader) page);
+
+	opaque->btpo_flags &= ~BTP_HALF_DEAD;
+	opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
+	header->pd_lower =
+		MAXALIGN(SizeOfPageHeaderData) + sizeof(BTDeletedPageContents);
+	header->pd_upper = header->pd_special;
+
+	/* Set safexid in deleted page */
+	contents = ((BTDeletedPageContents *) PageGetContents(page));
+	contents->safexid = safexid;
+}
+
+static inline FullTransactionId
+BTPageGetDeleteXid(Page page)
+{
+	BTPageOpaque opaque PG_USED_FOR_ASSERTS_ONLY;
+	BTDeletedPageContents *contents;
+
+	/* pg_upgrade'd indexes with old BTP_DELETED pages should not call here */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISDELETED(opaque) && !P_ISHALFDEAD(opaque) &&
+		   P_HAS_FULLXID(opaque));
+
+	/* Get safexid from deleted page */
+	contents = ((BTDeletedPageContents *) PageGetContents(page));
+	return contents->safexid;
+}
 
 /*
  *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -1067,7 +1118,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
 							 bool allequalimage);
 extern void _bt_update_meta_cleanup_info(Relation rel,
-										 TransactionId oldestBtpoXact, float8 numHeapTuples);
+										 BlockNumber pages_deleted_not_recycled,
+										 float8 num_heap_tuples);
 extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
@@ -1091,8 +1143,7 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
 									  Relation heapRel,
 									  TM_IndexDeleteOp *delstate);
-extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
-						  TransactionId *oldestBtpoXact);
+extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf);
 
 /*
  * prototypes for functions in nbtsearch.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 7ae5c98c2b..0cec80e5fa 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -13,6 +13,7 @@
 #ifndef NBTXLOG_H
 #define NBTXLOG_H
 
+#include "access/transam.h"
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 #include "storage/off.h"
@@ -52,7 +53,7 @@ typedef struct xl_btree_metadata
 	uint32		level;
 	BlockNumber fastroot;
 	uint32		fastlevel;
-	TransactionId oldest_btpo_xact;
+	uint32		last_cleanup_num_delpages;
 	float8		last_cleanup_num_heap_tuples;
 	bool		allequalimage;
 } xl_btree_metadata;
@@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page
 {
 	RelFileNode node;
 	BlockNumber block;
-	TransactionId latestRemovedXid;
+	FullTransactionId latestRemovedFullXid;
 } xl_btree_reuse_page;
 
 #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))
@@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead
 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
 
 /*
- * This is what we need to know about deletion of a btree page.  Note we do
- * not store any content for the deleted page --- it is just rewritten as empty
- * during recovery, apart from resetting the btpo.xact.
+ * This is what we need to know about deletion of a btree page.  Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while).  It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
  *
  * Backup Blk 0: target block being deleted
  * Backup Blk 1: target block's left sibling, if any
@@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page
 {
 	BlockNumber leftsib;		/* target block's left sibling, if any */
 	BlockNumber rightsib;		/* target block's right sibling */
+	uint32		level;			/* target block's level */
+	FullTransactionId safexid;	/* target block's BTPageSetDeleted() value */
 
 	/*
-	 * Information needed to recreate the leaf page, when target is an
-	 * internal page.
+	 * Information needed to recreate a half-dead leaf page with correct
+	 * topparent link.  The fields are only used when deletion operation's
+	 * target page is an internal page.  REDO routine creates half-dead page
+	 * from scratch to keep things simple (this is the same convenient
+	 * approach used for the target page itself).
 	 */
 	BlockNumber leafleftsib;
 	BlockNumber leafrightsib;
-	BlockNumber topparent;		/* next child down in the subtree */
+	BlockNumber topparent;
 
-	TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */
 	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
 } xl_btree_unlink_page;
 
-#define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
+#define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, topparent) + sizeof(BlockNumber))
 
 /*
  * New root log record.  There are zero tuples if this is to establish an
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index 94d33851d0..38fd85a431 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void);
 
 extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
 												RelFileNode node);
+extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+													   RelFileNode node);
 extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
 extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
 
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index f2eda79bc1..1c80eae044 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -394,28 +394,8 @@ gistRedoPageReuse(XLogReaderState *record)
 	 * same exclusion effect on primary and standby.
 	 */
 	if (InHotStandby)
-	{
-		FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
-		FullTransactionId nextXid = ReadNextFullTransactionId();
-		uint64		diff;
-
-		/*
-		 * ResolveRecoveryConflictWithSnapshot operates on 32-bit
-		 * TransactionIds, so truncate the logged FullTransactionId. If the
-		 * logged value is very old, so that XID wrap-around already happened
-		 * on it, there can't be any snapshots that still see it.
-		 */
-		diff = U64FromFullTransactionId(nextXid) -
-			U64FromFullTransactionId(latestRemovedFullXid);
-		if (diff < MaxTransactionId / 2)
-		{
-			TransactionId latestRemovedXid;
-
-			latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
-			ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
-												xlrec->node);
-		}
-	}
+		ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+												   xlrec->node);
 }
 
 void
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index e333603912..1f74181f8d 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -326,6 +326,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
 			Page		page;
 			BTPageOpaque opaque;
 
+			/* Deliberately omit "!_bt_page_recyclable(page)" Assert() */
 			_bt_checkpage(rel, insertstate->buf);
 			page = BufferGetPage(insertstate->buf);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1241,7 +1242,7 @@ _bt_insertonpg(Relation rel,
 			metapg = BufferGetPage(metabuf);
 			metad = BTPageGetMeta(metapg);
 
-			if (metad->btm_fastlevel >= opaque->btpo.level)
+			if (metad->btm_fastlevel >= opaque->btpo_level)
 			{
 				/* no update wanted */
 				_bt_relbuf(rel, metabuf);
@@ -1268,7 +1269,7 @@ _bt_insertonpg(Relation rel,
 			if (metad->btm_version < BTREE_NOVAC_VERSION)
 				_bt_upgrademetapage(metapg);
 			metad->btm_fastroot = BufferGetBlockNumber(buf);
-			metad->btm_fastlevel = opaque->btpo.level;
+			metad->btm_fastlevel = opaque->btpo_level;
 			MarkBufferDirty(metabuf);
 		}
 
@@ -1331,7 +1332,7 @@ _bt_insertonpg(Relation rel,
 					xlmeta.level = metad->btm_level;
 					xlmeta.fastroot = metad->btm_fastroot;
 					xlmeta.fastlevel = metad->btm_fastlevel;
-					xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+					xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 					xlmeta.last_cleanup_num_heap_tuples =
 						metad->btm_last_cleanup_num_heap_tuples;
 					xlmeta.allequalimage = metad->btm_allequalimage;
@@ -1537,7 +1538,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
 	lopaque->btpo_prev = oopaque->btpo_prev;
 	/* handle btpo_next after rightpage buffer acquired */
-	lopaque->btpo.level = oopaque->btpo.level;
+	lopaque->btpo_level = oopaque->btpo_level;
 	/* handle btpo_cycleid after rightpage buffer acquired */
 
 	/*
@@ -1722,7 +1723,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
 	ropaque->btpo_prev = origpagenumber;
 	ropaque->btpo_next = oopaque->btpo_next;
-	ropaque->btpo.level = oopaque->btpo.level;
+	ropaque->btpo_level = oopaque->btpo_level;
 	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
 
 	/*
@@ -1950,7 +1951,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		uint8		xlinfo;
 		XLogRecPtr	recptr;
 
-		xlrec.level = ropaque->btpo.level;
+		xlrec.level = ropaque->btpo_level;
 		/* See comments below on newitem, orignewitem, and posting lists */
 		xlrec.firstrightoff = firstrightoff;
 		xlrec.newitemoff = newitemoff;
@@ -2142,7 +2143,7 @@ _bt_insert_parent(Relation rel,
 					 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 			/* Find the leftmost page at the next level up */
-			pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL);
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
 			/* Set up a phony stack entry pointing there */
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
@@ -2480,15 +2481,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 	rootopaque->btpo_flags = BTP_ROOT;
-	rootopaque->btpo.level =
-		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
+	rootopaque->btpo_level =
+		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
 	rootopaque->btpo_cycleid = 0;
 
 	/* update metapage data */
 	metad->btm_root = rootblknum;
-	metad->btm_level = rootopaque->btpo.level;
+	metad->btm_level = rootopaque->btpo_level;
 	metad->btm_fastroot = rootblknum;
-	metad->btm_fastlevel = rootopaque->btpo.level;
+	metad->btm_fastlevel = rootopaque->btpo_level;
 
 	/*
 	 * Insert the left page pointer into the new root page.  The root page is
@@ -2548,7 +2549,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		md.level = metad->btm_level;
 		md.fastroot = rootblknum;
 		md.fastlevel = metad->btm_level;
-		md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+		md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 		md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 		md.allequalimage = metad->btm_allequalimage;
 
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index ac264a5952..f748e72539 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -37,7 +37,7 @@
 
 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
-							   TransactionId latestRemovedXid);
+							   FullTransactionId latestRemovedFullXid);
 static void _bt_delitems_delete(Relation rel, Buffer buf,
 								TransactionId latestRemovedXid,
 								OffsetNumber *deletable, int ndeletable,
@@ -50,7 +50,6 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
 									 BlockNumber scanblkno,
 									 bool *rightsib_empty,
-									 TransactionId *oldestBtpoXact,
 									 uint32 *ndeleted);
 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
 									BTStack stack,
@@ -78,7 +77,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
 	metad->btm_level = level;
 	metad->btm_fastroot = rootbknum;
 	metad->btm_fastlevel = level;
-	metad->btm_oldest_btpo_xact = InvalidTransactionId;
+	metad->btm_last_cleanup_num_delpages = 0;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
 	metad->btm_allequalimage = allequalimage;
 
@@ -118,7 +117,7 @@ _bt_upgrademetapage(Page page)
 
 	/* Set version number and fill extra fields added into version 3 */
 	metad->btm_version = BTREE_NOVAC_VERSION;
-	metad->btm_oldest_btpo_xact = InvalidTransactionId;
+	metad->btm_last_cleanup_num_delpages = 0;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
 	/* Only a REINDEX can set this field */
 	Assert(!metad->btm_allequalimage);
@@ -176,8 +175,9 @@ _bt_getmeta(Relation rel, Buffer metabuf)
  *		to those written in the metapage.  On mismatch, metapage is overwritten.
  */
 void
-_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
-							 float8 numHeapTuples)
+_bt_update_meta_cleanup_info(Relation rel,
+							 BlockNumber pages_deleted_not_free,
+							 float8 num_heap_tuples)
 {
 	Buffer		metabuf;
 	Page		metapg;
@@ -190,11 +190,38 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
 
+	/*
+	 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
+	 * field started out as a TransactionId field called btm_oldest_btpo_xact.
+	 * Both "versions" are just uint32 fields.  _bt_vacuum_needs_cleanup() has
+	 * used both versions to decide when cleanup-only VACUUMs needs to call
+	 * btvacuumscan.  It was convenient to repurpose the field when we began
+	 * to use 64-bit XIDs in deleted pages.
+	 *
+	 * It's possible that a pg_upgrade'd database will contain an XID value in
+	 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
+	 * field.  _bt_vacuum_needs_cleanup() may even believe that this value
+	 * indicates that there are lots of pages that it needs to recycle, when
+	 * in reality there are only one or two.  The worst that can happen is
+	 * that there will be a call to btvacuumscan a little earlier, which will
+	 * end up here -- at which point btm_last_cleanup_num_delpages will
+	 * contain a sane value.
+	 *
+	 * (Besides, this should only happen when there really are some pages that
+	 * we will be able to recycle.  If there are none at all then the metapage
+	 * XID value will be InvalidTransactionId, which is 0 --- we'll manage to
+	 * completely avoid even the minor annoyance of an early btvacuumscan.)
+	 */
+	StaticAssertStmt(sizeof(metad->btm_last_cleanup_num_delpages) ==
+					 sizeof(TransactionId),
+					 "on-disk compatibility assumption violated");
+
 	/* outdated version of metapage always needs rewrite */
 	if (metad->btm_version < BTREE_NOVAC_VERSION)
 		needsRewrite = true;
-	else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
-			 metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
+	else if (metad->btm_last_cleanup_num_delpages != pages_deleted_not_free)
+		needsRewrite = true;
+	else if (metad->btm_last_cleanup_num_heap_tuples != num_heap_tuples)
 		needsRewrite = true;
 
 	if (!needsRewrite)
@@ -214,8 +241,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		_bt_upgrademetapage(metapg);
 
 	/* update cleanup-related information */
-	metad->btm_oldest_btpo_xact = oldestBtpoXact;
-	metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
+	metad->btm_last_cleanup_num_delpages = pages_deleted_not_free;
+	metad->btm_last_cleanup_num_heap_tuples = num_heap_tuples;
 	MarkBufferDirty(metabuf);
 
 	/* write wal record if needed */
@@ -232,8 +259,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		md.level = metad->btm_level;
 		md.fastroot = metad->btm_fastroot;
 		md.fastlevel = metad->btm_fastlevel;
-		md.oldest_btpo_xact = oldestBtpoXact;
-		md.last_cleanup_num_heap_tuples = numHeapTuples;
+		md.last_cleanup_num_delpages = pages_deleted_not_free;
+		md.last_cleanup_num_heap_tuples = num_heap_tuples;
 		md.allequalimage = metad->btm_allequalimage;
 
 		XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
@@ -316,7 +343,7 @@ _bt_getroot(Relation rel, int access)
 		 * because that's not set in a "fast root".
 		 */
 		if (!P_IGNORE(rootopaque) &&
-			rootopaque->btpo.level == rootlevel &&
+			rootopaque->btpo_level == rootlevel &&
 			P_LEFTMOST(rootopaque) &&
 			P_RIGHTMOST(rootopaque))
 		{
@@ -377,7 +404,7 @@ _bt_getroot(Relation rel, int access)
 		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
-		rootopaque->btpo.level = 0;
+		rootopaque->btpo_level = 0;
 		rootopaque->btpo_cycleid = 0;
 		/* Get raw page pointer for metapage */
 		metapg = BufferGetPage(metabuf);
@@ -393,7 +420,7 @@ _bt_getroot(Relation rel, int access)
 		metad->btm_level = 0;
 		metad->btm_fastroot = rootblkno;
 		metad->btm_fastlevel = 0;
-		metad->btm_oldest_btpo_xact = InvalidTransactionId;
+		metad->btm_last_cleanup_num_delpages = 0;
 		metad->btm_last_cleanup_num_heap_tuples = -1.0;
 
 		MarkBufferDirty(rootbuf);
@@ -416,7 +443,7 @@ _bt_getroot(Relation rel, int access)
 			md.level = 0;
 			md.fastroot = rootblkno;
 			md.fastlevel = 0;
-			md.oldest_btpo_xact = InvalidTransactionId;
+			md.last_cleanup_num_delpages = 0;
 			md.last_cleanup_num_heap_tuples = -1.0;
 			md.allequalimage = metad->btm_allequalimage;
 
@@ -481,11 +508,10 @@ _bt_getroot(Relation rel, int access)
 			rootblkno = rootopaque->btpo_next;
 		}
 
-		/* Note: can't check btpo.level on deleted pages */
-		if (rootopaque->btpo.level != rootlevel)
+		if (rootopaque->btpo_level != rootlevel)
 			elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 				 rootblkno, RelationGetRelationName(rel),
-				 rootopaque->btpo.level, rootlevel);
+				 rootopaque->btpo_level, rootlevel);
 	}
 
 	/*
@@ -585,11 +611,10 @@ _bt_gettrueroot(Relation rel)
 		rootblkno = rootopaque->btpo_next;
 	}
 
-	/* Note: can't check btpo.level on deleted pages */
-	if (rootopaque->btpo.level != rootlevel)
+	if (rootopaque->btpo_level != rootlevel)
 		elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 			 rootblkno, RelationGetRelationName(rel),
-			 rootopaque->btpo.level, rootlevel);
+			 rootopaque->btpo_level, rootlevel);
 
 	return rootbuf;
 }
@@ -762,7 +787,8 @@ _bt_checkpage(Relation rel, Buffer buf)
  * Log the reuse of a page from the FSM.
  */
 static void
-_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+_bt_log_reuse_page(Relation rel, BlockNumber blkno,
+				   FullTransactionId latestRemovedFullXid)
 {
 	xl_btree_reuse_page xlrec_reuse;
 
@@ -775,7 +801,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
 	/* XLOG stuff */
 	xlrec_reuse.node = rel->rd_node;
 	xlrec_reuse.block = blkno;
-	xlrec_reuse.latestRemovedXid = latestRemovedXid;
+	xlrec_reuse.latestRemovedFullXid = latestRemovedFullXid;
 
 	XLogBeginInsert();
 	XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
@@ -815,6 +841,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		buf = ReadBuffer(rel, blkno);
 		_bt_lockbuf(rel, buf, access);
 		_bt_checkpage(rel, buf);
+		Assert(!_bt_page_recyclable(BufferGetPage(buf)));
 	}
 	else
 	{
@@ -862,17 +889,18 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 					 * If we are generating WAL for Hot Standby then create a
 					 * WAL record that will allow us to conflict with queries
 					 * running on standby, in case they have snapshots older
-					 * than btpo.xact.  This can only apply if the page does
-					 * have a valid btpo.xact value, ie not if it's new.  (We
-					 * must check that because an all-zero page has no special
-					 * space.)
+					 * than safexid value returned by BTPageGetDeleteXid().
+					 * This can only apply if the page does have a valid
+					 * safexid value, ie not if it's new.  (We must check that
+					 * because an all-zero page has no special space.)
 					 */
 					if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
 						!PageIsNew(page))
 					{
-						BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+						FullTransactionId latestRemovedFullXid;
 
-						_bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
+						latestRemovedFullXid = BTPageGetDeleteXid(page);
+						_bt_log_reuse_page(rel, blkno, latestRemovedFullXid);
 					}
 
 					/* Okay to use page.  Re-initialize and return it */
@@ -952,6 +980,7 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 	_bt_lockbuf(rel, buf, access);
 
 	_bt_checkpage(rel, buf);
+	Assert(!_bt_page_recyclable(BufferGetPage(buf)));
 	return buf;
 }
 
@@ -1101,9 +1130,31 @@ _bt_page_recyclable(Page page)
 	 * interested in it.
 	 */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-	if (P_ISDELETED(opaque) &&
-		GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
-		return true;
+	if (P_ISDELETED(opaque))
+	{
+		/*
+		 * If this is a pg_upgrade'd index, then this could be a deleted page
+		 * whose XID (which is stored in special area's level field via type
+		 * punning) is non-full 32-bit value.  It's safe to just assume that
+		 * we can recycle because the system must have been restarted since
+		 * the time of deletion.
+		 */
+		if (!P_HAS_FULLXID(opaque))
+			return true;
+
+		/*
+		 * The page was deleted, but when? If it was just deleted, a scan
+		 * might have seen the downlink to it, and will read the page later.
+		 * As long as that can happen, we must keep the deleted page around as
+		 * a tombstone.
+		 *
+		 * For that check if the deletion XID could still be visible to
+		 * anyone. If not, then no scan that's still in progress could have
+		 * seen its downlink, and we can recycle it.
+		 */
+		return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
+	}
+
 	return false;
 }
 
@@ -1768,16 +1819,12 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
  * that the btvacuumscan scan has yet to reach; they'll get counted later
  * instead.
  *
- * Maintains *oldestBtpoXact for any pages that get deleted.  Caller is
- * responsible for maintaining *oldestBtpoXact in the case of pages that were
- * deleted by a previous VACUUM.
- *
  * NOTE: this leaks memory.  Rather than trying to clean up everything
  * carefully, it's better to run it in a temp context that can be reset
  * frequently.
  */
 uint32
-_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
+_bt_pagedel(Relation rel, Buffer leafbuf)
 {
 	uint32		ndeleted = 0;
 	BlockNumber rightsib;
@@ -1985,8 +2032,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 		{
 			/* Check for interrupts in _bt_unlink_halfdead_page */
 			if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
-										  &rightsib_empty, oldestBtpoXact,
-										  &ndeleted))
+										  &rightsib_empty, &ndeleted))
 			{
 				/*
 				 * _bt_unlink_halfdead_page should never fail, since we
@@ -2001,9 +2047,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 			}
 		}
 
-		Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
-		Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
-											*oldestBtpoXact));
+		Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque) &&
+			   P_HAS_FULLXID(opaque));
 
 		rightsib = opaque->btpo_next;
 
@@ -2264,12 +2309,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
  * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
  * consistent.)
  *
- * We maintain *oldestBtpoXact for pages that are deleted by the current
- * VACUUM operation here.  This must be handled here because we conservatively
- * assume that there needs to be a new call to ReadNewTransactionId() each
- * time a page gets deleted.  See comments about the underlying assumption
- * below.
- *
  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
  * On success exit, we'll be holding pin and write lock.  On failure exit,
  * we'll release both pin and lock before returning (we define it that way
@@ -2277,8 +2316,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
  */
 static bool
 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
-						 bool *rightsib_empty, TransactionId *oldestBtpoXact,
-						 uint32 *ndeleted)
+						 bool *rightsib_empty, uint32 *ndeleted)
 {
 	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
 	BlockNumber leafleftsib;
@@ -2294,12 +2332,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	BTMetaPageData *metad = NULL;
 	ItemId		itemid;
 	Page		page;
-	PageHeader	header;
 	BTPageOpaque opaque;
+	FullTransactionId safexid;
 	bool		rightsib_is_rightmost;
-	int			targetlevel;
+	uint32		targetlevel;
 	IndexTuple	leafhikey;
-	BlockNumber nextchild;
+	BlockNumber topparent_in_target;
 
 	page = BufferGetPage(leafbuf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -2343,7 +2381,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		leftsib = opaque->btpo_prev;
-		targetlevel = opaque->btpo.level;
+		targetlevel = opaque->btpo_level;
 		Assert(targetlevel > 0);
 
 		/*
@@ -2450,7 +2488,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 			!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
 			elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
 				 target, RelationGetRelationName(rel));
-		nextchild = InvalidBlockNumber;
+
+		/* Leaf page is also target page: don't set topparent */
+		topparent_in_target = InvalidBlockNumber;
 	}
 	else
 	{
@@ -2459,11 +2499,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 			elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
 				 target, RelationGetRelationName(rel));
 
-		/* Remember the next non-leaf child down in the subtree */
+		/* Internal page is target: we'll set topparent in leaf page... */
 		itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
-		nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
-		if (nextchild == leafblkno)
-			nextchild = InvalidBlockNumber;
+		topparent_in_target =
+			BTreeTupleGetTopParent((IndexTuple) PageGetItem(page, itemid));
+		/* ...except when it would be a redundant pointer-to-self */
+		if (topparent_in_target == leafblkno)
+			topparent_in_target = InvalidBlockNumber;
 	}
 
 	/*
@@ -2553,13 +2595,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * no lock was held.
 	 */
 	if (target != leafblkno)
-		BTreeTupleSetTopParent(leafhikey, nextchild);
+		BTreeTupleSetTopParent(leafhikey, topparent_in_target);
 
 	/*
 	 * Mark the page itself deleted.  It can be recycled when all current
 	 * transactions are gone.  Storing GetTopTransactionId() would work, but
 	 * we're in VACUUM and would not otherwise have an XID.  Having already
-	 * updated links to the target, ReadNewTransactionId() suffices as an
+	 * updated links to the target, ReadNextFullTransactionId() suffices as an
 	 * upper bound.  Any scan having retained a now-stale link is advertising
 	 * in its PGPROC an xmin less than or equal to the value we read here.  It
 	 * will continue to do so, holding back the xmin horizon, for the duration
@@ -2568,17 +2610,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
-	opaque->btpo_flags &= ~BTP_HALF_DEAD;
-	opaque->btpo_flags |= BTP_DELETED;
-	opaque->btpo.xact = ReadNewTransactionId();
 
 	/*
-	 * Remove the remaining tuples on the page.  This keeps things simple for
-	 * WAL consistency checking.
+	 * Store upper bound XID that's used to determine when deleted page is no
+	 * longer needed as a tombstone
 	 */
-	header = (PageHeader) page;
-	header->pd_lower = SizeOfPageHeaderData;
-	header->pd_upper = header->pd_special;
+	safexid = ReadNextFullTransactionId();
+	BTPageSetDeleted(page, safexid);
+	opaque->btpo_cycleid = 0;
 
 	/* And update the metapage, if needed */
 	if (BufferIsValid(metabuf))
@@ -2616,15 +2655,16 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		if (target != leafblkno)
 			XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
 
-		/* information on the unlinked block */
+		/* information stored on the target/to-be-unlinked block */
 		xlrec.leftsib = leftsib;
 		xlrec.rightsib = rightsib;
-		xlrec.btpo_xact = opaque->btpo.xact;
+		xlrec.level = targetlevel;
+		xlrec.safexid = safexid;
 
 		/* information needed to recreate the leaf block (if not the target) */
 		xlrec.leafleftsib = leafleftsib;
 		xlrec.leafrightsib = leafrightsib;
-		xlrec.topparent = nextchild;
+		xlrec.topparent = topparent_in_target;
 
 		XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
 
@@ -2638,7 +2678,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 			xlmeta.level = metad->btm_level;
 			xlmeta.fastroot = metad->btm_fastroot;
 			xlmeta.fastlevel = metad->btm_fastlevel;
-			xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+			xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 			xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 			xlmeta.allequalimage = metad->btm_allequalimage;
 
@@ -2681,9 +2721,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		_bt_relbuf(rel, lbuf);
 	_bt_relbuf(rel, rbuf);
 
-	if (!TransactionIdIsValid(*oldestBtpoXact) ||
-		TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
-		*oldestBtpoXact = opaque->btpo.xact;
+	/* If the target is not leafbuf, we're done with it now -- release it */
+	if (target != leafblkno)
+		_bt_relbuf(rel, buf);
 
 	/*
 	 * If btvacuumscan won't revisit this page in a future btvacuumpage call
@@ -2693,10 +2733,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	if (target <= scanblkno)
 		(*ndeleted)++;
 
-	/* If the target is not leafbuf, we're done with it now -- release it */
-	if (target != leafblkno)
-		_bt_relbuf(rel, buf);
-
 	return true;
 }
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 289bd3c15d..12ad877b70 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -46,8 +46,6 @@ typedef struct
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
-	BlockNumber totFreePages;	/* true total # of free pages */
-	TransactionId oldestBtpoXact;
 	MemoryContext pagedelcontext;
 } BTVacState;
 
@@ -802,66 +800,87 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 	Buffer		metabuf;
 	Page		metapg;
 	BTMetaPageData *metad;
-	bool		result = false;
+	BTOptions  *relopts;
+	float8		cleanup_scale_factor;
+	uint32		btm_version;
+	BlockNumber prev_pages_deleted_not_free;
+	float8		prev_num_heap_tuples;
 
+	/*
+	 * Copy details from metapage to local variables quickly.
+	 *
+	 * Note that we deliberately avoid using cached version of metapage here.
+	 */
 	metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
+	btm_version = metad->btm_version;
+
+	if (btm_version < BTREE_NOVAC_VERSION)
+	{
+		/*
+		 * Metapage needs to be dynamically upgraded to store fields that are
+		 * only present when btm_version >= BTREE_NOVAC_VERSION
+		 */
+		_bt_relbuf(info->index, metabuf);
+		return true;
+	}
+
+	prev_pages_deleted_not_free = metad->btm_last_cleanup_num_delpages;
+	prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+	_bt_relbuf(info->index, metabuf);
 
 	/*
-	 * XXX: If IndexVacuumInfo contained the heap relation, we could be more
-	 * aggressive about vacuuming non catalog relations by passing the table
-	 * to GlobalVisCheckRemovableXid().
+	 * If table receives enough insertions and no cleanup was performed, then
+	 * index would appear have stale statistics.  If scale factor is set, we
+	 * avoid that by performing cleanup if the number of inserted tuples
+	 * exceeds vacuum_cleanup_index_scale_factor fraction of original tuples
+	 * count.
 	 */
+	relopts = (BTOptions *) info->index->rd_options;
+	cleanup_scale_factor = (relopts &&
+							relopts->vacuum_cleanup_index_scale_factor >= 0)
+		? relopts->vacuum_cleanup_index_scale_factor
+		: vacuum_cleanup_index_scale_factor;
 
-	if (metad->btm_version < BTREE_NOVAC_VERSION)
-	{
-		/*
-		 * Do cleanup if metapage needs upgrade, because we don't have
-		 * cleanup-related meta-information yet.
-		 */
-		result = true;
-	}
-	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
-			 GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact))
-	{
-		/*
-		 * If any oldest btpo.xact from a previously deleted page in the index
-		 * is visible to everyone, then at least one deleted page can be
-		 * recycled -- don't skip cleanup.
-		 */
-		result = true;
-	}
-	else
-	{
-		BTOptions  *relopts;
-		float8		cleanup_scale_factor;
-		float8		prev_num_heap_tuples;
+	if (cleanup_scale_factor <= 0 ||
+		info->num_heap_tuples < 0 ||
+		prev_num_heap_tuples <= 0 ||
+		(info->num_heap_tuples - prev_num_heap_tuples) /
+		prev_num_heap_tuples >= cleanup_scale_factor)
+		return true;
 
-		/*
-		 * If table receives enough insertions and no cleanup was performed,
-		 * then index would appear have stale statistics.  If scale factor is
-		 * set, we avoid that by performing cleanup if the number of inserted
-		 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
-		 * original tuples count.
-		 */
-		relopts = (BTOptions *) info->index->rd_options;
-		cleanup_scale_factor = (relopts &&
-								relopts->vacuum_cleanup_index_scale_factor >= 0)
-			? relopts->vacuum_cleanup_index_scale_factor
-			: vacuum_cleanup_index_scale_factor;
-		prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+	/*
+	 * Trigger cleanup in rare cases where prev_pages_deleted_not_free exceeds
+	 * 2.5% of the total size of the index.  We can reasonably expect (though
+	 * are not guaranteed) to be able to recycle this many pages if we decide
+	 * to do a btvacuumscan call as part of this btvacuumcleanup call.
+	 *
+	 * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan
+	 * calls.  That is, we can end up scanning the entire index without ever
+	 * placing even 1 of the prev_pages_deleted_not_free pages in the free
+	 * space map, at least in certain narrow cases.
+	 *
+	 * For example, a "wasted" scan will happen when (for whatever reason) no
+	 * XIDs were assigned/allocated since the "# deleted pages" field was last
+	 * set in metapage by VACUUM.  You can observe this yourself by running
+	 * two VACUUM VERBOSE commands one after the other on an otherwise idle
+	 * system.  When the first VACUUM command manages to delete pages that
+	 * were emptied + deleted during btbulkdelete, the second VACUUM command
+	 * won't be able to place those same deleted pages (which won't be newly
+	 * deleted from the perspective of the second VACUUM command) into the FSM
+	 * during btvacuumcleanup.
+	 *
+	 * Another "wasted FSM-driven cleanup scan" scenario can occur when
+	 * VACUUM's ability to do work is hindered by a long held MVCC snapshot.
+	 * The snapshot prevents page recycling/freeing within btvacuumscan,
+	 * though that will be the least of the DBA's problems.
+	 */
+	if (prev_pages_deleted_not_free >
+		RelationGetNumberOfBlocks(info->index) / 40)
+		return true;
 
-		if (cleanup_scale_factor <= 0 ||
-			info->num_heap_tuples < 0 ||
-			prev_num_heap_tuples <= 0 ||
-			(info->num_heap_tuples - prev_num_heap_tuples) /
-			prev_num_heap_tuples >= cleanup_scale_factor)
-			result = true;
-	}
-
-	_bt_relbuf(info->index, metabuf);
-	return result;
+	return false;
 }
 
 /*
@@ -973,6 +992,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	BlockNumber num_pages;
 	BlockNumber scanblkno;
 	bool		needLock;
+	BlockNumber pages_deleted_not_free;
 
 	/*
 	 * Reset counts that will be incremented during the scan; needed in case
@@ -981,6 +1001,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	stats->estimated_count = false;
 	stats->num_index_tuples = 0;
 	stats->pages_deleted = 0;
+	stats->pages_free = 0;
 
 	/* Set up info to pass down to btvacuumpage */
 	vstate.info = info;
@@ -988,8 +1009,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
-	vstate.totFreePages = 0;
-	vstate.oldestBtpoXact = InvalidTransactionId;
 
 	/* Create a temporary memory context to run _bt_pagedel in */
 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
@@ -1048,6 +1067,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		}
 	}
 
+	/* Set statistics num_pages field to final size of index */
+	stats->num_pages = num_pages;
+
 	MemoryContextDelete(vstate.pagedelcontext);
 
 	/*
@@ -1062,27 +1084,26 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * Note that if no recyclable pages exist, we don't bother vacuuming the
 	 * FSM at all.
 	 */
-	if (vstate.totFreePages > 0)
+	if (stats->pages_free > 0)
 		IndexFreeSpaceMapVacuum(rel);
 
 	/*
-	 * Maintain the oldest btpo.xact and a count of the current number of heap
-	 * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
+	 * Maintain the count of the current number of heap tuples in the
+	 * metapage.  Also maintain the last pages_deleted_not_free.  Both values
+	 * are used within _bt_vacuum_needs_cleanup.
 	 *
-	 * The page with the oldest btpo.xact is typically a page deleted by this
-	 * VACUUM operation, since pages deleted by a previous VACUUM operation
-	 * tend to be placed in the FSM (by the current VACUUM operation) -- such
-	 * pages are not candidates to be the oldest btpo.xact.  (Note that pages
-	 * placed in the FSM are reported as deleted pages in the bulk delete
-	 * statistics, despite not counting as deleted pages for the purposes of
-	 * determining the oldest btpo.xact.)
+	 * pages_deleted_not_free is the number of deleted pages now in the index
+	 * that were not safe to place in the FSM to be recycled just yet.  We
+	 * expect that it will almost certainly be possible to place all of these
+	 * pages in the FSM during the next VACUUM operation.  If the next VACUUM
+	 * operation happens to be cleanup-only, _bt_vacuum_needs_cleanup will be
+	 * called.  We may decide to proceed with a call to btvacuumscan purely
+	 * because there are lots of deleted pages not yet placed in the FSM.
 	 */
-	_bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
+	Assert(stats->pages_deleted >= stats->pages_free);
+	pages_deleted_not_free = stats->pages_deleted - stats->pages_free;
+	_bt_update_meta_cleanup_info(rel, pages_deleted_not_free,
 								 info->num_heap_tuples);
-
-	/* update statistics */
-	stats->num_pages = num_pages;
-	stats->pages_free = vstate.totFreePages;
 }
 
 /*
@@ -1122,9 +1143,10 @@ backtrack:
 
 	/*
 	 * We can't use _bt_getbuf() here because it always applies
-	 * _bt_checkpage(), which will barf on an all-zero page. We want to
-	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
-	 * buffer access strategy.
+	 * _bt_checkpage(), which will barf on an all-zero page.  We want to
+	 * recycle all-zero pages, not fail.  Similarly, must avoid _bt_getbuf()
+	 * because it Assert()s that any existing page must be non-recyclable.
+	 * And, we want to use a buffer access strategy here.
 	 */
 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 							 info->strategy);
@@ -1193,8 +1215,8 @@ backtrack:
 	{
 		/* Okay to recycle this page (which could be leaf or internal) */
 		RecordFreeIndexPage(rel, blkno);
-		vstate->totFreePages++;
 		stats->pages_deleted++;
+		stats->pages_free++;
 	}
 	else if (P_ISDELETED(opaque))
 	{
@@ -1203,17 +1225,12 @@ backtrack:
 		 * recycle yet.
 		 */
 		stats->pages_deleted++;
-
-		/* Maintain the oldest btpo.xact */
-		if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
-			TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
-			vstate->oldestBtpoXact = opaque->btpo.xact;
 	}
 	else if (P_ISHALFDEAD(opaque))
 	{
 		/*
 		 * Half-dead leaf page.  Try to delete now.  Might update
-		 * oldestBtpoXact and pages_deleted below.
+		 * pages_deleted below.
 		 */
 		attempt_pagedel = true;
 	}
@@ -1430,7 +1447,7 @@ backtrack:
 		 * count.  There will be no double-counting.
 		 */
 		Assert(blkno == scanblkno);
-		stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
+		stats->pages_deleted += _bt_pagedel(rel, buf);
 
 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 2e3bda8171..d1177d8772 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -169,7 +169,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * we're on the level 1 and asked to lock leaf page in write mode,
 		 * then lock next page in write mode, because it must be a leaf.
 		 */
-		if (opaque->btpo.level == 1 && access == BT_WRITE)
+		if (opaque->btpo_level == 1 && access == BT_WRITE)
 			page_access = BT_WRITE;
 
 		/* drop the read lock on the page, then acquire one on its child */
@@ -2341,9 +2341,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 		}
 
 		/* Done? */
-		if (opaque->btpo.level == level)
+		if (opaque->btpo_level == level)
 			break;
-		if (opaque->btpo.level < level)
+		if (opaque->btpo_level < level)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg_internal("btree level %u not found in index \"%s\"",
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 5683daa34d..2c4d7f6e25 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -620,7 +620,7 @@ _bt_blnewpage(uint32 level)
 	/* Initialize BT opaque state */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	opaque->btpo_prev = opaque->btpo_next = P_NONE;
-	opaque->btpo.level = level;
+	opaque->btpo_level = level;
 	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
 	opaque->btpo_cycleid = 0;
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index c1d578cc01..b6afe9526e 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -112,7 +112,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
 	md->btm_fastlevel = xlrec->fastlevel;
 	/* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
 	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
-	md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
+	md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
 	md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
 	md->btm_allequalimage = xlrec->allequalimage;
 
@@ -297,7 +297,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 
 	ropaque->btpo_prev = origpagenumber;
 	ropaque->btpo_next = spagenumber;
-	ropaque->btpo.level = xlrec->level;
+	ropaque->btpo_level = xlrec->level;
 	ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
 	ropaque->btpo_cycleid = 0;
 
@@ -773,7 +773,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
 
 	pageop->btpo_prev = xlrec->leftblk;
 	pageop->btpo_next = xlrec->rightblk;
-	pageop->btpo.level = 0;
+	pageop->btpo_level = 0;
 	pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
 	pageop->btpo_cycleid = 0;
 
@@ -802,6 +802,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
 	BlockNumber leftsib;
 	BlockNumber rightsib;
+	uint32		level;
+	bool		isleaf;
+	FullTransactionId safexid;
 	Buffer		leftbuf;
 	Buffer		target;
 	Buffer		rightbuf;
@@ -810,6 +813,12 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 
 	leftsib = xlrec->leftsib;
 	rightsib = xlrec->rightsib;
+	level = xlrec->level;
+	isleaf = (level == 0);
+	safexid = xlrec->safexid;
+
+	/* No topparent link for leaf page (level 0) or level 1 */
+	Assert(xlrec->topparent == InvalidBlockNumber || level > 1);
 
 	/*
 	 * In normal operation, we would lock all the pages this WAL record
@@ -844,9 +853,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 
 	pageop->btpo_prev = leftsib;
 	pageop->btpo_next = rightsib;
-	pageop->btpo.xact = xlrec->btpo_xact;
-	pageop->btpo_flags = BTP_DELETED;
-	if (!BlockNumberIsValid(xlrec->topparent))
+	pageop->btpo_level = level;
+	BTPageSetDeleted(page, safexid);
+	if (isleaf)
 		pageop->btpo_flags |= BTP_LEAF;
 	pageop->btpo_cycleid = 0;
 
@@ -892,6 +901,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		Buffer				leafbuf;
 		IndexTupleData		trunctuple;
 
+		Assert(!isleaf);
+
 		leafbuf = XLogInitBufferForRedo(record, 3);
 		page = (Page) BufferGetPage(leafbuf);
 
@@ -901,7 +912,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
 		pageop->btpo_prev = xlrec->leafleftsib;
 		pageop->btpo_next = xlrec->leafrightsib;
-		pageop->btpo.level = 0;
+		pageop->btpo_level = 0;
 		pageop->btpo_cycleid = 0;
 
 		/* Add a dummy hikey item */
@@ -942,7 +953,7 @@ btree_xlog_newroot(XLogReaderState *record)
 
 	pageop->btpo_flags = BTP_ROOT;
 	pageop->btpo_prev = pageop->btpo_next = P_NONE;
-	pageop->btpo.level = xlrec->level;
+	pageop->btpo_level = xlrec->level;
 	if (xlrec->level == 0)
 		pageop->btpo_flags |= BTP_LEAF;
 	pageop->btpo_cycleid = 0;
@@ -972,17 +983,15 @@ btree_xlog_reuse_page(XLogReaderState *record)
 	 * Btree reuse_page records exist to provide a conflict point when we
 	 * reuse pages in the index via the FSM.  That's all they do though.
 	 *
-	 * latestRemovedXid was the page's btpo.xact.  The
-	 * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually
-	 * mirrors the pgxact->xmin > limitXmin test in
+	 * latestRemovedXid was the page's deleteXid.  The
+	 * GlobalVisCheckRemovableFullXid(deleteXid) test in _bt_page_recyclable()
+	 * conceptually mirrors the PGPROC->xmin > limitXmin test in
 	 * GetConflictingVirtualXIDs().  Consequently, one XID value achieves the
 	 * same exclusion effect on primary and standby.
 	 */
 	if (InHotStandby)
-	{
-		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
-											xlrec->node);
-	}
+		ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+												   xlrec->node);
 }
 
 void
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 6e0d6a2b72..5cce10a5b6 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -80,9 +80,10 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec;
 
-				appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ",
-								 xlrec->leftsib, xlrec->rightsib,
-								 xlrec->btpo_xact);
+				appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ",
+								 xlrec->leftsib, xlrec->rightsib, xlrec->level,
+								 EpochFromFullTransactionId(xlrec->safexid),
+								 XidFromFullTransactionId(xlrec->safexid));
 				appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u",
 								 xlrec->leafleftsib, xlrec->leafrightsib,
 								 xlrec->topparent);
@@ -99,9 +100,11 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;
 
-				appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u",
+				appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
 								 xlrec->node.spcNode, xlrec->node.dbNode,
-								 xlrec->node.relNode, xlrec->latestRemovedXid);
+								 xlrec->node.relNode,
+								 EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
+								 XidFromFullTransactionId(xlrec->latestRemovedFullXid));
 				break;
 			}
 		case XLOG_BTREE_META_CLEANUP:
@@ -110,8 +113,8 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 
 				xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0,
 																  NULL);
-				appendStringInfo(buf, "oldest_btpo_xact %u; last_cleanup_num_heap_tuples: %f",
-								 xlrec->oldest_btpo_xact,
+				appendStringInfo(buf, "last_cleanup_num_delpages %u; last_cleanup_num_heap_tuples: %f",
+								 xlrec->last_cleanup_num_delpages,
 								 xlrec->last_cleanup_num_heap_tuples);
 				break;
 			}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 39a30c00f7..0eeb766943 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -452,6 +452,34 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode
 										   true);
 }
 
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+										   RelFileNode node)
+{
+	/*
+	 * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+	 * so truncate the logged FullTransactionId.  If the logged value is very
+	 * old, so that XID wrap-around already happened on it, there can't be any
+	 * snapshots that still see it.
+	 */
+	FullTransactionId nextXid = ReadNextFullTransactionId();
+	uint64			  diff;
+
+	diff = U64FromFullTransactionId(nextXid) -
+		U64FromFullTransactionId(latestRemovedFullXid);
+	if (diff < MaxTransactionId / 2)
+	{
+		TransactionId latestRemovedXid;
+
+		latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+	}
+}
+
 void
 ResolveRecoveryConflictWithTablespace(Oid tsid)
 {
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index b8c7793d9e..0272c6554f 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 											  P_FIRSTDATAKEY(opaque));
 				itup = (IndexTuple) PageGetItem(state->target, itemid);
 				nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
-				nextleveldown.level = opaque->btpo.level - 1;
+				nextleveldown.level = opaque->btpo_level - 1;
 			}
 			else
 			{
@@ -794,14 +794,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 		if (opaque->btpo_prev != leftcurrent)
 			bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
 
-		/* Check level, which must be valid for non-ignorable page */
-		if (level.level != opaque->btpo.level)
+		/* Check level */
+		if (level.level != opaque->btpo_level)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
 							RelationGetRelationName(state->rel)),
 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
-										current, level.level, opaque->btpo.level)));
+										current, level.level, opaque->btpo_level)));
 
 		/* Verify invariants for page */
 		bt_target_page_check(state);
@@ -1167,7 +1167,7 @@ bt_target_page_check(BtreeCheckState *state)
 				bt_child_highkey_check(state,
 									   offset,
 									   NULL,
-									   topaque->btpo.level);
+									   topaque->btpo_level);
 			}
 			continue;
 		}
@@ -1529,7 +1529,7 @@ bt_target_page_check(BtreeCheckState *state)
 	if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
 	{
 		bt_child_highkey_check(state, InvalidOffsetNumber,
-							   NULL, topaque->btpo.level);
+							   NULL, topaque->btpo_level);
 	}
 }
 
@@ -1606,7 +1606,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 		ereport(DEBUG1,
 				(errcode(ERRCODE_NO_DATA),
 				 errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead",
-						opaque->btpo.level, RelationGetRelationName(state->rel)),
+						opaque->btpo_level, RelationGetRelationName(state->rel)),
 				 errdetail_internal("Deleted page found when building scankey from right sibling.")));
 
 		/* Be slightly more pro-active in freeing this memory, just in case */
@@ -1910,14 +1910,15 @@ bt_child_highkey_check(BtreeCheckState *state,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 
-		/* Check level for non-ignorable page */
-		if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
+		/* Do level sanity check */
+		if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
+			opaque->btpo_level != target_level - 1)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
 							RelationGetRelationName(state->rel)),
 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
-										blkno, target_level - 1, opaque->btpo.level)));
+										blkno, target_level - 1, opaque->btpo_level)));
 
 		/* Try to detect circular links */
 		if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
@@ -2145,7 +2146,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
 	 * check for downlink connectivity.
 	 */
 	bt_child_highkey_check(state, downlinkoffnum,
-						   child, topaque->btpo.level);
+						   child, topaque->btpo_level);
 
 	/*
 	 * Since there cannot be a concurrent VACUUM operation in readonly mode,
@@ -2290,7 +2291,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 				 errmsg("harmless interrupted page split detected in index %s",
 						RelationGetRelationName(state->rel)),
 				 errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
-									blkno, opaque->btpo.level,
+									blkno, opaque->btpo_level,
 									opaque->btpo_prev,
 									(uint32) (pagelsn >> 32),
 									(uint32) pagelsn)));
@@ -2321,7 +2322,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 	elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
 		 RelationGetRelationName(state->rel));
 
-	level = opaque->btpo.level;
+	level = opaque->btpo_level;
 	itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
 	itup = (IndexTuple) PageGetItem(page, itemid);
 	childblk = BTreeTupleGetDownLink(itup);
@@ -2336,16 +2337,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 			break;
 
 		/* Do an extra sanity check in passing on internal pages */
-		if (copaque->btpo.level != level - 1)
+		if (copaque->btpo_level != level - 1)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
 									 RelationGetRelationName(state->rel)),
 					 errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
 										blkno, childblk,
-										level - 1, copaque->btpo.level)));
+										level - 1, copaque->btpo_level)));
 
-		level = copaque->btpo.level;
+		level = copaque->btpo_level;
 		itemid = PageGetItemIdCareful(state, childblk, child,
 									  P_FIRSTDATAKEY(copaque));
 		itup = (IndexTuple) PageGetItem(child, itemid);
@@ -2407,7 +2408,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 			 errmsg("internal index block lacks downlink in index \"%s\"",
 					RelationGetRelationName(state->rel)),
 			 errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
-								blkno, opaque->btpo.level,
+								blkno, opaque->btpo_level,
 								(uint32) (pagelsn >> 32),
 								(uint32) pagelsn)));
 }
@@ -3002,21 +3003,26 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 	}
 
 	/*
-	 * Deleted pages have no sane "level" field, so can only check non-deleted
-	 * page level
+	 * Deleted pages that still use the old 32-bit XID representation have no
+	 * sane "level" field because they type pun the field, but all other pages
+	 * (including pages deleted on Postgres 14+) have a valid value.
 	 */
-	if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_INDEX_CORRUPTED),
-				 errmsg("invalid leaf page level %u for block %u in index \"%s\"",
-						opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
+	if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
+	{
+		/* Okay, no reason not to trust btpo_level field from page */
 
-	if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
-		opaque->btpo.level == 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_INDEX_CORRUPTED),
-				 errmsg("invalid internal page level 0 for block %u in index \"%s\"",
-						blocknum, RelationGetRelationName(state->rel))));
+		if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("invalid leaf page level %u for block %u in index \"%s\"",
+							opaque->btpo_level, blocknum, RelationGetRelationName(state->rel))));
+
+		if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("invalid internal page level 0 for block %u in index \"%s\"",
+							blocknum, RelationGetRelationName(state->rel))));
+	}
 
 	/*
 	 * Sanity checks for number of items on page.
@@ -3064,7 +3070,8 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 	 * from version 9.4 on, so do the same here.  See _bt_pagedel() for full
 	 * details.
 	 *
-	 * Internal pages should never have garbage items, either.
+	 * Also check that internal pages have no garbage items, and that no page
+	 * has an invalid combination of page deletion related page level flags.
 	 */
 	if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
 		ereport(ERROR,
@@ -3079,6 +3086,18 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 				 errmsg("internal page block %u in index \"%s\" has garbage items",
 						blocknum, RelationGetRelationName(state->rel))));
 
+	if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
+								 blocknum, RelationGetRelationName(state->rel))));
+
+	if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
+								 blocknum, RelationGetRelationName(state->rel))));
+
 	return page;
 }
 
diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c
index 8bb180bbbe..dfac1a9716 100644
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -75,11 +75,7 @@ typedef struct BTPageStat
 	/* opaque data */
 	BlockNumber btpo_prev;
 	BlockNumber btpo_next;
-	union
-	{
-		uint32		level;
-		TransactionId xact;
-	}			btpo;
+	uint32		btpo_level;
 	uint16		btpo_flags;
 	BTCycleId	btpo_cycleid;
 } BTPageStat;
@@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
 	/* page type (flags) */
 	if (P_ISDELETED(opaque))
 	{
-		stat->type = 'd';
-		stat->btpo.xact = opaque->btpo.xact;
-		return;
+		/* We divide deleted pages into leaf ('d') or internal ('D') */
+		if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
+			stat->type = 'd';
+		else
+			stat->type = 'D';
+
+		/*
+		 * Report safexid in a deleted page.
+		 *
+		 * Handle pg_upgrade'd deleted pages that used the previous safexid
+		 * representation in btpo_level field (this used to be a union type
+		 * called "bpto").
+		 */
+		if (P_HAS_FULLXID(opaque))
+		{
+			FullTransactionId safexid = BTPageGetDeleteXid(page);
+
+			elog(NOTICE, "deleted page from block %u has safexid %u:%u",
+				 blkno, EpochFromFullTransactionId(safexid),
+				 XidFromFullTransactionId(safexid));
+		}
+		else
+			elog(NOTICE, "deleted page from block %u has safexid %u",
+				 blkno, opaque->btpo_level);
+
+		/* Don't interpret BTDeletedPageContents as index tuples */
+		maxoff = InvalidOffsetNumber;
 	}
 	else if (P_IGNORE(opaque))
 		stat->type = 'e';
@@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
 	/* btpage opaque data */
 	stat->btpo_prev = opaque->btpo_prev;
 	stat->btpo_next = opaque->btpo_next;
-	stat->btpo.level = opaque->btpo.level;
+	stat->btpo_level = opaque->btpo_level;
 	stat->btpo_flags = opaque->btpo_flags;
 	stat->btpo_cycleid = opaque->btpo_cycleid;
 
@@ -237,7 +257,8 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
 	values[j++] = psprintf("%u", stat.free_size);
 	values[j++] = psprintf("%u", stat.btpo_prev);
 	values[j++] = psprintf("%u", stat.btpo_next);
-	values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
+	/* The "btpo" field now only stores btpo_level, never an xact */
+	values[j++] = psprintf("%u", stat.btpo_level);
 	values[j++] = psprintf("%d", stat.btpo_flags);
 
 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
@@ -503,10 +524,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
 
 		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
 
-		if (P_ISDELETED(opaque))
-			elog(NOTICE, "page is deleted");
-
-		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		if (!P_ISDELETED(opaque))
+			fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		else
+		{
+			/* Don't interpret BTDeletedPageContents as index tuples */
+			elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
+			fctx->max_calls = 0;
+		}
 		uargs->leafpage = P_ISLEAF(opaque);
 		uargs->rightmost = P_RIGHTMOST(opaque);
 
@@ -603,7 +628,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
 		if (P_ISDELETED(opaque))
 			elog(NOTICE, "page is deleted");
 
-		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		if (!P_ISDELETED(opaque))
+			fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		else
+		{
+			/* Don't interpret BTDeletedPageContents as index tuples */
+			elog(NOTICE, "page from block is deleted");
+			fctx->max_calls = 0;
+		}
 		uargs->leafpage = P_ISLEAF(opaque);
 		uargs->rightmost = P_RIGHTMOST(opaque);
 
@@ -723,7 +755,8 @@ bt_metap(PG_FUNCTION_ARGS)
 	 */
 	if (metad->btm_version >= BTREE_NOVAC_VERSION)
 	{
-		values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
+		/* XXX: btm_last_cleanup_num_delpages used to be btm_oldest_btpo_xact */
+		values[j++] = psprintf("%u", metad->btm_last_cleanup_num_delpages);
 		values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
 		values[j++] = metad->btm_allequalimage ? "t" : "f";
 	}
diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c
index b1ce0d77d7..5368bb30f0 100644
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 		page = BufferGetPage(buffer);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
-		/* Determine page type, and update totals */
-
+		/*
+		 * Determine page type, and update totals.
+		 *
+		 * Note that we arbitrarily bucket deleted pages together without
+		 * considering if they're leaf pages or internal pages.
+		 */
 		if (P_ISDELETED(opaque))
 			indexStat.deleted_pages++;
 		else if (P_IGNORE(opaque))
-- 
2.27.0

