From d0cf595e7f587b0b8991156c3e08aadc32b81755 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 18 Jul 2022 14:35:44 -0700
Subject: [PATCH v1 2/4] Teach VACUUM to use visibility map snapshot.

Acquire an in-memory immutable "snapshot" of the target rel's visibility
map at the start of each VACUUM, and use the snapshot to determine when
and how VACUUM will skip pages.

This has significant advantages over the previous approach of using the
authoritative VM fork to decide on which pages to skip.  The number of
heap pages processed will no longer increase when some other backend
concurrently modifies a skippable page, since VACUUM will continue to
see the page as skippable (which is correct because the page really is
still skippable "relative to VACUUM's OldestXmin cutoff").  It also
gives VACUUM reliable information about how many pages will be scanned,
before its physical heap scan even begins.  That makes it easier to
model the costs that VACUUM incurs using a top-down, up-front approach.

Non-aggressive VACUUMs now make an up-front choice about VM skipping
strategy: they decide whether to prioritize early advancement of
relfrozenxid (eager behavior) over avoiding work by skipping all-visible
pages (lazy behavior).  Nothing about the details of how lazy_scan_prune
freezes changes just yet, though a later commit will add the concept of
freezing strategies.

Non-aggressive VACUUMs now explicitly commit to (or decide against)
early relfrozenxid advancement up-front.  VACUUM will now either scan
every all-visible page, or none at all.  This replaces lazy_scan_skip's
SKIP_PAGES_THRESHOLD behavior, which was intended to enable early
relfrozenxid advancement (see commit bf136cf6), but left many of the
details to chance.  It was possible that a single all-visible page
located in a range of all-frozen blocks would render it unsafe to
advance relfrozenxid later on; lazy_scan_skip just didn't have enough
high-level context about the table as a whole.  Now our policy around
skipping all-visible pages is exactly the same condition as whether or
not it's safe to advance relfrozenxid later on; nothing is left to
chance.

TODO: We don't spill VM snapshots to disk just yet (resource management
aspects of VM snapshots still need work).  For now a VM snapshot is just
a copy of the VM pages stored in local buffers allocated by palloc().
---
 src/include/access/visibilitymap.h      |   7 +
 src/backend/access/heap/vacuumlazy.c    | 323 +++++++++++++++---------
 src/backend/access/heap/visibilitymap.c | 162 ++++++++++++
 3 files changed, 367 insertions(+), 125 deletions(-)

diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 55f67edb6..49f197bb3 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -26,6 +26,9 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/* Snapshot of visibility map at a point in time */
+typedef struct vmsnapshot vmsnapshot;
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
@@ -35,6 +38,10 @@ extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 							  XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
 							  uint8 flags);
 extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
+extern vmsnapshot *visibilitymap_snap(Relation rel, BlockNumber rel_pages,
+									  BlockNumber *all_visible, BlockNumber *all_frozen);
+extern void visibilitymap_snap_release(vmsnapshot *vmsnap);
+extern uint8 visibilitymap_snap_status(vmsnapshot *vmsnap, BlockNumber heapBlk);
 extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen);
 extern BlockNumber visibilitymap_prepare_truncate(Relation rel,
 												  BlockNumber nheapblocks);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 75cb31e75..2fd703668 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -109,10 +109,10 @@
 	((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
 
 /*
- * Before we consider skipping a page that's marked as clean in
- * visibility map, we must've seen at least this many clean pages.
+ * Threshold that controls whether non-aggressive VACUUMs will skip any
+ * all-visible pages
  */
-#define SKIP_PAGES_THRESHOLD	((BlockNumber) 32)
+#define SKIPALLVIS_THRESHOLD_PAGES	0.05	/* i.e. 5% of rel_pages */
 
 /*
  * Size of the prefetch window for lazy vacuum backwards truncation scan.
@@ -146,8 +146,10 @@ typedef struct LVRelState
 
 	/* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */
 	bool		aggressive;
-	/* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */
-	bool		skipwithvm;
+	/* Skip (don't scan) all-visible pages? (must be !aggressive) */
+	bool		skipallvis;
+	/* Skip (don't scan) all-frozen pages? */
+	bool		skipallfrozen;
 	/* Wraparound failsafe has been triggered? */
 	bool		failsafe_active;
 	/* Consider index vacuuming bypass optimization? */
@@ -171,13 +173,14 @@ typedef struct LVRelState
 	TransactionId OldestXmin;
 	MultiXactId OldestMxact;
 	GlobalVisState *vistest;
+	/* Snapshot of visibility map, taken just after OldestXmin acquired */
+	vmsnapshot *vmsnap;
 	/* Limits on the age of the oldest unfrozen XID and MXID */
 	TransactionId FreezeLimit;
 	MultiXactId MultiXactCutoff;
 	/* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */
 	TransactionId NewRelfrozenXid;
 	MultiXactId NewRelminMxid;
-	bool		skippedallvis;
 
 	/* Error reporting state */
 	char	   *relnamespace;
@@ -248,10 +251,12 @@ typedef struct LVSavedErrInfo
 
 /* non-export function prototypes */
 static void lazy_scan_heap(LVRelState *vacrel);
-static BlockNumber lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer,
+static BlockNumber lazy_scan_strategy(LVRelState *vacrel,
+									  BlockNumber all_visible,
+									  BlockNumber all_frozen);
+static BlockNumber lazy_scan_skip(LVRelState *vacrel,
 								  BlockNumber next_block,
-								  bool *next_unskippable_allvis,
-								  bool *skipping_current_range);
+								  bool *next_unskippable_allvis);
 static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
 								   BlockNumber blkno, Page page,
 								   bool sharelock, Buffer vmbuffer);
@@ -314,7 +319,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	bool		verbose,
 				instrument,
 				aggressive,
-				skipwithvm,
+				skipallfrozen,
 				frozenxid_updated,
 				minmulti_updated;
 	TransactionId OldestXmin,
@@ -322,6 +327,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	MultiXactId OldestMxact,
 				MultiXactCutoff;
 	BlockNumber orig_rel_pages,
+				all_visible,
+				all_frozen,
+				scanned_pages,
 				new_rel_pages,
 				new_rel_allvisible;
 	PGRUsage	ru0;
@@ -367,7 +375,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 									   &OldestXmin, &OldestMxact,
 									   &FreezeLimit, &MultiXactCutoff);
 
-	skipwithvm = true;
+	skipallfrozen = true;
 	if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
 	{
 		/*
@@ -375,7 +383,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 		 * visibility map (even those set all-frozen)
 		 */
 		aggressive = true;
-		skipwithvm = false;
+		skipallfrozen = false;
 	}
 
 	/*
@@ -400,20 +408,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	errcallback.arg = vacrel;
 	errcallback.previous = error_context_stack;
 	error_context_stack = &errcallback;
-	if (verbose)
-	{
-		Assert(!IsAutoVacuumWorkerProcess());
-		if (aggressive)
-			ereport(INFO,
-					(errmsg("aggressively vacuuming \"%s.%s.%s\"",
-							get_database_name(MyDatabaseId),
-							vacrel->relnamespace, vacrel->relname)));
-		else
-			ereport(INFO,
-					(errmsg("vacuuming \"%s.%s.%s\"",
-							get_database_name(MyDatabaseId),
-							vacrel->relnamespace, vacrel->relname)));
-	}
 
 	/* Set up high level stuff about rel and its indexes */
 	vacrel->rel = rel;
@@ -440,7 +434,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
 		   params->truncate != VACOPTVALUE_AUTO);
 	vacrel->aggressive = aggressive;
-	vacrel->skipwithvm = skipwithvm;
+	/* Set skipallvis/skipallfrozen provisionally (before lazy_scan_strategy) */
+	vacrel->skipallvis = (!aggressive && skipallfrozen);
+	vacrel->skipallfrozen = skipallfrozen;
 	vacrel->failsafe_active = false;
 	vacrel->consider_bypass_optimization = true;
 	vacrel->do_index_vacuuming = true;
@@ -505,11 +501,19 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	 * thereby guaranteed to not miss any tuples with XIDs < OldestXmin. These
 	 * XIDs must at least be considered for freezing (though not necessarily
 	 * frozen) during its scan.
+	 *
+	 * Also acquire a read-only snapshot of the visibility map at this point.
+	 * We can work off of the snapshot when deciding which heap pages are safe
+	 * to skip.  This approach allows VACUUM to avoid scanning pages whose VM
+	 * bit gets unset concurrently, which is important with large tables that
+	 * take a long time to VACUUM.
 	 */
 	vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
 	vacrel->OldestXmin = OldestXmin;
 	vacrel->OldestMxact = OldestMxact;
 	vacrel->vistest = GlobalVisTestFor(rel);
+	vacrel->vmsnap = visibilitymap_snap(rel, orig_rel_pages,
+										&all_visible, &all_frozen);
 	/* FreezeLimit controls XID freezing (always <= OldestXmin) */
 	vacrel->FreezeLimit = FreezeLimit;
 	/* MultiXactCutoff controls MXID freezing (always <= OldestMxact) */
@@ -517,7 +521,35 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	/* Initialize state used to track oldest extant XID/MXID */
 	vacrel->NewRelfrozenXid = OldestXmin;
 	vacrel->NewRelminMxid = OldestMxact;
-	vacrel->skippedallvis = false;
+
+	/*
+	 * Use visibility map snapshot to determine whether we'll skip all-visible
+	 * pages using vmsnap in lazy_scan_heap
+	 */
+	scanned_pages = lazy_scan_strategy(vacrel,
+									   all_visible, all_frozen);
+	if (verbose)
+	{
+		Assert(!IsAutoVacuumWorkerProcess());
+		if (aggressive)
+			ereport(INFO,
+					(errmsg("aggressively vacuuming \"%s.%s.%s\"",
+							get_database_name(MyDatabaseId),
+							vacrel->relnamespace, vacrel->relname),
+					 errdetail_internal("total table size is %u pages, %u pages (%.2f%% of total) must be scanned",
+										orig_rel_pages, scanned_pages,
+										orig_rel_pages == 0 ? 100.0 :
+										100.0 * scanned_pages / orig_rel_pages)));
+		else
+			ereport(INFO,
+					(errmsg("vacuuming \"%s.%s.%s\"",
+							get_database_name(MyDatabaseId),
+							vacrel->relnamespace, vacrel->relname),
+					 errdetail_internal("total table size is %u pages, %u pages (%.2f%% of total) must be scanned",
+										orig_rel_pages, scanned_pages,
+										orig_rel_pages == 0 ? 100.0 :
+										100.0 * scanned_pages / orig_rel_pages)));
+	}
 
 	/*
 	 * Allocate dead_items array memory using dead_items_alloc.  This handles
@@ -534,6 +566,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	 * vacuuming, and heap vacuuming (plus related processing)
 	 */
 	lazy_scan_heap(vacrel);
+	Assert(vacrel->scanned_pages == scanned_pages);
 
 	/*
 	 * Free resources managed by dead_items_alloc.  This ends parallel mode in
@@ -580,12 +613,11 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 		   MultiXactIdPrecedesOrEquals(aggressive ? MultiXactCutoff :
 									   vacrel->relminmxid,
 									   vacrel->NewRelminMxid));
-	if (vacrel->skippedallvis)
+	if (vacrel->skipallvis)
 	{
 		/*
-		 * Must keep original relfrozenxid in a non-aggressive VACUUM that
-		 * chose to skip an all-visible page range.  The state that tracks new
-		 * values will have missed unfrozen XIDs from the pages we skipped.
+		 * Must keep original relfrozenxid in a non-aggressive VACUUM whose
+		 * lazy_scan_strategy call determined it would skip all-visible pages
 		 */
 		Assert(!aggressive);
 		vacrel->NewRelfrozenXid = InvalidTransactionId;
@@ -630,6 +662,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 						 vacrel->missed_dead_tuples);
 	pgstat_progress_end_command();
 
+	/* Done with rel's visibility map snapshot */
+	visibilitymap_snap_release(vacrel->vmsnap);
+
 	if (instrument)
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
@@ -853,8 +888,7 @@ lazy_scan_heap(LVRelState *vacrel)
 				next_fsm_block_to_vacuum = 0;
 	VacDeadItems *dead_items = vacrel->dead_items;
 	Buffer		vmbuffer = InvalidBuffer;
-	bool		next_unskippable_allvis,
-				skipping_current_range;
+	bool		next_unskippable_allvis;
 	const int	initprog_index[] = {
 		PROGRESS_VACUUM_PHASE,
 		PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
@@ -868,43 +902,33 @@ lazy_scan_heap(LVRelState *vacrel)
 	initprog_val[2] = dead_items->max_items;
 	pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
 
-	/* Set up an initial range of skippable blocks using the visibility map */
-	next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0,
-											&next_unskippable_allvis,
-											&skipping_current_range);
+	/* Set up an initial range of skippable blocks using VM snapshot */
+	next_unskippable_block = lazy_scan_skip(vacrel, 0,
+											&next_unskippable_allvis);
 	for (blkno = 0; blkno < rel_pages; blkno++)
 	{
 		Buffer		buf;
 		Page		page;
-		bool		all_visible_according_to_vm;
+		bool		all_visible_according_to_vmsnap;
 		LVPagePruneState prunestate;
 
-		if (blkno == next_unskippable_block)
-		{
-			/*
-			 * Can't skip this page safely.  Must scan the page.  But
-			 * determine the next skippable range after the page first.
-			 */
-			all_visible_according_to_vm = next_unskippable_allvis;
-			next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer,
-													blkno + 1,
-													&next_unskippable_allvis,
-													&skipping_current_range);
-
-			Assert(next_unskippable_block >= blkno + 1);
-		}
-		else
+		if (blkno < next_unskippable_block)
 		{
 			/* Last page always scanned (may need to set nonempty_pages) */
 			Assert(blkno < rel_pages - 1);
 
-			if (skipping_current_range)
-				continue;
-
-			/* Current range is too small to skip -- just scan the page */
-			all_visible_according_to_vm = true;
+			/* Skip (don't scan) this page */
+			continue;
 		}
 
+		/*
+		 * Can't skip this page safely.  Must scan the page.  But determine
+		 * the next skippable range after the page first.
+		 */
+		all_visible_according_to_vmsnap = next_unskippable_allvis;
+		next_unskippable_block = lazy_scan_skip(vacrel, blkno + 1,
+												&next_unskippable_allvis);
+
 		vacrel->scanned_pages++;
 
 		/* Report as block scanned, update error traceback information */
@@ -1113,10 +1137,10 @@ lazy_scan_heap(LVRelState *vacrel)
 		}
 
 		/*
-		 * Handle setting visibility map bit based on information from the VM
-		 * (as of last lazy_scan_skip() call), and from prunestate
+		 * Handle setting visibility map bit based on information from our VM
+		 * snapshot, and from prunestate
 		 */
-		if (!all_visible_according_to_vm && prunestate.all_visible)
+		if (!all_visible_according_to_vmsnap && prunestate.all_visible)
 		{
 			uint8		flags = VISIBILITYMAP_ALL_VISIBLE;
 
@@ -1145,11 +1169,11 @@ lazy_scan_heap(LVRelState *vacrel)
 
 		/*
 		 * As of PostgreSQL 9.2, the visibility map bit should never be set if
-		 * the page-level bit is clear.  However, it's possible that the bit
-		 * got cleared after lazy_scan_skip() was called, so we must recheck
-		 * with buffer lock before concluding that the VM is corrupt.
+		 * the page-level bit is clear.  However, lazy_scan_skip works off of
+		 * a snapshot of the VM that might be quite old by now.  Recheck with
+		 * a buffer lock held before concluding that the VM is corrupt.
 		 */
-		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
+		else if (all_visible_according_to_vmsnap && !PageIsAllVisible(page)
 				 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
 		{
 			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
@@ -1188,7 +1212,7 @@ lazy_scan_heap(LVRelState *vacrel)
 		 * mark it as all-frozen.  Note that all_frozen is only valid if
 		 * all_visible is true, so we must check both prunestate fields.
 		 */
-		else if (all_visible_according_to_vm && prunestate.all_visible &&
+		else if (all_visible_according_to_vmsnap && prunestate.all_visible &&
 				 prunestate.all_frozen &&
 				 !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
 		{
@@ -1281,7 +1305,97 @@ lazy_scan_heap(LVRelState *vacrel)
 }
 
 /*
- *	lazy_scan_skip() -- set up range of skippable blocks using visibility map.
+ *	lazy_scan_strategy() -- Determine skipping strategy.
+ *
+ * Determines if the ongoing VACUUM operation should skip all-visible pages
+ * for non-aggressive VACUUMs, where advancing relfrozenxid is optional.
+ *
+ * Returns final scanned_pages for the VACUUM operation.
+ */
+static BlockNumber
+lazy_scan_strategy(LVRelState *vacrel,
+				   BlockNumber all_visible,
+				   BlockNumber all_frozen)
+{
+	BlockNumber rel_pages = vacrel->rel_pages,
+				scanned_pages_skipallvis,
+				scanned_pages_skipallfrozen;
+	uint8		mapbits;
+
+	Assert(vacrel->scanned_pages == 0);
+	Assert(rel_pages >= all_visible && all_visible >= all_frozen);
+
+	/*
+	 * First figure out the final scanned_pages for each of the skipping
+	 * policies that lazy_scan_skip might end up using: skipallvis (skip both
+	 * all-frozen and all-visible) and skipallfrozen (just skip all-frozen).
+	 */
+	scanned_pages_skipallvis = rel_pages - all_visible;
+	scanned_pages_skipallfrozen = rel_pages - all_frozen;
+
+	/*
+	 * Even if the last page is skippable, it will still get scanned because
+	 * of lazy_scan_skip's "try to set nonempty_pages for last page" rule.
+	 * Reconcile that rule with what vmsnap says about the last page now.
+	 *
+	 * When vmsnap thinks that we will be skipping the last page (we won't),
+	 * increment scanned_pages to compensate.  Otherwise change nothing.
+	 */
+	mapbits = visibilitymap_snap_status(vacrel->vmsnap, rel_pages - 1);
+	if (mapbits & VISIBILITYMAP_ALL_VISIBLE)
+		scanned_pages_skipallvis++;
+	if (mapbits & VISIBILITYMAP_ALL_FROZEN)
+		scanned_pages_skipallfrozen++;
+
+	/*
+	 * Okay, now we have all the information we need to decide on a strategy
+	 */
+	if (!vacrel->skipallfrozen)
+	{
+		/* DISABLE_PAGE_SKIPPING makes all skipping unsafe */
+		Assert(vacrel->aggressive && !vacrel->skipallvis);
+		return rel_pages;
+	}
+	else if (vacrel->aggressive)
+		Assert(!vacrel->skipallvis);
+	else
+	{
+		BlockNumber nextra,
+					nextra_threshold;
+
+		/*
+		 * Decide on whether or not we'll skip all-visible pages.
+		 *
+		 * In general, VACUUM doesn't necessarily have to freeze anything to
+		 * be able to advance relfrozenxid and/or relminmxid by a significant
+		 * number of XIDs/MXIDs.  The oldest tuples might turn out to have
+		 * been deleted since VACUUM last ran, or this VACUUM might find that
+		 * there simply are no MultiXacts that even need to be considered.
+		 *
+		 * It's hard to predict whether this VACUUM operation will work out
+		 * that way, so be lazy (just skip) unless the added cost is very low.
+		 * We opt for a skipallfrozen-only VACUUM when the number of extra
+		 * pages (extra scanned pages that are all-visible but not all-frozen)
+		 * is less than 5% of rel_pages (or 32 pages when rel_pages is small).
+		 */
+		nextra = scanned_pages_skipallfrozen - scanned_pages_skipallvis;
+		nextra_threshold = (double) rel_pages * SKIPALLVIS_THRESHOLD_PAGES;
+		nextra_threshold = Max(32, nextra_threshold);
+
+		vacrel->skipallvis = nextra >= nextra_threshold;
+	}
+
+	/* Return the appropriate variant of scanned_pages */
+	if (vacrel->skipallvis)
+		return scanned_pages_skipallvis;
+	if (vacrel->skipallfrozen)
+		return scanned_pages_skipallfrozen;
+
+	return rel_pages;			/* keep compiler quiet */
+}
+
+/*
+ *	lazy_scan_skip() -- set up the next range of skippable blocks.
  *
  * lazy_scan_heap() calls here every time it needs to set up a new range of
  * blocks to skip via the visibility map.  Caller passes the next block in
@@ -1289,34 +1403,25 @@ lazy_scan_heap(LVRelState *vacrel)
  * no skippable blocks we just return caller's next_block.  The all-visible
  * status of the returned block is set in *next_unskippable_allvis for caller,
  * too.  Block usually won't be all-visible (since it's unskippable), but it
- * can be during aggressive VACUUMs (as well as in certain edge cases).
+ * can be for rel's last page, and when DISABLE_PAGE_SKIPPING is used.
  *
- * Sets *skipping_current_range to indicate if caller should skip this range.
- * Costs and benefits drive our decision.  Very small ranges won't be skipped.
- *
- * Note: our opinion of which blocks can be skipped can go stale immediately.
- * It's okay if caller "misses" a page whose all-visible or all-frozen marking
- * was concurrently cleared, though.  All that matters is that caller scan all
- * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact.
- * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with
- * older XIDs/MXIDs.  The vacrel->skippedallvis flag will be set here when the
- * choice to skip such a range is actually made, making everything safe.)
+ * This function operates on a snapshot of the visibility map that was taken
+ * just after OldestXmin was acquired.  VACUUM only needs to scan all pages
+ * whose tuples might contain XIDs < OldestXmin (or MXIDs < OldestMxact),
+ * which excludes pages treated as all-frozen here (pages >= rel_pages, too).
  */
 static BlockNumber
-lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block,
-			   bool *next_unskippable_allvis, bool *skipping_current_range)
+lazy_scan_skip(LVRelState *vacrel, BlockNumber next_block,
+			   bool *next_unskippable_allvis)
 {
 	BlockNumber rel_pages = vacrel->rel_pages,
-				next_unskippable_block = next_block,
-				nskippable_blocks = 0;
-	bool		skipsallvis = false;
+				next_unskippable_block = next_block;
 
 	*next_unskippable_allvis = true;
 	while (next_unskippable_block < rel_pages)
 	{
-		uint8		mapbits = visibilitymap_get_status(vacrel->rel,
-													   next_unskippable_block,
-													   vmbuffer);
+		uint8		mapbits = visibilitymap_snap_status(vacrel->vmsnap,
+														next_unskippable_block);
 
 		if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
 		{
@@ -1332,55 +1437,23 @@ lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block,
 		 * lock on rel to attempt a truncation that fails anyway, just because
 		 * there are tuples on the last page (it is likely that there will be
 		 * tuples on other nearby pages as well, but those can be skipped).
-		 *
-		 * Implement this by always treating the last block as unsafe to skip.
 		 */
 		if (next_unskippable_block == rel_pages - 1)
 			break;
 
 		/* DISABLE_PAGE_SKIPPING makes all skipping unsafe */
-		if (!vacrel->skipwithvm)
+		Assert(vacrel->skipallfrozen || !vacrel->skipallvis);
+		if (!vacrel->skipallfrozen)
 			break;
 
 		/*
-		 * Aggressive VACUUM caller can't skip pages just because they are
-		 * all-visible.  They may still skip all-frozen pages, which can't
-		 * contain XIDs < OldestXmin (XIDs that aren't already frozen by now).
+		 * Don't skip all-visible pages when lazy_scan_strategy determined
+		 * that it was more important for this VACUUM to advance relfrozenxid
 		 */
-		if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0)
-		{
-			if (vacrel->aggressive)
-				break;
+		if (!vacrel->skipallvis && (mapbits & VISIBILITYMAP_ALL_FROZEN) == 0)
+			break;
 
-			/*
-			 * All-visible block is safe to skip in non-aggressive case.  But
-			 * remember that the final range contains such a block for later.
-			 */
-			skipsallvis = true;
-		}
-
-		vacuum_delay_point();
 		next_unskippable_block++;
-		nskippable_blocks++;
-	}
-
-	/*
-	 * We only skip a range with at least SKIP_PAGES_THRESHOLD consecutive
-	 * pages.  Since we're reading sequentially, the OS should be doing
-	 * readahead for us, so there's no gain in skipping a page now and then.
-	 * Skipping such a range might even discourage sequential detection.
-	 *
-	 * This test also enables more frequent relfrozenxid advancement during
-	 * non-aggressive VACUUMs.  If the range has any all-visible pages then
-	 * skipping makes updating relfrozenxid unsafe, which is a real downside.
-	 */
-	if (nskippable_blocks < SKIP_PAGES_THRESHOLD)
-		*skipping_current_range = false;
-	else
-	{
-		*skipping_current_range = true;
-		if (skipsallvis)
-			vacrel->skippedallvis = true;
 	}
 
 	return next_unskippable_block;
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index ed72eb7b6..6848576fd 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -16,6 +16,9 @@
  *		visibilitymap_pin_ok - check whether correct map page is already pinned
  *		visibilitymap_set	 - set a bit in a previously pinned page
  *		visibilitymap_get_status - get status of bits
+ *		visibilitymap_snap - get read-only snapshot of visibility map
+ *		visibilitymap_snap_release - release previously acquired snapshot
+ *		visibilitymap_snap_status - get status of bits from vm snapshot
  *		visibilitymap_count  - count number of bits set in visibility map
  *		visibilitymap_prepare_truncate -
  *			prepare for truncation of the visibility map
@@ -52,6 +55,9 @@
  *
  * VACUUM will normally skip pages for which the visibility map bit is set;
  * such pages can't contain any dead tuples and therefore don't need vacuuming.
+ * VACUUM uses a snapshot of the visibility map to avoid scanning pages whose
+ * visibility map bit gets concurrently unset (only tuples deleted before its
+ * OldestXmin cutoff are considered dead).
  *
  * LOCKING
  *
@@ -124,6 +130,22 @@
 #define FROZEN_MASK64	UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
 														 * bit pair */
 
+/*
+ * Snapshot of visibility map at a point in time.
+ *
+ * TODO This is currently always just a palloc()'d buffer -- give more thought
+ * to resource management (at a minimum add spilling to temp file).
+ */
+struct vmsnapshot
+{
+	/* Snapshot may contain zero or more visibility map pages */
+	BlockNumber nvmpages;
+
+	/* Copy of VM pages from the time that visibilitymap_snap() was called */
+	char		vmpages[FLEXIBLE_ARRAY_MEMBER];
+};
+
+
 /* prototypes for internal routines */
 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
 static void vm_extend(Relation rel, BlockNumber vm_nblocks);
@@ -368,6 +390,146 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf)
 	return result;
 }
 
+/*
+ *	visibilitymap_snap - get read-only snapshot of visibility map
+ *
+ * Initializes caller's snapshot, allocating memory in caller's memory context.
+ * Caller can use visibilitymap_snapshot_status to get the status of individual
+ * heap pages at the point that we were called.
+ *
+ * Used by VACUUM to determine which pages it must scan up front.  This avoids
+ * useless scans of concurrently unset heap pages.  VACUUM prefers to leave
+ * them to be scanned during the next VACUUM operation.
+ *
+ * rel_pages is the current size of the heap relation.
+ */
+vmsnapshot *
+visibilitymap_snap(Relation rel, BlockNumber rel_pages,
+				   BlockNumber *all_visible, BlockNumber *all_frozen)
+{
+	BlockNumber nvmpages,
+				mapBlockLast;
+	vmsnapshot *vmsnap;
+
+#ifdef TRACE_VISIBILITYMAP
+	elog(DEBUG1, "visibilitymap_snap %s %d", RelationGetRelationName(rel), rel_pages);
+#endif
+
+	/*
+	 * Allocate space for VM pages up to and including those required to have
+	 * bits for the would-be heap block that is just beyond rel_pages
+	 */
+	*all_visible = 0;
+	*all_frozen = 0;
+	mapBlockLast = HEAPBLK_TO_MAPBLOCK(rel_pages);
+	nvmpages = mapBlockLast + 1;
+	vmsnap = palloc(offsetof(vmsnapshot, vmpages) + BLCKSZ * nvmpages);
+	vmsnap->nvmpages = nvmpages;
+
+	for (BlockNumber mapBlock = 0; mapBlock <= mapBlockLast; mapBlock++)
+	{
+		Page		localvmpage;
+		Buffer		mapBuffer;
+		char	   *map;
+		uint64	   *umap;
+
+		mapBuffer = vm_readbuf(rel, mapBlock, false);
+		if (!BufferIsValid(mapBuffer))
+		{
+			/*
+			 * Not all VM pages available.  Remember that, so that we'll treat
+			 * relevant heap pages as not all-visible/all-frozen when asked.
+			 */
+			vmsnap->nvmpages = mapBlock;
+			break;
+		}
+
+		localvmpage = vmsnap->vmpages + mapBlock * BLCKSZ;
+		LockBuffer(mapBuffer, BUFFER_LOCK_SHARE);
+		memcpy(localvmpage, BufferGetPage(mapBuffer), BLCKSZ);
+		UnlockReleaseBuffer(mapBuffer);
+
+		/*
+		 * Visibility map page copied to local buffer for caller's snapshot.
+		 * Caller requires an exact count of all-visible and all-frozen blocks
+		 * in the heap relation.  Handle that now.
+		 *
+		 * Must "truncate" our local copy of the VM to avoid incorrectly
+		 * counting heap pages >= rel_pages as all-visible/all-frozen.  Handle
+		 * this by clearing irrelevant bits on the last VM page copied.
+		 */
+		map = PageGetContents(localvmpage);
+		if (mapBlock == mapBlockLast)
+		{
+			/* byte and bit for first heap page not to be scanned by VACUUM */
+			uint32		truncByte = HEAPBLK_TO_MAPBYTE(rel_pages);
+			uint8		truncOffset = HEAPBLK_TO_OFFSET(rel_pages);
+
+			if (truncByte != 0 || truncOffset != 0)
+			{
+				/* Clear any bits set for heap pages >= rel_pages */
+				MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
+				map[truncByte] &= (1 << truncOffset) - 1;
+			}
+
+			/* Now it's safe to tally bits from this final VM page below */
+		}
+
+		/* Tally the all-visible and all-frozen counts from this page */
+		umap = (uint64 *) map;
+		for (int i = 0; i < MAPSIZE / sizeof(uint64); i++)
+		{
+			*all_visible += pg_popcount64(umap[i] & VISIBLE_MASK64);
+			*all_frozen += pg_popcount64(umap[i] & FROZEN_MASK64);
+		}
+	}
+
+	return vmsnap;
+}
+
+/*
+ *	visibilitymap_snap_release - release previously acquired snapshot
+ *
+ * Just frees the memory allocated by visibilitymap_snap for now (presumably
+ * this will need to release temp files in later revisions of the patch)
+ */
+void
+visibilitymap_snap_release(vmsnapshot *vmsnap)
+{
+	pfree(vmsnap);
+}
+
+/*
+ *	visibilitymap_snap_status - get status of bits from vm snapshot
+ *
+ * Are all tuples on heapBlk visible to all or are marked frozen, according
+ * to caller's snapshot of the visibility map?
+ */
+uint8
+visibilitymap_snap_status(vmsnapshot *vmsnap, BlockNumber heapBlk)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
+	char	   *map;
+	uint8		result;
+	Page		localvmpage;
+
+#ifdef TRACE_VISIBILITYMAP
+	elog(DEBUG1, "visibilitymap_snap_status %d", heapBlk);
+#endif
+
+	/* If we didn't copy the VM page, assume heapBlk not all-visible */
+	if (mapBlock >= vmsnap->nvmpages)
+		return 0;
+
+	localvmpage = ((Page) vmsnap->vmpages) + mapBlock * BLCKSZ;
+	map = PageGetContents(localvmpage);
+
+	result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS);
+	return result;
+}
+
 /*
  *	visibilitymap_count  - count number of bits set in visibility map
  *
-- 
2.34.1

