On Sat, Jan 18, 2025 at 11:51 AM Tomas Vondra <to...@vondra.me> wrote:
>
> Sure. I repeated the benchmark with v13, and it seems the behavior did
> change. I no longer see the "big" regression when most of the pages get
> updated (and need vacuuming).
>
> I can't be 100% sure this is due to changes in the patch, because I did
> some significant upgrades to the machine since that time - it has Ryzen
> 9900x instead of the ancient i5-2500k, new mobo/RAM/...  It's pretty
> much a new machine, I only kept the "old" SATA SSD RAID storage so that
> I can do some tests with non-NVMe.
>
> So there's a (small) chance the previous runs were hitting a bottleneck
> that does not exist on the new hardware.
>
> Anyway, just to make this information more complete, the machine now has
> this configuration:
>
> * Ryzen 9 9900x (12/24C), 64GB RAM
> * storage:
>   - data: Samsung SSD 990 PRO 4TB (NVMe)
>   - raid-nvme: RAID0 4x Samsung SSD 990 PRO 1TB (NVMe)
>   - raid-sata: RAID0 6x Intel DC3700 100GB (SATA)
>
> Attached is the script, raw results (CSV) and two PDFs summarizing the
> results as a pivot table for different test parameters. Compared to the
> earlier run I tweaked the script to also vary io_combine_limit (ioc), as
> I wanted to see how it interacts with effective_io_concurrency (eic).
>
> Looking at the new results, I don't see any regressions, except for two
> cases - data (single NVMe) and raid-nvme (4x NVMe). There's a small area
> of regression for eic=32 and perc=0.0005, but only with WAL-logging.
>
> I'm not sure this is worth worrying about too much. It's a heuristics
> and for every heuristics there's some combination parameters where it
> doesn't quite do the optimal thing. The area where the patch brings
> massive improvements (or does not regress) are much more significant.
>
> I personally am happy with this behavior, seems to be performing fine.

Yes, looking at these results, I also feel good about it. I've updated
the commit metadata in attached v14, but I could use a round of review
before pushing it.

- Melanie
From 2397c7fb4b91f907ebcec60d35067c5072a2ae8b Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Wed, 5 Feb 2025 17:23:05 -0500
Subject: [PATCH v14 2/2] Use streaming I/O in VACUUM's third phase

Now vacuum's third phase (its second pass over the heap), which removes
dead items referring to dead tuples collected in the first phase, uses a
read stream that looks ahead in the TidStore.

Author: Melanie Plageman <melanieplage...@gmail.com>
Co-authored-by: Thomas Munro <thomas.mu...@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGKN3oy0bN_3yv8hd78a4%2BM1tJC9z7mD8%2Bf%2ByA%2BGeoFUwQ%40mail.gmail.com
---
 src/backend/access/heap/vacuumlazy.c | 42 ++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 70351de403d..222ee01e1ad 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -2250,6 +2250,27 @@ lazy_vacuum_all_indexes(LVRelState *vacrel)
 	return allindexes;
 }
 
+/*
+ * Read stream callback for vacuum's third phase (second pass over the heap).
+ */
+static BlockNumber
+vacuum_reap_lp_read_stream_next(ReadStream *stream,
+								void *callback_private_data,
+								void *per_buffer_data)
+{
+	TidStoreIter *iter = callback_private_data;
+	TidStoreIterResult *iter_result;
+
+	iter_result = TidStoreIterateNext(iter);
+	if (iter_result == NULL)
+		return InvalidBlockNumber;
+
+	/* Save the TidStoreIterResult for later, so we can extract the offsets. */
+	memcpy(per_buffer_data, iter_result, sizeof(*iter_result));
+
+	return iter_result->blkno;
+}
+
 /*
  *	lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
  *
@@ -2270,6 +2291,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel)
 static void
 lazy_vacuum_heap_rel(LVRelState *vacrel)
 {
+	Buffer		buf;
+	ReadStream *stream;
 	BlockNumber vacuumed_pages = 0;
 	Buffer		vmbuffer = InvalidBuffer;
 	LVSavedErrInfo saved_err_info;
@@ -2290,10 +2313,18 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 							 InvalidBlockNumber, InvalidOffsetNumber);
 
 	iter = TidStoreBeginIterate(vacrel->dead_items);
-	while ((iter_result = TidStoreIterateNext(iter)) != NULL)
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE,
+										vacrel->bstrategy,
+										vacrel->rel,
+										MAIN_FORKNUM,
+										vacuum_reap_lp_read_stream_next,
+										iter,
+										sizeof(TidStoreIterResult));
+
+	while (BufferIsValid(buf = read_stream_next_buffer(stream,
+													   (void **) &iter_result)))
 	{
 		BlockNumber blkno;
-		Buffer		buf;
 		Page		page;
 		Size		freespace;
 		OffsetNumber offsets[MaxOffsetNumber];
@@ -2301,8 +2332,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 
 		vacuum_delay_point();
 
-		blkno = iter_result->blkno;
-		vacrel->blkno = blkno;
+		vacrel->blkno = blkno = BufferGetBlockNumber(buf);
 
 		num_offsets = TidStoreGetBlockOffsets(iter_result, offsets, lengthof(offsets));
 		Assert(num_offsets <= lengthof(offsets));
@@ -2315,8 +2345,6 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
 		/* We need a non-cleanup exclusive lock to mark dead_items unused */
-		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-								 vacrel->bstrategy);
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 		lazy_vacuum_heap_page(vacrel, blkno, buf, offsets,
 							  num_offsets, vmbuffer);
@@ -2329,6 +2357,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
 		vacuumed_pages++;
 	}
+
+	read_stream_end(stream);
 	TidStoreEndIterate(iter);
 
 	vacrel->blkno = InvalidBlockNumber;
-- 
2.34.1

From 6d61028e0900d6b96bfe72cb0c262e22b8bde39a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Wed, 5 Feb 2025 17:21:41 -0500
Subject: [PATCH v14 1/2] Use streaming I/O in VACUUM's first phase

Now vacuum's first phase, which HOT-prunes and records the TIDs of
non-removable dead tuples, uses the streaming read API by converting
heap_vac_scan_next_block() to a read stream callback.

Discussion: https://postgr.es/m/CA%2BhUKGKN3oy0bN_3yv8hd78a4%2BM1tJC9z7mD8%2Bf%2ByA%2BGeoFUwQ%40mail.gmail.com
---
 src/backend/access/heap/vacuumlazy.c | 100 +++++++++++++++++----------
 1 file changed, 62 insertions(+), 38 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 075af385cd1..70351de403d 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -108,6 +108,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
+#include "storage/read_stream.h"
 #include "utils/lsyscache.h"
 #include "utils/pg_rusage.h"
 #include "utils/timestamp.h"
@@ -296,8 +297,9 @@ typedef struct LVSavedErrInfo
 
 /* non-export function prototypes */
 static void lazy_scan_heap(LVRelState *vacrel);
-static bool heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
-									 bool *all_visible_according_to_vm);
+static BlockNumber heap_vac_scan_next_block(ReadStream *stream,
+											void *callback_private_data,
+											void *per_buffer_data);
 static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis);
 static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
 								   BlockNumber blkno, Page page,
@@ -907,10 +909,11 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 static void
 lazy_scan_heap(LVRelState *vacrel)
 {
+	ReadStream *stream;
 	BlockNumber rel_pages = vacrel->rel_pages,
-				blkno,
+				blkno = 0,
 				next_fsm_block_to_vacuum = 0;
-	bool		all_visible_according_to_vm;
+	bool	   *all_visible_according_to_vm = NULL;
 
 	Buffer		vmbuffer = InvalidBuffer;
 	const int	initprog_index[] = {
@@ -926,26 +929,27 @@ lazy_scan_heap(LVRelState *vacrel)
 	initprog_val[2] = vacrel->dead_items_info->max_bytes;
 	pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
 
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE,
+										vacrel->bstrategy,
+										vacrel->rel,
+										MAIN_FORKNUM,
+										heap_vac_scan_next_block,
+										vacrel,
+										sizeof(bool));
+
 	/* Initialize for the first heap_vac_scan_next_block() call */
 	vacrel->current_block = InvalidBlockNumber;
 	vacrel->next_unskippable_block = InvalidBlockNumber;
 	vacrel->next_unskippable_allvis = false;
 	vacrel->next_unskippable_vmbuffer = InvalidBuffer;
 
-	while (heap_vac_scan_next_block(vacrel, &blkno, &all_visible_according_to_vm))
+	while (true)
 	{
 		Buffer		buf;
 		Page		page;
 		bool		has_lpdead_items;
 		bool		got_cleanup_lock = false;
 
-		vacrel->scanned_pages++;
-
-		/* Report as block scanned, update error traceback information */
-		pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
-		update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
-								 blkno, InvalidOffsetNumber);
-
 		vacuum_delay_point();
 
 		/*
@@ -986,7 +990,8 @@ lazy_scan_heap(LVRelState *vacrel)
 
 			/*
 			 * Vacuum the Free Space Map to make newly-freed space visible on
-			 * upper-level FSM pages.  Note we have not yet processed blkno.
+			 * upper-level FSM pages.  Note that blkno is the previously
+			 * processed block.
 			 */
 			FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
 									blkno);
@@ -997,6 +1002,24 @@ lazy_scan_heap(LVRelState *vacrel)
 										 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
 		}
 
+		buf = read_stream_next_buffer(stream, (void **) &all_visible_according_to_vm);
+
+		if (!BufferIsValid(buf))
+			break;
+
+		Assert(all_visible_according_to_vm);
+		CheckBufferIsPinnedOnce(buf);
+		page = BufferGetPage(buf);
+
+		vacrel->scanned_pages++;
+
+		blkno = BufferGetBlockNumber(buf);
+
+		/* Report as block scanned, update error traceback information */
+		pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
+		update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
+								 blkno, InvalidOffsetNumber);
+
 		/*
 		 * Pin the visibility map page in case we need to mark the page
 		 * all-visible.  In most cases this will be very cheap, because we'll
@@ -1004,10 +1027,6 @@ lazy_scan_heap(LVRelState *vacrel)
 		 */
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
-		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-								 vacrel->bstrategy);
-		page = BufferGetPage(buf);
-
 		/*
 		 * We need a buffer cleanup lock to prune HOT chains and defragment
 		 * the page in lazy_scan_prune.  But when it's not possible to acquire
@@ -1063,7 +1082,7 @@ lazy_scan_heap(LVRelState *vacrel)
 		 */
 		if (got_cleanup_lock)
 			lazy_scan_prune(vacrel, buf, blkno, page,
-							vmbuffer, all_visible_according_to_vm,
+							vmbuffer, *all_visible_according_to_vm,
 							&has_lpdead_items);
 
 		/*
@@ -1117,7 +1136,7 @@ lazy_scan_heap(LVRelState *vacrel)
 		ReleaseBuffer(vmbuffer);
 
 	/* report that everything is now scanned */
-	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
+	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, rel_pages);
 
 	/* now we can compute the new value for pg_class.reltuples */
 	vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages,
@@ -1132,6 +1151,8 @@ lazy_scan_heap(LVRelState *vacrel)
 		Max(vacrel->new_live_tuples, 0) + vacrel->recently_dead_tuples +
 		vacrel->missed_dead_tuples;
 
+	read_stream_end(stream);
+
 	/*
 	 * Do index vacuuming (call each index's ambulkdelete routine), then do
 	 * related heap vacuuming
@@ -1143,11 +1164,11 @@ lazy_scan_heap(LVRelState *vacrel)
 	 * Vacuum the remainder of the Free Space Map.  We must do this whether or
 	 * not there were indexes, and whether or not we bypassed index vacuuming.
 	 */
-	if (blkno > next_fsm_block_to_vacuum)
-		FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
+	if (rel_pages > next_fsm_block_to_vacuum)
+		FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, rel_pages);
 
 	/* report all blocks vacuumed */
-	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
+	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, rel_pages);
 
 	/* Do final index cleanup (call each index's amvacuumcleanup routine) */
 	if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
@@ -1157,14 +1178,14 @@ lazy_scan_heap(LVRelState *vacrel)
 /*
  *	heap_vac_scan_next_block() -- get next block for vacuum to process
  *
- * lazy_scan_heap() calls here every time it needs to get the next block to
- * prune and vacuum.  The function uses the visibility map, vacuum options,
- * and various thresholds to skip blocks which do not need to be processed and
- * sets blkno to the next block to process.
+ * The streaming read callback invokes heap_vac_scan_next_block() every time
+ * lazy_scan_heap() needs the next block to prune and vacuum.  The function
+ * uses the visibility map, vacuum options, and various thresholds to skip
+ * blocks which do not need to be processed and returns the next block to
+ * process or InvalidBlockNumber if there are no remaining blocks.
  *
- * The block number and visibility status of the next block to process are set
- * in *blkno and *all_visible_according_to_vm.  The return value is false if
- * there are no further blocks to process.
+ * The visibility status of the next block to process is set in the
+ * per_buffer_data.
  *
  * vacrel is an in/out parameter here.  Vacuum options and information about
  * the relation are read.  vacrel->skippedallvis is set if we skip a block
@@ -1172,11 +1193,14 @@ lazy_scan_heap(LVRelState *vacrel)
  * relfrozenxid in that case.  vacrel also holds information about the next
  * unskippable block, as bookkeeping for this function.
  */
-static bool
-heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
-						 bool *all_visible_according_to_vm)
+static BlockNumber
+heap_vac_scan_next_block(ReadStream *stream,
+						 void *callback_private_data,
+						 void *per_buffer_data)
 {
 	BlockNumber next_block;
+	LVRelState *vacrel = callback_private_data;
+	bool	   *all_visible_according_to_vm = per_buffer_data;
 
 	/* relies on InvalidBlockNumber + 1 overflowing to 0 on first call */
 	next_block = vacrel->current_block + 1;
@@ -1189,8 +1213,8 @@ heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
 			ReleaseBuffer(vacrel->next_unskippable_vmbuffer);
 			vacrel->next_unskippable_vmbuffer = InvalidBuffer;
 		}
-		*blkno = vacrel->rel_pages;
-		return false;
+		vacrel->current_block = vacrel->rel_pages;
+		return InvalidBlockNumber;
 	}
 
 	/*
@@ -1239,9 +1263,9 @@ heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
 		 * but chose not to.  We know that they are all-visible in the VM,
 		 * otherwise they would've been unskippable.
 		 */
-		*blkno = vacrel->current_block = next_block;
+		vacrel->current_block = next_block;
 		*all_visible_according_to_vm = true;
-		return true;
+		return vacrel->current_block;
 	}
 	else
 	{
@@ -1251,9 +1275,9 @@ heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
 		 */
 		Assert(next_block == vacrel->next_unskippable_block);
 
-		*blkno = vacrel->current_block = next_block;
+		vacrel->current_block = next_block;
 		*all_visible_according_to_vm = vacrel->next_unskippable_allvis;
-		return true;
+		return vacrel->current_block;
 	}
 }
 
-- 
2.34.1

Reply via email to