On Wed, Mar 19, 2025 at 5:26 AM Andrey Borodin <x4...@yandex-team.ru> wrote: > > So, yes, your change to the test seems correct to me. We can do the test with > just one injection point.
Attached 0001 is what I plan to commit first thing tomorrow morning. I moved the vacuum_delay_point() so that we would call pgstat_progress_update_param() as soon as we were done processing the block (instead of doing vacuum_delay_point() first). This also makes it consistent with the other vacuum streaming read users. What do we even use the progress update for here, though? For heap vacuuming it makes sense because we report heap blocks vacuumed in pg_stat_progress_vacuum, but we don't do that for index blocks vacuumed... I plan to commit 0001 (the read stream user) without 0002 and then solicit more feedback on 0002. 0002 is a draft of the test. I want us to actually test the backtracking behavior. To do this, I think we need two injection points with the current injection point infrastructure. Which I don't love. I think if we could pass variables into INJECTION_POINT(), we could do the test with one injection point. I also don't love how "failing" for this test is just it hanging because the nbtree-vacuum-page-backtrack wait event never happens (there is no specific assertion or anything in the test). I do like that I moved the nbtree-vacuum-page injection point to the top of nbtreevacuumpage because I think it can be used for other tests in the future. I also think it is worth making a btree directory in src/test instead of adding this to src/test/modules/test_misc. In fact it might be worth moving the gin and brin tests out of src/test/modules and making a new "indexes" directory in src/test with gin, brin, and btree (also some spgist tests are in there somewhere) subdirectories. - Melanie
From 06689ada5011e1abeadaa0d33538e0b4069e5723 Mon Sep 17 00:00:00 2001 From: Melanie Plageman <melanieplage...@gmail.com> Date: Thu, 20 Mar 2025 16:09:41 -0400 Subject: [PATCH v10 2/2] test backtrack --- src/backend/access/nbtree/nbtree.c | 4 + src/test/modules/test_misc/meson.build | 1 + .../modules/test_misc/t/008_vacuum_btree.pl | 96 +++++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 src/test/modules/test_misc/t/008_vacuum_btree.pl diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 353600414a2..be29c4e70e2 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -32,6 +32,7 @@ #include "storage/lmgr.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" +#include "utils/injection_point.h" #include "utils/memutils.h" @@ -1178,6 +1179,7 @@ backtrack: attempt_pagedel = false; backtrack_to = P_NONE; + INJECTION_POINT("nbtree-vacuum-page"); _bt_lockbuf(rel, buf, BT_READ); page = BufferGetPage(buf); opaque = NULL; @@ -1468,6 +1470,8 @@ backtrack: /* check for vacuum delay while not holding any buffer lock */ vacuum_delay_point(false); + INJECTION_POINT("nbtree-vacuum-page-backtrack"); + /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build index 9c50de7efb0..ed0822f19ee 100644 --- a/src/test/modules/test_misc/meson.build +++ b/src/test/modules/test_misc/meson.build @@ -16,6 +16,7 @@ tests += { 't/005_timeouts.pl', 't/006_signal_autovacuum.pl', 't/007_catcache_inval.pl', + 't/008_vacuum_btree.pl', ], }, } diff --git a/src/test/modules/test_misc/t/008_vacuum_btree.pl b/src/test/modules/test_misc/t/008_vacuum_btree.pl new file mode 100644 index 00000000000..d393e1809ce --- /dev/null +++ b/src/test/modules/test_misc/t/008_vacuum_btree.pl @@ -0,0 +1,96 @@ +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test btree vacuum + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init; + +# Turn autovacuum off for the cluster. This could be done at the table level +# instead. However, since this file exercises vacuum, turn off autovacuum +# globally. This also allows use of non-local injection points in vacuum code. +$node->append_conf('postgresql.conf', 'autovacuum = off'); +$node->start; + +# Check if extension injection_points is available +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); + +my $psql_session = $node->background_psql('postgres'); + +# Create a table with an index filled with dead index entries +$node->safe_psql('postgres', q[ + create table test_backtrack (col1 int); + insert into test_backtrack select generate_series(1,800); + create index on test_backtrack(col1); + delete from test_backtrack; + ] +); + +# Attach to two injection points. The first one will allow us to stop between +# vacuuming each index page. The second is our validation that we did backtrack. +$psql_session->query_safe( + qq[ + SELECT injection_points_set_local(); + SELECT injection_points_attach('nbtree-vacuum-page', 'wait'); + SELECT injection_points_attach('nbtree-vacuum-page-backtrack', 'wait'); +]); + +# Start a vacuum of the table and index. +$psql_session->query_until( + qr/starting_bg_psql/, + q(\echo starting_bg_psql + vacuum (index_cleanup on, parallel 0) test_backtrack; + )); + +# The index vacuum should be waiting on our first injection point and is yet to +# process any pages. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); + +# Wake up vacuum so it can process the first index leaf page. +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page');"); + +# The first index leaf page is now vacuumed and vacuum should be waiting again +# on the first injection point. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); + +# Insert data while vacuum is waiting to process the next leaf page. The +# inserted data will force a page split in which some tuples from unprocessed +# leaf pages will be moved to the first already vacuumed leaf page. +$node->safe_psql('postgres', + "insert into test_backtrack select generate_series(1,300);"); + +# Now we want the vacuum to continue. We don't want to wait on our first break +# point again. +# We need to make sure we are waiting before detaching and issuing a wakeup. +# Otherwise there could be a race and the backend may not get woken up. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); +$node->safe_psql('postgres', "SELECT injection_points_detach('nbtree-vacuum-page');"); +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page');"); + +# Wait on our second break point. Vacuum should have been forced to backtrack +# and vacuum the first leaf page again to ensure it removed all dead index +# entries. +$node->wait_for_event('client backend', 'nbtree-vacuum-page-backtrack'); + +# Once we wait on our second break point, we're done. Time to tell the backend +# to detach and wake it up. +$node->safe_psql('postgres', "SELECT injection_points_detach('nbtree-vacuum-page-backtrack');"); +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page-backtrack');"); + +ok($psql_session->quit); + +done_testing(); -- 2.34.1
From e558d78483d36ee510f13389029663e6bace9a27 Mon Sep 17 00:00:00 2001 From: Melanie Plageman <melanieplage...@gmail.com> Date: Thu, 20 Mar 2025 16:09:11 -0400 Subject: [PATCH v10 1/2] Use streaming read I/O in btree vacuuming Btree vacuum processes all index pages in physical order. Now it uses the read stream API to get the next buffer instead of explicitly invoking ReadBuffer(). It is possible for concurrent indexes to cause page splits during index vacuuming. This can lead to index entries that have yet to be vacuumed being moved to pages that have already been vacuumed. Btree vacuum code handles this by backtracking to reprocess those pages. So, while normal, sequentially encountered pages are now read through the read stream API, backtracked pages are still read with explicit ReadBuffer() calls. Author: Andrey Borodin <x4...@yandex-team.ru> Reviewed-by: Melanie Plageman <melanieplage...@gmail.com> Reviewed-by: Junwang Zhao <zhjw...@gmail.com> Reviewed-by: Kirill Reshke <reshkekir...@gmail.com> Discussion: https://postgr.es/m/flat/CAAKRu_bW1UOyup%3DjdFw%2BkOF9bCaAm%3D9UpiyZtbPMn8n_vnP%2Big%40mail.gmail.com#3b3a84132fc683b3ee5b40bc4c2ea2a5 --- src/backend/access/nbtree/nbtree.c | 91 ++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c0a8833e068..353600414a2 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -86,7 +86,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid); -static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno); +static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf); static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, @@ -991,8 +991,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; - BlockNumber scanblkno; bool needLock; + BlockRangeReadStreamPrivate p; + ReadStream *stream = NULL; /* * Reset fields that track information about the entire index now. This @@ -1061,9 +1062,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, */ needLock = !RELATION_IS_LOCAL(rel); - scanblkno = BTREE_METAPAGE + 1; + p.current_blocknum = BTREE_METAPAGE + 1; + stream = read_stream_begin_relation(READ_STREAM_FULL, + info->strategy, + rel, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); for (;;) { + Buffer buf; + /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); @@ -1076,18 +1086,44 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, num_pages); /* Quit if we've scanned the whole relation */ - if (scanblkno >= num_pages) + if (p.current_blocknum >= num_pages) break; - /* Iterate over pages, then loop back to recheck length */ - for (; scanblkno < num_pages; scanblkno++) + + + p.last_exclusive = num_pages; + + /* Iterate over pages, then loop back to recheck relation length */ + while (true) { - btvacuumpage(&vstate, scanblkno); + BlockNumber current_block; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(false); + + buf = read_stream_next_buffer(stream, NULL); + + if (!BufferIsValid(buf)) + break; + + current_block = btvacuumpage(&vstate, buf); + if (info->report_progress) pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - scanblkno); + current_block); } + + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + + /* + * We have to reset the read stream to use it again. After returning + * InvalidBuffer, the read stream API won't invoke our callback again + * until the stream has been reset. + */ + read_stream_reset(stream); } + read_stream_end(stream); + /* Set statistics num_pages field to final size of index */ stats->num_pages = num_pages; @@ -1111,14 +1147,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we must - * backtrack to re-examine and VACUUM pages that were the scanblkno during + * backtrack to re-examine and VACUUM pages that were on buf's page during * a previous call here. This is how we handle page splits (that happened * after our cycleid was acquired) whose right half page happened to reuse * a block that we might have processed at some point before it was * recycled (i.e. before the page split). + * + * Returns BlockNumber of a scanned page (not backtracked). */ -static void -btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) +static BlockNumber +btvacuumpage(BTVacState *vstate, Buffer buf) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; @@ -1129,7 +1167,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) bool attempt_pagedel; BlockNumber blkno, backtrack_to; - Buffer buf; + BlockNumber scanblkno = BufferGetBlockNumber(buf); Page page; BTPageOpaque opaque; @@ -1140,17 +1178,6 @@ backtrack: attempt_pagedel = false; backtrack_to = P_NONE; - /* call vacuum_delay_point while not holding any buffer lock */ - vacuum_delay_point(false); - - /* - * We can't use _bt_getbuf() here because it always applies - * _bt_checkpage(), which will barf on an all-zero page. We want to - * recycle all-zero pages, not fail. Also, we want to use a nondefault - * buffer access strategy. - */ - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, - info->strategy); _bt_lockbuf(rel, buf, BT_READ); page = BufferGetPage(buf); opaque = NULL; @@ -1186,7 +1213,7 @@ backtrack: errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"", blkno, scanblkno, RelationGetRelationName(rel)))); _bt_relbuf(rel, buf); - return; + return scanblkno; } /* @@ -1206,7 +1233,7 @@ backtrack: { /* Done with current scanblkno (and all lower split pages) */ _bt_relbuf(rel, buf); - return; + return scanblkno; } } @@ -1437,8 +1464,22 @@ backtrack: if (backtrack_to != P_NONE) { blkno = backtrack_to; + + /* check for vacuum delay while not holding any buffer lock */ + vacuum_delay_point(false); + + /* + * We can't use _bt_getbuf() here because it always applies + * _bt_checkpage(), which will barf on an all-zero page. We want to + * recycle all-zero pages, not fail. Also, we want to use a + * nondefault buffer access strategy. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); goto backtrack; } + + return scanblkno; } /* -- 2.34.1