Suppose we start with this nbtree (subset of a diagram from verify_nbtree.c):
* 1 * / \ * 2 <-> 3 We're deleting 2, the leftmost leaf under a leftmost internal page. After the MARK_PAGE_HALFDEAD record, the first downlink from 1 will lead to 3, which still has a btpo_prev pointing to 2. bt_index_parent_check() complains here: /* The first page we visit at the level should be leftmost */ if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); One can encounter this if recovery ends between a MARK_PAGE_HALFDEAD record and its corresponding UNLINK_PAGE record. See the attached test case. The index is actually fine in such a state, right? I lean toward fixing this by having amcheck scan left; if left links reach only half-dead or deleted pages, that's as good as the present child block being P_LEFTMOST. There's a different error from bt_index_check(), and I've not yet studied how to fix that: ERROR: left link/right link pair in index "not_leftmost_pk" not in agreement DETAIL: Block=0 left block=0 left link from block=4294967295. Alternatively, one could view this as a need for the user to VACUUM between recovery and amcheck. The documentation could direct users to "VACUUM (DISABLE_PAGE_SKIPPING off, INDEX_CLEANUP on, TRUNCATE off)" if not done since last recovery. Does anyone prefer that or some other alternative? For some other amcheck expectations, the comments suggest reliance on the bt_index_parent_check() ShareLock. I haven't tried to make test cases for them, but perhaps recovery can trick them the same way. Examples: errmsg("downlink or sibling link points to deleted block in index \"%s\"", errmsg("block %u is not leftmost in index \"%s\"", errmsg("block %u is not true root in index \"%s\"", Thanks, nm
Author: Noah Misch <n...@leadboat.com> Commit: Noah Misch <n...@leadboat.com> diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index b82f221..9a7f4f7 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -13,6 +13,7 @@ PGFILEDESC = "amcheck - function for verifying relation integrity" REGRESS = check check_btree check_heap TAP_TESTS = 1 +EXTRA_INSTALL=contrib/pg_walinspect ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/amcheck/t/004_pitr.pl b/contrib/amcheck/t/004_pitr.pl new file mode 100644 index 0000000..ec6d87e --- /dev/null +++ b/contrib/amcheck/t/004_pitr.pl @@ -0,0 +1,65 @@ + +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +# Test integrity of intermediate states by PITR to those states +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# origin node: generate WAL records of interest. +my $origin = PostgreSQL::Test::Cluster->new('origin'); +$origin->init(has_archiving => 1, allows_streaming => 1); +$origin->append_conf('postgresql.conf', 'autovacuum = off'); +$origin->start; +$origin->backup('my_backup'); +# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the +# first four, so one index leaf is eligible for deletion. Make a replication +# slot just so pg_walinspect will always have access to later WAL. +my $setup = <<EOSQL; +BEGIN; +CREATE EXTENSION amcheck; +CREATE EXTENSION pg_walinspect; +CREATE TABLE not_leftmost (c text STORAGE PLAIN); +INSERT INTO not_leftmost + SELECT repeat(n::text, database_block_size / 4) + FROM generate_series(1,6) t(n), pg_control_init(); +ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c); +DELETE FROM not_leftmost WHERE c ~ '^[1-4]'; +SELECT pg_create_physical_replication_slot('for_walinspect', true, false); +COMMIT; +EOSQL +$origin->safe_psql('postgres', $setup); +my $before_vacuum_lsn = + $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); +# VACUUM to delete the aforementioned leaf page. Find the LSN of that +# UNLINK_PAGE record. +my $exec = <<EOSQL; +VACUUM VERBOSE not_leftmost; +SELECT max(start_lsn) + FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF') + WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE'; +EOSQL +my $unlink_lsn = $origin->safe_psql('postgres', $exec); +$origin->stop; + +# replica node: amcheck at notable points in the WAL stream +my $replica = PostgreSQL::Test::Cluster->new('replica'); +$replica->init_from_backup($origin, 'my_backup', has_restoring => 1); +$replica->append_conf('postgresql.conf', + "recovery_target_lsn = '$unlink_lsn'"); +$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off'); +$replica->append_conf('postgresql.conf', 'recovery_target_action = promote'); +$replica->start; +$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; +# recovery done; run amcheck +is( $replica->psql( + 'postgres', "SELECT bt_index_parent_check('not_leftmost_pk', true)"), + 0); +is( $replica->psql( + 'postgres', "SELECT bt_index_check('not_leftmost_pk', true)"), + 0); + +done_testing();