Suppose we start with this nbtree (subset of a diagram from verify_nbtree.c):

 *               1
 *           /       \
 *        2     <->     3

We're deleting 2, the leftmost leaf under a leftmost internal page.  After the
MARK_PAGE_HALFDEAD record, the first downlink from 1 will lead to 3, which
still has a btpo_prev pointing to 2.  bt_index_parent_check() complains here:

                /* The first page we visit at the level should be leftmost */
                if (first && !BlockNumberIsValid(state->prevrightlink) && 
!P_LEFTMOST(opaque))
                        ereport(ERROR,
                                        (errcode(ERRCODE_INDEX_CORRUPTED),
                                         errmsg("the first child of leftmost 
target page is not leftmost of its level in index \"%s\"",
                                                        
RelationGetRelationName(state->rel)),
                                         errdetail_internal("Target block=%u 
child block=%u target page lsn=%X/%X.",
                                                                                
state->targetblock, blkno,
                                                                                
LSN_FORMAT_ARGS(state->targetlsn))));

One can encounter this if recovery ends between a MARK_PAGE_HALFDEAD record
and its corresponding UNLINK_PAGE record.  See the attached test case.  The
index is actually fine in such a state, right?  I lean toward fixing this by
having amcheck scan left; if left links reach only half-dead or deleted pages,
that's as good as the present child block being P_LEFTMOST.  There's a
different error from bt_index_check(), and I've not yet studied how to fix
that:

  ERROR:  left link/right link pair in index "not_leftmost_pk" not in agreement
  DETAIL:  Block=0 left block=0 left link from block=4294967295.

Alternatively, one could view this as a need for the user to VACUUM between
recovery and amcheck.  The documentation could direct users to "VACUUM
(DISABLE_PAGE_SKIPPING off, INDEX_CLEANUP on, TRUNCATE off)" if not done since
last recovery.  Does anyone prefer that or some other alternative?

For some other amcheck expectations, the comments suggest reliance on the
bt_index_parent_check() ShareLock.  I haven't tried to make test cases for
them, but perhaps recovery can trick them the same way.  Examples:

  errmsg("downlink or sibling link points to deleted block in index \"%s\"",
  errmsg("block %u is not leftmost in index \"%s\"",
  errmsg("block %u is not true root in index \"%s\"",

Thanks,
nm
Author:     Noah Misch <n...@leadboat.com>
Commit:     Noah Misch <n...@leadboat.com>

    

diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
index b82f221..9a7f4f7 100644
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -13,6 +13,7 @@ PGFILEDESC = "amcheck - function for verifying relation 
integrity"
 REGRESS = check check_btree check_heap
 
 TAP_TESTS = 1
+EXTRA_INSTALL=contrib/pg_walinspect
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/amcheck/t/004_pitr.pl b/contrib/amcheck/t/004_pitr.pl
new file mode 100644
index 0000000..ec6d87e
--- /dev/null
+++ b/contrib/amcheck/t/004_pitr.pl
@@ -0,0 +1,65 @@
+
+# Copyright (c) 2021-2023, PostgreSQL Global Development Group
+
+# Test integrity of intermediate states by PITR to those states
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# origin node: generate WAL records of interest.
+my $origin = PostgreSQL::Test::Cluster->new('origin');
+$origin->init(has_archiving => 1, allows_streaming => 1);
+$origin->append_conf('postgresql.conf', 'autovacuum = off');
+$origin->start;
+$origin->backup('my_backup');
+# Create a table with each of 6 PK values spanning 1/4 of a block.  Delete the
+# first four, so one index leaf is eligible for deletion.  Make a replication
+# slot just so pg_walinspect will always have access to later WAL.
+my $setup = <<EOSQL;
+BEGIN;
+CREATE EXTENSION amcheck;
+CREATE EXTENSION pg_walinspect;
+CREATE TABLE not_leftmost (c text STORAGE PLAIN);
+INSERT INTO not_leftmost
+  SELECT repeat(n::text, database_block_size / 4)
+  FROM generate_series(1,6) t(n), pg_control_init();
+ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c);
+DELETE FROM not_leftmost WHERE c ~ '^[1-4]';
+SELECT pg_create_physical_replication_slot('for_walinspect', true, false);
+COMMIT;
+EOSQL
+$origin->safe_psql('postgres', $setup);
+my $before_vacuum_lsn =
+  $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+# VACUUM to delete the aforementioned leaf page.  Find the LSN of that
+# UNLINK_PAGE record.
+my $exec = <<EOSQL;
+VACUUM VERBOSE not_leftmost;
+SELECT max(start_lsn)
+  FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF')
+  WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE';
+EOSQL
+my $unlink_lsn = $origin->safe_psql('postgres', $exec);
+$origin->stop;
+
+# replica node: amcheck at notable points in the WAL stream
+my $replica = PostgreSQL::Test::Cluster->new('replica');
+$replica->init_from_backup($origin, 'my_backup', has_restoring => 1);
+$replica->append_conf('postgresql.conf',
+       "recovery_target_lsn = '$unlink_lsn'");
+$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off');
+$replica->append_conf('postgresql.conf', 'recovery_target_action = promote');
+$replica->start;
+$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
+  or die "Timed out while waiting for PITR promotion";
+# recovery done; run amcheck
+is( $replica->psql(
+               'postgres', "SELECT bt_index_parent_check('not_leftmost_pk', 
true)"),
+       0);
+is( $replica->psql(
+               'postgres', "SELECT bt_index_check('not_leftmost_pk', true)"),
+       0);
+
+done_testing();

Reply via email to