Hi, On Tue, Jan 09, 2024 at 01:59:08PM +0900, Michael Paquier wrote: > On Mon, Jan 08, 2024 at 08:00:00PM +0300, Alexander Lakhin wrote: > > Yes, I've added (VERBOSE) and also cut down the test to catch the failure > > faster.
Thanks Alexander! > > The difference between a successful and a failed run: > > tuples: 1 removed, 15 remain, 0 are dead but not yet removable > > [...] > > tuples: 0 removed, 16 remain, 1 are dead but not yet removable > > Yep, it's clear that the horizon is not stable. Yeap. > > > With FREEZE, 10 iterations with 20 tests in parallel succeeded for me > > (while without it, I get failures on iterations 1,2). > > > > [1] > > https://www.postgresql.org/message-id/b2a1f7d0-7629-72c0-2da7-e9c4e336b010%40gmail.com > > Alexander, does the test gain in stability once you begin using the > patch posted on [2], mentioned by Bertrand? Alexander, pleae find attached v3 which is more or less a rebased version of it. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
>From 055349da9eba780caadeb3615749b3b5453a59c4 Mon Sep 17 00:00:00 2001 From: bdrouvot <[email protected]> Date: Tue, 9 Jan 2024 05:08:35 +0000 Subject: [PATCH v3] Fix 035_standby_logical_decoding.pl race condition We want to ensure that vacuum was able to remove dead rows (aka no other transactions holding back global xmin) before testing for slots invalidation on the standby. --- .../t/035_standby_logical_decoding.pl | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) 100.0% src/test/recovery/t/ diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index 8bc39a5f03..9be1ff9b58 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -238,6 +238,22 @@ sub check_for_invalidation ) or die "Timed out waiting confl_active_logicalslot to be updated"; } +# Launch $sql and wait for a new snapshot that has a newer horizon before +# doing the vacuum with $vac_option on $to_vac. +sub wait_until_vacuum_can_remove +{ + my ($vac_option, $sql, $to_vac) = @_; + + my $xid = $node_primary->safe_psql('testdb', qq[$sql + select txid_current();]); + + $node_primary->poll_query_until('testdb', + "SELECT (select txid_snapshot_xmin(txid_current_snapshot()) - $xid) > 0") + or die "new snapshot does not have a newer horizon"; + + $node_primary->safe_psql('testdb', qq[VACUUM $vac_option verbose $to_vac; + INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal;]); +} ######################## # Initialize primary node ######################## @@ -248,6 +264,7 @@ $node_primary->append_conf( wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 +autovacuum = off }); $node_primary->dump_info; $node_primary->start; @@ -468,13 +485,8 @@ reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_', 0, 1); # This should trigger the conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE TABLE conflict_test(x integer, y text); - DROP TABLE conflict_test; - VACUUM full pg_class; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +wait_until_vacuum_can_remove('full', 'CREATE TABLE conflict_test(x integer, y text); + DROP TABLE conflict_test;', 'pg_class'); $node_primary->wait_for_replay_catchup($node_standby); @@ -550,13 +562,8 @@ reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_', 0, 1); # This should trigger the conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE TABLE conflict_test(x integer, y text); - DROP TABLE conflict_test; - VACUUM pg_class; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +wait_until_vacuum_can_remove('', 'CREATE TABLE conflict_test(x integer, y text); + DROP TABLE conflict_test;', 'pg_class'); $node_primary->wait_for_replay_catchup($node_standby); @@ -588,13 +595,8 @@ reactive_slots_change_hfs_and_wait_for_xmins('row_removal_', 'shared_row_removal_', 0, 1); # Trigger the conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE ROLE create_trash; - DROP ROLE create_trash; - VACUUM pg_authid; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +wait_until_vacuum_can_remove('', 'CREATE ROLE create_trash; + DROP ROLE create_trash;', 'pg_authid'); $node_primary->wait_for_replay_catchup($node_standby); @@ -625,14 +627,10 @@ reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', 'no_conflict_', 0, 1); # This should not trigger a conflict -$node_primary->safe_psql( - 'testdb', qq[ - CREATE TABLE conflict_test(x integer, y text); - INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s; - UPDATE conflict_test set x=1, y=1; - VACUUM conflict_test; - INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal -]); +wait_until_vacuum_can_remove('', 'CREATE TABLE conflict_test(x integer, y text); + INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s; + UPDATE conflict_test set x=1, y=1;', 'conflict_test'); + $node_primary->wait_for_replay_catchup($node_standby); # message should not be issued -- 2.34.1
