On Wed, Apr 02, 2025 at 12:13:52PM +0000, Hayato Kuroda (Fujitsu) wrote:
> Dear Amit, Bertrand,
> 
> > The other idea to simplify the changes for backbranches:
> > sub reactive_slots_change_hfs_and_wait_for_xmins
> > {
> > ...
> > +  my ($slot_prefix, $hsf, $invalidated, $needs_active_slot) = @_;
> > 
> >   # create the logical slots
> > -  create_logical_slots($node_standby, $slot_prefix);
> > +  create_logical_slots($node_standby, $slot_prefix, $needs_active_slot);
> > 
> > ...
> > +  if ($needs_active_slot)
> > +  {
> > +    $handle =
> > +      make_slot_active($node_standby, $slot_prefix, 1, \$stdout, \$stderr);
> > +  }
> > 
> > What if this function doesn't take input parameter needs_active_slot
> > and rather removes the call to make_slot_active? We will call
> > make_slot_active only at the required places. This should make the
> > changes much less because after that, we don't need to make changes
> > related to drop and create. Sure, in some cases, we will test two
> > inactive slots instead of one, but I guess that would be the price to
> > keep the tests simple and more like HEAD.
> 
> Actually, I could not decide which one is better, so let me share both drafts.

Thanks!

> V5-PG17-1 uses the previous approach, and v5-PG17-2 uses new proposed one.
> Bertrand, which one do you like?

I do prefer v5-PG17-2 as it is "closer" to HEAD. That said, I think that we 
should
keep the slots active and only avoid doing the checks for them (they are 
invalidated
that's fine, they are not that's fine too).

Also I think that we should change this part:

"
 # Verify that invalidated logical slots do not lead to retaining WAL.
@@ -602,7 +610,7 @@ check_slots_conflict_reason('vacuum_full_', 'rows_removed');
 my $restart_lsn = $node_standby->safe_psql(
        'postgres',
        "SELECT restart_lsn FROM pg_replication_slots
-               WHERE slot_name = 'vacuum_full_activeslot' AND conflicting;"
+               WHERE slot_name = 'vacuum_full_inactiveslot' AND conflicting;"
 );

" to be on the safe side of thing.

What do you think of the attached (to apply on top of v5-PG17-2)?

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl 
b/src/test/recovery/t/035_standby_logical_decoding.pl
index 2b7ae4b74e9..d1be179fed6 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -171,24 +171,28 @@ sub change_hot_standby_feedback_and_wait_for_xmins
 # Check reason for conflict in pg_replication_slots.
 sub check_slots_conflict_reason
 {
-       my ($slot_prefix, $reason) = @_;
+       my ($slot_prefix, $reason, $checks_active_slot) = @_;
 
-       my $active_slot = $slot_prefix . 'activeslot';
        my $inactive_slot = $slot_prefix . 'inactiveslot';
-
-       $res = $node_standby->safe_psql(
-               'postgres', qq(
-                        select invalidation_reason from pg_replication_slots 
where slot_name = '$active_slot' and conflicting;)
-       );
-
-       is($res, "$reason", "$active_slot reason for conflict is $reason");
-
        $res = $node_standby->safe_psql(
                'postgres', qq(
                         select invalidation_reason from pg_replication_slots 
where slot_name = '$inactive_slot' and conflicting;)
        );
 
        is($res, "$reason", "$inactive_slot reason for conflict is $reason");
+
+       if ($checks_active_slot)
+       {
+               my $active_slot = $slot_prefix . 'activeslot';
+
+               $res = $node_standby->safe_psql(
+                       'postgres', qq(
+                                select invalidation_reason from 
pg_replication_slots where slot_name = '$active_slot' and conflicting;)
+               );
+
+               is($res, "$reason", "$active_slot reason for conflict is 
$reason");
+
+       }
 }
 
 # Drop the slots, re-create them, change hot_standby_feedback,
@@ -205,6 +209,9 @@ sub reactive_slots_change_hfs_and_wait_for_xmins
 
        change_hot_standby_feedback_and_wait_for_xmins($hsf, $invalidated);
 
+       $handle =
+         make_slot_active($node_standby, $slot_prefix, 1, \$stdout, \$stderr);
+
        # reset stat: easier to check for confl_active_logicalslot in 
pg_stat_database_conflicts
        $node_standby->psql('testdb', q[select pg_stat_reset();]);
 }
@@ -252,8 +259,8 @@ sub check_for_invalidation
 # seeing a xl_running_xacts that would advance an active replication slot's
 # catalog_xmin.  Advancing the active replication slot's catalog_xmin
 # would break some tests that expect the active slot to conflict with
-# the catalog xmin horizon. We ensure that replication slots are not activated
-# for tests that might produce this race condition though.
+# the catalog xmin horizon. We ensure to not test for invalidations in such
+# cases.
 sub wait_until_vacuum_can_remove
 {
        my ($vac_option, $sql, $to_vac) = @_;
@@ -553,6 +560,10 @@ 
reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_',
 $node_primary->safe_psql('testdb',
        qq[INSERT INTO decoding_test(x,y) SELECT 100,'100';]);
 
+$node_standby->poll_query_until('testdb',
+       qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name 
= 'vacuum_full_activeslot']
+) or die "replication slot stats of vacuum_full_activeslot not updated";
+
 # This should trigger the conflict
 wait_until_vacuum_can_remove(
        'full', 'CREATE TABLE conflict_test(x integer, y text);
@@ -564,16 +575,19 @@ $node_primary->wait_for_replay_catchup($node_standby);
 check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
-check_slots_conflict_reason('vacuum_full_', 'rows_removed');
+check_slots_conflict_reason('vacuum_full_', 'rows_removed', 0);
+
+# Ensure that replication slot stats are not removed after invalidation.
+is( $node_standby->safe_psql(
+               'testdb',
+               qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE 
slot_name = 'vacuum_full_activeslot']
+       ),
+       't',
+       'replication slot stats not removed after invalidation');
 
 $handle =
   make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr);
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"vacuum_full_activeslot\""
-);
-
 # Turn hot_standby_feedback back on
 change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 
@@ -583,7 +597,7 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 $node_standby->restart;
 
 # Verify reason for conflict is retained across a restart.
-check_slots_conflict_reason('vacuum_full_', 'rows_removed');
+check_slots_conflict_reason('vacuum_full_', 'rows_removed', 0);
 
 ##################################################
 # Verify that invalidated logical slots do not lead to retaining WAL.
@@ -593,7 +607,7 @@ check_slots_conflict_reason('vacuum_full_', 'rows_removed');
 my $restart_lsn = $node_standby->safe_psql(
        'postgres',
        "SELECT restart_lsn FROM pg_replication_slots
-               WHERE slot_name = 'vacuum_full_activeslot' AND conflicting;"
+               WHERE slot_name = 'vacuum_full_inactiveslot' AND conflicting;"
 );
 
 chomp($restart_lsn);
@@ -645,16 +659,11 @@ $node_primary->wait_for_replay_catchup($node_standby);
 check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class', 
0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
-check_slots_conflict_reason('row_removal_', 'rows_removed');
+check_slots_conflict_reason('row_removal_', 'rows_removed', 0);
 
 $handle =
   make_slot_active($node_standby, 'row_removal_', 0, \$stdout, \$stderr);
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"row_removal_activeslot\""
-);
-
 ##################################################
 # Recovery conflict: Same as Scenario 2 but on a shared catalog table
 # Scenario 3: conflict due to row removal with hot_standby_feedback off.
@@ -681,16 +690,11 @@ check_for_invalidation('shared_row_removal_', $logstart,
        'with vacuum on pg_authid', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
-check_slots_conflict_reason('shared_row_removal_', 'rows_removed');
+check_slots_conflict_reason('shared_row_removal_', 'rows_removed', 0);
 
 $handle = make_slot_active($node_standby, 'shared_row_removal_', 0, \$stdout,
        \$stderr);
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"shared_row_removal_activeslot\""
-);
-
 ##################################################
 # Recovery conflict: Same as Scenario 2 but on a non catalog table
 # Scenario 4: No conflict expected.
@@ -702,11 +706,6 @@ $logstart = -s $node_standby->logfile;
 reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
        'no_conflict_', 0, 1);
 
-# This scenario won't produce the race condition by a xl_running_xacts, so
-# activate the slot. See comments atop wait_until_vacuum_can_remove().
-make_slot_active($node_standby, 'no_conflict_', 1, \$stdout,
-       \$stderr);
-
 # This should not trigger a conflict
 wait_until_vacuum_can_remove(
        '', 'CREATE TABLE conflict_test(x integer, y text);
@@ -778,14 +777,10 @@ $node_primary->wait_for_replay_catchup($node_standby);
 check_for_invalidation('pruning_', $logstart, 'with on-access pruning', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
-check_slots_conflict_reason('pruning_', 'rows_removed');
+check_slots_conflict_reason('pruning_', 'rows_removed', 0);
 
 $handle = make_slot_active($node_standby, 'pruning_', 0, \$stdout, \$stderr);
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"pruning_activeslot\"");
-
 # Turn hot_standby_feedback back on
 change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 
@@ -822,7 +817,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
 check_for_invalidation('wal_level_', $logstart, 'due to wal_level', 1);
 
 # Verify reason for conflict is 'wal_level_insufficient' in 
pg_replication_slots
-check_slots_conflict_reason('wal_level_', 'wal_level_insufficient');
+check_slots_conflict_reason('wal_level_', 'wal_level_insufficient', 1);
 
 $handle =
   make_slot_active($node_standby, 'wal_level_', 0, \$stdout, \$stderr);

Reply via email to