Re: Race condition in recovery?

Dilip Kumar Wed, 26 May 2021 23:26:42 -0700

On Wed, May 26, 2021 at 9:40 PM Robert Haas <robertmh...@gmail.com> wrote:


> ...which has a clear race condition.
> src/test/recovery/t/023_pitr_prepared_xact.pl has logic to wait for a
> WAL file to be archived, so maybe we can steal that logic and use it
> here.

Yeah, done that, I think we can use exact same logic for history files
as well because if wal file is archived then history file must be
because a) history file get created during promotion so created before
WAL file with new TL is ready for archive b) Archiver archive history
files before archiving any WAL files.

src/test/recovery/t/025_stuck_on_old_timeline.pl

> I suggest we rename the test to something a bit more descriptive. Like
> instead of 025_timeline_issue.pl, perhaps
> 025_stuck_on_old_timeline.pl? Or I'm open to other suggestions, but
> "timeline issue" is a bit too vague for my taste.

Changed as suggested.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

From 5bbea2f5a21c28729580ec2b7b8cf50fd67e7ed0 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilipkumar@localhost.localdomain>
Date: Sun, 23 May 2021 21:27:58 +0530
Subject: [PATCH v3] Test for new standby not following promoted standby

---
 src/test/recovery/t/025_stuck_on_old_timeline.pl | 101 +++++++++++++++++++++++
 src/test/recovery/t/cp_history_files             |   8 ++
 2 files changed, 109 insertions(+)
 create mode 100644 src/test/recovery/t/025_stuck_on_old_timeline.pl
 create mode 100755 src/test/recovery/t/cp_history_files

diff --git a/src/test/recovery/t/025_stuck_on_old_timeline.pl b/src/test/recovery/t/025_stuck_on_old_timeline.pl
new file mode 100644
index 0000000..e44b774
--- /dev/null
+++ b/src/test/recovery/t/025_stuck_on_old_timeline.pl
@@ -0,0 +1,101 @@
+
+# Copyright (c) 2021, PostgreSQL Global Development Group
+
+# Testing streaming replication where standby is promoted and a new cascade
+# standby (without WAL) is connected to the promoted standby.  Both archiving
+# and streaming are enabled but it should only get the history files from the
+# archive but not the WAL files so that it has to get the checkpoint record
+# from the promoted standby through streaming.  Test that the cascade standby
+# should be able to follow the new primary (promoted standby).
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use FindBin;
+use Test::More tests => 1;
+
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+
+# Set archive command using 'cp_history_files' (custom command).  The command
+# will only copy the history file and ignore all other WAL files.  This is
+# required to reproduce the scenario where history files reach the archive
+# but not the WAL files when standby try to restore it from the archive so that
+# it needs to stream the checkpoint record from the primary.
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+my $archivedir_primary = $node_primary->archive_dir;
+$node_primary->append_conf(
+	'postgresql.conf', qq(
+archive_command = '"$FindBin::RealBin/cp_history_files" "%p" "$archivedir_primary/%f"'
+));
+$node_primary->start;
+
+my $backup_name = 'my_backup';
+
+# Take backup from primary
+$node_primary->backup($backup_name);
+
+# Create streaming standby linking to primary
+my $node_standby = get_new_node('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	allows_streaming => 1, has_streaming => 1, has_archiving => 1);
+$node_standby->start;
+
+# Take backup of standby, use -Xnone so that pg_wal is empty.
+$node_standby->backup($backup_name, backup_options => ['-Xnone']);
+
+# Create cascading standby but don't start it yet.
+# Must set up both streaming and archiving.
+my $node_cascade = get_new_node('cascade');
+$node_cascade->init_from_backup($node_standby, $backup_name,
+	has_streaming => 1);
+
+# Setup restore command
+my $copy_command =
+  $TestLib::windows_os ? 'copy' : 'cp';
+
+$node_cascade->append_conf(
+	'postgresql.conf', qq(
+restore_command = '$copy_command "$archivedir_primary/%f" "%p"'
+));
+
+# Promote the standby.
+$node_standby->psql('postgres', 'SELECT pg_promote()');
+
+# Find next WAL segment to be archived
+my $walfile_to_be_archived = $node_standby->safe_psql('postgres',
+	"SELECT pg_walfile_name(pg_current_wal_lsn());");
+
+# Make WAL segment eligible for archival
+$node_standby->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Wait until the WAL segment has been archived.
+# Since the history file get created on promotion and the archiver ensures that
+# the history files get archived before any wal segment so we can ensure after
+# this the history file must have been archived.
+my $archive_wait_query =
+  "SELECT '$walfile_to_be_archived' <= last_archived_wal FROM pg_stat_archiver;";
+$node_standby->poll_query_until('postgres', $archive_wait_query)
+  or die "Timed out while waiting for WAL segment to be archived";
+my $last_archived_wal_file = $walfile_to_be_archived;
+
+# Start cascade node
+$node_cascade->start;
+
+# Create some content on promoted standby and check its presence in cascade standby
+$node_standby->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 1 AS a");
+
+# Wait for standbys to catch up
+$node_standby->wait_for_catchup($node_cascade, 'replay',
+	$node_standby->lsn('replay'));
+
+# Check cascade standby is able to follow the new primary
+my $result =
+  $node_cascade->safe_psql('postgres', "SELECT count(*) FROM tab_int");
+print "cascade: $result\n";
+is($result, qq(1), 'check streamed content on cascade standby');
+
+# clean up
+$node_primary->teardown_node;
+$node_standby->teardown_node;
+$node_cascade->teardown_node;
diff --git a/src/test/recovery/t/cp_history_files b/src/test/recovery/t/cp_history_files
new file mode 100755
index 0000000..8dfe019
--- /dev/null
+++ b/src/test/recovery/t/cp_history_files
@@ -0,0 +1,8 @@
+#!/usr/bin/perl
+
+use File::Copy;
+
+die "wrong number of arguments" if @ARGV != 2;
+my ($source, $target) = @ARGV;
+return if $source !~ /history/;
+copy($source, $target) or die "couldn't copy $source to $target: $!";
-- 
1.8.3.1

Re: Race condition in recovery?

Reply via email to