On Mon, May 14, 2018 at 01:14:22PM +0530, Pavan Deolasee wrote:
> Looks like I didn't understand Alvaro's comment when he mentioned it to me
> off-list. But I now see what Michael and Alvaro mean and that indeed seems
> like a problem. I was thinking that the test for (ControlFile->state ==
> DB_IN_ARCHIVE_RECOVERY) will ensure that minRecoveryPoint can't be updated
> after the standby is promoted. While that's true for a DB_IN_PRODUCTION,  the
> RestartPoint may finish after we have written end-of-recovery record, but
> before we're in production and thus the minRecoveryPoint may again be set.

Yeah, this has been something I considered as well first, but I was not
confident enough that setting up minRecoveryPoint to InvalidXLogRecPtr
was actually a safe thing for timeline switches.

So I have spent a good portion of today testing and playing with it to
be confident enough that this was right, and I have finished with the
attached.  The patch adds a new flag to XLogCtl which marks if the
control file has been updated after the end-of-recovery record has been
written, so as minRecoveryPoint does not get updated because of a
restart point running in parallel.

I have also reworked the test case you sent, removing the manuals sleeps
and replacing them with correct wait points.  There is also no point to
wait after promotion as pg_ctl promote implies a wait.  Another
important thing is that you need to use wal_log_hints = off to see a 
crash, which is something that allows_streaming actually enables.

Comments are welcome.
--
Michael
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index adbd6a2126..230409ced6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -701,6 +701,15 @@ typedef struct XLogCtlData
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
 
+	/*
+	 * Track if end-of-recovery record has been written and that the
+	 * control file has been updated, so as the minimum recovery LSN
+	 * does not get updated anymore.  This way, WAL is replayed up to
+	 * the end if a crash happens before the first post-end-recovery
+	 * checkpoint.
+	 */
+	bool		endOfRecoveryDone;
+
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -2707,12 +2716,25 @@ XLogGetReplicationSlotMinimumLSN(void)
 static void
 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 {
+	bool	endOfRecoveryDone;
+
 	/* Quick check using our local copy of the variable */
 	if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
 		return;
 
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
+	/*
+	 * If the end of recovery record has already been written and that
+	 * the control file has been accordingly updated, then minRecoveryPoint
+	 * should not be updated anymore.
+	 */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	endOfRecoveryDone = XLogCtl->endOfRecoveryDone;
+	SpinLockRelease(&XLogCtl->info_lck);
+	if (endOfRecoveryDone)
+		return;
+
 	/* update local copy */
 	minRecoveryPoint = ControlFile->minRecoveryPoint;
 	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
@@ -7032,6 +7054,7 @@ StartupXLOG(void)
 		XLogCtl->recoveryLastXTime = 0;
 		XLogCtl->currentChunkStartTime = 0;
 		XLogCtl->recoveryPause = false;
+		XLogCtl->endOfRecoveryDone = false;
 		SpinLockRelease(&XLogCtl->info_lck);
 
 		/* Also ensure XLogReceiptTime has a sane value */
@@ -9040,10 +9063,21 @@ CreateEndOfRecoveryRecord(void)
 	/*
 	 * Update the control file so that crash recovery can follow the timeline
 	 * changes to this point.
+	 *
+	 * Set minRecoveryPoint to InvalidXLogRecPtr so that a crash will force
+	 * redo recovery to the end of WAL. Otherwise a crash immediately after
+	 * promotion may lead to recovery to an inconsistent point and in the worst
+	 * case, recovery failing because of invalid page references.
+	 *
+	 * endOfRecoveryDone is updated to reflect the fact that minRecoveryPoint
+	 * cannot be updated anymore from this point.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->endOfRecoveryDone = true;
+	SpinLockRelease(&XLogCtl->info_lck);
 	ControlFile->time = (pg_time_t) time(NULL);
-	ControlFile->minRecoveryPoint = recptr;
+	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
 	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
@@ -9137,6 +9171,7 @@ CreateRestartPoint(int flags)
 	CheckPoint	lastCheckPoint;
 	XLogRecPtr	PriorRedoPtr;
 	TimestampTz xtime;
+	bool		endOfRecoveryDone;
 
 	/*
 	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
@@ -9244,6 +9279,9 @@ CreateRestartPoint(int flags)
 	 * we get here after the end-of-recovery checkpoint.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	endOfRecoveryDone = XLogCtl->endOfRecoveryDone;
+	SpinLockRelease(&XLogCtl->info_lck);
 	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
 		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
 	{
@@ -9261,8 +9299,12 @@ CreateRestartPoint(int flags)
 		 * at a minimum. Note that for an ordinary restart of recovery there's
 		 * no value in having the minimum recovery point any earlier than this
 		 * anyway, because redo will begin just after the checkpoint record.
+		 * It could be possible that minRecoveryPoint has already been updated
+		 * when generating the end-of-recovery record, in which case we want
+		 * WAL replay to happen up to the end.
 		 */
-		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
+		if (!endOfRecoveryDone &&
+			ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
 		{
 			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
 			ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
diff --git a/src/test/recovery/t/015_promotion.pl b/src/test/recovery/t/015_promotion.pl
new file mode 100644
index 0000000000..2f932d5acb
--- /dev/null
+++ b/src/test/recovery/t/015_promotion.pl
@@ -0,0 +1,83 @@
+# Test for promotion handling with WAL records generated post-promotion
+# before the first checkpoint is generated.  This test case checks for
+# invalid page references at replay based on the minimum consistent
+# recovery point defined.
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 1;
+
+# Initialize primary node
+my $alpha = get_new_node('alpha');
+$alpha->init(allows_streaming => 1);
+# Setting wal_log_hints to off is important to get invalid page
+# references.
+$alpha->append_conf("postgresql.conf", <<EOF);
+wal_log_hints = off
+EOF
+
+# Start the primary
+$alpha->start;
+
+# setup/start a standby
+$alpha->backup('bkp');
+my $bravo = get_new_node('bravo');
+$bravo->init_from_backup($alpha, 'bkp', has_streaming => 1);
+$bravo->append_conf('postgresql.conf', <<EOF);
+checkpoint_timeout=1h
+checkpoint_completion_target=0.9
+EOF
+$bravo->start;
+
+# Dummy table for the upcoming tests.
+$alpha->safe_psql('postgres', 'create table test1 (a int)');
+$alpha->safe_psql('postgres', 'insert into test1 select generate_series(1, 10000)');
+
+# take a checkpoint
+$alpha->safe_psql('postgres', 'checkpoint');
+
+# The following vacuum will set visibility map bits and create
+# problematic WAL records.
+$alpha->safe_psql('postgres', 'vacuum verbose test1');
+# Wait for last record to have been replayed on the standby.
+$alpha->wait_for_catchup($bravo, 'replay',
+						 $alpha->lsn('insert'));
+
+# Now force a checkpoint on the standby. This seems unnecessary but for "some"
+# reason, the previous checkpoint on the primary does not reflect on the standby
+# and without an explicit checkpoint, it may start redo recovery from a much
+# older point, which includes even create table and initial page additions.
+$bravo->safe_psql('postgres', 'checkpoint');
+
+# Now just use a dummy table and run some operations to move minRecoveryPoint
+# beyond the previous vacuum.
+$alpha->safe_psql('postgres', 'create table test2 (a int, b text)');
+$alpha->safe_psql('postgres', 'insert into test2 select generate_series(1,10000), md5(random()::text)');
+$alpha->safe_psql('postgres', 'truncate test2');
+
+# Do the promotion, which reinitializes minRecoveryPoint in the control
+# file so as WAL is replayed up to the end.
+$bravo->promote;
+
+# Truncate the table on the promoted standby, vacuum and extend it
+# again to create new page references.  The first post-recovery checkpoint
+# has not happened yet.
+$bravo->safe_psql('postgres', 'truncate test1');
+$bravo->safe_psql('postgres', 'vacuum verbose test1');
+$bravo->safe_psql('postgres', 'insert into test1 select generate_series(1,1000)');
+
+# Now crash-stop the promoted standby and restart.  This makes sure that
+# replay does not see invalid page references because of an invalid
+# minimum consistent recovery point.
+$bravo->stop('immediate');
+$bravo->start;
+
+# Check state of the table after full crash recovery.  All its data should
+# be here.
+my $psql_out;
+$bravo->psql(
+	'postgres',
+	"SELECT count(*) FROM test1",
+	stdout => \$psql_out);
+is($psql_out, '1000', "Check that table state is correct");

Attachment: signature.asc
Description: PGP signature

Reply via email to