On 10/10/2016 05:25 PM, Michael Paquier wrote:
On Fri, Oct 7, 2016 at 2:59 AM, Pavan Deolasee <pavan.deola...@gmail.com> wrote:
I believe the fix is very simple. The FSM change during truncation is
critical and the buffer must be marked by MarkBufferDirty() i.e. those
changes must make to the disk. I think it's alright not to WAL log them
because XLOG_SMGR_TRUNCATE will redo() them if a crash occurs. But it must
not be lost across a checkpoint. Also, since it happens only during relation
truncation, I don't see any problem from performance perspective.

Agreed. I happen to notice that VM is similalry careful when it comes
to truncate it (visibilitymap_truncate).

visibilitymap_truncate is actually also wrong, in a different way. The truncation WAL record is written only after the VM (and FSM) are truncated. But visibilitymap_truncate() has already modified and dirtied the page. If the VM page change is flushed to disk before the WAL record, and you crash, you might have a torn VM page and a checksum failure.

Simply replacing the MarkBufferDirtyHint() call with MarkBufferDirty() in FreeSpaceMapTruncateRel would have the same issue. If you call MarkBufferDirty(), you must WAL-log the change, and also set the page's LSN to make sure the WAL record is flushed first.

I think we need something like the attached.

- Heikki

>From 7b100e951236c21656eb7bd63b4631c0c32cc573 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 17 Oct 2016 14:02:52 +0300
Subject: [PATCH 1/1] WIP: Fix "FSM corruption leading to errors"

---
 src/backend/access/heap/visibilitymap.c   |   9 ++-
 src/backend/catalog/storage.c             | 110 ++++++++++++++++++------------
 src/backend/storage/freespace/freespace.c |  14 +++-
 src/include/access/visibilitymap.h        |   2 +-
 src/include/storage/freespace.h           |   3 +-
 src/test/recovery/t/008_fsm_check.pl      |  90 ++++++++++++++++++++++++
 6 files changed, 179 insertions(+), 49 deletions(-)
 create mode 100644 src/test/recovery/t/008_fsm_check.pl

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 3ad4a9f..377e511 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -457,9 +457,14 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
  * before they access the VM again.
  *
  * nheapblocks is the new size of the heap.
+ *
+ * The caller must have WAL-logged the truncation already (if the relation
+ * needs WAL-logging at all). 'lsn' is the LSN of the XLOG record. It is
+ * used to stamp the last remaining VM page, so that it doesn't get flushed
+ * to disk before the WAL record.
  */
 void
-visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
+visibilitymap_truncate(Relation rel, BlockNumber nheapblocks, XLogRecPtr lsn)
 {
 	BlockNumber newnblocks;
 
@@ -524,6 +529,8 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
 		map[truncByte] &= (1 << truncOffset) - 1;
 
 		MarkBufferDirty(mapBuffer);
+		if (lsn != InvalidXLogRecPtr)
+			PageSetLSN(page, lsn);
 		UnlockReleaseBuffer(mapBuffer);
 	}
 	else
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 0d8311c..5478953 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -28,6 +28,7 @@
 #include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
 #include "storage/freespace.h"
+#include "storage/proc.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -228,6 +229,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 {
 	bool		fsm;
 	bool		vm;
+	XLogRecPtr	lsn = InvalidXLogRecPtr;
 
 	/* Open it at the smgr level if not already done */
 	RelationOpenSmgr(rel);
@@ -239,56 +241,76 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
 	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
 
-	/* Truncate the FSM first if it exists */
-	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
-	if (fsm)
-		FreeSpaceMapTruncateRel(rel, nblocks);
-
-	/* Truncate the visibility map too if it exists. */
-	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
-	if (vm)
-		visibilitymap_truncate(rel, nblocks);
-
-	/*
-	 * We WAL-log the truncation before actually truncating, which means
-	 * trouble if the truncation fails. If we then crash, the WAL replay
-	 * likely isn't going to succeed in the truncation either, and cause a
-	 * PANIC. It's tempting to put a critical section here, but that cure
-	 * would be worse than the disease. It would turn a usually harmless
-	 * failure to truncate, that might spell trouble at WAL replay, into a
-	 * certain PANIC.
-	 */
-	if (RelationNeedsWAL(rel))
+	PG_TRY();
 	{
 		/*
-		 * Make an XLOG entry reporting the file truncation.
+		 * We WAL-log the truncation before actually truncating, which means
+		 * trouble if the truncation fails. If we then crash, the WAL replay
+		 * likely isn't going to succeed in the truncation either, and cause a
+		 * PANIC. It's tempting to put a critical section here, but that cure
+		 * would be worse than the disease. It would turn a usually harmless
+		 * failure to truncate, that might spell trouble at WAL replay, into a
+		 * certain PANIC.
 		 */
-		XLogRecPtr	lsn;
-		xl_smgr_truncate xlrec;
-
-		xlrec.blkno = nblocks;
-		xlrec.rnode = rel->rd_node;
-		xlrec.flags = SMGR_TRUNCATE_ALL;
+		if (RelationNeedsWAL(rel))
+		{
+			/*
+			 * Make an XLOG entry reporting the file truncation.
+			 */
+			xl_smgr_truncate xlrec;
+
+			/*
+			 * Since we're not holding a buffer lock, use delayChkpt to
+			 * prevent a checkpoint from happening between WAL-logging, and
+			 * performing the truncation.
+			 */
+			MyPgXact->delayChkpt = true;
+
+			xlrec.blkno = nblocks;
+			xlrec.rnode = rel->rd_node;
+			xlrec.flags = SMGR_TRUNCATE_ALL;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+			lsn = XLogInsert(RM_SMGR_ID,
+							 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+
+			/*
+			 * Flush, because otherwise the truncation of the main relation
+			 * might hit the disk before the WAL record, and the truncation of
+			 * the FSM or visibility map. If we crashed during that window,
+			 * we'd be left with a truncated heap, but the FSM or visibility
+			 * map would still contain entries for the non-existent heap
+			 * pages.
+			 */
+			if (fsm || vm)
+				XLogFlush(lsn);
+		}
 
-		XLogBeginInsert();
-		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+		/* Truncate the FSM first if it exists */
+		fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
+		if (fsm)
+			FreeSpaceMapTruncateRel(rel, nblocks, lsn);
 
-		lsn = XLogInsert(RM_SMGR_ID,
-						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+		/* Truncate the visibility map too if it exists. */
+		vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
+		if (vm)
+			visibilitymap_truncate(rel, nblocks, lsn);
 
-		/*
-		 * Flush, because otherwise the truncation of the main relation might
-		 * hit the disk before the WAL record, and the truncation of the FSM
-		 * or visibility map. If we crashed during that window, we'd be left
-		 * with a truncated heap, but the FSM or visibility map would still
-		 * contain entries for the non-existent heap pages.
-		 */
-		if (fsm || vm)
-			XLogFlush(lsn);
+		/* Do the real work */
+		smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
 	}
+	PG_CATCH();
+	{
+		if (RelationNeedsWAL(rel))
+			MyPgXact->delayChkpt = false;
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
-	/* Do the real work */
-	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
+	if (RelationNeedsWAL(rel))
+		MyPgXact->delayChkpt = false;
 }
 
 /*
@@ -536,10 +558,10 @@ smgr_redo(XLogReaderState *record)
 
 		if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
 			smgrexists(reln, FSM_FORKNUM))
-			FreeSpaceMapTruncateRel(rel, xlrec->blkno);
+			FreeSpaceMapTruncateRel(rel, xlrec->blkno, lsn);
 		if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
 			smgrexists(reln, VISIBILITYMAP_FORKNUM))
-			visibilitymap_truncate(rel, xlrec->blkno);
+			visibilitymap_truncate(rel, xlrec->blkno, lsn);
 
 		FreeFakeRelcacheEntry(rel);
 	}
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index bbd90c9..4483984 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -294,9 +294,14 @@ GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
  * before they access the FSM again.
  *
  * nblocks is the new size of the heap.
+ *
+ * The caller must have WAL-logged the truncation already (if the relation
+ * needs WAL-logging at all). 'lsn' is the LSN of the XLOG record. It is
+ * used to stamp the last remaining VM page, so that it doesn't get flushed
+ * to disk before the WAL record.
  */
 void
-FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
+FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks, XLogRecPtr lsn)
 {
 	BlockNumber new_nfsmblocks;
 	FSMAddress	first_removed_address;
@@ -327,8 +332,13 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
 		if (!BufferIsValid(buf))
 			return;				/* nothing to do; the FSM was already smaller */
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
 		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
-		MarkBufferDirtyHint(buf, false);
+
+		MarkBufferDirty(buf);
+		if (lsn != InvalidXLogRecPtr)
+			PageSetLSN(BufferGetPage(buf), lsn);
+
 		UnlockReleaseBuffer(buf);
 
 		new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 00bbd4c..8fe8906 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -44,6 +44,6 @@ extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 				  uint8 flags);
 extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
 extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen);
-extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks);
+extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks, XLogRecPtr lsn);
 
 #endif   /* VISIBILITYMAP_H */
diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h
index 77b3bc3..ac014a4 100644
--- a/src/include/storage/freespace.h
+++ b/src/include/storage/freespace.h
@@ -14,6 +14,7 @@
 #ifndef FREESPACE_H_
 #define FREESPACE_H_
 
+#include "access/xlogdefs.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 #include "utils/relcache.h"
@@ -30,7 +31,7 @@ extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
 extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
 							Size spaceAvail);
 
-extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks);
+extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks, XLogRecPtr lsn);
 extern void FreeSpaceMapVacuum(Relation rel);
 extern void UpdateFreeSpaceMap(Relation rel,
 				   BlockNumber startBlkNum,
diff --git a/src/test/recovery/t/008_fsm_check.pl b/src/test/recovery/t/008_fsm_check.pl
new file mode 100644
index 0000000..2c58e31
--- /dev/null
+++ b/src/test/recovery/t/008_fsm_check.pl
@@ -0,0 +1,90 @@
+# Set of tests to check file-space map consistency on standbys
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 1;
+
+my $node_master = get_new_node('master');
+$node_master->init(allows_streaming => 1);
+
+$node_master->append_conf('postgresql.conf', qq{
+fsync = on
+wal_level = replica
+wal_log_hints = on
+max_prepared_transactions = 5
+autovacuum = off
+});
+
+# Create a master node and its standby, initializing both with
+# some data at the same time.
+$node_master->start;
+
+$node_master->backup('master_backup');
+my $node_standby = get_new_node('standby');
+$node_standby->init_from_backup($node_master, 'master_backup',
+	has_streaming => 1);
+$node_standby->start;
+
+$node_master->psql('postgres', qq{
+create table testtab (a int, b char(100));
+insert into testtab select generate_series(1,1000), 'foo';
+insert into testtab select generate_series(1,1000), 'foo';
+delete from testtab where ctid > '(8,0)';
+});
+
+# Take a lock on the table to prevent following vacuum from truncating it
+$node_master->psql('postgres', qq{
+begin;
+lock table testtab in row share mode;
+prepare transaction 'p1';
+});
+
+# Vacuum, update FSM without truncation
+$node_master->psql('postgres', 'vacuum verbose testtab');
+
+# Force a checkpoint
+$node_master->psql('postgres', 'checkpoint');
+
+# Now do some more insert/deletes, another vacuum to ensure FPW writes are
+# done
+$node_master->psql('postgres', qq{
+insert into testtab select generate_series(1,1000), 'foo';
+delete from testtab where ctid > '(8,0)';
+vacuum verbose testtab;
+});
+
+# Ensure all buffers are now clean on the standby
+$node_standby->psql('postgres', 'checkpoint');
+
+# Release the lock, vacuum again which should lead to truncation
+$node_master->psql('postgres', qq{
+rollback prepared 'p1';
+vacuum verbose testtab;
+});
+
+$node_master->psql('postgres', 'checkpoint');
+my $until_lsn =
+	$node_master->safe_psql('postgres', "SELECT pg_current_xlog_location();");
+
+# Wait long enough for standby to receive and apply all WAL
+my $caughtup_query =
+	"SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()";
+$node_standby->poll_query_until('postgres', $caughtup_query)
+	or die "Timed out while waiting for standby to catch up";
+
+# now promote the standby
+$node_standby->promote;
+$node_standby->poll_query_until('postgres',
+	"SELECT NOT pg_is_in_recovery()")
+  or die "Timed out while waiting for promotion of standby";
+$node_standby->psql('postgres', 'checkpoint');
+
+# restart to discard in-memory copy of FSM
+$node_standby->restart;
+
+# Insert should work on standby
+is($node_standby->psql('postgres',
+   qq{insert into testtab select generate_series(1,1000), 'foo';}),
+   0, 'INSERT succeeds with truncated relation FSM');
-- 
2.9.3

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to