Re: [HACKERS] Should buffer of initialization fork have a BM_PERMANENT flag

Michael Paquier Sun, 29 Jan 2017 06:01:42 -0800

On Thu, Jan 26, 2017 at 9:14 AM, Michael Paquier
<michael.paqu...@gmail.com> wrote:
> So the patch attached fixes the problem by changing BufferAlloc() in
> such a way that initialization forks are permanently written to disk,
> which is what you are suggesting. As a simple fix for back-branches
> that's enough, though on HEAD I think that we should really rework the
> empty() routines so as the write goes through shared buffers first,
> that seems more solid than relying on the sgmr routines to do this
> work. Robert, what do you think?


Attached is what I have in mind for HEAD. btree, gist, spgist and
bloom indexes are changed so as the init forks created go through the
shared buffers instead of having their empty() routines handle the
flush of the page created. This removes any kind of race conditions
between the checkpointer and the init fork creations, which is I think
a good thing.

Here are the tests I have done.
First running those commands to create all types of indexes.
create extension bloom;
create extension btree_gist;
create extension btree_gin;
create unlogged table foo (a int);
create index foo_bt on foo(a);
create index foo_bloom on foo using bloom(a);
create index foo_gin on foo using gin (a);
create index foo_gist on foo using gist (a);
create index foo_brin on foo using brin (a);
create unlogged table foo_geo (a box);
create index foo_spgist ON foo_geo using spgist(a);
checkpoint;

Then crash the server, restart it, and the following vacuums are able
to complete.
vacuum foo;
vacuum foo_geo;

I have as well created a CF entry for this set of patches:
https://commitfest.postgresql.org/13/971/

Thanks,
-- 
Michael

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index 29bc5cea79..d04298add5 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -158,31 +158,24 @@ blbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 void
 blbuildempty(Relation index)
 {
-	Page		metapage;
+	Buffer		MetaBuffer;
 
-	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
-	BloomFillMetapage(index, metapage);
+	/* An empty bloom index has one meta page */
+	MetaBuffer =
+		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE);
 
-	/*
-	 * Write the page and log it.  It might seem that an immediate sync
-	 * would be sufficient to guarantee that the file exists on disk, but
-	 * recovery itself might remove it while replaying, for example, an
-	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we
-	 * need this even when wal_level=minimal.
-	 */
-	PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO);
-	smgrwrite(index->rd_smgr, INIT_FORKNUM, BLOOM_METAPAGE_BLKNO,
-			  (char *) metapage, true);
-	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-				BLOOM_METAPAGE_BLKNO, metapage, false);
+	/* Initialize the meta buffer */
+	BloomFillMetapage(index, BufferGetPage(MetaBuffer));
 
-	/*
-	 * An immediate sync is required even if we xlog'd the page, because the
-	 * write did not go through shared_buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	/* And log it */
+	START_CRIT_SECTION();
+	MarkBufferDirty(MetaBuffer);
+	log_newpage_buffer(MetaBuffer, false);
+	END_CRIT_SECTION();
+
+	/* Unlock and release the buffer */
+	UnlockReleaseBuffer(MetaBuffer);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1bb1acfea6..215d3c6c02 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -23,6 +23,7 @@
 #include "access/xlog.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
@@ -237,31 +238,22 @@ btbuildCallback(Relation index,
 void
 btbuildempty(Relation index)
 {
-	Page		metapage;
-
-	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, P_NONE, 0);
-
-	/*
-	 * Write the page and log it.  It might seem that an immediate sync
-	 * would be sufficient to guarantee that the file exists on disk, but
-	 * recovery itself might remove it while replaying, for example, an
-	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we
-	 * need this even when wal_level=minimal.
-	 */
-	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
-	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
-			  (char *) metapage, true);
-	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-				BTREE_METAPAGE, metapage, false);
-
-	/*
-	 * An immediate sync is required even if we xlog'd the page, because the
-	 * write did not go through shared_buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	Buffer		metabuffer;
+
+	/* An empty btree index has one meta page */
+	metabuffer =
+		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+
+	/* Initialize and xlog meta buffer */
+	START_CRIT_SECTION();
+	_bt_initmetapage(BufferGetPage(metabuffer), P_NONE, 0);
+	MarkBufferDirty(metabuffer);
+	log_newpage_buffer(metabuffer, false);
+	END_CRIT_SECTION();
+
+	/* Unlock and release the buffer */
+	UnlockReleaseBuffer(metabuffer);
 }
 
 /*
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index b42f4b7139..7f2997c92a 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -155,49 +155,50 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 void
 spgbuildempty(Relation index)
 {
-	Page		page;
-
-	/* Construct metapage. */
-	page = (Page) palloc(BLCKSZ);
-	SpGistInitMetapage(page);
+	Buffer		MetaBuffer,
+				RootBuffer,
+				TupleBuffer;
 
 	/*
-	 * Write the page and log it unconditionally.  This is important
-	 * particularly for indexes created on tablespaces and databases
-	 * whose creation happened after the last redo pointer as recovery
-	 * removes any of their existing content when the corresponding
-	 * create records are replayed.
+	 * An empty SPGist index has three pages:
+	 * - one meta page.
+	 * - one root page.
+	 * - one null-tuple root page.
 	 */
-	PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
-	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
-			  (char *) page, true);
-	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_METAPAGE_BLKNO, page, false);
-
-	/* Likewise for the root page. */
-	SpGistInitPage(page, SPGIST_LEAF);
-
-	PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
-	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO,
-			  (char *) page, true);
-	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_ROOT_BLKNO, page, true);
-
-	/* Likewise for the null-tuples root page. */
-	SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
-
-	PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
-	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO,
-			  (char *) page, true);
-	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_NULL_BLKNO, page, true);
+	MetaBuffer =
+		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE);
+	RootBuffer =
+		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE);
+	TupleBuffer =
+		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	LockBuffer(TupleBuffer, BUFFER_LOCK_EXCLUSIVE);
+
+	/* Initialize and log all the pages */
+	START_CRIT_SECTION();
 
-	/*
-	 * An immediate sync is required even if we xlog'd the pages, because the
-	 * writes did not go through shared buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	/* Construct and log the meta page */
+	SpGistInitMetapage(BufferGetPage(MetaBuffer));
+	MarkBufferDirty(MetaBuffer);
+	log_newpage_buffer(MetaBuffer, false);
+
+	/* root page */
+	SpGistInitPage(BufferGetPage(RootBuffer), SPGIST_LEAF);
+	MarkBufferDirty(RootBuffer);
+	log_newpage_buffer(RootBuffer, false);
+
+	/* null-tuples root page. */
+	SpGistInitPage(BufferGetPage(TupleBuffer), SPGIST_LEAF | SPGIST_NULLS);
+	MarkBufferDirty(TupleBuffer);
+	log_newpage_buffer(TupleBuffer, false);
+
+	END_CRIT_SECTION();
+
+	/* Unlock and release the buffers. */
+	UnlockReleaseBuffer(MetaBuffer);
+	UnlockReleaseBuffer(RootBuffer);
+	UnlockReleaseBuffer(TupleBuffer);
 }
 
 /*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3cb51204dc..66fe9ea529 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1302,6 +1302,14 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	else
 		buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
 
+	/*
+	 * Initialization forks of unlogged tables need to have their dirty
+	 * buffers written permanently to survive in case of a crash if the
+	 * redo point is moved past the WAL-logging of the page itself.
+	 */
+	if (forkNum == INIT_FORKNUM)
+		buf_state |= BM_PERMANENT;
+
 	UnlockBufHdr(buf, buf_state);
 
 	if (oldPartitionLock != NULL)

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Should buffer of initialization fork have a BM_PERMANENT flag

Reply via email to