On 2013-04-04 17:39:16 -0700, Jeff Davis wrote:
> On Thu, 2013-04-04 at 22:39 +0200, Andres Freund wrote:
> > I don't think its really slower. Earlier the code took WalInsertLock
> > everytime, even if we ended up not logging anything. Thats far more
> > epensive than a single spinlock. And the copy should also only be taken
> > in the case we need to log. So I think we end up ahead of the current
> > state.
> 
> Good point.
> 
> > > The code looks good to me except that we should be consistent about the
> > > page hole -- XLogCheckBuffer is calculating it, but then we copy the
> > > entire page. I don't think anything can change the size of the page hole
> > > while we have a shared lock on the buffer, so it seems OK to skip the
> > > page hole during the copy.
> > 
> > I don't think it can change either, but I doubt that there's a
> > performance advantage by not copying the hole. I'd guess the simpler
> > code ends up faster.
> 
> I was thinking more about the WAL size, but I don't have a strong
> opinion.

I was just a bit dense. No idea what I missed there.

How does the attached version look? I verified that it survives
recovery, but not more.

Jeff, any chance you can run this for a round with your suite?

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From 3c086daffb994665b23bc65a1017cb71abef17bf Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Fri, 5 Apr 2013 15:02:35 +0200
Subject: [PATCH] checksums: Log hint bit writes in a concurrency safe manner

The previous manner was racy since share locked pages - which
XLogSaveBufferForHint() gets passed - may still be modified. Namely further
hint bit updates and PageSetLSN() can be performed. That could lead to checksum
failures in WAL since the page changed between checksum computation and the
actual write to WAL.

Instead, and only if required, write a copy of the page as a normal
record. struct BkpBlock is still used so we can use most of the usual backup
block restoration code.
---
 src/backend/access/rmgrdesc/xlogdesc.c |    6 +-
 src/backend/access/transam/xlog.c      |  233 ++++++++++++++++++--------------
 src/include/catalog/catversion.h       |    2 +-
 3 files changed, 136 insertions(+), 105 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 52cf759..4c68b6a 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -17,6 +17,7 @@
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "catalog/pg_control.h"
+#include "common/relpath.h"
 #include "utils/guc.h"
 #include "utils/timestamp.h"
 
@@ -83,7 +84,10 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 	}
 	else if (info == XLOG_HINT)
 	{
-		appendStringInfo(buf, "page hint");
+		BkpBlock *bkp = (BkpBlock *) rec;
+		appendStringInfo(buf, "page hint: %s block %u",
+						 relpathperm(bkp->node, bkp->fork),
+						 bkp->block);
 	}
 	else if (info == XLOG_BACKUP_END)
 	{
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 4a9462e..3e59f94 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -648,6 +648,8 @@ static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 
 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 				XLogRecPtr *lsn, BkpBlock *bkpb);
+static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
+				char *blk, bool get_cleanup_lock, bool keep_buffer);
 static bool AdvanceXLInsertBuffer(bool new_segment);
 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
@@ -731,7 +733,6 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
-	bool		isHint = (rmid == RM_XLOG_ID && info == XLOG_HINT);
 	uint8		info_orig = info;
 	static XLogRecord *rechdr;
 
@@ -1002,18 +1003,6 @@ begin:;
 	}
 
 	/*
-	 * If this is a hint record and we don't need a backup block then
-	 * we have no more work to do and can exit quickly without inserting
-	 * a WAL record at all. In that case return InvalidXLogRecPtr.
-	 */
-	if (isHint && !(info & XLR_BKP_BLOCK_MASK))
-	{
-		LWLockRelease(WALInsertLock);
-		END_CRIT_SECTION();
-		return InvalidXLogRecPtr;
-	}
-
-	/*
 	 * If the current page is completely full, the record goes to the next
 	 * page, right after the page header.
 	 */
@@ -3158,8 +3147,6 @@ Buffer
 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
 				   bool get_cleanup_lock, bool keep_buffer)
 {
-	Buffer		buffer;
-	Page		page;
 	BkpBlock	bkpb;
 	char	   *blk;
 	int			i;
@@ -3177,42 +3164,8 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
 		if (i == block_index)
 		{
 			/* Found it, apply the update */
-			buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
-											RBM_ZERO);
-			Assert(BufferIsValid(buffer));
-			if (get_cleanup_lock)
-				LockBufferForCleanup(buffer);
-			else
-				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
-			page = (Page) BufferGetPage(buffer);
-
-			if (bkpb.hole_length == 0)
-			{
-				memcpy((char *) page, blk, BLCKSZ);
-			}
-			else
-			{
-				memcpy((char *) page, blk, bkpb.hole_offset);
-				/* must zero-fill the hole */
-				MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
-				memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
-					   blk + bkpb.hole_offset,
-					   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
-			}
-
-			/*
-			 * Any checksum set on this page will be invalid. We don't need
-			 * to reset it here since it will be set before being written.
-			 */
-
-			PageSetLSN(page, lsn);
-			MarkBufferDirty(buffer);
-
-			if (!keep_buffer)
-				UnlockReleaseBuffer(buffer);
-
-			return buffer;
+			return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
+											  keep_buffer);
 		}
 
 		blk += BLCKSZ - bkpb.hole_length;
@@ -3224,6 +3177,56 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
 }
 
 /*
+ * Workhorse for RestoreBackupBlock usable without an xlog record
+ *
+ * Restores a full-page image from BkpBlock and a data pointer.
+ */
+static Buffer
+RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
+						   bool get_cleanup_lock, bool keep_buffer)
+{
+	Buffer		buffer;
+	Page		page;
+
+	buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
+									RBM_ZERO);
+	Assert(BufferIsValid(buffer));
+	if (get_cleanup_lock)
+		LockBufferForCleanup(buffer);
+	else
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+	page = (Page) BufferGetPage(buffer);
+
+	if (bkpb.hole_length == 0)
+	{
+		memcpy((char *) page, blk, BLCKSZ);
+	}
+	else
+	{
+		memcpy((char *) page, blk, bkpb.hole_offset);
+		/* must zero-fill the hole */
+		MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
+		memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
+			   blk + bkpb.hole_offset,
+			   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
+	}
+
+	/*
+	 * Any checksum set on this page will be invalid. We don't need
+	 * to reset it here since it will be set before being written.
+	 */
+
+	PageSetLSN(page, lsn);
+	MarkBufferDirty(buffer);
+
+	if (!keep_buffer)
+		UnlockReleaseBuffer(buffer);
+
+	return buffer;
+}
+
+/*
  * Attempt to read an XLOG record.
  *
  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
@@ -7638,45 +7641,74 @@ XLogRestorePoint(const char *rpName)
  * Write a backup block if needed when we are setting a hint. Note that
  * this may be called for a variety of page types, not just heaps.
  *
- * Deciding the "if needed" part is delicate and requires us to either
- * grab WALInsertLock or check the info_lck spinlock. If we check the
- * spinlock and it says Yes then we will need to get WALInsertLock as well,
- * so the design choice here is to just go straight for the WALInsertLock
- * and trust that calls to this function are minimised elsewhere.
- *
  * Callable while holding just share lock on the buffer content.
  *
- * Possible that multiple concurrent backends could attempt to write
- * WAL records. In that case, more than one backup block may be recorded
- * though that isn't important to the outcome and the backup blocks are
- * likely to be identical anyway.
+ * We can't use the plain backup block mechanism since that relies on the
+ * Buffer being exclusively locked. Since some modifications (setting LSN, hint
+ * bits) are allowed in a sharelocked buffer that can lead to wal checksum
+ * failures. So instead we copy the page and insert the copied data as normal
+ * record data.
+ *
+ * We only need to do something if page has not yet been full page written in
+ * this checkpoint round. The LSN of the inserted wal record is inserted if we
+ * had to write, InvalidXLogRecPtr otherwise.
+ *
+ * It is possible that multiple concurrent backends could attempt to write WAL
+ * records. In that case, more than one block image may be recorded but that's
+ * ok from a correctness POV.
  */
-#define	XLOG_HINT_WATERMARK		13579
 XLogRecPtr
 XLogSaveBufferForHint(Buffer buffer)
 {
-	/*
-	 * Make an XLOG entry reporting the hint
-	 */
-	XLogRecData rdata[2];
-	int			watermark = XLOG_HINT_WATERMARK;
-
-	/*
-	 * Not allowed to have zero-length records, so use a small watermark
-	 */
-	rdata[0].data = (char *) (&watermark);
-	rdata[0].len = sizeof(int);
-	rdata[0].buffer = InvalidBuffer;
-	rdata[0].buffer_std = false;
-	rdata[0].next = &(rdata[1]);
-
-	rdata[1].data = NULL;
-	rdata[1].len = 0;
-	rdata[1].buffer = buffer;
-	rdata[1].buffer_std = true;
-	rdata[1].next = NULL;
-
-	return XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
+    XLogRecPtr recptr = InvalidXLogRecPtr;
+    XLogRecPtr lsn;
+    XLogRecData rdata[2];
+    BkpBlock bkpb;
+
+    /*
+     * Ensure no checkpoint can happen while we decide about logging something
+     * based on RedoRecPtr, since we don't hold any lock preventing that
+     * otherwise.
+     */
+    Assert(MyPgXact->delayChkpt);
+
+    /* update RedoRecPtr so XLogCheckBuffer can make the right decision */
+    GetRedoRecPtr();
+
+    /* setup phony rdata element for usage of XLogCheckBuffer */
+    rdata[0].buffer = buffer;
+    rdata[0].buffer_std = true; /* is that actually ok? E.g. fsm might not */
+
+    /* force full pages on, otherwise checksums won't work? */
+    if (XLogCheckBuffer(rdata, true, &lsn, &bkpb))
+    {
+        char copied_buffer[BLCKSZ];
+		char *origdata = (char *) BufferGetBlock(buffer);
+
+        /*
+         * copy buffer so we don't have to worry about concurrent hint bit or
+         * lsn updates. We assume pd_lower/upper cannot be changed without an
+         * exclusive lock, so the contents bkp are not racy.
+         */
+        memcpy(copied_buffer, origdata, bkpb.hole_offset);
+        memcpy(copied_buffer + bkpb.hole_offset,
+			   origdata + bkpb.hole_offset + bkpb.hole_length,
+			   BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
+
+        rdata[0].data = (char *) &bkpb;
+        rdata[0].len = sizeof(BkpBlock);
+        rdata[0].buffer = InvalidBuffer;
+        rdata[0].next = &(rdata[1]);
+
+        rdata[1].data = copied_buffer;
+        rdata[1].len = BLCKSZ - bkpb.hole_length;
+        rdata[1].buffer = InvalidBuffer;
+        rdata[1].next = NULL;
+
+        recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
+    }
+
+    return recptr;
 }
 
 /*
@@ -7845,8 +7877,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	/* Backup blocks are not used in most xlog records */
-	Assert(info == XLOG_HINT || !(record->xl_info & XLR_BKP_BLOCK_MASK));
+	/* Backup blocks are not used in xlog xlog records */
+	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
 	if (info == XLOG_NEXTOID)
 	{
@@ -8041,31 +8073,26 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 	}
 	else if (info == XLOG_HINT)
 	{
-#ifdef USE_ASSERT_CHECKING
-		int	*watermark = (int *) XLogRecGetData(record);
-#endif
-
-		/* Check the watermark is correct for the hint record */
-		Assert(*watermark == XLOG_HINT_WATERMARK);
-
-		/* Backup blocks must be present for smgr hint records */
-		Assert(record->xl_info & XLR_BKP_BLOCK_MASK);
+		char *data;
+		BkpBlock bkpb;
 
 		/*
-		 * Hint records have no information that needs to be replayed.
-		 * The sole purpose of them is to ensure that a hint bit does
-		 * not cause a checksum invalidation if a hint bit write should
-		 * cause a torn page. So the body of the record is empty but
-		 * there must be one backup block.
+		 * Hint bit writes contain a backup block stored "inline" in the normal
+		 * data since the locking when writing HINT records isn't sufficient to
+		 * use the normal backup block mechanism.
 		 *
-		 * Since the only change in the backup block is a hint bit,
-		 * there is no confict with Hot Standby.
+		 * Since the only change in the backup block is a hint bit, there is no
+		 * confict with Hot Standby.
 		 *
 		 * This also means there is no corresponding API call for this,
 		 * so an smgr implementation has no need to implement anything.
 		 * Which means nothing is needed in md.c etc
 		 */
-		RestoreBackupBlock(lsn, record, 0, false, false);
+		data = XLogRecGetData(record);
+		memcpy(&bkpb, XLogRecGetData(record), sizeof(BkpBlock));
+		data += sizeof(BkpBlock);
+
+		RestoreBackupBlockContents(lsn, bkpb, data, false, false);
 	}
 	else if (info == XLOG_BACKUP_END)
 	{
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index aa8b715..8d05c61 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201303291
+#define CATALOG_VERSION_NO	201304051
 
 #endif
-- 
1.7.10.4

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to