[HACKERS] [PATCH] introduce XLogLockBlockRangeForCleanup()

Abhijit Menon-Sen Fri, 13 Jun 2014 01:41:04 -0700

nbtxlog.c:btree_xlog_vacuum() contains the following comment:

     * XXX we don't actually need to read the block, we just need to
     * confirm it is unpinned. If we had a special call into the
     * buffer manager we could optimise this so that if the block is
     * not in shared_buffers we confirm it as unpinned.


The attached patch introduces an XLogLockBlockRangeForCleanup() function
that addresses this, as well as a "special call into the buffer manager"
that makes it possible to lock the buffers without reading them.

The patch is by Simon Riggs, with some minor edits and testing by me.

I'm adding the patch to the CommitFest, and will follow up with some
performance numbers soon.

-- Abhijit

diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	 * here during crash recovery.
 	 */
 	if (HotStandbyActiveInReplay())
-	{
-		BlockNumber blkno;
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
-	}
+		XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1,
+									 xlrec->block);
 
 	/*
 	 * If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..302551f 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -375,6 +369,73 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	return buffer;
 }
 
+/*
+ * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the
+ * locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno,
+							 BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber	endingBlkno;
+	Buffer		buffer;
+	BufferAccessStrategy bstrategy;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	/*
+	 * We need to use a BufferAccessStrategy because we do not want to
+	 * increment the buffer's usage_count as we read them. However,
+	 * there is no ring buffer associated with this strategy, since we
+	 * never actually read or write the contents, just lock them.
+	 */
+	bstrategy = GetAccessStrategy(BAS_DISCARD);
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno, bstrategy);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+
+	FreeAccessStrategy(bstrategy);
+}
 
 /*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index c070278..2c8d374 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -221,8 +221,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -283,6 +281,54 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum, BufferAccessStrategy strategy)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/* Found it.  Now, try to pin the buffer. */
+		buf = &BufferDescriptors[buf_id];
+
+		valid = PinBuffer(buf, strategy);
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
@@ -1076,12 +1122,7 @@ ReleaseAndReadBuffer(Buffer buffer,
  * PinBuffer -- make buffer unavailable for replacement.
  *
  * For the default access strategy, the buffer's usage_count is incremented
- * when we first pin it; for other strategies we just make sure the usage_count
- * isn't zero.  (The idea of the latter is that we don't want synchronized
- * heap scans to inflate the count, but we need it to not be zero to discourage
- * other backends from stealing buffers from our ring.  As long as we cycle
- * through the ring faster than the global clock-sweep cycles, buffers in
- * our ring won't be chosen as victims for replacement by other backends.)
+ * when we first pin it; otherwise we let the Strategy decide what to do.
  *
  * This should be applied only to shared buffers, never local ones.
  *
@@ -1106,10 +1147,7 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 				buf->usage_count++;
 		}
 		else
-		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
-		}
+			StrategySetBufferUsage(strategy, buf);
 		result = (buf->flags & BM_VALID) != 0;
 		UnlockBufHdr(buf);
 	}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 4befab0..c7daf47 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -76,6 +76,11 @@ typedef struct BufferAccessStrategyData
 	bool		current_was_in_ring;
 
 	/*
+	 * How we handle usage_count for this strategy.
+	 */
+	bool		increment_on_zero;
+
+	/*
 	 * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
 	 * have not yet selected a buffer for this ring slot.  For allocation
 	 * simplicity this is palloc'd together with the fixed fields of the
@@ -408,6 +413,7 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 {
 	BufferAccessStrategy strategy;
 	int			ring_size;
+	bool		increment_on_zero = true;
 
 	/*
 	 * Select ring size to use.  See buffer/README for rationales.
@@ -430,6 +436,10 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 		case BAS_VACUUM:
 			ring_size = 256 * 1024 / BLCKSZ;
 			break;
+		case BAS_DISCARD:
+			ring_size = 0;	/* We musn't ever call GetBufferFromRing() */
+			increment_on_zero = false;
+			break;
 
 		default:
 			elog(ERROR, "unrecognized buffer access strategy: %d",
@@ -437,8 +447,9 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 			return NULL;		/* keep compiler quiet */
 	}
 
-	/* Make sure ring isn't an undue fraction of shared buffers */
-	ring_size = Min(NBuffers / 8, ring_size);
+	if (ring_size > 0)
+		/* Make sure ring isn't an undue fraction of shared buffers */
+		ring_size = Min(NBuffers / 8, ring_size);
 
 	/* Allocate the object and initialize all elements to zeroes */
 	strategy = (BufferAccessStrategy)
@@ -448,6 +459,7 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 	/* Set fields that don't start out zero */
 	strategy->btype = btype;
 	strategy->ring_size = ring_size;
+	strategy->increment_on_zero = increment_on_zero;
 
 	return strategy;
 }
@@ -478,6 +490,8 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	volatile BufferDesc *buf;
 	Buffer		bufnum;
 
+	Assert(strategy->ring_size > 0);
+
 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
 		strategy->current = 0;
@@ -563,3 +577,22 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf)
 
 	return true;
 }
+
+/*
+ * Set the usage_count according to the strategy
+ *
+ * We just make sure the usage_count isn't zero. (We don't want
+ * synchronized heap scans to inflate the count, but we need it to not
+ * be zero to discourage other backends from stealing buffers from our
+ * ring. As long as we cycle through the ring faster than the global
+ * clock-sweep cycles, buffers in our ring won't be chosen as victims
+ * for replacement by other backends.)
+ *
+ * Called while holding buffer header spinlock, so make it quick.
+ */
+void
+StrategySetBufferUsage(BufferAccessStrategy strategy, volatile BufferDesc *buf)
+{
+	if (buf->usage_count == 0 && strategy->increment_on_zero)
+		buf->usage_count = 1;
+}
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c019013..720db12 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -190,6 +190,8 @@ extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 extern void StrategyFreeBuffer(volatile BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 volatile BufferDesc *buf);
+extern void StrategySetBufferUsage(BufferAccessStrategy strategy,
+					 volatile BufferDesc *buf);
 
 extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
 extern void StrategyNotifyBgWriter(Latch *bgwriterLatch);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..bdba547 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -29,7 +29,8 @@ typedef enum BufferAccessStrategyType
 	BAS_BULKREAD,				/* Large read-only scan (hint bit updates are
 								 * ok) */
 	BAS_BULKWRITE,				/* Large multi-block write (e.g. COPY IN) */
-	BAS_VACUUM					/* VACUUM */
+	BAS_VACUUM,					/* VACUUM */
+	BAS_DISCARD					/* DISCARD can only be used with GetBuffersWithoutRelcache() */
 } BufferAccessStrategyType;
 
 /* Possible modes for ReadBufferExtended() */
@@ -38,9 +39,7 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +169,9 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum,
+									   BufferAccessStrategy strategy);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] [PATCH] introduce XLogLockBlockRangeForCleanup()

Reply via email to