[HACKERS] limiting hint bit I/O

Robert Haas Thu, 13 Jan 2011 19:35:45 -0800

I whipped up the attached patch tonight.  It's pretty quick and dirty,
so it's possible I've missed something, but the intent is to suppress
writing of hint bits by buffers allocating backends, and by
checkpoints, and write them only from the background writer cleaning
scan.  It therefore should (and does) avoid the problem that the first
scan of a relation after a bulk load is much slower than subsequent
scans.  I used this test case:


create table s as select g,
random()::text||random()::text||random()::text||random()::text from
generate_series(1,1000000) g;

I didn't do any special configuration, so this was large enough to not
fit in shared_buffers, but small enough to fit in the OS cache.  Then
I did this repeatedly:

select sum(1) from s;

Without the patch, the first run took 1602 ms, and subsequent runs
took 207-216 ms.

With the patch, the first run took 270 ms, and subsequent runs
declined very, very slowly.  I got bored after getting down into the
240 ms range and ran VACUUM FREEZE, after which times dropped to about
197 ms.  (This also happens without the patch - VACUUM FREEZE seems to
speed things up a bit more than just setting all the hint bits.)

I find these results pretty depressing.  Obviously, the ~6x speedup on
the first run is great, but even after many runs subsequent runs it
was still 10-15% slower.  Certainly, for some people this patch might
be an improvement, but on the whole I can't see applying it, unless
someone can spot something I've done wrong that casts a different
light on the situation.  I am a little bit at a loss to explain how
I'm getting these results when others posted results that appeared to
show hint bits making very little difference.

Any insights appreciated.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1f89e52..9a43e7f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -602,9 +602,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/*
 		 * If the buffer was dirty, try to write it out.  There is a race
 		 * condition here, in that someone might dirty it after we released it
-		 * above, or even while we are writing it out (since our share-lock
-		 * won't prevent hint-bit updates).  We will recheck the dirty bit
-		 * after re-locking the buffer header.
+		 * above.  (Once we acquire a share-lock on the buffer contents, it's
+		 * no longer possible for it to get marked dirty, though hint bit
+		 * updates could still occur.) We will recheck the dirty bit after
+		 * re-locking the buffer header.
 		 */
 		if (oldFlags & BM_DIRTY)
 		{
@@ -774,10 +775,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		LockBufHdr(buf);
 
 		/*
-		 * Somebody could have pinned or re-dirtied the buffer while we were
-		 * doing the I/O and making the new hashtable entry.  If so, we can't
-		 * recycle this buffer; we must undo everything we've done and start
-		 * over with a new victim buffer.
+		 * Somebody could have re-dirtied the buffer before we acquired the
+		 * shared content lock, or pinned it at any time up until we
+		 * we re-locked the buffer header.  If so, we can't recycle this
+		 * buffer; we must undo everything we've done and start over with a
+		 * new victim buffer.
 		 */
 		oldFlags = buf->flags;
 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
@@ -801,7 +803,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * 1 so that the buffer can survive one clock-sweep pass.)
 	 */
 	buf->tag = newTag;
-	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
+	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_HINT_BITS | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
 	else
@@ -976,7 +978,7 @@ MarkBufferDirty(Buffer buffer)
 	if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
 		VacuumCostBalance += VacuumCostPageDirty;
 
-	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+	bufHdr->flags |= BM_DIRTY;
 
 	UnlockBufHdr(bufHdr);
 }
@@ -1612,7 +1614,16 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 		return result;
 	}
 
-	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
+	/*
+	 * We can get here either because we're checkpointing, or from the
+	 * background writer cleaning scan.  In the latter case, we want to
+	 * write out buffers with hint bit changes so that such changes eventually
+	 * make their way to disk.  In the former case we normally won't get called
+	 * if only hint bit changes have occurred, but if it somehow happens we'll
+	 * just write the page anyway.
+	 */
+	if (!(bufHdr->flags & BM_VALID)
+		|| !(bufHdr->flags & (BM_DIRTY | BM_HINT_BITS)))
 	{
 		/* It's clean, so nothing to do */
 		UnlockBufHdr(bufHdr);
@@ -1878,13 +1889,11 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	 * Now it's safe to write buffer to disk. Note that no one else should
 	 * have been able to write it while we were busy with log flushing because
 	 * we have the io_in_progress lock.
+	 *
+	 * The buffer might get marked BM_HINT_BITS while we're doing this, but
+	 * our share-lock on the buffer contents is enough to prevent it from
+	 * being marked BM_DIRTY.
 	 */
-
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	LockBufHdr(buf);
-	buf->flags &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf);
-
 	smgrwrite(reln,
 			  buf->tag.forkNum,
 			  buf->tag.blockNum,
@@ -1894,8 +1903,7 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	pgBufferUsage.shared_blks_written++;
 
 	/*
-	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
-	 * end the io_in_progress state.
+	 * Mark the buffer as clean and end the io_in_progress state.
 	 */
 	TerminateBufferIO(buf, true, 0);
 
@@ -2065,7 +2073,10 @@ PrintPinnedBufs(void)
  *
  *		This function writes all dirty pages of a relation out to disk
  *		(or more accurately, out to kernel disk buffers), ensuring that the
- *		kernel has an up-to-date view of the relation.
+ *		kernel has an up-to-date view of the relation.  We flush pages with
+ *		only hint bit changes, too: this probably isn't critical, but since
+ *		this only happens during relatively heavyweight operations, it
+ *		shouldn't cost much.
  *
  *		Generally, the caller should be holding AccessExclusiveLock on the
  *		target relation to ensure that no other backend is busy dirtying
@@ -2094,7 +2105,8 @@ FlushRelationBuffers(Relation rel)
 		{
 			bufHdr = &LocalBufferDescriptors[i];
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+				(bufHdr->flags & BM_VALID) &&
+				(bufHdr->flags & (BM_DIRTY | BM_HINT_BITS)))
 			{
 				ErrorContextCallback errcontext;
 
@@ -2110,7 +2122,7 @@ FlushRelationBuffers(Relation rel)
 						  (char *) LocalBufHdrGetBlock(bufHdr),
 						  false);
 
-				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				bufHdr->flags &= ~BM_DIRTY;
 
 				/* Pop the error context stack */
 				error_context_stack = errcontext.previous;
@@ -2128,7 +2140,8 @@ FlushRelationBuffers(Relation rel)
 		bufHdr = &BufferDescriptors[i];
 		LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(bufHdr->flags & BM_VALID) &&
+			(bufHdr->flags & (BM_DIRTY | BM_HINT_BITS)))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
@@ -2146,7 +2159,10 @@ FlushRelationBuffers(Relation rel)
  *
  *		This function writes all dirty pages of a database out to disk
  *		(or more accurately, out to kernel disk buffers), ensuring that the
- *		kernel has an up-to-date view of the database.
+ *		kernel has an up-to-date view of the relation.  We flush pages with
+ *		only hint bit changes, too: this probably isn't critical, but since
+ *		this only happens during relatively heavyweight operations, it
+ *		shouldn't cost much.
  *
  *		Generally, the caller should be holding an appropriate lock to ensure
  *		no other backend is active in the target database; otherwise more
@@ -2170,7 +2186,8 @@ FlushDatabaseBuffers(Oid dbid)
 		bufHdr = &BufferDescriptors[i];
 		LockBufHdr(bufHdr);
 		if (bufHdr->tag.rnode.dbNode == dbid &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(bufHdr->flags & BM_VALID) &&
+			(bufHdr->flags & (BM_DIRTY | BM_HINT_BITS)))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
@@ -2248,15 +2265,20 @@ IncrBufferRefCount(Buffer buffer)
 /*
  * SetBufferCommitInfoNeedsSave
  *
- *	Mark a buffer dirty when we have updated tuple commit-status bits in it.
- *
- * This is essentially the same as MarkBufferDirty, except that the caller
- * might have only share-lock instead of exclusive-lock on the buffer's
- * content lock.  We preserve the distinction mainly as a way of documenting
- * that the caller has not made a critical data change --- the status-bit
- * update could be redone by someone else just as easily.  Therefore, no WAL
- * log record need be generated, whereas calls to MarkBufferDirty really ought
- * to be associated with a WAL-entry-creating action.
+ * Mark a buffer as containing hint-bit changes.  For local buffers, this
+ * is no different from MarkBufferDirty, excpet that the caller might have
+ * only share-lock instead of exclusive-lock on the buffer content.  But for
+ * shared buffers, we set BM_HINT_BITS rather than BM_DIRTY.  The background
+ * writer's cleaning scan will still write such buffers to speed up future
+ * access to the tuples stored in those blocks, but checkpoints and writes
+ * forced by buffer allocations will skip such writes, since they are
+ * non-critical.  This allows some hint bit changes to make their way to disk
+ * without (hopefully) consuming too much I/O bandwidth.
+ *
+ * Note that a status-bit update could be redone by someone else just as
+ * easily.  Therefore, no WAL log record need be generated, whereas calls to
+ * MarkBufferDirty really ought to be associated with a WAL-entry-creating
+ * action.
  */
 void
 SetBufferCommitInfoNeedsSave(Buffer buffer)
@@ -2281,20 +2303,19 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 	/*
 	 * This routine might get called many times on the same page, if we are
 	 * making the first scan after commit of an xact that added/deleted many
-	 * tuples.	So, be as quick as we can if the buffer is already dirty.  We
-	 * do this by not acquiring spinlock if it looks like the status bits are
-	 * already OK.	(Note it is okay if someone else clears BM_JUST_DIRTIED
-	 * immediately after we look, because the buffer content update is already
-	 * done and will be reflected in the I/O.)
+	 * tuples.	So, be as quick as we can if the buffer is already marked.  We
+	 * do this by not acquiring spinlock if it looks like BM_HINT_BITS is
+	 * already set.  (If memory-ordering effects cause us to see a stale value,
+	 * the worst thing that can happen is we fail to set BM_HINT_BITS.  But
+	 * that won't result in data corruption, since hint bits are not critical.)
 	 */
-	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
-		(BM_DIRTY | BM_JUST_DIRTIED))
+	if (!(bufHdr->flags & BM_HINT_BITS))
 	{
 		LockBufHdr(bufHdr);
 		Assert(bufHdr->refcount > 0);
-		if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
+		if (!(bufHdr->flags & BM_HINT_BITS) && VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
-		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+		bufHdr->flags |= BM_HINT_BITS;
 		UnlockBufHdr(bufHdr);
 	}
 }
@@ -2583,8 +2604,8 @@ WaitIO(volatile BufferDesc *buf)
  * io_in_progress lock until he's done.
  *
  * Input operations are only attempted on buffers that are not BM_VALID,
- * and output operations only on buffers that are BM_VALID and BM_DIRTY,
- * so we can always tell if the work is already done.
+ * and output operations only on buffers that are BM_VALID and either BM_DIRTY
+ * or BM_HINT_BITS, so we can always tell if the work is already done.
  *
  * Returns TRUE if we successfully marked the buffer as I/O busy,
  * FALSE if someone else already did the work.
@@ -2620,7 +2641,8 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 
 	/* Once we get here, there is definitely no I/O active on this buffer */
 
-	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
+	if (forInput ? (buf->flags & BM_VALID) :
+		!(buf->flags & (BM_DIRTY | BM_HINT_BITS)))
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf);
@@ -2646,10 +2668,11 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
  *	We hold the buffer's io_in_progress lock
  *	The buffer is Pinned
  *
- * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
- * buffer's BM_DIRTY flag.  This is appropriate when terminating a
- * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
- * marking the buffer clean if it was re-dirtied while we were writing.
+ * If clear_dirty is TRUE, we clear the buffer's BM_DIRTY, BM_HINT_BITS,
+ * and BM_CHECKPOINT_NEEDED flags.  It's possible that that more hint bits
+ * have been set while we the buffer was being written, and that those changes
+ * were not included in the I/O.  But we don't really care, because hint bit
+ * updates are not critical and can be redone later if they're lost.
  *
  * set_flag_bits gets ORed into the buffer's flags.  It must include
  * BM_IO_ERROR in a failure case.  For successful completion it could
@@ -2665,8 +2688,8 @@ TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
 
 	Assert(buf->flags & BM_IO_IN_PROGRESS);
 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
-	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
-		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+	if (clear_dirty)
+		buf->flags &= ~(BM_DIRTY | BM_HINT_BITS | BM_CHECKPOINT_NEEDED);
 	buf->flags |= set_flag_bits;
 
 	UnlockBufHdr(buf);
@@ -2704,7 +2727,7 @@ AbortBufferIO(void)
 		Assert(buf->flags & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 		{
-			Assert(!(buf->flags & BM_DIRTY));
+			Assert(!(buf->flags & (BM_DIRTY | BM_HINT_BITS)));
 			/* We'd better not think buffer is valid yet */
 			Assert(!(buf->flags & BM_VALID));
 			UnlockBufHdr(buf);
@@ -2714,7 +2737,7 @@ AbortBufferIO(void)
 			BufFlags	sv_flags;
 
 			sv_flags = buf->flags;
-			Assert(sv_flags & BM_DIRTY);
+			Assert(sv_flags & (BM_DIRTY | BM_HINT_BITS));
 			UnlockBufHdr(buf);
 			/* Issue notice if this is not the first failure... */
 			if (sv_flags & BM_IO_ERROR)
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 0a01ed5..d1d544e 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -248,7 +248,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * it's all ours now.
 	 */
 	bufHdr->tag = newTag;
-	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_HINT_BITS | BM_IO_ERROR);
 	bufHdr->flags |= BM_TAG_VALID;
 	bufHdr->usage_count = 1;
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0652bdf..78a4761 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -34,7 +34,7 @@
 #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
 #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
 #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
-#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
+#define BM_HINT_BITS			(1 << 5)		/* hint bits changed */
 #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
 #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
 #define BM_PERMANENT			(1 << 8)		/* permanent relation (not unlogged) */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] limiting hint bit I/O

Reply via email to