On Thu, Jan 5, 2012 at 6:26 PM, Simon Riggs <si...@2ndquadrant.com> wrote:

> Patch to remove clog contention caused by dirty clog LRU.

v2, minor changes, updated for recent commits

-- 
 Simon Riggs                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 69b6ef3..f3e08e6 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -594,6 +594,26 @@ CheckPointCLOG(void)
 	TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
 }
 
+/*
+ * Conditionally flush the CLOG LRU.
+ *
+ * When a backend does ExtendCLOG we need to write the CLOG LRU if it is
+ * dirty. Performing I/O while holding XidGenLock prevents new write
+ * transactions from starting. To avoid that we flush the CLOG LRU, if
+ * we think that a page write is due soon, according to a heuristic.
+ *
+ * Note that we're reading ShmemVariableCache->nextXid without a lock
+ * since the exact value doesn't matter as input into our heuristic.
+ */
+void
+CLOGBackgroundFlushLRU(void)
+{
+	TransactionId xid = ShmemVariableCache->nextXid;
+	int		threshold = (CLOG_XACTS_PER_PAGE - (CLOG_XACTS_PER_PAGE / 4));
+
+	if (TransactionIdToPgIndex(xid) > threshold)
+		SlruBackgroundFlushLRUPage(ClogCtl);
+}
 
 /*
  * Make sure that CLOG has room for a newly-allocated XID.
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 30538ff..aea6c09 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -885,6 +885,82 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
 }
 
 /*
+ * Identify the LRU slot but just leave it as it is.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+SlruIdentifyLRUSlot(SlruCtl ctl)
+{
+	SlruShared	shared = ctl->shared;
+	int			slotno;
+	int			cur_count;
+	int			bestslot;
+	int			best_delta;
+	int			best_page_number;
+
+	/*
+	 * If we find any EMPTY slot, just select that one. Else locate the
+	 * least-recently-used slot.
+	 *
+	 * Normally the page_lru_count values will all be different and so
+	 * there will be a well-defined LRU page.  But since we allow
+	 * concurrent execution of SlruRecentlyUsed() within
+	 * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
+	 * acquire the same lru_count values.  In that case we break ties by
+	 * choosing the furthest-back page.
+	 *
+	 * In no case will we select the slot containing latest_page_number
+	 * for replacement, even if it appears least recently used.
+	 *
+	 * Notice that this next line forcibly advances cur_lru_count to a
+	 * value that is certainly beyond any value that will be in the
+	 * page_lru_count array after the loop finishes.  This ensures that
+	 * the next execution of SlruRecentlyUsed will mark the page newly
+	 * used, even if it's for a page that has the current counter value.
+	 * That gets us back on the path to having good data when there are
+	 * multiple pages with the same lru_count.
+	 */
+	cur_count = (shared->cur_lru_count)++;
+	best_delta = -1;
+	bestslot = 0;			/* no-op, just keeps compiler quiet */
+	best_page_number = 0;	/* ditto */
+	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	{
+		int			this_delta;
+		int			this_page_number;
+
+		if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+			return slotno;
+		this_delta = cur_count - shared->page_lru_count[slotno];
+		if (this_delta < 0)
+		{
+			/*
+			 * Clean up in case shared updates have caused cur_count
+			 * increments to get "lost".  We back off the page counts,
+			 * rather than trying to increase cur_count, to avoid any
+			 * question of infinite loops or failure in the presence of
+			 * wrapped-around counts.
+			 */
+			shared->page_lru_count[slotno] = cur_count;
+			this_delta = 0;
+		}
+		this_page_number = shared->page_number[slotno];
+		if ((this_delta > best_delta ||
+			 (this_delta == best_delta &&
+			  ctl->PagePrecedes(this_page_number, best_page_number))) &&
+			this_page_number != shared->latest_page_number)
+		{
+			bestslot = slotno;
+			best_delta = this_delta;
+			best_page_number = this_page_number;
+		}
+	}
+
+	return bestslot;
+}
+
+/*
  * Select the slot to re-use when we need a free slot.
  *
  * The target page number is passed because we need to consider the
@@ -905,11 +981,8 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 	/* Outer loop handles restart after I/O */
 	for (;;)
 	{
-		int			slotno;
-		int			cur_count;
 		int			bestslot;
-		int			best_delta;
-		int			best_page_number;
+		int			slotno;
 
 		/* See if page already has a buffer assigned */
 		for (slotno = 0; slotno < shared->num_slots; slotno++)
@@ -919,69 +992,14 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 				return slotno;
 		}
 
-		/*
-		 * If we find any EMPTY slot, just select that one. Else locate the
-		 * least-recently-used slot to replace.
-		 *
-		 * Normally the page_lru_count values will all be different and so
-		 * there will be a well-defined LRU page.  But since we allow
-		 * concurrent execution of SlruRecentlyUsed() within
-		 * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
-		 * acquire the same lru_count values.  In that case we break ties by
-		 * choosing the furthest-back page.
-		 *
-		 * In no case will we select the slot containing latest_page_number
-		 * for replacement, even if it appears least recently used.
-		 *
-		 * Notice that this next line forcibly advances cur_lru_count to a
-		 * value that is certainly beyond any value that will be in the
-		 * page_lru_count array after the loop finishes.  This ensures that
-		 * the next execution of SlruRecentlyUsed will mark the page newly
-		 * used, even if it's for a page that has the current counter value.
-		 * That gets us back on the path to having good data when there are
-		 * multiple pages with the same lru_count.
-		 */
-		cur_count = (shared->cur_lru_count)++;
-		best_delta = -1;
-		bestslot = 0;			/* no-op, just keeps compiler quiet */
-		best_page_number = 0;	/* ditto */
-		for (slotno = 0; slotno < shared->num_slots; slotno++)
-		{
-			int			this_delta;
-			int			this_page_number;
-
-			if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
-				return slotno;
-			this_delta = cur_count - shared->page_lru_count[slotno];
-			if (this_delta < 0)
-			{
-				/*
-				 * Clean up in case shared updates have caused cur_count
-				 * increments to get "lost".  We back off the page counts,
-				 * rather than trying to increase cur_count, to avoid any
-				 * question of infinite loops or failure in the presence of
-				 * wrapped-around counts.
-				 */
-				shared->page_lru_count[slotno] = cur_count;
-				this_delta = 0;
-			}
-			this_page_number = shared->page_number[slotno];
-			if ((this_delta > best_delta ||
-				 (this_delta == best_delta &&
-				  ctl->PagePrecedes(this_page_number, best_page_number))) &&
-				this_page_number != shared->latest_page_number)
-			{
-				bestslot = slotno;
-				best_delta = this_delta;
-				best_page_number = this_page_number;
-			}
-		}
+		bestslot = SlruIdentifyLRUSlot(ctl);
 
 		/*
-		 * If the selected page is clean, we're set.
+		 * If the selected page is clean or empty, we're set.
 		 */
-		if (shared->page_status[bestslot] == SLRU_PAGE_VALID &&
-			!shared->page_dirty[bestslot])
+		if (shared->page_status[bestslot] == SLRU_PAGE_EMPTY ||
+			(shared->page_status[bestslot] == SLRU_PAGE_VALID &&
+			!shared->page_dirty[bestslot]))
 			return bestslot;
 
 		/*
@@ -1067,6 +1085,39 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 }
 
 /*
+ * Make sure the next victim buffer is clean, so that the next caller of
+ * SlruSelectLRUPage does not require I/O.
+ */
+void
+SlruBackgroundFlushLRUPage(SlruCtl ctl)
+{
+	SlruShared	shared = ctl->shared;
+	int			bestslot;
+
+	/*
+	 * Notice this takes only a shared lock on the ControlLock.
+	 * We aren't going to change the page/slot allocation, only
+	 * write if needed and reset the dirty status. This is OK
+	 * as long as only one process ever calls this, the bgwriter.
+	 */
+	LWLockAcquire(shared->ControlLock, LW_SHARED);
+
+	bestslot = SlruIdentifyLRUSlot(ctl);
+
+	/*
+	 * If the selected page is valid and dirty then write it out.
+	 * It's possible that the page is already write-busy, or in the worst
+	 * case still read-busy.  In those cases assume that the write we
+	 * wanted to do just happened and we can go.
+	 */
+	if (shared->page_status[bestslot] == SLRU_PAGE_VALID &&
+		shared->page_dirty[bestslot])
+		SlruInternalWritePage(ctl, bestslot, NULL);
+
+	LWLockRelease(shared->ControlLock);
+}
+
+/*
  * Remove all segments before the one holding the passed page number
  */
 void
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 1f8d2d6..66eee36 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -265,6 +265,7 @@ BackgroundWriterMain(void)
 		 * Do one cycle of dirty-buffer writing.
 		 */
 		BgBufferSync();
+		CLOGBackgroundFlushLRU();
 
 		/* Nap for the configured time. */
 		BgWriterNap();
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index bed3b8c..6376464 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -40,6 +40,7 @@ extern void StartupCLOG(void);
 extern void TrimCLOG(void);
 extern void ShutdownCLOG(void);
 extern void CheckPointCLOG(void);
+extern void CLOGBackgroundFlushLRU(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact);
 
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 41cd484..94d4247 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -144,6 +144,7 @@ extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
 						   TransactionId xid);
 extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
 extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
+extern void SlruBackgroundFlushLRUPage(SlruCtl ctl);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
 
 typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index db6380f..c17ae29 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -264,6 +264,11 @@ extern pg_time_t GetLastSegSwitchTime(void);
 extern XLogRecPtr RequestXLogSwitch(void);
 
 /*
+ * Exported to support background writing
+ */
+extern void CLOGBackgroundFlushLRU(void);
+
+/*
  * These aren't in xlog.h because I'd rather not include fmgr.h there.
  */
 extern Datum pg_start_backup(PG_FUNCTION_ARGS);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to