On Thu, Jan 5, 2012 at 6:26 PM, Simon Riggs <[email protected]> wrote:
> Patch to remove clog contention caused by dirty clog LRU.
v2, minor changes, updated for recent commits
--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 69b6ef3..f3e08e6 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -594,6 +594,26 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
+/*
+ * Conditionally flush the CLOG LRU.
+ *
+ * When a backend does ExtendCLOG we need to write the CLOG LRU if it is
+ * dirty. Performing I/O while holding XidGenLock prevents new write
+ * transactions from starting. To avoid that we flush the CLOG LRU, if
+ * we think that a page write is due soon, according to a heuristic.
+ *
+ * Note that we're reading ShmemVariableCache->nextXid without a lock
+ * since the exact value doesn't matter as input into our heuristic.
+ */
+void
+CLOGBackgroundFlushLRU(void)
+{
+ TransactionId xid = ShmemVariableCache->nextXid;
+ int threshold = (CLOG_XACTS_PER_PAGE - (CLOG_XACTS_PER_PAGE / 4));
+
+ if (TransactionIdToPgIndex(xid) > threshold)
+ SlruBackgroundFlushLRUPage(ClogCtl);
+}
/*
* Make sure that CLOG has room for a newly-allocated XID.
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 30538ff..aea6c09 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -885,6 +885,82 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
}
/*
+ * Identify the LRU slot but just leave it as it is.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+SlruIdentifyLRUSlot(SlruCtl ctl)
+{
+ SlruShared shared = ctl->shared;
+ int slotno;
+ int cur_count;
+ int bestslot;
+ int best_delta;
+ int best_page_number;
+
+ /*
+ * If we find any EMPTY slot, just select that one. Else locate the
+ * least-recently-used slot.
+ *
+ * Normally the page_lru_count values will all be different and so
+ * there will be a well-defined LRU page. But since we allow
+ * concurrent execution of SlruRecentlyUsed() within
+ * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
+ * acquire the same lru_count values. In that case we break ties by
+ * choosing the furthest-back page.
+ *
+ * In no case will we select the slot containing latest_page_number
+ * for replacement, even if it appears least recently used.
+ *
+ * Notice that this next line forcibly advances cur_lru_count to a
+ * value that is certainly beyond any value that will be in the
+ * page_lru_count array after the loop finishes. This ensures that
+ * the next execution of SlruRecentlyUsed will mark the page newly
+ * used, even if it's for a page that has the current counter value.
+ * That gets us back on the path to having good data when there are
+ * multiple pages with the same lru_count.
+ */
+ cur_count = (shared->cur_lru_count)++;
+ best_delta = -1;
+ bestslot = 0; /* no-op, just keeps compiler quiet */
+ best_page_number = 0; /* ditto */
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ int this_delta;
+ int this_page_number;
+
+ if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+ return slotno;
+ this_delta = cur_count - shared->page_lru_count[slotno];
+ if (this_delta < 0)
+ {
+ /*
+ * Clean up in case shared updates have caused cur_count
+ * increments to get "lost". We back off the page counts,
+ * rather than trying to increase cur_count, to avoid any
+ * question of infinite loops or failure in the presence of
+ * wrapped-around counts.
+ */
+ shared->page_lru_count[slotno] = cur_count;
+ this_delta = 0;
+ }
+ this_page_number = shared->page_number[slotno];
+ if ((this_delta > best_delta ||
+ (this_delta == best_delta &&
+ ctl->PagePrecedes(this_page_number, best_page_number))) &&
+ this_page_number != shared->latest_page_number)
+ {
+ bestslot = slotno;
+ best_delta = this_delta;
+ best_page_number = this_page_number;
+ }
+ }
+
+ return bestslot;
+}
+
+/*
* Select the slot to re-use when we need a free slot.
*
* The target page number is passed because we need to consider the
@@ -905,11 +981,8 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
/* Outer loop handles restart after I/O */
for (;;)
{
- int slotno;
- int cur_count;
int bestslot;
- int best_delta;
- int best_page_number;
+ int slotno;
/* See if page already has a buffer assigned */
for (slotno = 0; slotno < shared->num_slots; slotno++)
@@ -919,69 +992,14 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
return slotno;
}
- /*
- * If we find any EMPTY slot, just select that one. Else locate the
- * least-recently-used slot to replace.
- *
- * Normally the page_lru_count values will all be different and so
- * there will be a well-defined LRU page. But since we allow
- * concurrent execution of SlruRecentlyUsed() within
- * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
- * acquire the same lru_count values. In that case we break ties by
- * choosing the furthest-back page.
- *
- * In no case will we select the slot containing latest_page_number
- * for replacement, even if it appears least recently used.
- *
- * Notice that this next line forcibly advances cur_lru_count to a
- * value that is certainly beyond any value that will be in the
- * page_lru_count array after the loop finishes. This ensures that
- * the next execution of SlruRecentlyUsed will mark the page newly
- * used, even if it's for a page that has the current counter value.
- * That gets us back on the path to having good data when there are
- * multiple pages with the same lru_count.
- */
- cur_count = (shared->cur_lru_count)++;
- best_delta = -1;
- bestslot = 0; /* no-op, just keeps compiler quiet */
- best_page_number = 0; /* ditto */
- for (slotno = 0; slotno < shared->num_slots; slotno++)
- {
- int this_delta;
- int this_page_number;
-
- if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
- return slotno;
- this_delta = cur_count - shared->page_lru_count[slotno];
- if (this_delta < 0)
- {
- /*
- * Clean up in case shared updates have caused cur_count
- * increments to get "lost". We back off the page counts,
- * rather than trying to increase cur_count, to avoid any
- * question of infinite loops or failure in the presence of
- * wrapped-around counts.
- */
- shared->page_lru_count[slotno] = cur_count;
- this_delta = 0;
- }
- this_page_number = shared->page_number[slotno];
- if ((this_delta > best_delta ||
- (this_delta == best_delta &&
- ctl->PagePrecedes(this_page_number, best_page_number))) &&
- this_page_number != shared->latest_page_number)
- {
- bestslot = slotno;
- best_delta = this_delta;
- best_page_number = this_page_number;
- }
- }
+ bestslot = SlruIdentifyLRUSlot(ctl);
/*
- * If the selected page is clean, we're set.
+ * If the selected page is clean or empty, we're set.
*/
- if (shared->page_status[bestslot] == SLRU_PAGE_VALID &&
- !shared->page_dirty[bestslot])
+ if (shared->page_status[bestslot] == SLRU_PAGE_EMPTY ||
+ (shared->page_status[bestslot] == SLRU_PAGE_VALID &&
+ !shared->page_dirty[bestslot]))
return bestslot;
/*
@@ -1067,6 +1085,39 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
}
/*
+ * Make sure the next victim buffer is clean, so that the next caller of
+ * SlruSelectLRUPage does not require I/O.
+ */
+void
+SlruBackgroundFlushLRUPage(SlruCtl ctl)
+{
+ SlruShared shared = ctl->shared;
+ int bestslot;
+
+ /*
+ * Notice this takes only a shared lock on the ControlLock.
+ * We aren't going to change the page/slot allocation, only
+ * write if needed and reset the dirty status. This is OK
+ * as long as only one process ever calls this, the bgwriter.
+ */
+ LWLockAcquire(shared->ControlLock, LW_SHARED);
+
+ bestslot = SlruIdentifyLRUSlot(ctl);
+
+ /*
+ * If the selected page is valid and dirty then write it out.
+ * It's possible that the page is already write-busy, or in the worst
+ * case still read-busy. In those cases assume that the write we
+ * wanted to do just happened and we can go.
+ */
+ if (shared->page_status[bestslot] == SLRU_PAGE_VALID &&
+ shared->page_dirty[bestslot])
+ SlruInternalWritePage(ctl, bestslot, NULL);
+
+ LWLockRelease(shared->ControlLock);
+}
+
+/*
* Remove all segments before the one holding the passed page number
*/
void
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 1f8d2d6..66eee36 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -265,6 +265,7 @@ BackgroundWriterMain(void)
* Do one cycle of dirty-buffer writing.
*/
BgBufferSync();
+ CLOGBackgroundFlushLRU();
/* Nap for the configured time. */
BgWriterNap();
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index bed3b8c..6376464 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -40,6 +40,7 @@ extern void StartupCLOG(void);
extern void TrimCLOG(void);
extern void ShutdownCLOG(void);
extern void CheckPointCLOG(void);
+extern void CLOGBackgroundFlushLRU(void);
extern void ExtendCLOG(TransactionId newestXact);
extern void TruncateCLOG(TransactionId oldestXact);
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 41cd484..94d4247 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -144,6 +144,7 @@ extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
TransactionId xid);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
+extern void SlruBackgroundFlushLRUPage(SlruCtl ctl);
extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index db6380f..c17ae29 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -264,6 +264,11 @@ extern pg_time_t GetLastSegSwitchTime(void);
extern XLogRecPtr RequestXLogSwitch(void);
/*
+ * Exported to support background writing
+ */
+extern void CLOGBackgroundFlushLRU(void);
+
+/*
* These aren't in xlog.h because I'd rather not include fmgr.h there.
*/
extern Datum pg_start_backup(PG_FUNCTION_ARGS);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers