On Thu, Jan 5, 2012 at 6:26 PM, Simon Riggs <si...@2ndquadrant.com> wrote:
> Patch to remove clog contention caused by dirty clog LRU. v2, minor changes, updated for recent commits -- Simon Riggs http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 69b6ef3..f3e08e6 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -594,6 +594,26 @@ CheckPointCLOG(void) TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); } +/* + * Conditionally flush the CLOG LRU. + * + * When a backend does ExtendCLOG we need to write the CLOG LRU if it is + * dirty. Performing I/O while holding XidGenLock prevents new write + * transactions from starting. To avoid that we flush the CLOG LRU, if + * we think that a page write is due soon, according to a heuristic. + * + * Note that we're reading ShmemVariableCache->nextXid without a lock + * since the exact value doesn't matter as input into our heuristic. + */ +void +CLOGBackgroundFlushLRU(void) +{ + TransactionId xid = ShmemVariableCache->nextXid; + int threshold = (CLOG_XACTS_PER_PAGE - (CLOG_XACTS_PER_PAGE / 4)); + + if (TransactionIdToPgIndex(xid) > threshold) + SlruBackgroundFlushLRUPage(ClogCtl); +} /* * Make sure that CLOG has room for a newly-allocated XID. diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 30538ff..aea6c09 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -885,6 +885,82 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) } /* + * Identify the LRU slot but just leave it as it is. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +SlruIdentifyLRUSlot(SlruCtl ctl) +{ + SlruShared shared = ctl->shared; + int slotno; + int cur_count; + int bestslot; + int best_delta; + int best_page_number; + + /* + * If we find any EMPTY slot, just select that one. Else locate the + * least-recently-used slot. + * + * Normally the page_lru_count values will all be different and so + * there will be a well-defined LRU page. But since we allow + * concurrent execution of SlruRecentlyUsed() within + * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages + * acquire the same lru_count values. In that case we break ties by + * choosing the furthest-back page. + * + * In no case will we select the slot containing latest_page_number + * for replacement, even if it appears least recently used. + * + * Notice that this next line forcibly advances cur_lru_count to a + * value that is certainly beyond any value that will be in the + * page_lru_count array after the loop finishes. This ensures that + * the next execution of SlruRecentlyUsed will mark the page newly + * used, even if it's for a page that has the current counter value. + * That gets us back on the path to having good data when there are + * multiple pages with the same lru_count. + */ + cur_count = (shared->cur_lru_count)++; + best_delta = -1; + bestslot = 0; /* no-op, just keeps compiler quiet */ + best_page_number = 0; /* ditto */ + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + int this_delta; + int this_page_number; + + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + return slotno; + this_delta = cur_count - shared->page_lru_count[slotno]; + if (this_delta < 0) + { + /* + * Clean up in case shared updates have caused cur_count + * increments to get "lost". We back off the page counts, + * rather than trying to increase cur_count, to avoid any + * question of infinite loops or failure in the presence of + * wrapped-around counts. + */ + shared->page_lru_count[slotno] = cur_count; + this_delta = 0; + } + this_page_number = shared->page_number[slotno]; + if ((this_delta > best_delta || + (this_delta == best_delta && + ctl->PagePrecedes(this_page_number, best_page_number))) && + this_page_number != shared->latest_page_number) + { + bestslot = slotno; + best_delta = this_delta; + best_page_number = this_page_number; + } + } + + return bestslot; +} + +/* * Select the slot to re-use when we need a free slot. * * The target page number is passed because we need to consider the @@ -905,11 +981,8 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) /* Outer loop handles restart after I/O */ for (;;) { - int slotno; - int cur_count; int bestslot; - int best_delta; - int best_page_number; + int slotno; /* See if page already has a buffer assigned */ for (slotno = 0; slotno < shared->num_slots; slotno++) @@ -919,69 +992,14 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) return slotno; } - /* - * If we find any EMPTY slot, just select that one. Else locate the - * least-recently-used slot to replace. - * - * Normally the page_lru_count values will all be different and so - * there will be a well-defined LRU page. But since we allow - * concurrent execution of SlruRecentlyUsed() within - * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages - * acquire the same lru_count values. In that case we break ties by - * choosing the furthest-back page. - * - * In no case will we select the slot containing latest_page_number - * for replacement, even if it appears least recently used. - * - * Notice that this next line forcibly advances cur_lru_count to a - * value that is certainly beyond any value that will be in the - * page_lru_count array after the loop finishes. This ensures that - * the next execution of SlruRecentlyUsed will mark the page newly - * used, even if it's for a page that has the current counter value. - * That gets us back on the path to having good data when there are - * multiple pages with the same lru_count. - */ - cur_count = (shared->cur_lru_count)++; - best_delta = -1; - bestslot = 0; /* no-op, just keeps compiler quiet */ - best_page_number = 0; /* ditto */ - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - int this_delta; - int this_page_number; - - if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) - return slotno; - this_delta = cur_count - shared->page_lru_count[slotno]; - if (this_delta < 0) - { - /* - * Clean up in case shared updates have caused cur_count - * increments to get "lost". We back off the page counts, - * rather than trying to increase cur_count, to avoid any - * question of infinite loops or failure in the presence of - * wrapped-around counts. - */ - shared->page_lru_count[slotno] = cur_count; - this_delta = 0; - } - this_page_number = shared->page_number[slotno]; - if ((this_delta > best_delta || - (this_delta == best_delta && - ctl->PagePrecedes(this_page_number, best_page_number))) && - this_page_number != shared->latest_page_number) - { - bestslot = slotno; - best_delta = this_delta; - best_page_number = this_page_number; - } - } + bestslot = SlruIdentifyLRUSlot(ctl); /* - * If the selected page is clean, we're set. + * If the selected page is clean or empty, we're set. */ - if (shared->page_status[bestslot] == SLRU_PAGE_VALID && - !shared->page_dirty[bestslot]) + if (shared->page_status[bestslot] == SLRU_PAGE_EMPTY || + (shared->page_status[bestslot] == SLRU_PAGE_VALID && + !shared->page_dirty[bestslot])) return bestslot; /* @@ -1067,6 +1085,39 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint) } /* + * Make sure the next victim buffer is clean, so that the next caller of + * SlruSelectLRUPage does not require I/O. + */ +void +SlruBackgroundFlushLRUPage(SlruCtl ctl) +{ + SlruShared shared = ctl->shared; + int bestslot; + + /* + * Notice this takes only a shared lock on the ControlLock. + * We aren't going to change the page/slot allocation, only + * write if needed and reset the dirty status. This is OK + * as long as only one process ever calls this, the bgwriter. + */ + LWLockAcquire(shared->ControlLock, LW_SHARED); + + bestslot = SlruIdentifyLRUSlot(ctl); + + /* + * If the selected page is valid and dirty then write it out. + * It's possible that the page is already write-busy, or in the worst + * case still read-busy. In those cases assume that the write we + * wanted to do just happened and we can go. + */ + if (shared->page_status[bestslot] == SLRU_PAGE_VALID && + shared->page_dirty[bestslot]) + SlruInternalWritePage(ctl, bestslot, NULL); + + LWLockRelease(shared->ControlLock); +} + +/* * Remove all segments before the one holding the passed page number */ void diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 1f8d2d6..66eee36 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -265,6 +265,7 @@ BackgroundWriterMain(void) * Do one cycle of dirty-buffer writing. */ BgBufferSync(); + CLOGBackgroundFlushLRU(); /* Nap for the configured time. */ BgWriterNap(); diff --git a/src/include/access/clog.h b/src/include/access/clog.h index bed3b8c..6376464 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -40,6 +40,7 @@ extern void StartupCLOG(void); extern void TrimCLOG(void); extern void ShutdownCLOG(void); extern void CheckPointCLOG(void); +extern void CLOGBackgroundFlushLRU(void); extern void ExtendCLOG(TransactionId newestXact); extern void TruncateCLOG(TransactionId oldestXact); diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 41cd484..94d4247 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -144,6 +144,7 @@ extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid); extern void SimpleLruWritePage(SlruCtl ctl, int slotno); extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint); +extern void SlruBackgroundFlushLRUPage(SlruCtl ctl); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index db6380f..c17ae29 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -264,6 +264,11 @@ extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); /* + * Exported to support background writing + */ +extern void CLOGBackgroundFlushLRU(void); + +/* * These aren't in xlog.h because I'd rather not include fmgr.h there. */ extern Datum pg_start_backup(PG_FUNCTION_ARGS);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers