Hello, As a very simple exploration of the possible gains from batching redo records during replay, I tried to avoid acquiring and releasing buffers pins and locks while replaying records that touch the same page as the previous record. The attached experiment-grade patch works by trying to give a locked buffer to the next redo handler, which then releases it if it wants a different buffer. Crash recovery on my dev machine went from 62s to 34s (1.8x speedup) for:
create table t (i int, foo text); insert into t select generate_series(1, 50000000), 'the quick brown fox jumped over the lazy dog'; delete from t; Of course that workload was contrived to produce a suitable WAL history for this demo. The patch doesn't help more common histories from the real world, involving (non-HOT) UPDATEs and indexes etc, because then you have various kinds of interleaving that defeat this simple-minded optimisation. To get a more general improvement, it seems that we'd need a smarter redo loop that could figure out what can safely be reordered to maximise the page-level batching and locality effects. I haven't studied the complications of reordering yet, and I'm not working on that for PostgreSQL 14, but I wanted to see if others have thoughts about it. The WAL prefetching patch that I am planning to get into 14 opens up these possibilities by decoding many records into a circular WAL decode buffer, so you can see a whole chain of them at once.
From b481d7727d1732705b85e5935887ff4404eb014b Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Fri, 12 Feb 2021 01:08:04 +1300 Subject: [PATCH] Try to hold onto buffers between WAL records. When replaying a sequence of redo records that touch the same page, try to avoid having to look up, pin, lock, unlock and unpin the buffer each time by holding onto one buffer between records. This aims to speed up replay of bulk operations. XXX Are there really many interesting workload where records are adjacent without (for example) interleaving Heap and Btree records? Maybe for this to pay off for interleaving cases you might need to look ahead and find suitable records, and replay them out of order! XXX There is probably a better word than "keep" WIP experiment; may contain nuts --- src/backend/access/heap/heapam.c | 12 ++-- src/backend/access/nbtree/nbtxlog.c | 8 +-- src/backend/access/transam/xlog.c | 6 ++ src/backend/access/transam/xlogreader.c | 2 + src/backend/access/transam/xlogutils.c | 80 ++++++++++++++++++++++--- src/backend/catalog/storage.c | 3 + src/include/access/xlogreader.h | 3 + src/include/access/xlogutils.h | 1 + 8 files changed, 96 insertions(+), 19 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9926e2bd54..4750c8ba2d 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8448,7 +8448,7 @@ heap_xlog_clean(XLogReaderState *record) { Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); /* * After cleaning records from a page, it's useful to update the FSM @@ -8660,7 +8660,7 @@ heap_xlog_freeze_page(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); } /* @@ -8760,7 +8760,7 @@ heap_xlog_delete(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); } static void @@ -8869,7 +8869,7 @@ heap_xlog_insert(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); /* * If the page is running low on free space, update the FSM as well. @@ -9410,7 +9410,7 @@ heap_xlog_lock(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); } static void @@ -9511,7 +9511,7 @@ heap_xlog_inplace(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); } void diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c1d578cc01..4025f97227 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -237,7 +237,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting, MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); /* * Note: in normal operation, we'd update the metapage while still holding @@ -553,7 +553,7 @@ btree_xlog_dedup(XLogReaderState *record) } if (BufferIsValid(buf)) - UnlockReleaseBuffer(buf); + XLogKeepBufferForRedo(record, buf); } static void @@ -647,7 +647,7 @@ btree_xlog_vacuum(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); } static void @@ -707,7 +707,7 @@ btree_xlog_delete(XLogReaderState *record) MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + XLogKeepBufferForRedo(record, buffer); } static void diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8e3b5df7dc..c626b8a732 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7469,6 +7469,9 @@ StartupXLOG(void) record = ReadRecord(xlogreader, LOG, false); } while (record != NULL); + /* Release any buffer cached by xlogutil.c. */ + XLogKeepBufferForRedo(xlogreader, InvalidBuffer); + /* * end of main redo apply loop */ @@ -12017,6 +12020,9 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; int r; + /* Release any buffer we might be holding onto while we do I/O. */ + XLogKeepBufferForRedo(xlogreader, InvalidBuffer); + XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index bb95e0e527..cacc20ab20 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -132,6 +132,8 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, return NULL; } + state->kept_buffer = InvalidBuffer; + return state; } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index e723253297..e27f5e2693 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -25,6 +25,7 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "pgstat.h" +#include "storage/buf_internals.h" #include "storage/smgr.h" #include "utils/guc.h" #include "utils/hsearch.h" @@ -296,6 +297,26 @@ XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, false, buf); } +/* + * Keep a buffer pinned and locked, so that we can perhaps use it directly when + * processing the next record. This can be used instead of + * UnlockReleaseBuffer() during recovery, in simple redo routines that don't + * take cleanup locks and only touch one buffer. Call with InvalidBuffer to + * release. + */ +void +XLogKeepBufferForRedo(XLogReaderState *record, Buffer buffer) +{ + Assert(!BufferIsValid(buffer) || + LWLockHeldByMeInMode(BufferDescriptorGetContentLock(GetBufferDescriptor(buffer - 1)), + LW_EXCLUSIVE)); + + /* Release any buffer we might already be holding onto. */ + if (BufferIsValid(record->kept_buffer) && record->kept_buffer != buffer) + UnlockReleaseBuffer(record->kept_buffer); + record->kept_buffer = buffer; +} + /* * Pin and lock a buffer referenced by a WAL record, for the purpose of * re-initializing it. @@ -345,6 +366,27 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, elog(PANIC, "failed to locate backup block with ID %d", block_id); } + /* Do we already have this buffer pinned and suitably locked? */ + if (BufferIsValid(record->kept_buffer)) + { + bool can_reuse = false; + + if (mode == RBM_NORMAL && !get_cleanup_lock) + { + BufferDesc *bufHdr; + BufferTag tag; + + bufHdr = GetBufferDescriptor(record->kept_buffer - 1); + Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), + LW_EXCLUSIVE)); + INIT_BUFFERTAG(tag, rnode, forknum, blkno); + if (BUFFERTAGS_EQUAL(tag, bufHdr->tag)) + can_reuse = true; + } + if (!can_reuse) + XLogKeepBufferForRedo(record, InvalidBuffer); + } + /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. @@ -360,8 +402,14 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, if (XLogRecBlockImageApply(record, block_id)) { Assert(XLogRecHasBlockImage(record, block_id)); - *buf = XLogReadBufferExtended(rnode, forknum, blkno, - get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); + if (BufferIsValid(record->kept_buffer)) + { + *buf = record->kept_buffer; /* XXX and zero? */ + record->kept_buffer = InvalidBuffer; /* transferring ownership */ + } + else + *buf = XLogReadBufferExtended(rnode, forknum, blkno, + get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); if (!RestoreBlockImage(record, block_id, page)) elog(ERROR, "failed to restore block image"); @@ -390,16 +438,30 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, } else { - *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); - if (BufferIsValid(*buf)) + if (BufferIsValid(record->kept_buffer)) { - if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) + *buf = record->kept_buffer; + record->kept_buffer = InvalidBuffer; /* transferring ownership */ + Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(GetBufferDescriptor(*buf - 1)), LW_EXCLUSIVE)); + Assert(!get_cleanup_lock); + Assert(mode == RBM_NORMAL); + } + else + { + *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); + if (BufferIsValid(*buf)) { - if (get_cleanup_lock) - LockBufferForCleanup(*buf); - else - LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) + { + if (get_cleanup_lock) + LockBufferForCleanup(*buf); + else + LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + } } + } + if (BufferIsValid(*buf)) + { if (lsn <= PageGetLSN(BufferGetPage(*buf))) return BLK_DONE; else diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index cba7a9ada0..6863ace1cb 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -906,6 +906,9 @@ smgr_redo(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + /* Make sure we don't have any buffers pinned. */ + XLogKeepBufferForRedo(record, InvalidBuffer); + /* Backup blocks are not used in smgr records */ Assert(!XLogRecHasAnyBlockRefs(record)); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 21d200d3df..d2155f405b 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -252,6 +252,9 @@ struct XLogReaderState /* Buffer to hold error message */ char *errormsg_buf; + + /* Private storage workspace for xlogutil.c. */ + int kept_buffer; /* XXX should be Buffer, but ... ugh */ }; /* Get a new XLogReader */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 9ac602b674..750acdba99 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -40,6 +40,7 @@ extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 buffer_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf); +extern void XLogKeepBufferForRedo(XLogReaderState *record, Buffer buffer); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode); -- 2.30.0