On Sat, Sep 19, 2020 at 5:06 PM Thomas Munro <thomas.mu...@gmail.com> wrote: > In the meantime, from the low-hanging-fruit department, here's a new > version of the SLRU-fsync-offload patch. The only changes are a > tweaked commit message, and adoption of C99 designated initialisers > for the function table, so { [SYNC_HANDLER_CLOG] = ... } instead of > relying on humans to make the array order match the enum values. If > there are no comments or objections, I'm planning to commit this quite > soon.
... and CI told me that Windows didn't like my array syntax with the extra trailing comma. Here's one without.
From 89bc77827a73c93914f018fc862439f53ff54a39 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sat, 19 Sep 2020 16:22:06 +1200 Subject: [PATCH v4 1/2] Defer flushing of SLRU files. Previously, we called fsync() after writing out pg_xact, multixact and commit_ts pages due to cache pressure, leading to an I/O stall in user backends and recovery. Collapse requests for the same file into a single system call as part of the next checkpoint, as we already do for relation files. This causes a significant improvement to recovery times. Tested-by: Jakub Wartak <jakub.war...@tomtom.com> Discussion: https://postgr.es/m/CA+hUKGLJ=84yt+nvhkeedauutvhmfq9i-n7k_o50jmq6rpj...@mail.gmail.com --- src/backend/access/transam/clog.c | 13 +++- src/backend/access/transam/commit_ts.c | 12 ++- src/backend/access/transam/multixact.c | 24 +++++- src/backend/access/transam/slru.c | 101 +++++++++++++++++++------ src/backend/access/transam/subtrans.c | 4 +- src/backend/commands/async.c | 5 +- src/backend/storage/lmgr/predicate.c | 4 +- src/backend/storage/sync/sync.c | 28 ++++++- src/include/access/clog.h | 3 + src/include/access/commit_ts.h | 3 + src/include/access/multixact.h | 4 + src/include/access/slru.h | 12 ++- src/include/storage/sync.h | 7 +- 13 files changed, 174 insertions(+), 46 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 65aa8841f7..304612c159 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -42,6 +42,7 @@ #include "pg_trace.h" #include "pgstat.h" #include "storage/proc.h" +#include "storage/sync.h" /* * Defines for CLOG page sizes. A page is the same BLCKSZ as is used @@ -691,7 +692,8 @@ CLOGShmemInit(void) { XactCtl->PagePrecedes = CLOGPagePrecedes; SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, - XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER); + XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, + SYNC_HANDLER_CLOG); } /* @@ -1033,3 +1035,12 @@ clog_redo(XLogReaderState *record) else elog(PANIC, "clog_redo: unknown op code %u", info); } + +/* + * Entrypoint for sync.c to sync clog files. + */ +int +clogsyncfiletag(const FileTag *ftag, char *path) +{ + return slrusyncfiletag(XactCtl, ftag, path); +} diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 5244b06a2b..913ec9e48d 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -555,7 +555,8 @@ CommitTsShmemInit(void) CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, CommitTsSLRULock, "pg_commit_ts", - LWTRANCHE_COMMITTS_BUFFER); + LWTRANCHE_COMMITTS_BUFFER, + SYNC_HANDLER_COMMIT_TS); commitTsShared = ShmemInitStruct("CommitTs shared", sizeof(CommitTimestampShared), @@ -1083,3 +1084,12 @@ commit_ts_redo(XLogReaderState *record) else elog(PANIC, "commit_ts_redo: unknown op code %u", info); } + +/* + * Entrypoint for sync.c to sync commit_ts files. + */ +int +committssyncfiletag(const FileTag *ftag, char *path) +{ + return slrusyncfiletag(CommitTsCtl, ftag, path); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b8bedca04a..344006b0f5 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1831,11 +1831,13 @@ MultiXactShmemInit(void) SimpleLruInit(MultiXactOffsetCtl, "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, MultiXactOffsetSLRULock, "pg_multixact/offsets", - LWTRANCHE_MULTIXACTOFFSET_BUFFER); + LWTRANCHE_MULTIXACTOFFSET_BUFFER, + SYNC_HANDLER_MULTIXACT_OFFSET); SimpleLruInit(MultiXactMemberCtl, "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, MultiXactMemberSLRULock, "pg_multixact/members", - LWTRANCHE_MULTIXACTMEMBER_BUFFER); + LWTRANCHE_MULTIXACTMEMBER_BUFFER, + SYNC_HANDLER_MULTIXACT_MEMBER); /* Initialize our shared state struct */ MultiXactState = ShmemInitStruct("Shared MultiXact State", @@ -3386,3 +3388,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funccxt); } + +/* + * Entrypoint for sync.c to sync offsets files. + */ +int +multixactoffsetssyncfiletag(const FileTag *ftag, char *path) +{ + return slrusyncfiletag(MultiXactOffsetCtl, ftag, path); +} + +/* + * Entrypoint for sync.c to sync members files. + */ +int +multixactmemberssyncfiletag(const FileTag *ftag, char *path) +{ + return slrusyncfiletag(MultiXactMemberCtl, ftag, path); +} diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 7640f153c2..21d5aa2dc0 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -80,6 +80,18 @@ typedef struct SlruFlushData typedef struct SlruFlushData *SlruFlush; +/* + * Populate a file tag describing a segment file. We only use the segment + * number, since we can derive everything else we need by having separate + * sync handler functions for clog, multixact etc. + */ +#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = (xx_handler), \ + (a).segno = (xx_segno) \ +) + /* * Macro to mark a buffer slot "most recently used". Note multiple evaluation * of arguments! @@ -173,7 +185,8 @@ SimpleLruShmemSize(int nslots, int nlsns) */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id) + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler) { SlruShared shared; bool found; @@ -251,7 +264,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, * assume caller set PagePrecedes. */ ctl->shared = shared; - ctl->do_fsync = true; /* default behavior */ + ctl->sync_handler = sync_handler; strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); } @@ -870,23 +883,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } pgstat_report_wait_end(); - /* - * If not part of Flush, need to fsync now. We assume this happens - * infrequently enough that it's not a performance issue. - */ - if (!fdata) + /* Queue up a sync request for the checkpointer. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) { - pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); - if (ctl->do_fsync && pg_fsync(fd) != 0) + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) { + /* No space to enqueue sync request. Do it synchronously. */ + pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); + if (pg_fsync(fd) != 0) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } pgstat_report_wait_end(); - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - return false; } - pgstat_report_wait_end(); + } + /* Close file, unless part of flush request. */ + if (!fdata) + { if (CloseTransientFile(fd) != 0) { slru_errcause = SLRU_CLOSE_FAILED; @@ -1162,21 +1183,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) LWLockRelease(shared->ControlLock); /* - * Now fsync and close any files that were open + * Now close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { - pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); - if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0) - { - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; - ok = false; - } - pgstat_report_wait_end(); - if (CloseTransientFile(fdata.fd[i]) != 0) { slru_errcause = SLRU_CLOSE_FAILED; @@ -1303,6 +1314,7 @@ SlruDeleteSegment(SlruCtl ctl, int segno) int slotno; char path[MAXPGPATH]; bool did_write; + FileTag tag; /* Clean out any possibly existing references to the segment. */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); @@ -1346,6 +1358,17 @@ restart: snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno); ereport(DEBUG2, (errmsg("removing file \"%s\"", path))); + + /* + * Tell the checkpointer to forget any sync requests, before we unlink the + * file. + */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); + } + unlink(path); LWLockRelease(shared->ControlLock); @@ -1444,3 +1467,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) return retval; } + +/* + * Individual SLRUs (clog, ...) have to provide a sync.c handler function so + * that they can provide the correct "SlruCtl" (otherwise we don't know how to + * build the path), but they just forward to this common implementation that + * performs the fsync. + */ +int +slrusyncfiletag(SlruCtl ctl, const FileTag *ftag, char *path) +{ + int fd; + int save_errno; + int result; + + SlruFileName(ctl, path, ftag->segno); + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd < 0) + return -1; + + result = pg_fsync(fd); + save_errno = errno; + + CloseTransientFile(fd); + + errno = save_errno; + return result; +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index a50f60b99a..16867c9d1e 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -193,9 +193,7 @@ SUBTRANSShmemInit(void) SubTransCtl->PagePrecedes = SubTransPagePrecedes; SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, SubtransSLRULock, "pg_subtrans", - LWTRANCHE_SUBTRANS_BUFFER); - /* Override default assumption that writes should be fsync'd */ - SubTransCtl->do_fsync = false; + LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); } /* diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index cb341365df..8dbcace3f9 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -554,9 +554,8 @@ AsyncShmemInit(void) */ NotifyCtl->PagePrecedes = asyncQueuePagePrecedes; SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0, - NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER); - /* Override default assumption that writes should be fsync'd */ - NotifyCtl->do_fsync = false; + NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER, + SYNC_HANDLER_NONE); if (!found) { diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index a2f8e7524b..577914a2b8 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -821,9 +821,7 @@ SerialInit(void) SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; SimpleLruInit(SerialSlruCtl, "Serial", NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial", - LWTRANCHE_SERIAL_BUFFER); - /* Override default assumption that writes should be fsync'd */ - SerialSlruCtl->do_fsync = false; + LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE); /* * Create or attach to the SerialControl structure. diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index 3ded2cdd71..1d635d596c 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -18,6 +18,9 @@ #include <fcntl.h> #include <sys/file.h> +#include "access/commit_ts.h" +#include "access/clog.h" +#include "access/multixact.h" #include "access/xlog.h" #include "access/xlogutils.h" #include "commands/tablespace.h" @@ -90,12 +93,31 @@ typedef struct SyncOps const FileTag *candidate); } SyncOps; +/* + * These indexes must correspond to the values of the SyncRequestHandler enum. + */ static const SyncOps syncsw[] = { /* magnetic disk */ - { + [SYNC_HANDLER_MD] = { .sync_syncfiletag = mdsyncfiletag, .sync_unlinkfiletag = mdunlinkfiletag, .sync_filetagmatches = mdfiletagmatches + }, + /* pg_xact */ + [SYNC_HANDLER_CLOG] = { + .sync_syncfiletag = clogsyncfiletag + }, + /* pg_commit_ts */ + [SYNC_HANDLER_COMMIT_TS] = { + .sync_syncfiletag = committssyncfiletag + }, + /* pg_multixact/offsets */ + [SYNC_HANDLER_MULTIXACT_OFFSET] = { + .sync_syncfiletag = multixactoffsetssyncfiletag + }, + /* pg_multixact/members */ + [SYNC_HANDLER_MULTIXACT_MEMBER] = { + .sync_syncfiletag = multixactmemberssyncfiletag } }; @@ -505,8 +527,8 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type) (void *) ftag, HASH_ENTER, &found); - /* if new entry, initialize it */ - if (!found) + /* if new entry, or was previously canceled, initialize it */ + if (!found || entry->canceled) { entry->cycle_ctr = sync_cycle_ctr; entry->canceled = false; diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 2db8acb189..d97b9042dc 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -12,6 +12,7 @@ #define CLOG_H #include "access/xlogreader.h" +#include "storage/sync.h" #include "lib/stringinfo.h" /* @@ -50,6 +51,8 @@ extern void CheckPointCLOG(void); extern void ExtendCLOG(TransactionId newestXact); extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid); +extern int clogsyncfiletag(const FileTag *ftag, char *path); + /* XLOG stuff */ #define CLOG_ZEROPAGE 0x00 #define CLOG_TRUNCATE 0x10 diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h index 2740c02a84..27900ce430 100644 --- a/src/include/access/commit_ts.h +++ b/src/include/access/commit_ts.h @@ -14,6 +14,7 @@ #include "access/xlog.h" #include "datatype/timestamp.h" #include "replication/origin.h" +#include "storage/sync.h" #include "utils/guc.h" @@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact); extern void AdvanceOldestCommitTsXid(TransactionId oldestXact); +extern int committssyncfiletag(const FileTag *ftag, char *path); + /* XLOG stuff */ #define COMMIT_TS_ZEROPAGE 0x00 #define COMMIT_TS_TRUNCATE 0x10 diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 6d729008c6..71d6e78063 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -13,6 +13,7 @@ #include "access/xlogreader.h" #include "lib/stringinfo.h" +#include "storage/sync.h" /* @@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2); +extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path); +extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path); + extern void AtEOXact_MultiXact(void); extern void AtPrepare_MultiXact(void); extern void PostPrepare_MultiXact(TransactionId xid); diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 61fbc80ef0..2720284157 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -15,6 +15,7 @@ #include "access/xlogdefs.h" #include "storage/lwlock.h" +#include "storage/sync.h" /* @@ -111,10 +112,10 @@ typedef struct SlruCtlData SlruShared shared; /* - * This flag tells whether to fsync writes (true for pg_xact and multixact - * stuff, false for pg_subtrans and pg_notify). + * Which sync handler function to use when handing sync requests over to + * the checkpointer. SYNC_HANDLER_NONE to disable fsync (eg pg_notify). */ - bool do_fsync; + SyncRequestHandler sync_handler; /* * Decide which of two page numbers is "older" for truncation purposes. We @@ -135,7 +136,8 @@ typedef SlruCtlData *SlruCtl; extern Size SimpleLruShmemSize(int nslots, int nlsns); extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id); + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler); extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid); @@ -151,6 +153,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); extern void SlruDeleteSegment(SlruCtl ctl, int segno); +extern int slrusyncfiletag(SlruCtl ctl, const FileTag *ftag, char *path); + /* SlruScanDirectory public callbacks */ extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data); diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h index e16ab8e711..f32e412e75 100644 --- a/src/include/storage/sync.h +++ b/src/include/storage/sync.h @@ -34,7 +34,12 @@ typedef enum SyncRequestType */ typedef enum SyncRequestHandler { - SYNC_HANDLER_MD = 0 /* md smgr */ + SYNC_HANDLER_MD = 0, + SYNC_HANDLER_CLOG, + SYNC_HANDLER_COMMIT_TS, + SYNC_HANDLER_MULTIXACT_OFFSET, + SYNC_HANDLER_MULTIXACT_MEMBER, + SYNC_HANDLER_NONE } SyncRequestHandler; /* -- 2.20.1