On Thu, Mar 17, 2022 at 04:12:12PM -0700, Nathan Bossart wrote: > It seems unlikely that this will be committed for v15, so I've adjusted the > commitfest entry to v16 and moved it to the next commitfest.
rebased -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 3781795f9b4e448df6bdd24d5cd7c0743b5e2944 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <bossa...@amazon.com> Date: Wed, 10 Nov 2021 18:35:14 +0000 Subject: [PATCH v8 1/2] Move WAL segment creation logic to its own function. --- src/backend/access/transam/xlog.c | 103 +-------------------------- src/backend/storage/file/fd.c | 114 ++++++++++++++++++++++++++++++ src/include/storage/fd.h | 1 + 3 files changed, 116 insertions(+), 102 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a7814d4019..87d71e2008 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2918,11 +2918,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, bool *added, char *path) { char tmppath[MAXPGPATH]; - PGAlignedXLogBlock zbuffer; XLogSegNo installed_segno; XLogSegNo max_segno; int fd; - int save_errno; Assert(logtli != 0); @@ -2952,106 +2950,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, elog(DEBUG2, "creating and filling new WAL file"); snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); - - unlink(tmppath); - - /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", tmppath))); - - memset(zbuffer.data, 0, XLOG_BLCKSZ); - - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); - save_errno = 0; - if (wal_init_zero) - { - struct iovec iov[PG_IOV_MAX]; - int blocks; - - /* - * Zero-fill the file. With this setting, we do this the hard way to - * ensure that all the file space has really been allocated. On - * platforms that allow "holes" in files, just seeking to the end - * doesn't allocate intermediate space. This way, we know that we - * have all the space and (after the fsync below) that all the - * indirect blocks are down on disk. Therefore, fdatasync(2) or - * O_DSYNC will be sufficient to sync future writes to the log file. - */ - - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (int i = 0; i < lengthof(iov); ++i) - { - iov[i].iov_base = zbuffer.data; - iov[i].iov_len = XLOG_BLCKSZ; - } - - /* Loop, writing as many blocks as we can for each system call. */ - blocks = wal_segment_size / XLOG_BLCKSZ; - for (int i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * XLOG_BLCKSZ; - - if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) - { - save_errno = errno; - break; - } - - i += iovcnt; - } - } - else - { - /* - * Otherwise, seeking to the end and writing a solitary byte is - * enough. - */ - errno = 0; - if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) - { - /* if write didn't set errno, assume no disk space */ - save_errno = errno ? errno : ENOSPC; - } - } - pgstat_report_wait_end(); - - if (save_errno) - { - /* - * If we fail to make the file, delete it to release disk space - */ - unlink(tmppath); - - close(fd); - - errno = save_errno; - - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", tmppath))); - } - - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); - if (pg_fsync(fd) != 0) - { - int save_errno = errno; - - close(fd); - errno = save_errno; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", tmppath))); - } - pgstat_report_wait_end(); - - if (close(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", tmppath))); + CreateEmptyWalSegment(tmppath); /* * Now move the segment into place with its final name. Cope with diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 14b77f2861..4efc46460e 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -3891,3 +3891,117 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) return sum; } + +/* + * CreateEmptyWalSegment + * + * Create a new file that can be used as a new WAL segment. The caller is + * responsible for installing the new file in pg_wal. + */ +void +CreateEmptyWalSegment(const char *path) +{ + PGAlignedXLogBlock zbuffer; + int fd; + int save_errno; + + unlink(path); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + + memset(zbuffer.data, 0, XLOG_BLCKSZ); + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); + save_errno = 0; + if (wal_init_zero) + { + struct iovec iov[PG_IOV_MAX]; + int blocks; + + /* + * Zero-fill the file. With this setting, we do this the hard way to + * ensure that all the file space has really been allocated. On + * platforms that allow "holes" in files, just seeking to the end + * doesn't allocate intermediate space. This way, we know that we + * have all the space and (after the fsync below) that all the + * indirect blocks are down on disk. Therefore, fdatasync(2) or + * O_DSYNC will be sufficient to sync future writes to the log file. + */ + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (int i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = XLOG_BLCKSZ; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = wal_segment_size / XLOG_BLCKSZ; + for (int i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * XLOG_BLCKSZ; + + if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) + { + save_errno = errno; + break; + } + + i += iovcnt; + } + } + else + { + /* + * Otherwise, seeking to the end and writing a solitary byte is + * enough. + */ + errno = 0; + if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) + { + /* if write didn't set errno, assume no disk space */ + save_errno = errno ? errno : ENOSPC; + } + } + pgstat_report_wait_end(); + + if (save_errno) + { + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(path); + + close(fd); + + errno = save_errno; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + } + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + } + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 69549b000f..6bb9e3525b 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -190,6 +190,7 @@ extern int durable_unlink(const char *fname, int loglevel); extern int durable_rename_excl(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); +extern void CreateEmptyWalSegment(const char *path); /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" -- 2.25.1
>From 6423b98e2f367623791288f1ce2b0d98887eb605 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Fri, 8 Apr 2022 13:27:31 -0700 Subject: [PATCH v8 2/2] WAL segment pre-allocation. --- doc/src/sgml/config.sgml | 23 ++ src/backend/access/transam/xlog.c | 60 ++++- src/backend/postmaster/checkpointer.c | 234 ++++++++++++++++++ src/backend/replication/basebackup.c | 6 +- src/backend/storage/file/fd.c | 22 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/utils/misc/guc.c | 11 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/bin/initdb/initdb.c | 1 + src/bin/pg_basebackup/bbstreamer_file.c | 3 +- src/bin/pg_basebackup/pg_basebackup.c | 13 + src/bin/pg_basebackup/t/010_pg_basebackup.pl | 4 +- src/bin/pg_resetwal/pg_resetwal.c | 48 ++++ src/include/postmaster/bgwriter.h | 5 + src/include/storage/fd.h | 2 +- 15 files changed, 414 insertions(+), 20 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 6e3e27bed7..ae55038b6c 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3219,6 +3219,29 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-wal-preallocate-max-size" xreflabel="wal_preallocate_max_size"> + <term><varname>wal_preallocate_max_size</varname> (<type>integer</type>) + <indexterm> + <primary><varname>wal_preallocate_max_size</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + The maximum amount of WAL to pre-allocate for use when a new WAL segment + must be initialized. Setting this parameter to a size less than the WAL + segment size disables this behavior. The default value is 64 megabytes + (<literal>64MB</literal>). This parameter can only be set in the + <filename>postgresql.conf</filename> file or on the server command line. + </para> + + <para> + WAL pre-allocation may improve performance in scenarios where new WAL + segments must be created (e.g., during checkpoints when WAL segments + cannot be recycled). + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers"> <term><varname>wal_buffers</varname> (<type>integer</type>) <indexterm> diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 87d71e2008..f7fc1c2fb4 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2921,6 +2921,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, XLogSegNo installed_segno; XLogSegNo max_segno; int fd; + int prealloc_segs; + bool found_prealloc = false; Assert(logtli != 0); @@ -2942,15 +2944,45 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, return fd; /* - * Initialize an empty (all zeroes) segment. NOTE: it is possible that - * another process is doing the same thing. If so, we will end up - * pre-creating an extra log segment. That seems OK, and better than - * holding the lock throughout this lengthy process. + * Try to use a pre-allocated segment, if one exists. If none are + * available, we fall back to creating a new segment on our own. + * + * Note that we still look for a pre-allocated segment even if the pre- + * allocation functionality is disabled via the GUCs. This ensures that any + * pre-allocated segments left over after turning off the pre-allocation + * functionality are still eligible for use. */ - elog(DEBUG2, "creating and filling new WAL file"); + LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE); + prealloc_segs = GetNumPreallocatedWalSegs(); + if (prealloc_segs > 0) + { + elog(DEBUG2, "using pre-allocated WAL file"); - snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); - CreateEmptyWalSegment(tmppath); + found_prealloc = true; + prealloc_segs--; + SetNumPreallocatedWalSegs(prealloc_segs); + snprintf(tmppath, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d", + XLOGDIR, prealloc_segs); + } + else + { + /* + * We're not using a pre-allocated segment, so there's no need to keep + * holding the WALPreallocationLock. + */ + LWLockRelease(WALPreallocationLock); + + /* + * Initialize an empty (all zeroes) segment. NOTE: it is possible that + * another process is doing the same thing. If so, we will end up + * pre-creating an extra log segment. That seems OK, and better than + * holding the lock throughout this lengthy process. + */ + elog(DEBUG2, "creating and filling new WAL file"); + + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + CreateEmptyWalSegment(tmppath, ERROR); + } /* * Now move the segment into place with its final name. Cope with @@ -2973,7 +3005,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, logtli)) { *added = true; - elog(DEBUG2, "done creating and filling new WAL file"); + elog(DEBUG2, "done installing new WAL file"); } else { @@ -2986,6 +3018,18 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, elog(DEBUG2, "abandoned new WAL file"); } + /* + * If we are using a pre-allocated segment, we've been holding onto the + * WALPreallocationLock all this time so that the checkpointer process can't + * overwrite the file before we've installed it. + * + * While we're at it, also nudge the checkpointer process so that it pre- + * allocates new segments if possible. + */ + if (found_prealloc) + LWLockRelease(WALPreallocationLock); + RequestWalPreallocation(); + return -1; } diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index c937c39f50..78ff4d14b2 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -128,6 +128,9 @@ typedef struct uint32 num_backend_writes; /* counts user backend buffer writes */ uint32 num_backend_fsync; /* counts user backend fsync calls */ + bool wal_prealloc_requested; /* protected by ckpt_lck */ + int num_prealloc_segs; /* protected by WALPreallocationLock */ + int num_requests; /* current # of requests */ int max_requests; /* allocated array size */ CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; @@ -144,6 +147,7 @@ static CheckpointerShmemStruct *CheckpointerShmem; int CheckPointTimeout = 300; int CheckPointWarning = 30; double CheckPointCompletionTarget = 0.9; +int wal_prealloc_max_size_mb = 0; /* * Private state @@ -166,6 +170,10 @@ static bool IsCheckpointOnSchedule(double progress); static bool ImmediateCheckpointRequested(void); static bool CompactCheckpointerRequestQueue(void); static void UpdateSharedMemoryConfig(void); +static void ScanForExistingPreallocatedSegments(void); +static bool InstallPreallocatedWalSeg(const char *path); +static void DoWalPreAllocation(void); +static int GetMaxPreallocatedWalSegs(void); /* Signal handlers */ static void ReqCheckpointHandler(SIGNAL_ARGS); @@ -340,6 +348,11 @@ CheckpointerMain(void) */ ProcGlobal->checkpointerLatch = &MyProc->procLatch; + /* + * Look for any leftover pre-allocated segments we can use. + */ + ScanForExistingPreallocatedSegments(); + /* * Loop forever */ @@ -360,6 +373,11 @@ CheckpointerMain(void) AbsorbSyncRequests(); HandleCheckpointerInterrupts(); + /* + * First do WAL pre-allocation. + */ + DoWalPreAllocation(); + /* * Detect a pending checkpoint request by checking whether the flags * word in shared memory is nonzero. We shouldn't need to acquire the @@ -513,6 +531,12 @@ CheckpointerMain(void) if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) continue; + /* + * Also skip sleeping if WAL pre-allocation has been requested. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->wal_prealloc_requested) + continue; + /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. @@ -692,6 +716,8 @@ ImmediateCheckpointRequested(void) * * 'progress' is an estimate of how much of the work has been done, as a * fraction between 0.0 meaning none, and 1.0 meaning all done. + * + * This function also takes care of handling any WAL pre-allocation requests. */ void CheckpointWriteDelay(int flags, double progress) @@ -702,6 +728,15 @@ CheckpointWriteDelay(int flags, double progress) if (!AmCheckpointerProcess()) return; + /* + * WAL pre-allocation has nothing to do with throttling BufferSync()'s write + * rate. However, since this function is called frequently during + * checkpoints, it is a convenient place to handle any pending WAL pre- + * allocation requests. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->wal_prealloc_requested) + DoWalPreAllocation(); + /* * Perform the usual duties and take a nap, unless we're behind schedule, * in which case we just try to catch up as quickly as possible. @@ -834,6 +869,203 @@ IsCheckpointOnSchedule(double progress) return true; } +/* + * GetNumPreallocatedWalSegs + * + * Returns the number of pre-allocated WAL segments in the preallocated_segments + * directory. Callers are expected to hold the WALPreallocationLock. + */ +int +GetNumPreallocatedWalSegs(void) +{ + Assert(LWLockHeldByMe(WALPreallocationLock)); + return CheckpointerShmem->num_prealloc_segs; +} + +/* + * SetNumPreallocatedWalSegs + * + * Sets the number of pre-allocated WAL segments in the preallocated_segments + * directory. Callers are expected to hold the WALPreallocationLock + * exclusively. + */ +void +SetNumPreallocatedWalSegs(int i) +{ + Assert(LWLockHeldByMeInMode(WALPreallocationLock, LW_EXCLUSIVE)); + CheckpointerShmem->num_prealloc_segs = i; +} + +/* + * ScanForExistingPreallocatedSegments + * + * This function searches through pg_wal/preallocated_segments for any segments + * that were left over and sets the tracking variable in shared memory + * accordingly. + */ +static void +ScanForExistingPreallocatedSegments(void) +{ + int i = 0; + + /* + * fsync the preallocated_segments directory in case any renames have yet to + * be flushed to disk. + */ + fsync_fname_ext(XLOGDIR "/preallocated_segments", true, false, FATAL); + + /* + * Gather all the preallocated segments we can find. + */ + while (true) + { + FILE *fd; + char path[MAXPGPATH]; + + snprintf(path, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d", + XLOGDIR, i); + + fd = AllocateFile(path, "r"); + if (fd != NULL) + { + FreeFile(fd); + i++; + } + else + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + break; + } + } + + LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE); + SetNumPreallocatedWalSegs(i); + LWLockRelease(WALPreallocationLock); + + elog(DEBUG2, "found %d preallocated segments during startup", i); +} + +/* + * InstallPreallocatedWalSeg + * + * Renames the file at "path" to the next open pre-allocated segment slot and + * bumps up num_prealloc_segs. If there is a problem, a WARNING is emitted, we + * attempt to delete the file, and false is returned. Otherwise, true is + * returned. + */ +static bool +InstallPreallocatedWalSeg(const char *path) +{ + char newpath[MAXPGPATH]; + int rc; + + LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE); + + snprintf(newpath, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d", + XLOGDIR, CheckpointerShmem->num_prealloc_segs); + + rc = durable_rename(path, newpath, DEBUG1); + if (rc == 0) + CheckpointerShmem->num_prealloc_segs++; + else + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("file \"%s\" could not be renamed to \"%s\": %m", + path, newpath))); + + (void) durable_unlink(path, DEBUG1); + (void) durable_unlink(newpath, DEBUG1); + } + + LWLockRelease(WALPreallocationLock); + + return (rc == 0); +} + +/* + * DoWalPreAllocation + * + * Tries to allocate up to wal_preallocate_max_size worth of WAL. + */ +static void +DoWalPreAllocation(void) +{ + int segs_to_prealloc; + + /* + * Reset the request flag. + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->wal_prealloc_requested = false; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + /* + * Determine how many segments to pre-allocate. + */ + LWLockAcquire(WALPreallocationLock, LW_SHARED); + segs_to_prealloc = GetMaxPreallocatedWalSegs() - GetNumPreallocatedWalSegs(); + LWLockRelease(WALPreallocationLock); + + /* + * Do the pre-allocation. + */ + for (int i = 0; i < segs_to_prealloc; i++) + { + char tmppath[MAXPGPATH]; + + snprintf(tmppath, MAXPGPATH, "%s/preallocated_segments/xlogtemp", XLOGDIR); + + if (!CreateEmptyWalSegment(tmppath, WARNING) || + !InstallPreallocatedWalSeg(tmppath)) + { + elog(DEBUG2, "failed to pre-allocate WAL segment"); + return; + } + else + elog(DEBUG2, "pre-allocated WAL segment"); + + /* Check for barrier events. */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ShutdownRequestPending) + return; + } +} + +/* + * GetMaxPreallocatedWalSegs + * + * Returns the maximum number of pre-allocated WAL segments to create based on + * the current value of wal_preallocate_max_size. + */ +static int +GetMaxPreallocatedWalSegs(void) +{ + return wal_prealloc_max_size_mb / (wal_segment_size / (1024 * 1024)); +} + +/* + * RequestWalPreallocation + * + * Requests that more segments be pre-allocated for future use. + */ +void +RequestWalPreallocation(void) +{ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->wal_prealloc_requested = true; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (ProcGlobal->checkpointerLatch && + GetMaxPreallocatedWalSegs() > 0) + SetLatch(ProcGlobal->checkpointerLatch); +} + /* -------------------------------- * signal handler routines @@ -907,6 +1139,8 @@ CheckpointerShmemInit(void) CheckpointerShmem->max_requests = NBuffers; ConditionVariableInit(&CheckpointerShmem->start_cv); ConditionVariableInit(&CheckpointerShmem->done_cv); + CheckpointerShmem->wal_prealloc_requested = false; + CheckpointerShmem->num_prealloc_segs = 0; } } diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 27e4152446..2be9d199e3 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1304,11 +1304,13 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, &statbuf, sizeonly); /* - * Also send archive_status directory (by hackishly reusing - * statbuf from above ...). + * Also send archive_status and preallocated_segments (by hackishly + * reusing statbuf from above ...). */ size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL, &statbuf, sizeonly); + size += _tarWriteHeader(sink, "./pg_wal/preallocated_segments", NULL, + &statbuf, sizeonly); continue; /* don't recurse into pg_wal */ } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 4efc46460e..05ab0b4c66 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -3898,8 +3898,8 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) * Create a new file that can be used as a new WAL segment. The caller is * responsible for installing the new file in pg_wal. */ -void -CreateEmptyWalSegment(const char *path) +bool +CreateEmptyWalSegment(const char *path, int elevel) { PGAlignedXLogBlock zbuffer; int fd; @@ -3910,9 +3910,12 @@ CreateEmptyWalSegment(const char *path) /* do not use get_sync_bit() here --- want to fsync only at end of fill */ fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); if (fd < 0) - ereport(ERROR, + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", path))); + return false; + } memset(zbuffer.data, 0, XLOG_BLCKSZ); @@ -3982,9 +3985,10 @@ CreateEmptyWalSegment(const char *path) errno = save_errno; - ereport(ERROR, + ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", path))); + return false; } pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); @@ -3994,14 +3998,20 @@ CreateEmptyWalSegment(const char *path) close(fd); errno = save_errno; - ereport(ERROR, + ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); + return false; } pgstat_report_wait_end(); if (close(fd) != 0) - ereport(ERROR, + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", path))); + return false; + } + + return true; } diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c295..212bda8e7d 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +WALPreallocationLock 48 diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 22b5571a70..0b496ddc9b 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2918,6 +2918,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"wal_preallocate_max_size", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("Sets the maximum amount of WAL to pre-allocate."), + NULL, + GUC_UNIT_MB + }, + &wal_prealloc_max_size_mb, + 64, 0, 102400, + NULL, NULL, NULL + }, + { {"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS, gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 94270eb0ec..8238679d25 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -223,6 +223,7 @@ # off, pglz, lz4, zstd, or on #wal_init_zero = on # zero-fill new WAL files #wal_recycle = on # recycle WAL files +#wal_preallocate_max_size = 64MB # amount of WAL to pre-allocate #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ab826da650..0867bd30af 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -212,6 +212,7 @@ static char *extra_options = ""; static const char *const subdirs[] = { "global", "pg_wal/archive_status", + "pg_wal/preallocated_segments", "pg_commit_ts", "pg_dynshmem", "pg_notify", diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c index 393e9f340c..3b932b80a6 100644 --- a/src/bin/pg_basebackup/bbstreamer_file.c +++ b/src/bin/pg_basebackup/bbstreamer_file.c @@ -293,7 +293,8 @@ extract_directory(const char *filename, mode_t mode) */ if (!((pg_str_endswith(filename, "/pg_wal") || pg_str_endswith(filename, "/pg_xlog") || - pg_str_endswith(filename, "/archive_status")) && + pg_str_endswith(filename, "/archive_status") || + pg_str_endswith(filename, "/preallocated_segments")) && errno == EEXIST)) pg_fatal("could not create directory \"%s\": %m", filename); diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 65dcfff0a0..b9d990cf7c 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -683,6 +683,19 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && errno != EEXIST) pg_fatal("could not create directory \"%s\": %m", statusdir); + + /* + * Also create pg_wal/preallocated_segments if necessary. + */ + if (PQserverVersion(conn) >= 150000) + { + char prealloc_dir[MAXPGPATH]; + + snprintf(prealloc_dir, sizeof(prealloc_dir), "%s/pg_wal/preallocated_segments", + basedir); + if (pg_mkdir_p(prealloc_dir, pg_dir_create_mode) != 0 && errno != EEXIST) + pg_fatal("could not create directory \"%s\": %m", prealloc_dir); + } } /* diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index 7309ebddea..b3baee781c 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -204,10 +204,10 @@ SKIP: "check backup dir permissions"); } -# Only archive_status directory should be copied in pg_wal/. +# Only archive_status and preallocated_segments directories should be copied in pg_wal/. is_deeply( [ sort(slurp_dir("$tempdir/backup/pg_wal/")) ], - [ sort qw(. .. archive_status) ], + [ sort qw(. .. archive_status preallocated_segments) ], 'no WAL files copied'); # Contents of these directories should not be copied. diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index d4772a2965..dfae77fb78 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -85,6 +85,7 @@ static void RewriteControlFile(void); static void FindEndOfXLOG(void); static void KillExistingXLOG(void); static void KillExistingArchiveStatus(void); +static void KillExistingPreallocatedSegments(void); static void WriteEmptyXLOG(void); static void usage(void); @@ -488,6 +489,7 @@ main(int argc, char *argv[]) RewriteControlFile(); KillExistingXLOG(); KillExistingArchiveStatus(); + KillExistingPreallocatedSegments(); WriteEmptyXLOG(); printf(_("Write-ahead log reset\n")); @@ -1037,6 +1039,52 @@ KillExistingArchiveStatus(void) } +/* + * Remove existing preallocated segments + */ +static void +KillExistingPreallocatedSegments(void) +{ +#define PREALLOCSEGDIR XLOGDIR "/preallocated_segments" + + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(PREALLOCSEGDIR)]; + + xldir = opendir(PREALLOCSEGDIR); + if (xldir == NULL) + { + pg_log_error("could not open directory \"%s\": %m", PREALLOCSEGDIR); + exit(1); + } + + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + if (strncmp(xlde->d_name, "xlogtemp", strlen("xlogtemp")) == 0) + { + snprintf(path, sizeof(path), "%s/%s", PREALLOCSEGDIR, xlde->d_name); + if (unlink(path) < 0) + { + pg_log_error("could not delete file \"%s\": %m", path); + exit(1); + } + } + } + + if (errno) + { + pg_log_error("could not read directory \"%s\": %m", PREALLOCSEGDIR); + exit(1); + } + + if (closedir(xldir)) + { + pg_log_error("could not close directory \"%s\": %m", PREALLOCSEGDIR); + exit(1); + } +} + + /* * Write an empty XLOG file, containing only the checkpoint record * already set up in ControlFile. diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 2511ef451e..9f284fa76b 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -26,6 +26,7 @@ extern PGDLLIMPORT int BgWriterDelay; extern PGDLLIMPORT int CheckPointTimeout; extern PGDLLIMPORT int CheckPointWarning; extern PGDLLIMPORT double CheckPointCompletionTarget; +extern PGDLLIMPORT int wal_prealloc_max_size_mb; extern void BackgroundWriterMain(void) pg_attribute_noreturn(); extern void CheckpointerMain(void) pg_attribute_noreturn(); @@ -42,4 +43,8 @@ extern void CheckpointerShmemInit(void); extern bool FirstCallSinceLastCheckpoint(void); +extern int GetNumPreallocatedWalSegs(void); +extern void SetNumPreallocatedWalSegs(int i); +extern void RequestWalPreallocation(void); + #endif /* _BGWRITER_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 6bb9e3525b..bfc32a9b39 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -190,7 +190,7 @@ extern int durable_unlink(const char *fname, int loglevel); extern int durable_rename_excl(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); -extern void CreateEmptyWalSegment(const char *path); +extern bool CreateEmptyWalSegment(const char *path, int elevel); /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" -- 2.25.1