On Thu, Mar 17, 2022 at 04:12:12PM -0700, Nathan Bossart wrote:
> It seems unlikely that this will be committed for v15, so I've adjusted the
> commitfest entry to v16 and moved it to the next commitfest.

rebased

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 3781795f9b4e448df6bdd24d5cd7c0743b5e2944 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Wed, 10 Nov 2021 18:35:14 +0000
Subject: [PATCH v8 1/2] Move WAL segment creation logic to its own function.

---
 src/backend/access/transam/xlog.c | 103 +--------------------------
 src/backend/storage/file/fd.c     | 114 ++++++++++++++++++++++++++++++
 src/include/storage/fd.h          |   1 +
 3 files changed, 116 insertions(+), 102 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a7814d4019..87d71e2008 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2918,11 +2918,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 					 bool *added, char *path)
 {
 	char		tmppath[MAXPGPATH];
-	PGAlignedXLogBlock zbuffer;
 	XLogSegNo	installed_segno;
 	XLogSegNo	max_segno;
 	int			fd;
-	int			save_errno;
 
 	Assert(logtli != 0);
 
@@ -2952,106 +2950,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	elog(DEBUG2, "creating and filling new WAL file");
 
 	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
-
-	unlink(tmppath);
-
-	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
-	if (fd < 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not create file \"%s\": %m", tmppath)));
-
-	memset(zbuffer.data, 0, XLOG_BLCKSZ);
-
-	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
-	save_errno = 0;
-	if (wal_init_zero)
-	{
-		struct iovec iov[PG_IOV_MAX];
-		int			blocks;
-
-		/*
-		 * Zero-fill the file.  With this setting, we do this the hard way to
-		 * ensure that all the file space has really been allocated.  On
-		 * platforms that allow "holes" in files, just seeking to the end
-		 * doesn't allocate intermediate space.  This way, we know that we
-		 * have all the space and (after the fsync below) that all the
-		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
-		 * O_DSYNC will be sufficient to sync future writes to the log file.
-		 */
-
-		/* Prepare to write out a lot of copies of our zero buffer at once. */
-		for (int i = 0; i < lengthof(iov); ++i)
-		{
-			iov[i].iov_base = zbuffer.data;
-			iov[i].iov_len = XLOG_BLCKSZ;
-		}
-
-		/* Loop, writing as many blocks as we can for each system call. */
-		blocks = wal_segment_size / XLOG_BLCKSZ;
-		for (int i = 0; i < blocks;)
-		{
-			int			iovcnt = Min(blocks - i, lengthof(iov));
-			off_t		offset = i * XLOG_BLCKSZ;
-
-			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
-			{
-				save_errno = errno;
-				break;
-			}
-
-			i += iovcnt;
-		}
-	}
-	else
-	{
-		/*
-		 * Otherwise, seeking to the end and writing a solitary byte is
-		 * enough.
-		 */
-		errno = 0;
-		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
-		{
-			/* if write didn't set errno, assume no disk space */
-			save_errno = errno ? errno : ENOSPC;
-		}
-	}
-	pgstat_report_wait_end();
-
-	if (save_errno)
-	{
-		/*
-		 * If we fail to make the file, delete it to release disk space
-		 */
-		unlink(tmppath);
-
-		close(fd);
-
-		errno = save_errno;
-
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not write to file \"%s\": %m", tmppath)));
-	}
-
-	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
-	if (pg_fsync(fd) != 0)
-	{
-		int			save_errno = errno;
-
-		close(fd);
-		errno = save_errno;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not fsync file \"%s\": %m", tmppath)));
-	}
-	pgstat_report_wait_end();
-
-	if (close(fd) != 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not close file \"%s\": %m", tmppath)));
+	CreateEmptyWalSegment(tmppath);
 
 	/*
 	 * Now move the segment into place with its final name.  Cope with
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 14b77f2861..4efc46460e 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -3891,3 +3891,117 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 
 	return sum;
 }
+
+/*
+ * CreateEmptyWalSegment
+ *
+ * Create a new file that can be used as a new WAL segment.  The caller is
+ * responsible for installing the new file in pg_wal.
+ */
+void
+CreateEmptyWalSegment(const char *path)
+{
+	PGAlignedXLogBlock zbuffer;
+	int			fd;
+	int			save_errno;
+
+	unlink(path);
+
+	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
+	fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", path)));
+
+	memset(zbuffer.data, 0, XLOG_BLCKSZ);
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
+	save_errno = 0;
+	if (wal_init_zero)
+	{
+		struct iovec iov[PG_IOV_MAX];
+		int			blocks;
+
+		/*
+		 * Zero-fill the file.  With this setting, we do this the hard way to
+		 * ensure that all the file space has really been allocated.  On
+		 * platforms that allow "holes" in files, just seeking to the end
+		 * doesn't allocate intermediate space.  This way, we know that we
+		 * have all the space and (after the fsync below) that all the
+		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
+		 * O_DSYNC will be sufficient to sync future writes to the log file.
+		 */
+
+		/* Prepare to write out a lot of copies of our zero buffer at once. */
+		for (int i = 0; i < lengthof(iov); ++i)
+		{
+			iov[i].iov_base = zbuffer.data;
+			iov[i].iov_len = XLOG_BLCKSZ;
+		}
+
+		/* Loop, writing as many blocks as we can for each system call. */
+		blocks = wal_segment_size / XLOG_BLCKSZ;
+		for (int i = 0; i < blocks;)
+		{
+			int			iovcnt = Min(blocks - i, lengthof(iov));
+			off_t		offset = i * XLOG_BLCKSZ;
+
+			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
+			{
+				save_errno = errno;
+				break;
+			}
+
+			i += iovcnt;
+		}
+	}
+	else
+	{
+		/*
+		 * Otherwise, seeking to the end and writing a solitary byte is
+		 * enough.
+		 */
+		errno = 0;
+		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+		{
+			/* if write didn't set errno, assume no disk space */
+			save_errno = errno ? errno : ENOSPC;
+		}
+	}
+	pgstat_report_wait_end();
+
+	if (save_errno)
+	{
+		/*
+		 * If we fail to make the file, delete it to release disk space
+		 */
+		unlink(path);
+
+		close(fd);
+
+		errno = save_errno;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", path)));
+	}
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
+	if (pg_fsync(fd) != 0)
+	{
+		int			save_errno = errno;
+
+		close(fd);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", path)));
+	}
+	pgstat_report_wait_end();
+
+	if (close(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", path)));
+}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 69549b000f..6bb9e3525b 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -190,6 +190,7 @@ extern int	durable_unlink(const char *fname, int loglevel);
 extern int	durable_rename_excl(const char *oldfile, const char *newfile, int loglevel);
 extern void SyncDataDirectory(void);
 extern int	data_sync_elevel(int elevel);
+extern void CreateEmptyWalSegment(const char *path);
 
 /* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
2.25.1

>From 6423b98e2f367623791288f1ce2b0d98887eb605 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Fri, 8 Apr 2022 13:27:31 -0700
Subject: [PATCH v8 2/2] WAL segment pre-allocation.

---
 doc/src/sgml/config.sgml                      |  23 ++
 src/backend/access/transam/xlog.c             |  60 ++++-
 src/backend/postmaster/checkpointer.c         | 234 ++++++++++++++++++
 src/backend/replication/basebackup.c          |   6 +-
 src/backend/storage/file/fd.c                 |  22 +-
 src/backend/storage/lmgr/lwlocknames.txt      |   1 +
 src/backend/utils/misc/guc.c                  |  11 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/bin/initdb/initdb.c                       |   1 +
 src/bin/pg_basebackup/bbstreamer_file.c       |   3 +-
 src/bin/pg_basebackup/pg_basebackup.c         |  13 +
 src/bin/pg_basebackup/t/010_pg_basebackup.pl  |   4 +-
 src/bin/pg_resetwal/pg_resetwal.c             |  48 ++++
 src/include/postmaster/bgwriter.h             |   5 +
 src/include/storage/fd.h                      |   2 +-
 15 files changed, 414 insertions(+), 20 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6e3e27bed7..ae55038b6c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3219,6 +3219,29 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-wal-preallocate-max-size" xreflabel="wal_preallocate_max_size">
+      <term><varname>wal_preallocate_max_size</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>wal_preallocate_max_size</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        The maximum amount of WAL to pre-allocate for use when a new WAL segment
+        must be initialized.  Setting this parameter to a size less than the WAL
+        segment size disables this behavior.  The default value is 64 megabytes
+        (<literal>64MB</literal>).  This parameter can only be set in the
+        <filename>postgresql.conf</filename> file or on the server command line.
+       </para>
+
+       <para>
+        WAL pre-allocation may improve performance in scenarios where new WAL
+        segments must be created (e.g., during checkpoints when WAL segments
+        cannot be recycled).
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers">
       <term><varname>wal_buffers</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 87d71e2008..f7fc1c2fb4 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2921,6 +2921,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	XLogSegNo	installed_segno;
 	XLogSegNo	max_segno;
 	int			fd;
+	int			prealloc_segs;
+	bool		found_prealloc = false;
 
 	Assert(logtli != 0);
 
@@ -2942,15 +2944,45 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		return fd;
 
 	/*
-	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
-	 * another process is doing the same thing.  If so, we will end up
-	 * pre-creating an extra log segment.  That seems OK, and better than
-	 * holding the lock throughout this lengthy process.
+	 * Try to use a pre-allocated segment, if one exists.  If none are
+	 * available, we fall back to creating a new segment on our own.
+	 *
+	 * Note that we still look for a pre-allocated segment even if the pre-
+	 * allocation functionality is disabled via the GUCs.  This ensures that any
+	 * pre-allocated segments left over after turning off the pre-allocation
+	 * functionality are still eligible for use.
 	 */
-	elog(DEBUG2, "creating and filling new WAL file");
+	LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE);
+	prealloc_segs = GetNumPreallocatedWalSegs();
+	if (prealloc_segs > 0)
+	{
+		elog(DEBUG2, "using pre-allocated WAL file");
 
-	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
-	CreateEmptyWalSegment(tmppath);
+		found_prealloc = true;
+		prealloc_segs--;
+		SetNumPreallocatedWalSegs(prealloc_segs);
+		snprintf(tmppath, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d",
+				 XLOGDIR, prealloc_segs);
+	}
+	else
+	{
+		/*
+		 * We're not using a pre-allocated segment, so there's no need to keep
+		 * holding the WALPreallocationLock.
+		 */
+		LWLockRelease(WALPreallocationLock);
+
+		/*
+		 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
+		 * another process is doing the same thing.  If so, we will end up
+		 * pre-creating an extra log segment.  That seems OK, and better than
+		 * holding the lock throughout this lengthy process.
+		 */
+		elog(DEBUG2, "creating and filling new WAL file");
+
+		snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+		CreateEmptyWalSegment(tmppath, ERROR);
+	}
 
 	/*
 	 * Now move the segment into place with its final name.  Cope with
@@ -2973,7 +3005,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 							   logtli))
 	{
 		*added = true;
-		elog(DEBUG2, "done creating and filling new WAL file");
+		elog(DEBUG2, "done installing new WAL file");
 	}
 	else
 	{
@@ -2986,6 +3018,18 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		elog(DEBUG2, "abandoned new WAL file");
 	}
 
+	/*
+	 * If we are using a pre-allocated segment, we've been holding onto the
+	 * WALPreallocationLock all this time so that the checkpointer process can't
+	 * overwrite the file before we've installed it.
+	 *
+	 * While we're at it, also nudge the checkpointer process so that it pre-
+	 * allocates new segments if possible.
+	 */
+	if (found_prealloc)
+		LWLockRelease(WALPreallocationLock);
+	RequestWalPreallocation();
+
 	return -1;
 }
 
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index c937c39f50..78ff4d14b2 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -128,6 +128,9 @@ typedef struct
 	uint32		num_backend_writes; /* counts user backend buffer writes */
 	uint32		num_backend_fsync;	/* counts user backend fsync calls */
 
+	bool		wal_prealloc_requested;	/* protected by ckpt_lck */
+	int			num_prealloc_segs;	/* protected by WALPreallocationLock */
+
 	int			num_requests;	/* current # of requests */
 	int			max_requests;	/* allocated array size */
 	CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
@@ -144,6 +147,7 @@ static CheckpointerShmemStruct *CheckpointerShmem;
 int			CheckPointTimeout = 300;
 int			CheckPointWarning = 30;
 double		CheckPointCompletionTarget = 0.9;
+int			wal_prealloc_max_size_mb = 0;
 
 /*
  * Private state
@@ -166,6 +170,10 @@ static bool IsCheckpointOnSchedule(double progress);
 static bool ImmediateCheckpointRequested(void);
 static bool CompactCheckpointerRequestQueue(void);
 static void UpdateSharedMemoryConfig(void);
+static void ScanForExistingPreallocatedSegments(void);
+static bool InstallPreallocatedWalSeg(const char *path);
+static void DoWalPreAllocation(void);
+static int GetMaxPreallocatedWalSegs(void);
 
 /* Signal handlers */
 static void ReqCheckpointHandler(SIGNAL_ARGS);
@@ -340,6 +348,11 @@ CheckpointerMain(void)
 	 */
 	ProcGlobal->checkpointerLatch = &MyProc->procLatch;
 
+	/*
+	 * Look for any leftover pre-allocated segments we can use.
+	 */
+	ScanForExistingPreallocatedSegments();
+
 	/*
 	 * Loop forever
 	 */
@@ -360,6 +373,11 @@ CheckpointerMain(void)
 		AbsorbSyncRequests();
 		HandleCheckpointerInterrupts();
 
+		/*
+		 * First do WAL pre-allocation.
+		 */
+		DoWalPreAllocation();
+
 		/*
 		 * Detect a pending checkpoint request by checking whether the flags
 		 * word in shared memory is nonzero.  We shouldn't need to acquire the
@@ -513,6 +531,12 @@ CheckpointerMain(void)
 		if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
 			continue;
 
+		/*
+		 * Also skip sleeping if WAL pre-allocation has been requested.
+		 */
+		if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->wal_prealloc_requested)
+			continue;
+
 		/*
 		 * Sleep until we are signaled or it's time for another checkpoint or
 		 * xlog file switch.
@@ -692,6 +716,8 @@ ImmediateCheckpointRequested(void)
  *
  * 'progress' is an estimate of how much of the work has been done, as a
  * fraction between 0.0 meaning none, and 1.0 meaning all done.
+ *
+ * This function also takes care of handling any WAL pre-allocation requests.
  */
 void
 CheckpointWriteDelay(int flags, double progress)
@@ -702,6 +728,15 @@ CheckpointWriteDelay(int flags, double progress)
 	if (!AmCheckpointerProcess())
 		return;
 
+	/*
+	 * WAL pre-allocation has nothing to do with throttling BufferSync()'s write
+	 * rate.  However, since this function is called frequently during
+	 * checkpoints, it is a convenient place to handle any pending WAL pre-
+	 * allocation requests.
+	 */
+	if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->wal_prealloc_requested)
+		DoWalPreAllocation();
+
 	/*
 	 * Perform the usual duties and take a nap, unless we're behind schedule,
 	 * in which case we just try to catch up as quickly as possible.
@@ -834,6 +869,203 @@ IsCheckpointOnSchedule(double progress)
 	return true;
 }
 
+/*
+ * GetNumPreallocatedWalSegs
+ *
+ * Returns the number of pre-allocated WAL segments in the preallocated_segments
+ * directory.  Callers are expected to hold the WALPreallocationLock.
+ */
+int
+GetNumPreallocatedWalSegs(void)
+{
+	Assert(LWLockHeldByMe(WALPreallocationLock));
+	return CheckpointerShmem->num_prealloc_segs;
+}
+
+/*
+ * SetNumPreallocatedWalSegs
+ *
+ * Sets the number of pre-allocated WAL segments in the preallocated_segments
+ * directory.  Callers are expected to hold the WALPreallocationLock
+ * exclusively.
+ */
+void
+SetNumPreallocatedWalSegs(int i)
+{
+	Assert(LWLockHeldByMeInMode(WALPreallocationLock, LW_EXCLUSIVE));
+	CheckpointerShmem->num_prealloc_segs = i;
+}
+
+/*
+ * ScanForExistingPreallocatedSegments
+ *
+ * This function searches through pg_wal/preallocated_segments for any segments
+ * that were left over and sets the tracking variable in shared memory
+ * accordingly.
+ */
+static void
+ScanForExistingPreallocatedSegments(void)
+{
+	int		i = 0;
+
+	/*
+	 * fsync the preallocated_segments directory in case any renames have yet to
+	 * be flushed to disk.
+	 */
+	fsync_fname_ext(XLOGDIR "/preallocated_segments", true, false, FATAL);
+
+	/*
+	 * Gather all the preallocated segments we can find.
+	 */
+	while (true)
+	{
+		FILE *fd;
+		char path[MAXPGPATH];
+
+		snprintf(path, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d",
+				 XLOGDIR, i);
+
+		fd = AllocateFile(path, "r");
+		if (fd != NULL)
+		{
+			FreeFile(fd);
+			i++;
+		}
+		else
+		{
+			if (errno != ENOENT)
+				ereport(FATAL,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m", path)));
+			break;
+		}
+	}
+
+	LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE);
+	SetNumPreallocatedWalSegs(i);
+	LWLockRelease(WALPreallocationLock);
+
+	elog(DEBUG2, "found %d preallocated segments during startup", i);
+}
+
+/*
+ * InstallPreallocatedWalSeg
+ *
+ * Renames the file at "path" to the next open pre-allocated segment slot and
+ * bumps up num_prealloc_segs.  If there is a problem, a WARNING is emitted, we
+ * attempt to delete the file, and false is returned.  Otherwise, true is
+ * returned.
+ */
+static bool
+InstallPreallocatedWalSeg(const char *path)
+{
+	char	newpath[MAXPGPATH];
+	int		rc;
+
+	LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE);
+
+	snprintf(newpath, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d",
+	XLOGDIR, CheckpointerShmem->num_prealloc_segs);
+
+	rc = durable_rename(path, newpath, DEBUG1);
+	if (rc == 0)
+		CheckpointerShmem->num_prealloc_segs++;
+	else
+	{
+		ereport(WARNING,
+				(errcode_for_file_access(),
+				 errmsg("file \"%s\" could not be renamed to \"%s\": %m",
+						path, newpath)));
+
+		(void) durable_unlink(path, DEBUG1);
+		(void) durable_unlink(newpath, DEBUG1);
+	}
+
+	LWLockRelease(WALPreallocationLock);
+
+	return (rc == 0);
+}
+
+/*
+ * DoWalPreAllocation
+ *
+ * Tries to allocate up to wal_preallocate_max_size worth of WAL.
+ */
+static void
+DoWalPreAllocation(void)
+{
+	int		segs_to_prealloc;
+
+	/*
+	 * Reset the request flag.
+	 */
+	SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
+	CheckpointerShmem->wal_prealloc_requested = false;
+	SpinLockRelease(&CheckpointerShmem->ckpt_lck);
+
+	/*
+	 * Determine how many segments to pre-allocate.
+	 */
+	LWLockAcquire(WALPreallocationLock, LW_SHARED);
+	segs_to_prealloc = GetMaxPreallocatedWalSegs() - GetNumPreallocatedWalSegs();
+	LWLockRelease(WALPreallocationLock);
+
+	/*
+	 * Do the pre-allocation.
+	 */
+	for (int i = 0; i < segs_to_prealloc; i++)
+	{
+		char	tmppath[MAXPGPATH];
+
+		snprintf(tmppath, MAXPGPATH, "%s/preallocated_segments/xlogtemp", XLOGDIR);
+
+		if (!CreateEmptyWalSegment(tmppath, WARNING) ||
+			!InstallPreallocatedWalSeg(tmppath))
+		{
+			elog(DEBUG2, "failed to pre-allocate WAL segment");
+			return;
+		}
+		else
+			elog(DEBUG2, "pre-allocated WAL segment");
+
+		/* Check for barrier events. */
+		if (ProcSignalBarrierPending)
+			ProcessProcSignalBarrier();
+
+		if (ShutdownRequestPending)
+			return;
+	}
+}
+
+/*
+ * GetMaxPreallocatedWalSegs
+ *
+ * Returns the maximum number of pre-allocated WAL segments to create based on
+ * the current value of wal_preallocate_max_size.
+ */
+static int
+GetMaxPreallocatedWalSegs(void)
+{
+	return wal_prealloc_max_size_mb / (wal_segment_size / (1024 * 1024));
+}
+
+/*
+ * RequestWalPreallocation
+ *
+ * Requests that more segments be pre-allocated for future use.
+ */
+void
+RequestWalPreallocation(void)
+{
+	SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
+	CheckpointerShmem->wal_prealloc_requested = true;
+	SpinLockRelease(&CheckpointerShmem->ckpt_lck);
+
+	if (ProcGlobal->checkpointerLatch &&
+		GetMaxPreallocatedWalSegs() > 0)
+		SetLatch(ProcGlobal->checkpointerLatch);
+}
+
 
 /* --------------------------------
  *		signal handler routines
@@ -907,6 +1139,8 @@ CheckpointerShmemInit(void)
 		CheckpointerShmem->max_requests = NBuffers;
 		ConditionVariableInit(&CheckpointerShmem->start_cv);
 		ConditionVariableInit(&CheckpointerShmem->done_cv);
+		CheckpointerShmem->wal_prealloc_requested = false;
+		CheckpointerShmem->num_prealloc_segs = 0;
 	}
 }
 
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 27e4152446..2be9d199e3 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1304,11 +1304,13 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
 									&statbuf, sizeonly);
 
 			/*
-			 * Also send archive_status directory (by hackishly reusing
-			 * statbuf from above ...).
+			 * Also send archive_status and preallocated_segments (by hackishly
+			 * reusing statbuf from above ...).
 			 */
 			size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
 									&statbuf, sizeonly);
+			size += _tarWriteHeader(sink, "./pg_wal/preallocated_segments", NULL,
+									&statbuf, sizeonly);
 
 			continue;			/* don't recurse into pg_wal */
 		}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 4efc46460e..05ab0b4c66 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -3898,8 +3898,8 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
  * Create a new file that can be used as a new WAL segment.  The caller is
  * responsible for installing the new file in pg_wal.
  */
-void
-CreateEmptyWalSegment(const char *path)
+bool
+CreateEmptyWalSegment(const char *path, int elevel)
 {
 	PGAlignedXLogBlock zbuffer;
 	int			fd;
@@ -3910,9 +3910,12 @@ CreateEmptyWalSegment(const char *path)
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
 	fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
 	if (fd < 0)
-		ereport(ERROR,
+	{
+		ereport(elevel,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", path)));
+		return false;
+	}
 
 	memset(zbuffer.data, 0, XLOG_BLCKSZ);
 
@@ -3982,9 +3985,10 @@ CreateEmptyWalSegment(const char *path)
 
 		errno = save_errno;
 
-		ereport(ERROR,
+		ereport(elevel,
 				(errcode_for_file_access(),
 				 errmsg("could not write to file \"%s\": %m", path)));
+		return false;
 	}
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
@@ -3994,14 +3998,20 @@ CreateEmptyWalSegment(const char *path)
 
 		close(fd);
 		errno = save_errno;
-		ereport(ERROR,
+		ereport(elevel,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", path)));
+		return false;
 	}
 	pgstat_report_wait_end();
 
 	if (close(fd) != 0)
-		ereport(ERROR,
+	{
+		ereport(elevel,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", path)));
+		return false;
+	}
+
+	return true;
 }
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c295..212bda8e7d 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+WALPreallocationLock				48
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 22b5571a70..0b496ddc9b 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2918,6 +2918,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"wal_preallocate_max_size", PGC_SIGHUP, WAL_SETTINGS,
+			gettext_noop("Sets the maximum amount of WAL to pre-allocate."),
+			NULL,
+			GUC_UNIT_MB
+		},
+		&wal_prealloc_max_size_mb,
+		64, 0, 102400,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
 			gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 94270eb0ec..8238679d25 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -223,6 +223,7 @@
 					# off, pglz, lz4, zstd, or on
 #wal_init_zero = on			# zero-fill new WAL files
 #wal_recycle = on			# recycle WAL files
+#wal_preallocate_max_size = 64MB	# amount of WAL to pre-allocate
 #wal_buffers = -1			# min 32kB, -1 sets based on shared_buffers
 					# (change requires restart)
 #wal_writer_delay = 200ms		# 1-10000 milliseconds
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index ab826da650..0867bd30af 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -212,6 +212,7 @@ static char *extra_options = "";
 static const char *const subdirs[] = {
 	"global",
 	"pg_wal/archive_status",
+	"pg_wal/preallocated_segments",
 	"pg_commit_ts",
 	"pg_dynshmem",
 	"pg_notify",
diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c
index 393e9f340c..3b932b80a6 100644
--- a/src/bin/pg_basebackup/bbstreamer_file.c
+++ b/src/bin/pg_basebackup/bbstreamer_file.c
@@ -293,7 +293,8 @@ extract_directory(const char *filename, mode_t mode)
 		 */
 		if (!((pg_str_endswith(filename, "/pg_wal") ||
 			   pg_str_endswith(filename, "/pg_xlog") ||
-			   pg_str_endswith(filename, "/archive_status")) &&
+			   pg_str_endswith(filename, "/archive_status") ||
+			   pg_str_endswith(filename, "/preallocated_segments")) &&
 			  errno == EEXIST))
 			pg_fatal("could not create directory \"%s\": %m",
 					 filename);
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 65dcfff0a0..b9d990cf7c 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -683,6 +683,19 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier,
 
 		if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && errno != EEXIST)
 			pg_fatal("could not create directory \"%s\": %m", statusdir);
+
+		/*
+		 * Also create pg_wal/preallocated_segments if necessary.
+		 */
+		if (PQserverVersion(conn) >= 150000)
+		{
+			char prealloc_dir[MAXPGPATH];
+
+			snprintf(prealloc_dir, sizeof(prealloc_dir), "%s/pg_wal/preallocated_segments",
+					 basedir);
+			if (pg_mkdir_p(prealloc_dir, pg_dir_create_mode) != 0 && errno != EEXIST)
+				pg_fatal("could not create directory \"%s\": %m", prealloc_dir);
+		}
 	}
 
 	/*
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index 7309ebddea..b3baee781c 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -204,10 +204,10 @@ SKIP:
 		"check backup dir permissions");
 }
 
-# Only archive_status directory should be copied in pg_wal/.
+# Only archive_status and preallocated_segments directories should be copied in pg_wal/.
 is_deeply(
 	[ sort(slurp_dir("$tempdir/backup/pg_wal/")) ],
-	[ sort qw(. .. archive_status) ],
+	[ sort qw(. .. archive_status preallocated_segments) ],
 	'no WAL files copied');
 
 # Contents of these directories should not be copied.
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index d4772a2965..dfae77fb78 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -85,6 +85,7 @@ static void RewriteControlFile(void);
 static void FindEndOfXLOG(void);
 static void KillExistingXLOG(void);
 static void KillExistingArchiveStatus(void);
+static void KillExistingPreallocatedSegments(void);
 static void WriteEmptyXLOG(void);
 static void usage(void);
 
@@ -488,6 +489,7 @@ main(int argc, char *argv[])
 	RewriteControlFile();
 	KillExistingXLOG();
 	KillExistingArchiveStatus();
+	KillExistingPreallocatedSegments();
 	WriteEmptyXLOG();
 
 	printf(_("Write-ahead log reset\n"));
@@ -1037,6 +1039,52 @@ KillExistingArchiveStatus(void)
 }
 
 
+/*
+ * Remove existing preallocated segments
+ */
+static void
+KillExistingPreallocatedSegments(void)
+{
+#define PREALLOCSEGDIR XLOGDIR "/preallocated_segments"
+
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		path[MAXPGPATH + sizeof(PREALLOCSEGDIR)];
+
+	xldir = opendir(PREALLOCSEGDIR);
+	if (xldir == NULL)
+	{
+		pg_log_error("could not open directory \"%s\": %m", PREALLOCSEGDIR);
+		exit(1);
+	}
+
+	while (errno = 0, (xlde = readdir(xldir)) != NULL)
+	{
+		if (strncmp(xlde->d_name, "xlogtemp", strlen("xlogtemp")) ==  0)
+		{
+			snprintf(path, sizeof(path), "%s/%s", PREALLOCSEGDIR, xlde->d_name);
+			if (unlink(path) < 0)
+			{
+				pg_log_error("could not delete file \"%s\": %m", path);
+				exit(1);
+			}
+		}
+	}
+
+	if (errno)
+	{
+		pg_log_error("could not read directory \"%s\": %m", PREALLOCSEGDIR);
+		exit(1);
+	}
+
+	if (closedir(xldir))
+	{
+		pg_log_error("could not close directory \"%s\": %m", PREALLOCSEGDIR);
+		exit(1);
+	}
+}
+
+
 /*
  * Write an empty XLOG file, containing only the checkpoint record
  * already set up in ControlFile.
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 2511ef451e..9f284fa76b 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -26,6 +26,7 @@ extern PGDLLIMPORT int BgWriterDelay;
 extern PGDLLIMPORT int CheckPointTimeout;
 extern PGDLLIMPORT int CheckPointWarning;
 extern PGDLLIMPORT double CheckPointCompletionTarget;
+extern PGDLLIMPORT int wal_prealloc_max_size_mb;
 
 extern void BackgroundWriterMain(void) pg_attribute_noreturn();
 extern void CheckpointerMain(void) pg_attribute_noreturn();
@@ -42,4 +43,8 @@ extern void CheckpointerShmemInit(void);
 
 extern bool FirstCallSinceLastCheckpoint(void);
 
+extern int GetNumPreallocatedWalSegs(void);
+extern void SetNumPreallocatedWalSegs(int i);
+extern void RequestWalPreallocation(void);
+
 #endif							/* _BGWRITER_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 6bb9e3525b..bfc32a9b39 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -190,7 +190,7 @@ extern int	durable_unlink(const char *fname, int loglevel);
 extern int	durable_rename_excl(const char *oldfile, const char *newfile, int loglevel);
 extern void SyncDataDirectory(void);
 extern int	data_sync_elevel(int elevel);
-extern void CreateEmptyWalSegment(const char *path);
+extern bool CreateEmptyWalSegment(const char *path, int elevel);
 
 /* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
2.25.1

Reply via email to