Re: Vectored I/O in bulk_write.c

Thomas Munro Sun, 10 Mar 2024 16:43:20 -0700

Here also is a first attempt at improving the memory allocation and
memory layout.


I wonder if bulk logging should trigger larger WAL writes in the "Have
to write it ourselves" case in AdvanceXLInsertBuffer(), since writing
8kB of WAL at a time seems like an unnecessarily degraded level of
performance, especially with wal_sync_method=open_datasync.  Of course
the real answer is "make sure wal_buffers is high enough for your
workload" (usually indirectly by automatically scaling from
shared_buffers), but this problem jumps out when tracing bulk_writes.c
with default settings.  We write out the index 128kB at a time, but
the WAL 8kB at a time.

From 793caf2db6c00314f5bd8f7146d4797508f2f627 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 9 Mar 2024 16:04:21 +1300
Subject: [PATCH v3 1/3] Provide vectored variant of smgrextend().

Since mdwrite() and mdextend() were basically the same, merge them.
They had different assertions, but those would surely apply to any
implementation of smgr, so move them up to the smgr.c wrapper
level.  The other difference was the policy on segment creation, but
that can be captured by having smgrwritev() and smgrextendv() call a
single mdwritev() function with a new "extend" flag.

Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/smgr/md.c   | 106 +++++---------------------------
 src/backend/storage/smgr/smgr.c |  32 ++++++----
 src/include/storage/md.h        |   3 +-
 src/include/storage/smgr.h      |  12 +++-
 4 files changed, 47 insertions(+), 106 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index bf0f3ca76d1..9b125841226 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -447,79 +447,10 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 	pfree(path);
 }
 
-/*
- * mdextend() -- Add a block to the specified relation.
- *
- * The semantics are nearly the same as mdwrite(): write at the
- * specified position.  However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF).  Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
- */
-void
-mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void *buffer, bool skipFsync)
-{
-	off_t		seekpos;
-	int			nbytes;
-	MdfdVec    *v;
-
-	/* If this build supports direct I/O, the buffer must be I/O aligned. */
-	if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
-		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
-
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
-	/*
-	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
-	 * more --- we mustn't create a block whose number actually is
-	 * InvalidBlockNumber.  (Note that this failure should be unreachable
-	 * because of upstream checks in bufmgr.c.)
-	 */
-	if (blocknum == InvalidBlockNumber)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forknum),
-						InvalidBlockNumber)));
-
-	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
-
-	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
-	{
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not extend file \"%s\": %m",
-							FilePathName(v->mdfd_vfd)),
-					 errhint("Check free disk space.")));
-		/* short write: complain appropriately */
-		ereport(ERROR,
-				(errcode(ERRCODE_DISK_FULL),
-				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
-						FilePathName(v->mdfd_vfd),
-						nbytes, BLCKSZ, blocknum),
-				 errhint("Check free disk space.")));
-	}
-
-	if (!skipFsync && !SmgrIsTemp(reln))
-		register_dirty_segment(reln, forknum, v);
-
-	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
-}
-
 /*
  * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
  *
- * Similar to mdextend(), except the relation can be extended by multiple
- * blocks at once and the added blocks will be filled with zeroes.
+ * The added blocks will be filled with zeroes.
  */
 void
 mdzeroextend(SMgrRelation reln, ForkNumber forknum,
@@ -595,13 +526,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 		{
 			int			ret;
 
-			/*
-			 * Even if we don't want to use fallocate, we can still extend a
-			 * bit more efficiently than writing each 8kB block individually.
-			 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
-			 * to avoid multiple writes or needing a zeroed buffer for the
-			 * whole length of the extension.
-			 */
+			/* Fall back to writing out zeroes. */
 			ret = FileZero(v->mdfd_vfd,
 						   seekpos, (off_t) BLCKSZ * numblocks,
 						   WAIT_EVENT_DATA_FILE_EXTEND);
@@ -920,19 +845,14 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
  * mdwritev() -- Write the supplied blocks at the appropriate location.
  *
- * This is to be used only for updating already-existing blocks of a
- * relation (ie, those before the current EOF).  To extend a relation,
- * use mdextend().
+ * Note that smgrextendv() and smgrwritev() are different operations, but both
+ * are handled here.
  */
 void
 mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void **buffers, BlockNumber nblocks, bool skipFsync)
+		 const void **buffers, BlockNumber nblocks,
+		 bool extend, bool skipFsync)
 {
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum < mdnblocks(reln, forknum));
-#endif
-
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
@@ -945,6 +865,8 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		size_t		size_this_segment;
 
 		v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+						 extend ?
+						 EXTENSION_CREATE :
 						 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
 		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -992,7 +914,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 				ereport(ERROR,
 						(errcode_for_file_access(),
-						 errmsg("could not write blocks %u..%u in file \"%s\": %m",
+						 errmsg(extend ?
+								"could not extend blocks %u..%u in file \"%s\": %m" :
+								"could not write blocks %u..%u in file \"%s\": %m",
 								blocknum,
 								blocknum + nblocks_this_segment - 1,
 								FilePathName(v->mdfd_vfd)),
@@ -1638,7 +1562,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		{
 			/*
 			 * Normally we will create new segments only if authorized by the
-			 * caller (i.e., we are doing mdextend()).  But when doing WAL
+			 * caller (i.e., we are doing smgrextend()).  But when doing WAL
 			 * recovery, create segments anyway; this allows cases such as
 			 * replaying WAL data that has a write into a high-numbered
 			 * segment of a relation that was later deleted. We want to go
@@ -1655,9 +1579,9 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 				char	   *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
 													 MCXT_ALLOC_ZERO);
 
-				mdextend(reln, forknum,
-						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-						 zerobuf, skipFsync);
+				smgrextend(reln, forknum,
+						   nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+						   zerobuf, skipFsync);
 				pfree(zerobuf);
 			}
 			flags = O_CREAT;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index a5b18328b89..ad0f02f205e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -82,8 +82,6 @@ typedef struct f_smgr
 	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
 								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, const void *buffer, bool skipFsync);
 	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
 									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
@@ -94,7 +92,7 @@ typedef struct f_smgr
 	void		(*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
 								BlockNumber blocknum,
 								const void **buffers, BlockNumber nblocks,
-								bool skipFsync);
+								bool extend, bool skipFsync);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -114,7 +112,6 @@ static const f_smgr smgrsw[] = {
 		.smgr_create = mdcreate,
 		.smgr_exists = mdexists,
 		.smgr_unlink = mdunlink,
-		.smgr_extend = mdextend,
 		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
 		.smgr_readv = mdreadv,
@@ -525,7 +522,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 
 
 /*
- * smgrextend() -- Add a new block to a file.
+ * smgrextendv() -- Add new blocks to a file.
  *
  * The semantics are nearly the same as smgrwrite(): write at the
  * specified position.  However, this is to be used for the case of
@@ -534,19 +531,24 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
  * causes intervening file space to become filled with zeroes.
  */
 void
-smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		   const void *buffer, bool skipFsync)
+smgrextendv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			const void **buffers, BlockNumber nblocks, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
-										 buffer, skipFsync);
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= smgrnblocks(reln, forknum));
+#endif
+
+	smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
+										 buffers, nblocks, true, skipFsync);
 
 	/*
-	 * Normally we expect this to increase nblocks by one, but if the cached
+	 * Normally we expect this to increase nblocks by N, but if the cached
 	 * value isn't as expected, just invalidate it so the next call asks the
 	 * kernel.
 	 */
 	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
+		reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
 	else
 		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 }
@@ -633,8 +635,14 @@ void
 smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   const void **buffers, BlockNumber nblocks, bool skipFsync)
 {
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum + nblocks <= smgrnblocks(reln, forknum));
+#endif
+
 	smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
-										 buffers, nblocks, skipFsync);
+										 buffers, nblocks, false,
+										 skipFsync);
 }
 
 /*
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 620f10abdeb..8746a48a5fb 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -36,7 +36,8 @@ extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					void **buffers, BlockNumber nblocks);
 extern void mdwritev(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum,
-					 const void **buffers, BlockNumber nblocks, bool skipFsync);
+					 const void **buffers, BlockNumber nblocks,
+					 bool extend, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index fc5f883ce14..7ffea4332d2 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -86,8 +86,9 @@ extern void smgrreleaserellocator(RelFileLocatorBackend rlocator);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
-extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void smgrextendv(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum,
+						const void **buffers, BlockNumber nblocks, bool skipFsync);
 extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
@@ -124,4 +125,11 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	smgrwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
 }
 
+static inline void
+smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		   const void *buffer, bool skipFsync)
+{
+	smgrextendv(reln, forknum, blocknum, &buffer, 1, skipFsync);
+}
+
 #endif							/* SMGR_H */
-- 
2.39.2

From 9f679c914cc81907ba7ae9bbbf6e08fe81c7ec2d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 9 Mar 2024 16:54:56 +1300
Subject: [PATCH v3 2/3] Use vectored I/O for bulk writes.

Zero-extend using smgrzeroextend().  Extend using smgrextendv().  Write
using smgrwritev().

Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/smgr/bulk_write.c | 83 +++++++++++++++++++--------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index 4a10ece4c39..2a566a80f60 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -8,7 +8,7 @@
  * the regular buffer manager and the bulk loading interface!
  *
  * We bypass the buffer manager to avoid the locking overhead, and call
- * smgrextend() directly.  A downside is that the pages will need to be
+ * smgrextendv() directly.  A downside is that the pages will need to be
  * re-read into shared buffers on first use after the build finishes.  That's
  * usually a good tradeoff for large relations, and for small relations, the
  * overhead isn't very significant compared to creating the relation in the
@@ -45,8 +45,6 @@
 
 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
 
-static const PGIOAlignedBlock zero_buffer = {{0}};	/* worth BLCKSZ */
-
 typedef struct PendingWrite
 {
 	BulkWriteBuffer buf;
@@ -225,35 +223,70 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 
 	for (int i = 0; i < npending; i++)
 	{
-		BlockNumber blkno = pending_writes[i].blkno;
-		Page		page = pending_writes[i].buf->data;
-
+		Page		page;
+		const void *pages[16];
+		BlockNumber blkno;
+		int			nblocks;
+		int			max_nblocks;
+
+		/* Prepare to write the first block. */
+		blkno = pending_writes[i].blkno;
+		page = pending_writes[i].buf->data;
 		PageSetChecksumInplace(page, blkno);
+		pages[0] = page;
+		nblocks = 1;
 
-		if (blkno >= bulkstate->pages_written)
+		/* Zero-extend any missing space before the first block. */
+		if (blkno > bulkstate->pages_written)
+		{
+			int			nzeroblocks;
+
+			nzeroblocks = blkno - bulkstate->pages_written;
+			smgrzeroextend(bulkstate->smgr, bulkstate->forknum,
+						   bulkstate->pages_written, nzeroblocks, true);
+			bulkstate->pages_written += nzeroblocks;
+		}
+
+		if (blkno < bulkstate->pages_written)
 		{
 			/*
-			 * If we have to write pages nonsequentially, fill in the space
-			 * with zeroes until we come back and overwrite.  This is not
-			 * logically necessary on standard Unix filesystems (unwritten
-			 * space will read as zeroes anyway), but it should help to avoid
-			 * fragmentation.  The dummy pages aren't WAL-logged though.
+			 * We're overwriting.  Clamp at the existing size, because we
+			 * can't mix writing and extending in a single operation.
 			 */
-			while (blkno > bulkstate->pages_written)
-			{
-				/* don't set checksum for all-zero page */
-				smgrextend(bulkstate->smgr, bulkstate->forknum,
-						   bulkstate->pages_written++,
-						   &zero_buffer,
-						   true);
-			}
-
-			smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
-			bulkstate->pages_written = pending_writes[i].blkno + 1;
+			max_nblocks = Min(lengthof(pages),
+							  bulkstate->pages_written - blkno);
+		}
+		else
+		{
+			/* We're extending. */
+			Assert(blkno == bulkstate->pages_written);
+			max_nblocks = lengthof(pages);
+		}
+
+		/* Find as many consecutive blocks as we can. */
+		while (i + 1 < npending &&
+			   pending_writes[i + 1].blkno == blkno + nblocks &&
+			   nblocks < max_nblocks)
+		{
+			page = pending_writes[++i].buf->data;
+			PageSetChecksumInplace(page, pending_writes[i].blkno);
+			pages[nblocks++] = page;
+		}
+
+		/* Extend or overwrite. */
+		if (blkno == bulkstate->pages_written)
+		{
+			smgrextendv(bulkstate->smgr, bulkstate->forknum, blkno, pages, nblocks, true);
+			bulkstate->pages_written += nblocks;
 		}
 		else
-			smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
-		pfree(page);
+		{
+			Assert(blkno + nblocks <= bulkstate->pages_written);
+			smgrwritev(bulkstate->smgr, bulkstate->forknum, blkno, pages, nblocks, true);
+		}
+
+		for (int j = 0; j < nblocks; ++j)
+			pfree(pending_writes[i - j].buf->data);
 	}
 
 	bulkstate->npending = 0;
-- 
2.39.2

From 6d807a3c85b77d7daf7b8345ab988b6f44fd4c87 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Mon, 11 Mar 2024 11:44:41 +1300
Subject: [PATCH v3 3/3] Improve bulk_write.c memory management.

Instead of allocating buffers one at a time with palloc(), allocate an
array full of them up front, and then manage them in a FIFO freelist.
Aside from avoiding allocator overheads, this means that callers who
write sequential blocks will tend to fill up sequential memory, which
hopefully generates more efficient vectored writes.

Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/smgr/bulk_write.c | 62 +++++++++++++++++++--------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index 2a566a80f60..d9c38ffa88b 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -36,6 +36,7 @@
 
 #include "access/xloginsert.h"
 #include "access/xlogrecord.h"
+#include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/bulk_write.h"
@@ -45,9 +46,15 @@
 
 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
 
+typedef union BufferSlot
+{
+	PGIOAlignedBlock buffer;
+	dlist_node	freelist_node;
+}			BufferSlot;
+
 typedef struct PendingWrite
 {
-	BulkWriteBuffer buf;
+	BufferSlot *slot;
 	BlockNumber blkno;
 	bool		page_std;
 } PendingWrite;
@@ -57,6 +64,10 @@ typedef struct PendingWrite
  */
 struct BulkWriteState
 {
+	/* Comes first so we can align it correctly. */
+	BufferSlot	buffer_slots[MAX_PENDING_WRITES + 2];
+	dlist_head	buffer_slots_freelist;
+
 	/* Information about the target relation we're writing */
 	SMgrRelation smgr;
 	ForkNumber	forknum;
@@ -71,8 +82,6 @@ struct BulkWriteState
 
 	/* The RedoRecPtr at the time that the bulk operation started */
 	XLogRecPtr	start_RedoRecPtr;
-
-	MemoryContext memcxt;
 };
 
 static void smgr_bulk_flush(BulkWriteState *bulkstate);
@@ -98,7 +107,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 {
 	BulkWriteState *state;
 
-	state = palloc(sizeof(BulkWriteState));
+	state = palloc_aligned(sizeof(BulkWriteState), PG_IO_ALIGN_SIZE, 0);
 	state->smgr = smgr;
 	state->forknum = forknum;
 	state->use_wal = use_wal;
@@ -108,11 +117,11 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 
 	state->start_RedoRecPtr = GetRedoRecPtr();
 
-	/*
-	 * Remember the memory context.  We will use it to allocate all the
-	 * buffers later.
-	 */
-	state->memcxt = CurrentMemoryContext;
+	/* Set up the free-list of buffers. */
+	dlist_init(&state->buffer_slots_freelist);
+	for (int i = 0; i < lengthof(state->buffer_slots); ++i)
+		dlist_push_tail(&state->buffer_slots_freelist,
+						&state->buffer_slots[i].freelist_node);
 
 	return state;
 }
@@ -206,7 +215,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 		for (int i = 0; i < npending; i++)
 		{
 			blknos[i] = pending_writes[i].blkno;
-			pages[i] = pending_writes[i].buf->data;
+			pages[i] = pending_writes[i].slot->buffer.data;
 
 			/*
 			 * If any of the pages use !page_std, we log them all as such.
@@ -231,7 +240,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 
 		/* Prepare to write the first block. */
 		blkno = pending_writes[i].blkno;
-		page = pending_writes[i].buf->data;
+		page = pending_writes[i].slot->buffer.data;
 		PageSetChecksumInplace(page, blkno);
 		pages[0] = page;
 		nblocks = 1;
@@ -268,7 +277,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 			   pending_writes[i + 1].blkno == blkno + nblocks &&
 			   nblocks < max_nblocks)
 		{
-			page = pending_writes[++i].buf->data;
+			page = pending_writes[++i].slot->buffer.data;
 			PageSetChecksumInplace(page, pending_writes[i].blkno);
 			pages[nblocks++] = page;
 		}
@@ -285,8 +294,14 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 			smgrwritev(bulkstate->smgr, bulkstate->forknum, blkno, pages, nblocks, true);
 		}
 
-		for (int j = 0; j < nblocks; ++j)
-			pfree(pending_writes[i - j].buf->data);
+		/*
+		 * Maintain FIFO ordering in the free list, so that users who write
+		 * blocks in sequential order tend to get sequential chunks of buffer
+		 * memory, which may be slight more efficient for vectored writes.
+		 */
+		for (int j = i - nblocks + 1; j <= i; ++j)
+			dlist_push_tail(&bulkstate->buffer_slots_freelist,
+							&pending_writes[j].slot->freelist_node);
 	}
 
 	bulkstate->npending = 0;
@@ -306,7 +321,7 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer
 	PendingWrite *w;
 
 	w = &bulkstate->pending_writes[bulkstate->npending++];
-	w->buf = buf;
+	w->slot = (BufferSlot *) buf;
 	w->blkno = blocknum;
 	w->page_std = page_std;
 
@@ -320,12 +335,21 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer
  * There is no function to free the buffer.  When you pass it to
  * smgr_bulk_write(), it takes ownership and frees it when it's no longer
  * needed.
- *
- * This is currently implemented as a simple palloc, but could be implemented
- * using a ring buffer or larger chunks in the future, so don't rely on it.
  */
 BulkWriteBuffer
 smgr_bulk_get_buf(BulkWriteState *bulkstate)
 {
-	return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	BufferSlot *slot;
+
+	if (dlist_is_empty(&bulkstate->buffer_slots_freelist))
+	{
+		smgr_bulk_flush(bulkstate);
+		if (dlist_is_empty(&bulkstate->buffer_slots_freelist))
+			elog(ERROR, "too many bulk write buffers used but not yet written");
+	}
+
+	slot = dlist_head_element(BufferSlot, freelist_node, &bulkstate->buffer_slots_freelist);
+	dlist_pop_head_node(&bulkstate->buffer_slots_freelist);
+
+	return &slot->buffer;
 }
-- 
2.39.2

Re: Vectored I/O in bulk_write.c

Reply via email to