Hi,

I am working on using read streams in the CREATE DATABASE command when the
strategy is wal_log. RelationCopyStorageUsingBuffer() function is used in
this context. This function reads source buffers then copies them to the
destination buffers. I used read streams only when reading source buffers
because the destination buffers are read by 'RBM_ZERO_AND_LOCK' option, so
it is not important.

I created a ~6 GB table [1] and created a new database with the wal_log
strategy using the database that table was created in as a template [2]. My
benchmarking results are:

a. Timings:

patched:
12955.027 ms
12917.475 ms
13177.846 ms
12971.308 ms
13059.985 ms

master:
13156.375 ms
13054.071 ms
13151.607 ms
13152.633 ms
13160.538 ms

There is no difference in timings, the patched version is a tiny bit better
but it is negligible. I actually expected the patched version to be better
because there was no prefetching before, but the read stream API detects
sequential access and disables prefetching.

b. strace:

patched:
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 68.02    3.749359           2   1285782           pwrite64
 18.54    1.021734          21     46730           preadv
  9.49    0.522889         826       633           fdatasync
  2.55    0.140339          59      2368           pwritev
  1.14    0.062583         409       153           fsync

master:
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 59.71    3.831542           2   1288365           pwrite64
 29.84    1.914540           2    747936           pread64
  7.90    0.506843         837       605           fdatasync
  1.58    0.101575          54      1856           pwritev
  0.75    0.048431         400       121           fsync

There are fewer (~1/16) read system calls in the patched version.

c. perf:

patched:

-   97.83%     1.13%  postgres  postgres           [.]
RelationCopyStorageUsingBuffer

   - 97.83% RelationCopyStorageUsingBuffer

      - 44.28% ReadBufferWithoutRelcache

         + 42.20% GetVictimBuffer

           0.81% ZeroBuffer

      + 31.86% log_newpage_buffer

      - 19.51% read_stream_next_buffer

         - 17.92% WaitReadBuffers

            + 17.61% mdreadv

         - 1.47% read_stream_start_pending_read

            + 1.46% StartReadBuffers

master:

-   97.68%     0.57%  postgres  postgres           [.]
RelationCopyStorageUsingBuffer

   - RelationCopyStorageUsingBuffer

      - 65.48% ReadBufferWithoutRelcache

         + 41.16% GetVictimBuffer

         - 20.42% WaitReadBuffers

            + 19.90% mdreadv

         + 1.85% StartReadBuffer

           0.75% ZeroBuffer

      + 30.82% log_newpage_buffer

Patched version spends less CPU time in read calls and more CPU time in
other calls such as write.

There are three patch files attached. First two are optimization and adding
a way to create a read stream object by using SMgrRelation, these are
already proposed in the streaming I/O thread [3]. The third one is the
actual patch file.

Any kind of feedback would be appreciated.

[1] CREATE TABLE t as select repeat('a', 100) || i || repeat('b', 500) as
filler from generate_series(1, 9000000) as i;
[2] CREATE DATABASE test_1 STRATEGY 'wal_log' TEMPLATE test;
[3]
https://www.postgresql.org/message-id/CAN55FZ1yGvCzCW_aufu83VimdEYHbG_zuOY3J9JL-nBptyJyKA%40mail.gmail.com

-- 
Regards,
Nazir Bilal Yavuz
Microsoft
From edcfacec40a70a8747c5f18777b6c28b0fccba7a Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavu...@gmail.com>
Date: Sun, 7 Apr 2024 22:33:36 +0300
Subject: [PATCH v1 1/3] Refactor PinBufferForBlock() to remove if checks about
 persistence

There are if checks in PinBufferForBlock() function to set persistence
of the relation and this function is called for the each block in the
relation. Instead of that, set persistence of the relation before
PinBufferForBlock() function.
---
 src/backend/storage/aio/read_stream.c |  2 +-
 src/backend/storage/buffer/bufmgr.c   | 31 +++++++++++----------------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index f54dacdd914..d155dde5ce3 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -549,7 +549,7 @@ read_stream_begin_relation(int flags,
 	{
 		stream->ios[i].op.rel = rel;
 		stream->ios[i].op.smgr = RelationGetSmgr(rel);
-		stream->ios[i].op.smgr_persistence = 0;
+		stream->ios[i].op.smgr_persistence = rel->rd_rel->relpersistence;
 		stream->ios[i].op.forknum = forknum;
 		stream->ios[i].op.strategy = strategy;
 	}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 901b7230fb9..17cb7127f99 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1067,24 +1067,10 @@ PinBufferForBlock(Relation rel,
 	BufferDesc *bufHdr;
 	IOContext	io_context;
 	IOObject	io_object;
-	char		persistence;
 
 	Assert(blockNum != P_NEW);
 
-	/*
-	 * If there is no Relation it usually implies recovery and thus permanent,
-	 * but we take an argmument because CreateAndCopyRelationData can reach us
-	 * with only an SMgrRelation for an unlogged relation that we don't want
-	 * to flag with BM_PERMANENT.
-	 */
-	if (rel)
-		persistence = rel->rd_rel->relpersistence;
-	else if (smgr_persistence == 0)
-		persistence = RELPERSISTENCE_PERMANENT;
-	else
-		persistence = smgr_persistence;
-
-	if (persistence == RELPERSISTENCE_TEMP)
+	if (smgr_persistence == RELPERSISTENCE_TEMP)
 	{
 		io_context = IOCONTEXT_NORMAL;
 		io_object = IOOBJECT_TEMP_RELATION;
@@ -1101,7 +1087,7 @@ PinBufferForBlock(Relation rel,
 									   smgr->smgr_rlocator.locator.relNumber,
 									   smgr->smgr_rlocator.backend);
 
-	if (persistence == RELPERSISTENCE_TEMP)
+	if (smgr_persistence == RELPERSISTENCE_TEMP)
 	{
 		bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
 		if (*foundPtr)
@@ -1109,7 +1095,7 @@ PinBufferForBlock(Relation rel,
 	}
 	else
 	{
-		bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
+		bufHdr = BufferAlloc(smgr, smgr_persistence, forkNum, blockNum,
 							 strategy, foundPtr, io_context);
 		if (*foundPtr)
 			pgBufferUsage.shared_blks_hit++;
@@ -1157,6 +1143,7 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
 	ReadBuffersOperation operation;
 	Buffer		buffer;
 	int			flags;
+	char		persistence;
 
 	/*
 	 * Backward compatibility path, most code should use ExtendBufferedRel()
@@ -1195,7 +1182,15 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
 		flags = 0;
 	operation.smgr = smgr;
 	operation.rel = rel;
-	operation.smgr_persistence = smgr_persistence;
+
+	if (rel)
+		persistence = rel->rd_rel->relpersistence;
+	else if (smgr_persistence == 0)
+		persistence = RELPERSISTENCE_PERMANENT;
+	else
+		persistence = smgr_persistence;
+	operation.smgr_persistence = persistence;
+
 	operation.forknum = forkNum;
 	operation.strategy = strategy;
 	if (StartReadBuffer(&operation,
-- 
2.43.0

From 2c6947231ba6377f291a1bd7c9cee55e5833fdbf Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavu...@gmail.com>
Date: Mon, 8 Apr 2024 00:14:52 +0300
Subject: [PATCH v1 2/3] Add a way to create read stream object by using
 SMgrRelation

Currently read stream object can be created only by using Relation,
there is no way to create it by using SMgrRelation.

To achieve that, read_stream_begin_impl() function is created and
contents of the read_stream_begin_relation() is moved into this
function.

Both read_stream_begin_relation() and read_stream_begin_smgr_relation()
calls read_stream_begin_impl() by Relation and SMgrRelation
respectively.
---
 src/include/storage/read_stream.h     |  9 +++
 src/backend/storage/aio/read_stream.c | 83 ++++++++++++++++++++++-----
 2 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index fae09d2b4cc..c7d688d8c9c 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -15,6 +15,7 @@
 #define READ_STREAM_H
 
 #include "storage/bufmgr.h"
+#include "storage/smgr.h"
 
 /* Default tuning, reasonable for many users. */
 #define READ_STREAM_DEFAULT 0x00
@@ -56,6 +57,14 @@ extern ReadStream *read_stream_begin_relation(int flags,
 											  ReadStreamBlockNumberCB callback,
 											  void *callback_private_data,
 											  size_t per_buffer_data_size);
+extern ReadStream *read_stream_begin_smgr_relation(int flags,
+												   BufferAccessStrategy strategy,
+												   SMgrRelation smgr,
+												   char smgr_persistence,
+												   ForkNumber forknum,
+												   ReadStreamBlockNumberCB callback,
+												   void *callback_private_data,
+												   size_t per_buffer_data_size);
 extern Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_private);
 extern void read_stream_reset(ReadStream *stream);
 extern void read_stream_end(ReadStream *stream);
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index d155dde5ce3..8dd100f016c 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -406,14 +406,16 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
  * write extra data for each block into the space provided to it.  It will
  * also receive callback_private_data for its own purposes.
  */
-ReadStream *
-read_stream_begin_relation(int flags,
-						   BufferAccessStrategy strategy,
-						   Relation rel,
-						   ForkNumber forknum,
-						   ReadStreamBlockNumberCB callback,
-						   void *callback_private_data,
-						   size_t per_buffer_data_size)
+static ReadStream *
+read_stream_begin_impl(int flags,
+					   BufferAccessStrategy strategy,
+					   Relation rel,
+					   SMgrRelation smgr,
+					   char smgr_persistence,
+					   ForkNumber forknum,
+					   ReadStreamBlockNumberCB callback,
+					   void *callback_private_data,
+					   size_t per_buffer_data_size)
 {
 	ReadStream *stream;
 	size_t		size;
@@ -422,9 +424,6 @@ read_stream_begin_relation(int flags,
 	int			strategy_pin_limit;
 	uint32		max_pinned_buffers;
 	Oid			tablespace_id;
-	SMgrRelation smgr;
-
-	smgr = RelationGetSmgr(rel);
 
 	/*
 	 * Decide how many I/Os we will allow to run at the same time.  That
@@ -434,7 +433,7 @@ read_stream_begin_relation(int flags,
 	 */
 	tablespace_id = smgr->smgr_rlocator.locator.spcOid;
 	if (!OidIsValid(MyDatabaseId) ||
-		IsCatalogRelation(rel) ||
+		(rel && IsCatalogRelation(rel)) ||
 		IsCatalogRelationOid(smgr->smgr_rlocator.locator.relNumber))
 	{
 		/*
@@ -548,8 +547,8 @@ read_stream_begin_relation(int flags,
 	for (int i = 0; i < max_ios; ++i)
 	{
 		stream->ios[i].op.rel = rel;
-		stream->ios[i].op.smgr = RelationGetSmgr(rel);
-		stream->ios[i].op.smgr_persistence = rel->rd_rel->relpersistence;
+		stream->ios[i].op.smgr = smgr;
+		stream->ios[i].op.smgr_persistence = smgr_persistence;
 		stream->ios[i].op.forknum = forknum;
 		stream->ios[i].op.strategy = strategy;
 	}
@@ -557,6 +556,62 @@ read_stream_begin_relation(int flags,
 	return stream;
 }
 
+/*
+ * Create a new read stream for reading a relation.
+ * See read_stream_begin_impl() for the detailed explanation.
+ */
+ReadStream *
+read_stream_begin_relation(int flags,
+						   BufferAccessStrategy strategy,
+						   Relation rel,
+						   ForkNumber forknum,
+						   ReadStreamBlockNumberCB callback,
+						   void *callback_private_data,
+						   size_t per_buffer_data_size)
+{
+	return read_stream_begin_impl(flags,
+								  strategy,
+								  rel,
+								  RelationGetSmgr(rel),
+								  rel->rd_rel->relpersistence,
+								  forknum,
+								  callback,
+								  callback_private_data,
+								  per_buffer_data_size);
+}
+
+/*
+ * Create a new read stream for reading a SMgr relation.
+ * See read_stream_begin_impl() for the detailed explanation.
+ */
+ReadStream *
+read_stream_begin_smgr_relation(int flags,
+								BufferAccessStrategy strategy,
+								SMgrRelation smgr,
+								char smgr_persistence,
+								ForkNumber forknum,
+								ReadStreamBlockNumberCB callback,
+								void *callback_private_data,
+								size_t per_buffer_data_size)
+{
+	char		persistence;
+
+	if (smgr_persistence == 0)
+		persistence = RELPERSISTENCE_PERMANENT;
+	else
+		persistence = smgr_persistence;
+
+	return read_stream_begin_impl(flags,
+								  strategy,
+								  NULL,
+								  smgr,
+								  persistence,
+								  forknum,
+								  callback,
+								  callback_private_data,
+								  per_buffer_data_size);
+}
+
 /*
  * Pull one pinned buffer out of a stream.  Each call returns successive
  * blocks in the order specified by the callback.  If per_buffer_data_size was
-- 
2.43.0

From b97e1f39ef53c331090597aaf2d38d66a9b49e89 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavu...@gmail.com>
Date: Tue, 16 Apr 2024 11:36:49 +0300
Subject: [PATCH v1 3/3] Use read streams in CREATE DATABASE when strategy is
 wal_log

CREATE DABASE command uses RelationCopyStorageUsingBuffer() function to
copy buffers when the strategy is wal_log. This function reads source
buffers then copies them to the destination buffers. Read streams are
used only only while reading source buffers because the destination
buffer is read by 'RBM_ZERO_AND_LOCK' option, so it is not important.
---
 src/backend/storage/buffer/bufmgr.c | 53 ++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 17cb7127f99..e02b699f6cf 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -54,6 +54,7 @@
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
+#include "storage/read_stream.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
 #include "utils/memdebug.h"
@@ -135,6 +136,33 @@ typedef struct SMgrSortArray
 	SMgrRelation srel;
 } SMgrSortArray;
 
+/*
+ * Helper struct for read stream object used in
+ * RelationCopyStorageUsingBuffer() function.
+ */
+struct copy_storage_using_buffer_read_stream_private
+{
+	BlockNumber blocknum;
+	int64		last_block;
+};
+
+/*
+ * Callback function to get next block for read stream object used in
+ * RelationCopyStorageUsingBuffer() function.
+ */
+static BlockNumber
+copy_storage_using_buffer_read_stream_next_block(ReadStream *stream,
+												 void *callback_private_data,
+												 void *per_buffer_data)
+{
+	struct copy_storage_using_buffer_read_stream_private *p = callback_private_data;
+
+	if (p->blocknum < p->last_block)
+		return p->blocknum++;
+
+	return InvalidBlockNumber;
+}
+
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@@ -4639,6 +4667,9 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
 	PGIOAlignedBlock buf;
 	BufferAccessStrategy bstrategy_src;
 	BufferAccessStrategy bstrategy_dst;
+	struct copy_storage_using_buffer_read_stream_private p;
+	ReadStream *src_stream;
+	SMgrRelation src_smgr;
 
 	/*
 	 * In general, we want to write WAL whenever wal_level > 'minimal', but we
@@ -4667,19 +4698,31 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
 	bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
 	bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
 
+	/* Initalize streaming read */
+	p.blocknum = 0;
+	p.last_block = nblocks;
+	src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
+	src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL,
+												 bstrategy_src,
+												 src_smgr,
+												 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
+												 forkNum,
+												 copy_storage_using_buffer_read_stream_next_block,
+												 &p,
+												 0);
+
 	/* Iterate over each block of the source relation file. */
 	for (blkno = 0; blkno < nblocks; blkno++)
 	{
 		CHECK_FOR_INTERRUPTS();
 
 		/* Read block from source relation. */
-		srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
-										   RBM_NORMAL, bstrategy_src,
-										   permanent);
+		srcBuf = read_stream_next_buffer(src_stream, NULL);
 		LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
 		srcPage = BufferGetPage(srcBuf);
 
-		dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
+		dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
+										   BufferGetBlockNumber(srcBuf),
 										   RBM_ZERO_AND_LOCK, bstrategy_dst,
 										   permanent);
 		dstPage = BufferGetPage(dstBuf);
@@ -4699,6 +4742,8 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
 		UnlockReleaseBuffer(dstBuf);
 		UnlockReleaseBuffer(srcBuf);
 	}
+	Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
+	read_stream_end(src_stream);
 
 	FreeAccessStrategy(bstrategy_src);
 	FreeAccessStrategy(bstrategy_dst);
-- 
2.43.0

Reply via email to