Hi, I am working on using read streams in the CREATE DATABASE command when the strategy is wal_log. RelationCopyStorageUsingBuffer() function is used in this context. This function reads source buffers then copies them to the destination buffers. I used read streams only when reading source buffers because the destination buffers are read by 'RBM_ZERO_AND_LOCK' option, so it is not important.
I created a ~6 GB table [1] and created a new database with the wal_log strategy using the database that table was created in as a template [2]. My benchmarking results are: a. Timings: patched: 12955.027 ms 12917.475 ms 13177.846 ms 12971.308 ms 13059.985 ms master: 13156.375 ms 13054.071 ms 13151.607 ms 13152.633 ms 13160.538 ms There is no difference in timings, the patched version is a tiny bit better but it is negligible. I actually expected the patched version to be better because there was no prefetching before, but the read stream API detects sequential access and disables prefetching. b. strace: patched: % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 68.02 3.749359 2 1285782 pwrite64 18.54 1.021734 21 46730 preadv 9.49 0.522889 826 633 fdatasync 2.55 0.140339 59 2368 pwritev 1.14 0.062583 409 153 fsync master: % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 59.71 3.831542 2 1288365 pwrite64 29.84 1.914540 2 747936 pread64 7.90 0.506843 837 605 fdatasync 1.58 0.101575 54 1856 pwritev 0.75 0.048431 400 121 fsync There are fewer (~1/16) read system calls in the patched version. c. perf: patched: - 97.83% 1.13% postgres postgres [.] RelationCopyStorageUsingBuffer - 97.83% RelationCopyStorageUsingBuffer - 44.28% ReadBufferWithoutRelcache + 42.20% GetVictimBuffer 0.81% ZeroBuffer + 31.86% log_newpage_buffer - 19.51% read_stream_next_buffer - 17.92% WaitReadBuffers + 17.61% mdreadv - 1.47% read_stream_start_pending_read + 1.46% StartReadBuffers master: - 97.68% 0.57% postgres postgres [.] RelationCopyStorageUsingBuffer - RelationCopyStorageUsingBuffer - 65.48% ReadBufferWithoutRelcache + 41.16% GetVictimBuffer - 20.42% WaitReadBuffers + 19.90% mdreadv + 1.85% StartReadBuffer 0.75% ZeroBuffer + 30.82% log_newpage_buffer Patched version spends less CPU time in read calls and more CPU time in other calls such as write. There are three patch files attached. First two are optimization and adding a way to create a read stream object by using SMgrRelation, these are already proposed in the streaming I/O thread [3]. The third one is the actual patch file. Any kind of feedback would be appreciated. [1] CREATE TABLE t as select repeat('a', 100) || i || repeat('b', 500) as filler from generate_series(1, 9000000) as i; [2] CREATE DATABASE test_1 STRATEGY 'wal_log' TEMPLATE test; [3] https://www.postgresql.org/message-id/CAN55FZ1yGvCzCW_aufu83VimdEYHbG_zuOY3J9JL-nBptyJyKA%40mail.gmail.com -- Regards, Nazir Bilal Yavuz Microsoft
From edcfacec40a70a8747c5f18777b6c28b0fccba7a Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavu...@gmail.com> Date: Sun, 7 Apr 2024 22:33:36 +0300 Subject: [PATCH v1 1/3] Refactor PinBufferForBlock() to remove if checks about persistence There are if checks in PinBufferForBlock() function to set persistence of the relation and this function is called for the each block in the relation. Instead of that, set persistence of the relation before PinBufferForBlock() function. --- src/backend/storage/aio/read_stream.c | 2 +- src/backend/storage/buffer/bufmgr.c | 31 +++++++++++---------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index f54dacdd914..d155dde5ce3 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -549,7 +549,7 @@ read_stream_begin_relation(int flags, { stream->ios[i].op.rel = rel; stream->ios[i].op.smgr = RelationGetSmgr(rel); - stream->ios[i].op.smgr_persistence = 0; + stream->ios[i].op.smgr_persistence = rel->rd_rel->relpersistence; stream->ios[i].op.forknum = forknum; stream->ios[i].op.strategy = strategy; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 901b7230fb9..17cb7127f99 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1067,24 +1067,10 @@ PinBufferForBlock(Relation rel, BufferDesc *bufHdr; IOContext io_context; IOObject io_object; - char persistence; Assert(blockNum != P_NEW); - /* - * If there is no Relation it usually implies recovery and thus permanent, - * but we take an argmument because CreateAndCopyRelationData can reach us - * with only an SMgrRelation for an unlogged relation that we don't want - * to flag with BM_PERMANENT. - */ - if (rel) - persistence = rel->rd_rel->relpersistence; - else if (smgr_persistence == 0) - persistence = RELPERSISTENCE_PERMANENT; - else - persistence = smgr_persistence; - - if (persistence == RELPERSISTENCE_TEMP) + if (smgr_persistence == RELPERSISTENCE_TEMP) { io_context = IOCONTEXT_NORMAL; io_object = IOOBJECT_TEMP_RELATION; @@ -1101,7 +1087,7 @@ PinBufferForBlock(Relation rel, smgr->smgr_rlocator.locator.relNumber, smgr->smgr_rlocator.backend); - if (persistence == RELPERSISTENCE_TEMP) + if (smgr_persistence == RELPERSISTENCE_TEMP) { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) @@ -1109,7 +1095,7 @@ PinBufferForBlock(Relation rel, } else { - bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, + bufHdr = BufferAlloc(smgr, smgr_persistence, forkNum, blockNum, strategy, foundPtr, io_context); if (*foundPtr) pgBufferUsage.shared_blks_hit++; @@ -1157,6 +1143,7 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ReadBuffersOperation operation; Buffer buffer; int flags; + char persistence; /* * Backward compatibility path, most code should use ExtendBufferedRel() @@ -1195,7 +1182,15 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, flags = 0; operation.smgr = smgr; operation.rel = rel; - operation.smgr_persistence = smgr_persistence; + + if (rel) + persistence = rel->rd_rel->relpersistence; + else if (smgr_persistence == 0) + persistence = RELPERSISTENCE_PERMANENT; + else + persistence = smgr_persistence; + operation.smgr_persistence = persistence; + operation.forknum = forkNum; operation.strategy = strategy; if (StartReadBuffer(&operation, -- 2.43.0
From 2c6947231ba6377f291a1bd7c9cee55e5833fdbf Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavu...@gmail.com> Date: Mon, 8 Apr 2024 00:14:52 +0300 Subject: [PATCH v1 2/3] Add a way to create read stream object by using SMgrRelation Currently read stream object can be created only by using Relation, there is no way to create it by using SMgrRelation. To achieve that, read_stream_begin_impl() function is created and contents of the read_stream_begin_relation() is moved into this function. Both read_stream_begin_relation() and read_stream_begin_smgr_relation() calls read_stream_begin_impl() by Relation and SMgrRelation respectively. --- src/include/storage/read_stream.h | 9 +++ src/backend/storage/aio/read_stream.c | 83 ++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h index fae09d2b4cc..c7d688d8c9c 100644 --- a/src/include/storage/read_stream.h +++ b/src/include/storage/read_stream.h @@ -15,6 +15,7 @@ #define READ_STREAM_H #include "storage/bufmgr.h" +#include "storage/smgr.h" /* Default tuning, reasonable for many users. */ #define READ_STREAM_DEFAULT 0x00 @@ -56,6 +57,14 @@ extern ReadStream *read_stream_begin_relation(int flags, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size); +extern ReadStream *read_stream_begin_smgr_relation(int flags, + BufferAccessStrategy strategy, + SMgrRelation smgr, + char smgr_persistence, + ForkNumber forknum, + ReadStreamBlockNumberCB callback, + void *callback_private_data, + size_t per_buffer_data_size); extern Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_private); extern void read_stream_reset(ReadStream *stream); extern void read_stream_end(ReadStream *stream); diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index d155dde5ce3..8dd100f016c 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -406,14 +406,16 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice) * write extra data for each block into the space provided to it. It will * also receive callback_private_data for its own purposes. */ -ReadStream * -read_stream_begin_relation(int flags, - BufferAccessStrategy strategy, - Relation rel, - ForkNumber forknum, - ReadStreamBlockNumberCB callback, - void *callback_private_data, - size_t per_buffer_data_size) +static ReadStream * +read_stream_begin_impl(int flags, + BufferAccessStrategy strategy, + Relation rel, + SMgrRelation smgr, + char smgr_persistence, + ForkNumber forknum, + ReadStreamBlockNumberCB callback, + void *callback_private_data, + size_t per_buffer_data_size) { ReadStream *stream; size_t size; @@ -422,9 +424,6 @@ read_stream_begin_relation(int flags, int strategy_pin_limit; uint32 max_pinned_buffers; Oid tablespace_id; - SMgrRelation smgr; - - smgr = RelationGetSmgr(rel); /* * Decide how many I/Os we will allow to run at the same time. That @@ -434,7 +433,7 @@ read_stream_begin_relation(int flags, */ tablespace_id = smgr->smgr_rlocator.locator.spcOid; if (!OidIsValid(MyDatabaseId) || - IsCatalogRelation(rel) || + (rel && IsCatalogRelation(rel)) || IsCatalogRelationOid(smgr->smgr_rlocator.locator.relNumber)) { /* @@ -548,8 +547,8 @@ read_stream_begin_relation(int flags, for (int i = 0; i < max_ios; ++i) { stream->ios[i].op.rel = rel; - stream->ios[i].op.smgr = RelationGetSmgr(rel); - stream->ios[i].op.smgr_persistence = rel->rd_rel->relpersistence; + stream->ios[i].op.smgr = smgr; + stream->ios[i].op.smgr_persistence = smgr_persistence; stream->ios[i].op.forknum = forknum; stream->ios[i].op.strategy = strategy; } @@ -557,6 +556,62 @@ read_stream_begin_relation(int flags, return stream; } +/* + * Create a new read stream for reading a relation. + * See read_stream_begin_impl() for the detailed explanation. + */ +ReadStream * +read_stream_begin_relation(int flags, + BufferAccessStrategy strategy, + Relation rel, + ForkNumber forknum, + ReadStreamBlockNumberCB callback, + void *callback_private_data, + size_t per_buffer_data_size) +{ + return read_stream_begin_impl(flags, + strategy, + rel, + RelationGetSmgr(rel), + rel->rd_rel->relpersistence, + forknum, + callback, + callback_private_data, + per_buffer_data_size); +} + +/* + * Create a new read stream for reading a SMgr relation. + * See read_stream_begin_impl() for the detailed explanation. + */ +ReadStream * +read_stream_begin_smgr_relation(int flags, + BufferAccessStrategy strategy, + SMgrRelation smgr, + char smgr_persistence, + ForkNumber forknum, + ReadStreamBlockNumberCB callback, + void *callback_private_data, + size_t per_buffer_data_size) +{ + char persistence; + + if (smgr_persistence == 0) + persistence = RELPERSISTENCE_PERMANENT; + else + persistence = smgr_persistence; + + return read_stream_begin_impl(flags, + strategy, + NULL, + smgr, + persistence, + forknum, + callback, + callback_private_data, + per_buffer_data_size); +} + /* * Pull one pinned buffer out of a stream. Each call returns successive * blocks in the order specified by the callback. If per_buffer_data_size was -- 2.43.0
From b97e1f39ef53c331090597aaf2d38d66a9b49e89 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <byavu...@gmail.com> Date: Tue, 16 Apr 2024 11:36:49 +0300 Subject: [PATCH v1 3/3] Use read streams in CREATE DATABASE when strategy is wal_log CREATE DABASE command uses RelationCopyStorageUsingBuffer() function to copy buffers when the strategy is wal_log. This function reads source buffers then copies them to the destination buffers. Read streams are used only only while reading source buffers because the destination buffer is read by 'RBM_ZERO_AND_LOCK' option, so it is not important. --- src/backend/storage/buffer/bufmgr.c | 53 ++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 17cb7127f99..e02b699f6cf 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -54,6 +54,7 @@ #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/proc.h" +#include "storage/read_stream.h" #include "storage/smgr.h" #include "storage/standby.h" #include "utils/memdebug.h" @@ -135,6 +136,33 @@ typedef struct SMgrSortArray SMgrRelation srel; } SMgrSortArray; +/* + * Helper struct for read stream object used in + * RelationCopyStorageUsingBuffer() function. + */ +struct copy_storage_using_buffer_read_stream_private +{ + BlockNumber blocknum; + int64 last_block; +}; + +/* + * Callback function to get next block for read stream object used in + * RelationCopyStorageUsingBuffer() function. + */ +static BlockNumber +copy_storage_using_buffer_read_stream_next_block(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data) +{ + struct copy_storage_using_buffer_read_stream_private *p = callback_private_data; + + if (p->blocknum < p->last_block) + return p->blocknum++; + + return InvalidBlockNumber; +} + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -4639,6 +4667,9 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, PGIOAlignedBlock buf; BufferAccessStrategy bstrategy_src; BufferAccessStrategy bstrategy_dst; + struct copy_storage_using_buffer_read_stream_private p; + ReadStream *src_stream; + SMgrRelation src_smgr; /* * In general, we want to write WAL whenever wal_level > 'minimal', but we @@ -4667,19 +4698,31 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, bstrategy_src = GetAccessStrategy(BAS_BULKREAD); bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE); + /* Initalize streaming read */ + p.blocknum = 0; + p.last_block = nblocks; + src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER); + src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL, + bstrategy_src, + src_smgr, + permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, + forkNum, + copy_storage_using_buffer_read_stream_next_block, + &p, + 0); + /* Iterate over each block of the source relation file. */ for (blkno = 0; blkno < nblocks; blkno++) { CHECK_FOR_INTERRUPTS(); /* Read block from source relation. */ - srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno, - RBM_NORMAL, bstrategy_src, - permanent); + srcBuf = read_stream_next_buffer(src_stream, NULL); LockBuffer(srcBuf, BUFFER_LOCK_SHARE); srcPage = BufferGetPage(srcBuf); - dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno, + dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, + BufferGetBlockNumber(srcBuf), RBM_ZERO_AND_LOCK, bstrategy_dst, permanent); dstPage = BufferGetPage(dstBuf); @@ -4699,6 +4742,8 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, UnlockReleaseBuffer(dstBuf); UnlockReleaseBuffer(srcBuf); } + Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer); + read_stream_end(src_stream); FreeAccessStrategy(bstrategy_src); FreeAccessStrategy(bstrategy_dst); -- 2.43.0