On Wed, Feb 15, 2023 at 6:25 AM Andres Freund <and...@anarazel.de> wrote: > > > Done, PSA v2 patch. > > This feels way too complicated to me. How about something more like the > attached?
Thanks. I kind of did cost analysis of v2 and v3: Input: zero-fill a file of size 256*8K. v2 patch: iovec initialization with zerobuf for loop - 1 time zero-fill 32 blocks at once - 8 times stack memory - sizeof(PGAlignedBlock) + sizeof(struct iovec) * PG_IOV_MAX v3 patch: iovec initialization with zerobuf for loop - 8 times (7 times more than v2 patch) zero-fill 32 blocks at once - 8 times (no change from v2 patch) stack memory - sizeof(PGAlignedBlock) + sizeof(struct iovec) * PG_IOV_MAX (no change from v2 patch) The v3 patch reduces initialization of iovec array elements which is a clear win when pg_pwrite_zeros is called for sizes less than BLCKSZ many times (I assume this is what is needed for the relation extension lock improvements feature). However, it increases the number of iovec initialization with zerobuf for the cases when pg_pwrite_zeros is called for sizes far greater than BLCKSZ (for instance, WAL file initialization). FWIW, I attached v4 patch, a simplified version of the v2 - it initializes all the iovec array elements if the total blocks to be written crosses lengthof(iovec array), otherwise it initializes only the needed blocks. -- Bharath Rupireddy PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
From a941ea3de79cbe890196fa16f60d1e0797a9b119 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Date: Wed, 15 Feb 2023 05:17:22 +0000 Subject: [PATCH v4] Refactor pg_pwrite_zeros() This commit refactors pg_pwrite_zeros() as follows: - add offset parameter - avoid memset() of zerobuf on every call - don't initialize the whole IOV array unnecessarily - don't handle the smaller trailing write in a separate write call Suggested-by: Andres Freund Author: Bharath Rupireddy Discussion: https://www.postgresql.org/message-id/20230211224424.r25uw4rsv6taukxk%40awork3.anarazel.d --- src/backend/access/transam/xlog.c | 2 +- src/bin/pg_basebackup/walmethods.c | 2 +- src/common/file_utils.c | 102 +++++++++++++++-------------- src/include/common/file_utils.h | 2 +- 4 files changed, 55 insertions(+), 53 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f9f0f6db8d..786b26054c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2981,7 +2981,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * indirect blocks are down on disk. Therefore, fdatasync(2) or * O_DSYNC will be sufficient to sync future writes to the log file. */ - rc = pg_pwrite_zeros(fd, wal_segment_size); + rc = pg_pwrite_zeros(fd, wal_segment_size, 0); if (rc < 0) save_errno = errno; diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c index 54014e2b84..6d14b988cb 100644 --- a/src/bin/pg_basebackup/walmethods.c +++ b/src/bin/pg_basebackup/walmethods.c @@ -222,7 +222,7 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname, { ssize_t rc; - rc = pg_pwrite_zeros(fd, pad_to_size); + rc = pg_pwrite_zeros(fd, pad_to_size, 0); if (rc < 0) { diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 4dae534152..303bb07543 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -531,72 +531,74 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) /* * pg_pwrite_zeros * - * Writes zeros to file worth "size" bytes, using vectored I/O. + * Writes zeros to file worth "size" bytes from starting file offset 'offset', + * using vectored I/O. * * Returns the total amount of data written. On failure, a negative value * is returned with errno set. + * + * Note that pg_pwrite(), which gets called from this function, may change the + * file position (see win32pwrite.c). If needed, callers must deal with it. */ ssize_t -pg_pwrite_zeros(int fd, size_t size) +pg_pwrite_zeros(int fd, size_t size, off_t offset) { - PGAlignedBlock zbuffer; /* worth BLCKSZ */ - size_t zbuffer_sz; - struct iovec iov[PG_IOV_MAX]; - int blocks; - size_t remaining_size = 0; - int i; - ssize_t written; - ssize_t total_written = 0; - - zbuffer_sz = sizeof(zbuffer.data); - - /* Zero-fill the buffer. */ - memset(zbuffer.data, 0, zbuffer_sz); - - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (i = 0; i < lengthof(iov); ++i) + /* + * Zero-filled buffer worth BLCKSZ. We use static variable to avoid zeroing + * the buffer on every call. Also, with the const qualifier, it is ensured + * that the buffer is placed in read-only section of the process's memory. + */ + static const PGAlignedBlock zbuf = {0}; + static size_t zbuf_sz = BLCKSZ; + struct iovec iov[PG_IOV_MAX]; + int blocks; + int init_upto; + int i; + ssize_t ntotalwritten = 0; + size_t nremaining; + size_t nlast; + + blocks = size / zbuf_sz; + nremaining = size; + nlast = size % zbuf_sz; + + /* Initialize only the needed iovec elements with zero buffer. */ + init_upto = (blocks == 0) ? 1 : ((blocks >= lengthof(iov)) ? lengthof(iov) : blocks); + for (i = 0; i < init_upto; ++i) { - iov[i].iov_base = zbuffer.data; - iov[i].iov_len = zbuffer_sz; + iov[i].iov_base = (void *) (unconstify(PGAlignedBlock *, &zbuf)->data); + iov[i].iov_len = zbuf_sz; } - /* Loop, writing as many blocks as we can for each system call. */ - blocks = size / zbuffer_sz; - remaining_size = size % zbuffer_sz; - for (i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * zbuffer_sz; - - written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); - - if (written < 0) - return written; - - i += iovcnt; - total_written += written; - } + i = 0; - /* Now, write the remaining size, if any, of the file with zeros. */ - if (remaining_size > 0) + /* Loop, writing as many blocks as we can for each system call. */ + while (nremaining > 0) { - /* We'll never write more than one block here */ - int iovcnt = 1; - - /* Jump on to the end of previously written blocks */ - off_t offset = i * zbuffer_sz; + int iovcnt; + ssize_t nwritten; - iov[0].iov_len = remaining_size; + /* Determine if we need to write last bytes (< zbuf_sz) */ + if (nlast != 0 && nlast == nremaining) + { + iov[0].iov_len = nlast; + iovcnt = 1; /* We will write not more than one block */ + } + else + iovcnt = Min(blocks - i, lengthof(iov)); - written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + nwritten = pg_pwritev_with_retry(fd, iov, iovcnt, offset); - if (written < 0) - return written; + if (nwritten < 0) + return nwritten; - total_written += written; + i += iovcnt; + ntotalwritten += nwritten; + nremaining -= nwritten; + offset += nwritten; } - Assert(total_written == size); + Assert(ntotalwritten == size); - return total_written; + return ntotalwritten; } diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index bda6d3a541..b7efa1226d 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -44,6 +44,6 @@ extern ssize_t pg_pwritev_with_retry(int fd, int iovcnt, off_t offset); -extern ssize_t pg_pwrite_zeros(int fd, size_t size); +extern ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset); #endif /* FILE_UTILS_H */ -- 2.34.1