Hi,

On 2023-02-14 18:00:00 +0530, Bharath Rupireddy wrote:
> On Mon, Feb 13, 2023 at 11:09 PM Andres Freund <and...@anarazel.de> wrote:
> >
> > > Later code assigns iov[0].iov_len thus we need to provide a separate
> > > iov non-const variable, or can we use pwrite instead there?  (I didn't
> > > find pg_pwrite_with_retry(), though)
> >
> > Given that we need to do that, and given that we already need to loop to
> > handle writes that are longer than PG_IOV_MAX * BLCKSZ, it's probably not
> > worth avoiding iov initialization.
> >
> > But I think it's worth limiting the initialization to blocks.
>
> We can still optimize away the for loop by using a single iovec for
> remaining size, like the attached v2 patch.
>
> > I'd also try to combine the first pg_writev_* with the second one.
>
> Done, PSA v2 patch.

This feels way too complicated to me.  How about something more like the
attached?


> 2) A small test module passing in a file with the size to write isn't
> multiple of block size, meaning, the code we have in the function to
> write last remaining bytes (less than BLCKSZ) gets covered which isn't
> covered right now -

FWIW, I tested this locally by just specifying a smaller size than BLCKSZ for
the write size.

Greetings,

Andres Freund
>From ba0a8697c639870b76e92a285644a87d36cf4496 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 13 Feb 2023 17:11:29 -0800
Subject: [PATCH v3 01/12] Revise pg_pwrite_zeros()

- add offset parameter
- avoid memset() of zerobuf on every call
- don't initialize the whole IOV array unnecessarily
- don't handle the smaller trailing write in a separate write call
---
 src/include/common/file_utils.h    |  2 +-
 src/common/file_utils.c            | 67 ++++++++++--------------------
 src/backend/access/transam/xlog.c  |  2 +-
 src/bin/pg_basebackup/walmethods.c |  2 +-
 4 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index bda6d3a5413..95b5cc8b3a1 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -44,6 +44,6 @@ extern ssize_t pg_pwritev_with_retry(int fd,
 									 int iovcnt,
 									 off_t offset);
 
-extern ssize_t pg_pwrite_zeros(int fd, size_t size);
+extern ssize_t pg_pwrite_zeros(int fd, off_t offset, size_t size);
 
 #endif							/* FILE_UTILS_H */
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 4dae534152f..a161b1c7bbe 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -537,62 +537,41 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
  * is returned with errno set.
  */
 ssize_t
-pg_pwrite_zeros(int fd, size_t size)
+pg_pwrite_zeros(int fd, off_t offset, size_t size)
 {
-	PGAlignedBlock zbuffer;		/* worth BLCKSZ */
-	size_t		zbuffer_sz;
+	const static PGAlignedBlock zbuffer = {0};		/* worth BLCKSZ */
+	void *zerobuf_addr = unconstify(PGAlignedBlock *, &zbuffer)->data;
 	struct iovec iov[PG_IOV_MAX];
-	int			blocks;
-	size_t		remaining_size = 0;
-	int			i;
-	ssize_t		written;
+	size_t		remaining_size = size;
 	ssize_t		total_written = 0;
 
-	zbuffer_sz = sizeof(zbuffer.data);
-
-	/* Zero-fill the buffer. */
-	memset(zbuffer.data, 0, zbuffer_sz);
-
-	/* Prepare to write out a lot of copies of our zero buffer at once. */
-	for (i = 0; i < lengthof(iov); ++i)
-	{
-		iov[i].iov_base = zbuffer.data;
-		iov[i].iov_len = zbuffer_sz;
-	}
-
 	/* Loop, writing as many blocks as we can for each system call. */
-	blocks = size / zbuffer_sz;
-	remaining_size = size % zbuffer_sz;
-	for (i = 0; i < blocks;)
+	while (remaining_size > 0)
 	{
-		int			iovcnt = Min(blocks - i, lengthof(iov));
-		off_t		offset = i * zbuffer_sz;
-
-		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
-
-		if (written < 0)
-			return written;
-
-		i += iovcnt;
-		total_written += written;
-	}
-
-	/* Now, write the remaining size, if any, of the file with zeros. */
-	if (remaining_size > 0)
-	{
-		/* We'll never write more than one block here */
-		int			iovcnt = 1;
-
-		/* Jump on to the end of previously written blocks */
-		off_t		offset = i * zbuffer_sz;
-
-		iov[0].iov_len = remaining_size;
+		int			iovcnt = 0;
+		ssize_t		written;
+
+		for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
+		{
+			size_t this_iov_size;
+
+			iov[iovcnt].iov_base = zerobuf_addr;
+
+			if (remaining_size < BLCKSZ)
+				this_iov_size = remaining_size;
+			else
+				this_iov_size = BLCKSZ;
+
+			iov[iovcnt].iov_len = this_iov_size;
+			remaining_size -= this_iov_size;
+		}
 
 		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
 
 		if (written < 0)
 			return written;
 
+		offset += written;
 		total_written += written;
 	}
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f9f0f6db8d1..cffe72e2386 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2981,7 +2981,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
-		rc = pg_pwrite_zeros(fd, wal_segment_size);
+		rc = pg_pwrite_zeros(fd, 0, wal_segment_size);
 
 		if (rc < 0)
 			save_errno = errno;
diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index 54014e2b84d..9e055f2b793 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -222,7 +222,7 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	{
 		ssize_t		rc;
 
-		rc = pg_pwrite_zeros(fd, pad_to_size);
+		rc = pg_pwrite_zeros(fd, 0, pad_to_size);
 
 		if (rc < 0)
 		{
-- 
2.38.0

Reply via email to