O (WIP)

Thomas Munro Tue, 02 Apr 2024 17:32:05 -0700

On Tue, Apr 2, 2024 at 9:39 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> So this is the version I'm going to commit shortly, barring objections.


And done, after fixing a small snafu with smgr-only reads coming from
CreateAndCopyRelationData() (BM_PERMANENT would be
incorrectly/unnecessarily set for unlogged tables).

Here are the remaining patches discussed in this thread.  They give
tablespace-specific io_combine_limit, effective_io_readahead_window
(is this useful?), and up-to-1MB io_combine_limit (is this useful?).
I think the first two would probably require teaching reloption.c how
to use guc.c's parse_int() and unit flags, but I won't have time to
look at that for this release so I'll just leave these here.

On the subject of guc.c, this is a terrible error message... did I do
something wrong?

postgres=# set io_combine_limit = '42MB';
ERROR:  5376 8kB is outside the valid range for parameter
"io_combine_limit" (1 .. 32)

From 84b8280481312cdd1efcb7efa1182d4647cbe00a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 30 Mar 2024 19:09:44 +1300
Subject: [PATCH v16 1/4] ALTER TABLESPACE ... SET (io_combine_limit = ...).

This is the per-tablespace version of the GUC of the same name.

XXX reloptions.c lacks the ability to accept units eg '64kB'!  Which is
why I haven't included it with the main feature commit.

Suggested-by: Tomas Vondra <tomas.von...@enterprisedb.com
Discussion: https://postgr.es/m/f603ac51-a7ff-496a-99c1-76673635692e%40enterprisedb.com
---
 doc/src/sgml/ref/alter_tablespace.sgml |  9 +++---
 src/backend/access/common/reloptions.c | 12 +++++++-
 src/backend/storage/aio/read_stream.c  | 39 ++++++++++++++++----------
 src/backend/utils/cache/spccache.c     | 14 +++++++++
 src/bin/psql/tab-complete.c            |  3 +-
 src/include/commands/tablespace.h      |  1 +
 src/include/utils/spccache.h           |  1 +
 7 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/doc/src/sgml/ref/alter_tablespace.sgml b/doc/src/sgml/ref/alter_tablespace.sgml
index 6ec863400d1..faf0c6e7fbc 100644
--- a/doc/src/sgml/ref/alter_tablespace.sgml
+++ b/doc/src/sgml/ref/alter_tablespace.sgml
@@ -84,16 +84,17 @@ ALTER TABLESPACE <replaceable>name</replaceable> RESET ( <replaceable class="par
      <para>
       A tablespace parameter to be set or reset.  Currently, the only
       available parameters are <varname>seq_page_cost</varname>,
-      <varname>random_page_cost</varname>, <varname>effective_io_concurrency</varname>
-      and <varname>maintenance_io_concurrency</varname>.
+      <varname>random_page_cost</varname>, <varname>effective_io_concurrency</varname>,
+      <varname>maintenance_io_concurrency</varname> and <varname>io_combine_limit</varname>.
       Setting these values for a particular tablespace will override the
       planner's usual estimate of the cost of reading pages from tables in
-      that tablespace, and the executor's prefetching behavior, as established
+      that tablespace, and the executor's prefetching and I/O sizing behavior, as established
       by the configuration parameters of the
       same name (see <xref linkend="guc-seq-page-cost"/>,
       <xref linkend="guc-random-page-cost"/>,
       <xref linkend="guc-effective-io-concurrency"/>,
-      <xref linkend="guc-maintenance-io-concurrency"/>).  This may be useful if
+      <xref linkend="guc-maintenance-io-concurrency"/>),
+      <xref linkend="guc-io-combine-limit"/>).  This may be useful if
       one tablespace is located on a disk which is faster or slower than the
       remainder of the I/O subsystem.
      </para>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index d6eb5d85599..1e1c611fab2 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -371,6 +371,15 @@ static relopt_int intRelOpts[] =
 		0, 0, 0
 #endif
 	},
+	{
+		{
+			"io_combine_limit",
+			"Limit on the size of data reads and writes.",
+			RELOPT_KIND_TABLESPACE,
+			ShareUpdateExclusiveLock
+		},
+		-1, 1, MAX_IO_COMBINE_LIMIT
+	},
 	{
 		{
 			"parallel_workers",
@@ -2089,7 +2098,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
 		{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
 		{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
 		{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)},
-		{"maintenance_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, maintenance_io_concurrency)}
+		{"maintenance_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, maintenance_io_concurrency)},
+		{"io_combine_limit", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, io_combine_limit)},
 	};
 
 	return (bytea *) build_reloptions(reloptions, validate,
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 4f21262ff5e..907c80e6bf9 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -114,6 +114,7 @@ struct ReadStream
 	int16		max_pinned_buffers;
 	int16		pinned_buffers;
 	int16		distance;
+	int16		io_combine_limit;
 	bool		advice_enabled;
 
 	/*
@@ -241,7 +242,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 
 	/* This should only be called with a pending read. */
 	Assert(stream->pending_read_nblocks > 0);
-	Assert(stream->pending_read_nblocks <= io_combine_limit);
+	Assert(stream->pending_read_nblocks <= stream->io_combine_limit);
 
 	/* We had better not exceed the pin limit by starting this read. */
 	Assert(stream->pinned_buffers + stream->pending_read_nblocks <=
@@ -329,7 +330,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		int16		buffer_index;
 		void	   *per_buffer_data;
 
-		if (stream->pending_read_nblocks == io_combine_limit)
+		if (stream->pending_read_nblocks == stream->io_combine_limit)
 		{
 			read_stream_start_pending_read(stream, suppress_advice);
 			suppress_advice = false;
@@ -389,7 +390,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 	 * signaled end-of-stream, we start the read immediately.
 	 */
 	if (stream->pending_read_nblocks > 0 &&
-		(stream->pending_read_nblocks == io_combine_limit ||
+		(stream->pending_read_nblocks == stream->io_combine_limit ||
 		 (stream->pending_read_nblocks == stream->distance &&
 		  stream->pinned_buffers == 0) ||
 		 stream->distance == 0) &&
@@ -419,6 +420,7 @@ read_stream_begin_relation(int flags,
 	size_t		size;
 	int16		queue_size;
 	int16		max_ios;
+	int16		my_io_combine_limit;
 	uint32		max_pinned_buffers;
 	Oid			tablespace_id;
 	SMgrRelation smgr;
@@ -437,15 +439,21 @@ read_stream_begin_relation(int flags,
 		IsCatalogRelationOid(smgr->smgr_rlocator.locator.relNumber))
 	{
 		/*
-		 * Avoid circularity while trying to look up tablespace settings or
-		 * before spccache.c is ready.
+		 * Avoid circularity while trying to look up tablespace catalog itself
+		 * or before spccache.c is ready: just use the plain GUC values in
+		 * those cases.
 		 */
 		max_ios = effective_io_concurrency;
+		my_io_combine_limit = io_combine_limit;
 	}
-	else if (flags & READ_STREAM_MAINTENANCE)
-		max_ios = get_tablespace_maintenance_io_concurrency(tablespace_id);
 	else
-		max_ios = get_tablespace_io_concurrency(tablespace_id);
+	{
+		if (flags & READ_STREAM_MAINTENANCE)
+			max_ios = get_tablespace_maintenance_io_concurrency(tablespace_id);
+		else
+			max_ios = get_tablespace_io_concurrency(tablespace_id);
+		my_io_combine_limit = get_tablespace_io_combine_limit(tablespace_id);
+	}
 	max_ios = Min(max_ios, PG_INT16_MAX);
 
 	/*
@@ -456,9 +464,9 @@ read_stream_begin_relation(int flags,
 	 * overflow (even though that's not possible with the current GUC range
 	 * limits), allowing also for the spare entry and the overflow space.
 	 */
-	max_pinned_buffers = Max(max_ios * 4, io_combine_limit);
+	max_pinned_buffers = Max(max_ios * 4, my_io_combine_limit);
 	max_pinned_buffers = Min(max_pinned_buffers,
-							 PG_INT16_MAX - io_combine_limit - 1);
+							 PG_INT16_MAX - my_io_combine_limit - 1);
 
 	/* Don't allow this backend to pin more than its share of buffers. */
 	if (SmgrIsTemp(smgr))
@@ -484,14 +492,14 @@ read_stream_begin_relation(int flags,
 	 * io_combine_limit - 1 elements.
 	 */
 	size = offsetof(ReadStream, buffers);
-	size += sizeof(Buffer) * (queue_size + io_combine_limit - 1);
+	size += sizeof(Buffer) * (queue_size + my_io_combine_limit - 1);
 	size += sizeof(InProgressIO) * Max(1, max_ios);
 	size += per_buffer_data_size * queue_size;
 	size += MAXIMUM_ALIGNOF * 2;
 	stream = (ReadStream *) palloc(size);
 	memset(stream, 0, offsetof(ReadStream, buffers));
 	stream->ios = (InProgressIO *)
-		MAXALIGN(&stream->buffers[queue_size + io_combine_limit - 1]);
+		MAXALIGN(&stream->buffers[queue_size + my_io_combine_limit - 1]);
 	if (per_buffer_data_size > 0)
 		stream->per_buffer_data = (void *)
 			MAXALIGN(&stream->ios[Max(1, max_ios)]);
@@ -519,6 +527,7 @@ read_stream_begin_relation(int flags,
 		max_ios = 1;
 
 	stream->max_ios = max_ios;
+	stream->io_combine_limit = my_io_combine_limit;
 	stream->per_buffer_data_size = per_buffer_data_size;
 	stream->max_pinned_buffers = max_pinned_buffers;
 	stream->queue_size = queue_size;
@@ -531,7 +540,7 @@ read_stream_begin_relation(int flags,
 	 * doing full io_combine_limit sized reads (behavior B).
 	 */
 	if (flags & READ_STREAM_FULL)
-		stream->distance = Min(max_pinned_buffers, io_combine_limit);
+		stream->distance = Min(max_pinned_buffers, my_io_combine_limit);
 	else
 		stream->distance = 1;
 
@@ -713,14 +722,14 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No advice; move towards io_combine_limit (behavior B). */
-			if (stream->distance > io_combine_limit)
+			if (stream->distance > stream->io_combine_limit)
 			{
 				stream->distance--;
 			}
 			else
 			{
 				distance = stream->distance * 2;
-				distance = Min(distance, io_combine_limit);
+				distance = Min(distance, stream->io_combine_limit);
 				distance = Min(distance, stream->max_pinned_buffers);
 				stream->distance = distance;
 			}
diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c
index ec63cdc8e52..97664824a4d 100644
--- a/src/backend/utils/cache/spccache.c
+++ b/src/backend/utils/cache/spccache.c
@@ -235,3 +235,17 @@ get_tablespace_maintenance_io_concurrency(Oid spcid)
 	else
 		return spc->opts->maintenance_io_concurrency;
 }
+
+/*
+ * get_tablespace_io_combine_limit
+ */
+int
+get_tablespace_io_combine_limit(Oid spcid)
+{
+	TableSpaceCacheEntry *spc = get_tablespace(spcid);
+
+	if (!spc->opts || spc->opts->io_combine_limit < 0)
+		return io_combine_limit;
+	else
+		return spc->opts->io_combine_limit;
+}
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 82eb3955abf..e2115e01ad4 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2633,7 +2633,8 @@ psql_completion(const char *text, int start, int end)
 	/* ALTER TABLESPACE <foo> SET|RESET ( */
 	else if (Matches("ALTER", "TABLESPACE", MatchAny, "SET|RESET", "("))
 		COMPLETE_WITH("seq_page_cost", "random_page_cost",
-					  "effective_io_concurrency", "maintenance_io_concurrency");
+					  "effective_io_concurrency", "maintenance_io_concurrency",
+					  "io_combine_limit");
 
 	/* ALTER TEXT SEARCH */
 	else if (Matches("ALTER", "TEXT", "SEARCH"))
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h
index b6cec632db9..c187283a6dd 100644
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -43,6 +43,7 @@ typedef struct TableSpaceOpts
 	float8		seq_page_cost;
 	int			effective_io_concurrency;
 	int			maintenance_io_concurrency;
+	int			io_combine_limit;
 } TableSpaceOpts;
 
 extern Oid	CreateTableSpace(CreateTableSpaceStmt *stmt);
diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h
index 11cfa719955..f8a19764a8f 100644
--- a/src/include/utils/spccache.h
+++ b/src/include/utils/spccache.h
@@ -17,5 +17,6 @@ extern void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
 									  float8 *spc_seq_page_cost);
 extern int	get_tablespace_io_concurrency(Oid spcid);
 extern int	get_tablespace_maintenance_io_concurrency(Oid spcid);
+extern int	get_tablespace_io_combine_limit(Oid spcid);
 
 #endif							/* SPCCACHE_H */
-- 
2.44.0

From 166a81b4d4639ddf5ee89c6f97634a453988e2bb Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 31 Mar 2024 05:48:20 +1300
Subject: [PATCH v16 2/4] Add effective_io_readahead_window setting.

In a few places we estimate whether the kernel will consider a pattern
of reads to be sequential, for readahead purposes.  If we think so, we
don't issue POSIX_FADV_WILLNEED advice, because we know that it's better
to get out of the kernel's way.

Linux uses a sliding window to detect sequential access, defaulting to
128kB (see blockdev --report).  This setting allows the user to tell
PostgreSQL that size, so we can try to model kernel behavior more
accurately.  Several other known operating systems seem to use a strict
next-block-only detection, so for other systems we just use 0 for now
(it must be exactly sequential), until we have specific information
about other OSes.  It's not a very critical question, because most other
systems either lack or ignore POSIX_FADV_WILLNEED anyway.

XXX Is this a good idea?
XXX This doesn't support units like '128kB' in ALTER TABLE, to be
like the GUC.

Suggested-by: Tomas Vondra <tomas.von...@enterprisedb.com
Discussion: https://postgr.es/m/f603ac51-a7ff-496a-99c1-76673635692e%40enterprisedb.com
---
 doc/src/sgml/config.sgml                      | 22 +++++++++++++++++++
 doc/src/sgml/ref/alter_tablespace.sgml        |  6 +++--
 src/backend/access/common/reloptions.c        | 10 +++++++++
 src/backend/storage/aio/read_stream.c         | 14 ++++++++++--
 src/backend/storage/buffer/bufmgr.c           |  7 ++++++
 src/backend/utils/cache/spccache.c            | 14 ++++++++++++
 src/backend/utils/misc/guc_tables.c           | 14 ++++++++++++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 src/bin/psql/tab-complete.c                   |  2 +-
 src/include/commands/tablespace.h             |  1 +
 src/include/storage/bufmgr.h                  |  7 ++++++
 src/include/utils/spccache.h                  |  1 +
 12 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 624518e0b01..fcfe38a823a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2722,6 +2722,28 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
 
+      <varlistentry id="guc-effective-io-readahead-window" xreflabel="effective_io_readahead_window">
+       <term><varname>effective_io_readahead_window</varname> (<type>integer</type>)
+       <indexterm>
+        <primary><varname>effective_io_readahead_window</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Sets the size of the window that <productname>PostgreSQL</productname>
+         expects the OS to use to detect sequential reads.  This is used to
+         make decisions about when to issue hints to the kernel on future
+         random reads.  Does not apply when direct I/O is used.
+        </para>
+        <para>
+         The default is 128kB on Linux, and 0 otherwise.  This value can
+         be overridden for tables in a particular tablespace by setting the
+         tablespace parameter of the same name (see
+         <xref linkend="sql-altertablespace"/>).
+        </para>
+       </listitem>
+      </varlistentry>
+
       <varlistentry id="guc-max-worker-processes" xreflabel="max_worker_processes">
        <term><varname>max_worker_processes</varname> (<type>integer</type>)
        <indexterm>
diff --git a/doc/src/sgml/ref/alter_tablespace.sgml b/doc/src/sgml/ref/alter_tablespace.sgml
index faf0c6e7fbc..25dbbea66ac 100644
--- a/doc/src/sgml/ref/alter_tablespace.sgml
+++ b/doc/src/sgml/ref/alter_tablespace.sgml
@@ -85,7 +85,8 @@ ALTER TABLESPACE <replaceable>name</replaceable> RESET ( <replaceable class="par
       A tablespace parameter to be set or reset.  Currently, the only
       available parameters are <varname>seq_page_cost</varname>,
       <varname>random_page_cost</varname>, <varname>effective_io_concurrency</varname>,
-      <varname>maintenance_io_concurrency</varname> and <varname>io_combine_limit</varname>.
+      <varname>maintenance_io_concurrency</varname>,
+      <varname>effective_io_readahead_window</varname> and <varname>io_combine_limit</varname>.
       Setting these values for a particular tablespace will override the
       planner's usual estimate of the cost of reading pages from tables in
       that tablespace, and the executor's prefetching and I/O sizing behavior, as established
@@ -94,7 +95,8 @@ ALTER TABLESPACE <replaceable>name</replaceable> RESET ( <replaceable class="par
       <xref linkend="guc-random-page-cost"/>,
       <xref linkend="guc-effective-io-concurrency"/>,
       <xref linkend="guc-maintenance-io-concurrency"/>),
-      <xref linkend="guc-io-combine-limit"/>).  This may be useful if
+      <xref linkend="guc-io-combine-limit"/>),
+      <xref linkend="guc-effective-io-readahead-window"/>).  This may be useful if
       one tablespace is located on a disk which is faster or slower than the
       remainder of the I/O subsystem.
      </para>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 1e1c611fab2..5a5bc7c5a2e 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -380,6 +380,15 @@ static relopt_int intRelOpts[] =
 		},
 		-1, 1, MAX_IO_COMBINE_LIMIT
 	},
+	{
+		{
+			"effective_io_readahead_window",
+			"Size of the window used by the OS to detect sequential buffered access.",
+			RELOPT_KIND_TABLESPACE,
+			ShareUpdateExclusiveLock
+		},
+		-1, 1, PG_INT16_MAX,
+	},
 	{
 		{
 			"parallel_workers",
@@ -2100,6 +2109,7 @@ tablespace_reloptions(Datum reloptions, bool validate)
 		{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)},
 		{"maintenance_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, maintenance_io_concurrency)},
 		{"io_combine_limit", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, io_combine_limit)},
+		{"effective_io_readahead_window", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_readahead_window)},
 	};
 
 	return (bytea *) build_reloptions(reloptions, validate,
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 907c80e6bf9..84a5b714b11 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -115,6 +115,7 @@ struct ReadStream
 	int16		pinned_buffers;
 	int16		distance;
 	int16		io_combine_limit;
+	int16		effective_io_readahead_window;
 	bool		advice_enabled;
 
 	/*
@@ -256,11 +257,15 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 
 	/*
 	 * If advice hasn't been suppressed, this system supports it, and this
-	 * isn't a strictly sequential pattern, then we'll issue advice.
+	 * isn't sequential according to the effective_io_readahead_window
+	 * setting, (which should ideally tell us how the OS detects sequential
+	 * buffered access), then we'll issue advice.
 	 */
 	if (!suppress_advice &&
 		stream->advice_enabled &&
-		stream->pending_read_blocknum != stream->seq_blocknum)
+		!(stream->pending_read_blocknum >= stream->seq_blocknum &&
+		  stream->pending_read_blocknum <= (stream->seq_blocknum +
+											stream->effective_io_readahead_window)))
 		flags = READ_BUFFERS_ISSUE_ADVICE;
 	else
 		flags = 0;
@@ -422,6 +427,7 @@ read_stream_begin_relation(int flags,
 	int16		max_ios;
 	int16		my_io_combine_limit;
 	uint32		max_pinned_buffers;
+	int16		my_effective_io_readahead_window;
 	Oid			tablespace_id;
 	SMgrRelation smgr;
 
@@ -445,6 +451,7 @@ read_stream_begin_relation(int flags,
 		 */
 		max_ios = effective_io_concurrency;
 		my_io_combine_limit = io_combine_limit;
+		my_effective_io_readahead_window = effective_io_readahead_window;
 	}
 	else
 	{
@@ -453,6 +460,8 @@ read_stream_begin_relation(int flags,
 		else
 			max_ios = get_tablespace_io_concurrency(tablespace_id);
 		my_io_combine_limit = get_tablespace_io_combine_limit(tablespace_id);
+		my_effective_io_readahead_window =
+			get_tablespace_io_readahead_window(tablespace_id);
 	}
 	max_ios = Min(max_ios, PG_INT16_MAX);
 
@@ -531,6 +540,7 @@ read_stream_begin_relation(int flags,
 	stream->per_buffer_data_size = per_buffer_data_size;
 	stream->max_pinned_buffers = max_pinned_buffers;
 	stream->queue_size = queue_size;
+	stream->effective_io_readahead_window = my_effective_io_readahead_window;
 	stream->callback = callback;
 	stream->callback_private_data = callback_private_data;
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 929eb8f175f..502e93fd7af 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -163,6 +163,13 @@ int			maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
  */
 int			io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
 
+/*
+ * In order to detect sequential access in the same way as the OS would, we
+ * need to know the size of the window it uses.  Overridden by the tablespace
+ * setting of the same name.
+ */
+int			effective_io_readahead_window = DEFAULT_EFFECTIVE_IO_READAHEAD_WINDOW;
+
 /*
  * GUC variables about triggering kernel writeback for buffers written; OS
  * dependent defaults are set via the GUC mechanism.
diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c
index 97664824a4d..b29d7abda67 100644
--- a/src/backend/utils/cache/spccache.c
+++ b/src/backend/utils/cache/spccache.c
@@ -249,3 +249,17 @@ get_tablespace_io_combine_limit(Oid spcid)
 	else
 		return spc->opts->io_combine_limit;
 }
+
+/*
+ * get_tablespace_io_readahead_window
+ */
+int
+get_tablespace_io_readahead_window(Oid spcid)
+{
+	TableSpaceCacheEntry *spc = get_tablespace(spcid);
+
+	if (!spc->opts || spc->opts->effective_io_readahead_window < 0)
+		return effective_io_readahead_window;
+	else
+		return spc->opts->effective_io_readahead_window;
+}
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index c12784cbec8..ab5e5f50df9 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3143,6 +3143,20 @@ struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"effective_io_readahead_window",
+			PGC_USERSET,
+			RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Size of the window the OS uses to detect sequential buffered file access."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&effective_io_readahead_window,
+		DEFAULT_EFFECTIVE_IO_READAHEAD_WINDOW,
+		0, PG_INT16_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"backend_flush_after", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
 			gettext_noop("Number of pages after which previously performed writes are flushed to disk."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index baecde28410..41598104b32 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -204,6 +204,7 @@
 #effective_io_concurrency = 1		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
 #io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#effective_io_readahead_window = 16	# 1-32767; expected system readahead window
 #max_worker_processes = 8		# (change requires restart)
 #max_parallel_workers_per_gather = 2	# limited by max_parallel_workers
 #max_parallel_maintenance_workers = 2	# limited by max_parallel_workers
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index e2115e01ad4..2af489ec447 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2634,7 +2634,7 @@ psql_completion(const char *text, int start, int end)
 	else if (Matches("ALTER", "TABLESPACE", MatchAny, "SET|RESET", "("))
 		COMPLETE_WITH("seq_page_cost", "random_page_cost",
 					  "effective_io_concurrency", "maintenance_io_concurrency",
-					  "io_combine_limit");
+					  "io_combine_limit", "effective_io_readahead_window");
 
 	/* ALTER TEXT SEARCH */
 	else if (Matches("ALTER", "TEXT", "SEARCH"))
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h
index c187283a6dd..94cd1a29985 100644
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -44,6 +44,7 @@ typedef struct TableSpaceOpts
 	int			effective_io_concurrency;
 	int			maintenance_io_concurrency;
 	int			io_combine_limit;
+	int			effective_io_readahead_window;
 } TableSpaceOpts;
 
 extern Oid	CreateTableSpace(CreateTableSpaceStmt *stmt);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index f380f9d9a6c..aa1ef8bf797 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -173,6 +173,13 @@ extern PGDLLIMPORT int maintenance_io_concurrency;
 #define DEFAULT_IO_COMBINE_LIMIT Min(MAX_IO_COMBINE_LIMIT, (128 * 1024) / BLCKSZ)
 extern PGDLLIMPORT int io_combine_limit;
 
+#ifdef __linux__
+#define DEFAULT_EFFECTIVE_IO_READAHEAD_WINDOW ((128 * 1024) / BLCKSZ)
+#else
+#define DEFAULT_EFFECTIVE_IO_READAHEAD_WINDOW 0
+#endif
+extern PGDLLIMPORT int effective_io_readahead_window;
+
 extern PGDLLIMPORT int checkpoint_flush_after;
 extern PGDLLIMPORT int backend_flush_after;
 extern PGDLLIMPORT int bgwriter_flush_after;
diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h
index f8a19764a8f..a9d163400f7 100644
--- a/src/include/utils/spccache.h
+++ b/src/include/utils/spccache.h
@@ -18,5 +18,6 @@ extern void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
 extern int	get_tablespace_io_concurrency(Oid spcid);
 extern int	get_tablespace_maintenance_io_concurrency(Oid spcid);
 extern int	get_tablespace_io_combine_limit(Oid spcid);
+extern int	get_tablespace_io_readahead_window(Oid spcid);
 
 #endif							/* SPCCACHE_H */
-- 
2.44.0

From 7f0d9adfeec4db374970996c53fd3ccdd0ee857a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 31 Mar 2024 17:24:44 +1300
Subject: [PATCH v16 3/4] Increase PG_IOV_MAX for bigger io_combine_limit.

PG_IOV_MAX, our clamped version of IOV_MAX, constrains io_combine_limit.
Change the clamp from 32 up to 128 vectors so that we can set
io_combine_limit up to 1MB instead of 256kB (assuming BLCKSZ == 8kB).
These numbers are fairly arbitrary and it's not at all clear that it's a
good idea to run with large numbers, but we might as well at least allow
experimentation.

This will also affect the size of the writes when initializing WAL
segments, which will change from 256kB to 1MB (assuming BLCKSZ == 8kB).
---
 src/backend/utils/misc/postgresql.conf.sample | 2 +-
 src/include/port/pg_iovec.h                   | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 41598104b32..26612c938ac 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -203,7 +203,7 @@
 #backend_flush_after = 0		# measured in pages, 0 disables
 #effective_io_concurrency = 1		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
-#io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#io_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 #effective_io_readahead_window = 16	# 1-32767; expected system readahead window
 #max_worker_processes = 8		# (change requires restart)
 #max_parallel_workers_per_gather = 2	# limited by max_parallel_workers
diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h
index 7255c1bd911..5055ede2c8e 100644
--- a/src/include/port/pg_iovec.h
+++ b/src/include/port/pg_iovec.h
@@ -33,8 +33,11 @@ struct iovec
 
 #endif
 
-/* Define a reasonable maximum that is safe to use on the stack. */
-#define PG_IOV_MAX Min(IOV_MAX, 32)
+/*
+ * Define a reasonable maximum that is safe to use on the stack but also allows
+ * io_combine_limit to reach large sizes.
+ */
+#define PG_IOV_MAX Min(IOV_MAX, 128)
 
 /*
  * Like preadv(), but with a prefix to remind us of a side-effect: on Windows
-- 
2.44.0

Re: Streaming I/O, vectored I/O (WIP)

Reply via email to