Hi,

I've pushed the first two parts earlier today. Here's a v13 with the
remaining pieces rebased on top of that, with updated commit messages
and some additional fixes.

1) There was a bogus else-if in explain.c that would make it impossible
to enable "wal" and "io" at the same time.

2) Replaced one of the "typedef IOStats" with just forward declaration,
to prevent possible conflicting typedefs in the future (even if that
wasn't an issue now).

3) Used (es_instrument & INSTRUMENT_IO) more consistently. A couple
places in the executor still checked just es_instrument, and so would
collect stats even if not needed.  Be consistent.

4) Various comment additions / improvements.


I intend to push this later today, after giving it one more round of
review etc.


regards

-- 
Tomas Vondra
From 1270fb4c87b964021a66ffb261a901e0fdc929af Mon Sep 17 00:00:00 2001
From: Tomas Vondra <[email protected]>
Date: Tue, 31 Mar 2026 13:44:23 +0200
Subject: [PATCH v13 1/5] Switch EXPLAIN to unaligned output for json/xml/yaml

Use unaligned output for multiple EXPLAIN queries using non-text format
in regression tests. With aligned output adding/removing explain fields
can be very disruptive, as it often modifies the whole block because of
padding. Unaligned output does not have this issue.

Reviewed-by: Melanie Plageman <[email protected]>
Reviewed-by: Lukas Fittl <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 src/test/regress/expected/explain.out | 296 +++++++++++++-------------
 src/test/regress/sql/explain.sql      |   5 +-
 2 files changed, 150 insertions(+), 151 deletions(-)

diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 7c1f26b182c..dc31c7ce9f9 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -93,164 +93,160 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
  Execution Time: N.N ms
 (3 rows)
 
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
-                     explain_filter                     
---------------------------------------------------------
- <explain xmlns="http://www.postgresql.org/N/explain";> +
-   <Query>                                             +
-     <Plan>                                            +
-       <Node-Type>Seq Scan</Node-Type>                 +
-       <Parallel-Aware>false</Parallel-Aware>          +
-       <Async-Capable>false</Async-Capable>            +
-       <Relation-Name>int8_tbl</Relation-Name>         +
-       <Alias>i8</Alias>                               +
-       <Startup-Cost>N.N</Startup-Cost>                +
-       <Total-Cost>N.N</Total-Cost>                    +
-       <Plan-Rows>N</Plan-Rows>                        +
-       <Plan-Width>N</Plan-Width>                      +
-       <Actual-Startup-Time>N.N</Actual-Startup-Time>  +
-       <Actual-Total-Time>N.N</Actual-Total-Time>      +
-       <Actual-Rows>N.N</Actual-Rows>                  +
-       <Actual-Loops>N</Actual-Loops>                  +
-       <Disabled>false</Disabled>                      +
-       <Shared-Hit-Blocks>N</Shared-Hit-Blocks>        +
-       <Shared-Read-Blocks>N</Shared-Read-Blocks>      +
-       <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>+
-       <Shared-Written-Blocks>N</Shared-Written-Blocks>+
-       <Local-Hit-Blocks>N</Local-Hit-Blocks>          +
-       <Local-Read-Blocks>N</Local-Read-Blocks>        +
-       <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks>  +
-       <Local-Written-Blocks>N</Local-Written-Blocks>  +
-       <Temp-Read-Blocks>N</Temp-Read-Blocks>          +
-       <Temp-Written-Blocks>N</Temp-Written-Blocks>    +
-     </Plan>                                           +
-     <Planning>                                        +
-       <Shared-Hit-Blocks>N</Shared-Hit-Blocks>        +
-       <Shared-Read-Blocks>N</Shared-Read-Blocks>      +
-       <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>+
-       <Shared-Written-Blocks>N</Shared-Written-Blocks>+
-       <Local-Hit-Blocks>N</Local-Hit-Blocks>          +
-       <Local-Read-Blocks>N</Local-Read-Blocks>        +
-       <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks>  +
-       <Local-Written-Blocks>N</Local-Written-Blocks>  +
-       <Temp-Read-Blocks>N</Temp-Read-Blocks>          +
-       <Temp-Written-Blocks>N</Temp-Written-Blocks>    +
-     </Planning>                                       +
-     <Planning-Time>N.N</Planning-Time>                +
-     <Triggers>                                        +
-     </Triggers>                                       +
-     <Execution-Time>N.N</Execution-Time>              +
-   </Query>                                            +
- </explain>
-(1 row)
-
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
-        explain_filter         
--------------------------------
- - Plan:                      +
-     Node Type: "Seq Scan"    +
-     Parallel Aware: false    +
-     Async Capable: false     +
-     Relation Name: "int8_tbl"+
-     Alias: "i8"              +
-     Startup Cost: N.N        +
-     Total Cost: N.N          +
-     Plan Rows: N             +
-     Plan Width: N            +
-     Actual Startup Time: N.N +
-     Actual Total Time: N.N   +
-     Actual Rows: N.N         +
-     Actual Loops: N          +
-     Disabled: false          +
-     Shared Hit Blocks: N     +
-     Shared Read Blocks: N    +
-     Shared Dirtied Blocks: N +
-     Shared Written Blocks: N +
-     Local Hit Blocks: N      +
-     Local Read Blocks: N     +
-     Local Dirtied Blocks: N  +
-     Local Written Blocks: N  +
-     Temp Read Blocks: N      +
-     Temp Written Blocks: N   +
-   Planning:                  +
-     Shared Hit Blocks: N     +
-     Shared Read Blocks: N    +
-     Shared Dirtied Blocks: N +
-     Shared Written Blocks: N +
-     Local Hit Blocks: N      +
-     Local Read Blocks: N     +
-     Local Dirtied Blocks: N  +
-     Local Written Blocks: N  +
-     Temp Read Blocks: N      +
-     Temp Written Blocks: N   +
-   Planning Time: N.N         +
-   Triggers:                  +
-   Serialization:             +
-     Time: N.N                +
-     Output Volume: N         +
-     Format: "text"           +
-     Shared Hit Blocks: N     +
-     Shared Read Blocks: N    +
-     Shared Dirtied Blocks: N +
-     Shared Written Blocks: N +
-     Local Hit Blocks: N      +
-     Local Read Blocks: N     +
-     Local Dirtied Blocks: N  +
-     Local Written Blocks: N  +
-     Temp Read Blocks: N      +
-     Temp Written Blocks: N   +
-   Execution Time: N.N
-(1 row)
-
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
                      explain_filter                      
 ---------------------------------------------------------
  Seq Scan on int8_tbl i8  (cost=N.N..N.N rows=N width=N)
 (1 row)
 
+\a
+select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
+explain_filter
+<explain xmlns="http://www.postgresql.org/N/explain";>
+  <Query>
+    <Plan>
+      <Node-Type>Seq Scan</Node-Type>
+      <Parallel-Aware>false</Parallel-Aware>
+      <Async-Capable>false</Async-Capable>
+      <Relation-Name>int8_tbl</Relation-Name>
+      <Alias>i8</Alias>
+      <Startup-Cost>N.N</Startup-Cost>
+      <Total-Cost>N.N</Total-Cost>
+      <Plan-Rows>N</Plan-Rows>
+      <Plan-Width>N</Plan-Width>
+      <Actual-Startup-Time>N.N</Actual-Startup-Time>
+      <Actual-Total-Time>N.N</Actual-Total-Time>
+      <Actual-Rows>N.N</Actual-Rows>
+      <Actual-Loops>N</Actual-Loops>
+      <Disabled>false</Disabled>
+      <Shared-Hit-Blocks>N</Shared-Hit-Blocks>
+      <Shared-Read-Blocks>N</Shared-Read-Blocks>
+      <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>
+      <Shared-Written-Blocks>N</Shared-Written-Blocks>
+      <Local-Hit-Blocks>N</Local-Hit-Blocks>
+      <Local-Read-Blocks>N</Local-Read-Blocks>
+      <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks>
+      <Local-Written-Blocks>N</Local-Written-Blocks>
+      <Temp-Read-Blocks>N</Temp-Read-Blocks>
+      <Temp-Written-Blocks>N</Temp-Written-Blocks>
+    </Plan>
+    <Planning>
+      <Shared-Hit-Blocks>N</Shared-Hit-Blocks>
+      <Shared-Read-Blocks>N</Shared-Read-Blocks>
+      <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>
+      <Shared-Written-Blocks>N</Shared-Written-Blocks>
+      <Local-Hit-Blocks>N</Local-Hit-Blocks>
+      <Local-Read-Blocks>N</Local-Read-Blocks>
+      <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks>
+      <Local-Written-Blocks>N</Local-Written-Blocks>
+      <Temp-Read-Blocks>N</Temp-Read-Blocks>
+      <Temp-Written-Blocks>N</Temp-Written-Blocks>
+    </Planning>
+    <Planning-Time>N.N</Planning-Time>
+    <Triggers>
+    </Triggers>
+    <Execution-Time>N.N</Execution-Time>
+  </Query>
+</explain>
+(1 row)
+select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+explain_filter
+- Plan: 
+    Node Type: "Seq Scan"
+    Parallel Aware: false
+    Async Capable: false
+    Relation Name: "int8_tbl"
+    Alias: "i8"
+    Startup Cost: N.N
+    Total Cost: N.N
+    Plan Rows: N
+    Plan Width: N
+    Actual Startup Time: N.N
+    Actual Total Time: N.N
+    Actual Rows: N.N
+    Actual Loops: N
+    Disabled: false
+    Shared Hit Blocks: N
+    Shared Read Blocks: N
+    Shared Dirtied Blocks: N
+    Shared Written Blocks: N
+    Local Hit Blocks: N
+    Local Read Blocks: N
+    Local Dirtied Blocks: N
+    Local Written Blocks: N
+    Temp Read Blocks: N
+    Temp Written Blocks: N
+  Planning: 
+    Shared Hit Blocks: N
+    Shared Read Blocks: N
+    Shared Dirtied Blocks: N
+    Shared Written Blocks: N
+    Local Hit Blocks: N
+    Local Read Blocks: N
+    Local Dirtied Blocks: N
+    Local Written Blocks: N
+    Temp Read Blocks: N
+    Temp Written Blocks: N
+  Planning Time: N.N
+  Triggers: 
+  Serialization: 
+    Time: N.N
+    Output Volume: N
+    Format: "text"
+    Shared Hit Blocks: N
+    Shared Read Blocks: N
+    Shared Dirtied Blocks: N
+    Shared Written Blocks: N
+    Local Hit Blocks: N
+    Local Read Blocks: N
+    Local Dirtied Blocks: N
+    Local Written Blocks: N
+    Temp Read Blocks: N
+    Temp Written Blocks: N
+  Execution Time: N.N
+(1 row)
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
-           explain_filter           
-------------------------------------
- [                                 +
-   {                               +
-     "Plan": {                     +
-       "Node Type": "Seq Scan",    +
-       "Parallel Aware": false,    +
-       "Async Capable": false,     +
-       "Relation Name": "int8_tbl",+
-       "Alias": "i8",              +
-       "Startup Cost": N.N,        +
-       "Total Cost": N.N,          +
-       "Plan Rows": N,             +
-       "Plan Width": N,            +
-       "Disabled": false,          +
-       "Shared Hit Blocks": N,     +
-       "Shared Read Blocks": N,    +
-       "Shared Dirtied Blocks": N, +
-       "Shared Written Blocks": N, +
-       "Local Hit Blocks": N,      +
-       "Local Read Blocks": N,     +
-       "Local Dirtied Blocks": N,  +
-       "Local Written Blocks": N,  +
-       "Temp Read Blocks": N,      +
-       "Temp Written Blocks": N    +
-     },                            +
-     "Planning": {                 +
-       "Shared Hit Blocks": N,     +
-       "Shared Read Blocks": N,    +
-       "Shared Dirtied Blocks": N, +
-       "Shared Written Blocks": N, +
-       "Local Hit Blocks": N,      +
-       "Local Read Blocks": N,     +
-       "Local Dirtied Blocks": N,  +
-       "Local Written Blocks": N,  +
-       "Temp Read Blocks": N,      +
-       "Temp Written Blocks": N    +
-     }                             +
-   }                               +
- ]
+explain_filter
+[
+  {
+    "Plan": {
+      "Node Type": "Seq Scan",
+      "Parallel Aware": false,
+      "Async Capable": false,
+      "Relation Name": "int8_tbl",
+      "Alias": "i8",
+      "Startup Cost": N.N,
+      "Total Cost": N.N,
+      "Plan Rows": N,
+      "Plan Width": N,
+      "Disabled": false,
+      "Shared Hit Blocks": N,
+      "Shared Read Blocks": N,
+      "Shared Dirtied Blocks": N,
+      "Shared Written Blocks": N,
+      "Local Hit Blocks": N,
+      "Local Read Blocks": N,
+      "Local Dirtied Blocks": N,
+      "Local Written Blocks": N,
+      "Temp Read Blocks": N,
+      "Temp Written Blocks": N
+    },
+    "Planning": {
+      "Shared Hit Blocks": N,
+      "Shared Read Blocks": N,
+      "Shared Dirtied Blocks": N,
+      "Shared Written Blocks": N,
+      "Local Hit Blocks": N,
+      "Local Read Blocks": N,
+      "Local Dirtied Blocks": N,
+      "Local Written Blocks": N,
+      "Temp Read Blocks": N,
+      "Temp Written Blocks": N
+    }
+  }
+]
 (1 row)
-
+\a
 -- Check expansion of window definitions
 select explain_filter('explain verbose select sum(unique1) over w, sum(unique2) over (w order by hundred), sum(tenthous) over (w order by hundred) from tenk1 window w as (partition by ten)');
                                             explain_filter                                             
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ebdab42604b..8f10e1aff55 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -66,10 +66,13 @@ select explain_filter('explain select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers off) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers off, verbose) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format text) select * from int8_tbl i8');
+select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
+
+\a
 select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
 select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
-select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
+\a
 
 -- Check expansion of window definitions
 
-- 
2.53.0

From 1d79ca14128b7f1b805bef51880c9603c9c02b8e Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Sun, 5 Apr 2026 17:17:22 -0400
Subject: [PATCH v13 2/5] Add EXPLAIN (IO) infrastructure with BitmapHeapScan
 support

Allows collecting details about AIO / prefetch for scan nodes backed by
a ReadStream. This may be enabled by a new "IO" option in EXPLAIN, and
it shows information about the prefetch distance and I/O requests.

As of this commit this applies only to BitmapHeapScan, because that's
the only scan node using a ReadStream and collecting instrumentation
from workers in a parallel query. Support for SeqScan and TidRangeScan,
the other scan nodes using ReadStream, will be added in subsequent
commits.

The stats are collected only when required by EXPLAIN ANALYZE, with the
IO option (disabled by default). The amount of collected statistics is
very limited, but we don't want to clutter EXPLAIN with too much data.

The IOStats struct is stored in the scan descriptor as a field, next to
other fields used by table AMs. A pointer to the field is passed to the
ReadStream, and updated directly.

It's the responsibility of the table AM to allocate the struct (e.g. in
ambeginscan) whenever the flag SO_SCAN_INSTRUMENT flag is passed to the
scan, so that the executor and ReadStream has access to it.

The collected stats are designed for ReadStream, but are meant to be
reasonably generic in case a TAM manages I/Os in different ways.

Reviewed-by: Melanie Plageman <[email protected]>
Reviewed-by: Lukas Fittl <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 doc/src/sgml/ref/explain.sgml             |  12 ++
 src/backend/access/heap/heapam.c          |  10 ++
 src/backend/commands/explain.c            | 128 ++++++++++++++++++++++
 src/backend/commands/explain_state.c      |   8 ++
 src/backend/executor/nodeBitmapHeapscan.c |  19 +++-
 src/backend/storage/aio/read_stream.c     |  87 +++++++++++++++
 src/include/access/relscan.h              |   6 +
 src/include/access/tableam.h              |   3 +
 src/include/commands/explain_state.h      |   1 +
 src/include/executor/instrument.h         |   1 +
 src/include/executor/instrument_node.h    |  50 +++++++++
 src/include/storage/read_stream.h         |   2 +
 src/tools/pgindent/typedefs.list          |   2 +
 13 files changed, 327 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 5b8b521802e..a854c41e963 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -46,6 +46,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
     TIMING [ <replaceable class="parameter">boolean</replaceable> ]
     SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
     MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
+    IO [ <replaceable class="parameter">boolean</replaceable> ]
     FORMAT { TEXT | XML | JSON | YAML }
 </synopsis>
  </refsynopsisdiv>
@@ -298,6 +299,17 @@ ROLLBACK;
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>IO</literal></term>
+    <listitem>
+     <para>
+      Include information on I/O performed by each node.
+      This parameter may only be used when <literal>ANALYZE</literal> is also
+      enabled.  It defaults to <literal>FALSE</literal>.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><literal>FORMAT</literal></term>
     <listitem>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f6ac5a0897c..89ab9742aa5 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -43,6 +43,7 @@
 #include "catalog/pg_database.h"
 #include "catalog/pg_database_d.h"
 #include "commands/vacuum.h"
+#include "executor/instrument_node.h"
 #include "pgstat.h"
 #include "port/pg_bitutils.h"
 #include "storage/lmgr.h"
@@ -1200,6 +1201,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_nkeys = nkeys;
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
+	scan->rs_base.rs_instrument = NULL;
 	scan->rs_strategy = NULL;	/* set in initscan */
 	scan->rs_cbuf = InvalidBuffer;
 
@@ -1312,6 +1314,14 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 														  sizeof(TBMIterateResult));
 	}
 
+	/* enable read stream instrumentation */
+	if (flags & SO_SCAN_INSTRUMENT)
+	{
+		scan->rs_base.rs_instrument = palloc0_object(TableScanInstrumentation);
+		read_stream_enable_stats(scan->rs_read_stream,
+								 &scan->rs_base.rs_instrument->io);
+	}
+
 	scan->rs_vmbuffer = InvalidBuffer;
 
 	return (TableScanDesc) scan;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index f151f21f9b3..e9aafdf6e55 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -13,6 +13,7 @@
  */
 #include "postgres.h"
 
+#include "access/relscan.h"
 #include "access/xact.h"
 #include "catalog/pg_type.h"
 #include "commands/createas.h"
@@ -139,6 +140,8 @@ static void show_hashagg_info(AggState *aggstate, ExplainState *es);
 static void show_indexsearches_info(PlanState *planstate, ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 								ExplainState *es);
+static void show_scan_io_usage(ScanState *planstate,
+							   ExplainState *es);
 static void show_instrumentation_count(const char *qlabel, int which,
 									   PlanState *planstate, ExplainState *es);
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
@@ -519,6 +522,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		instrument_option |= INSTRUMENT_BUFFERS;
 	if (es->wal)
 		instrument_option |= INSTRUMENT_WAL;
+	if (es->io)
+		instrument_option |= INSTRUMENT_IO;
 
 	/*
 	 * We always collect timing for the entire statement, even when node-level
@@ -2008,6 +2013,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
 			show_tidbitmap_info((BitmapHeapScanState *) planstate, es);
+			show_scan_io_usage((ScanState *) planstate, es);
 			break;
 		case T_SampleScan:
 			show_tablesample(((SampleScan *) plan)->tablesample,
@@ -3984,6 +3990,128 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 	}
 }
 
+/*
+ * Print I/O stats - prefetching and I/O performed
+ *
+ * This prints two types of stats - "prefetch" about the prefetching done by
+ * ReadStream, and "I/O" issued by the stream. The prefetch stats are based
+ * on buffers pulled from the stream (even if no I/O is needed). The I/O
+ * information is related to I/O requests issued by the stream.
+ *
+ * The prefetch stats are printed if any buffer was pulled from the stream.
+ * For the I/O stats it depend on the output format. In non-text formats the
+ * information is printed always, while in text format only if there were
+ * any actual I/O requests.
+ */
+static void
+print_io_usage(ExplainState *es, IOStats *stats)
+{
+	/* don't print prefetch stats if there's nothing to report */
+	if (stats->prefetch_count > 0)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			/* prefetch distance info */
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Prefetch: avg=%.2f max=%d capacity=%d\n",
+							 (stats->distance_sum * 1.0 / stats->prefetch_count),
+							 stats->distance_max,
+							 stats->distance_capacity);
+
+			/* prefetch I/O info (only if there were actual I/Os) */
+			if (stats->io_count > 0)
+			{
+				ExplainIndentText(es);
+				appendStringInfo(es->str, "I/O: count=%" PRIu64 " waits=%" PRIu64
+								 " size=%.2f inprogress=%.2f\n",
+								 stats->io_count, stats->wait_count,
+								 (stats->io_nblocks * 1.0 / stats->io_count),
+								 (stats->io_in_progress * 1.0 / stats->io_count));
+			}
+		}
+		else
+		{
+			ExplainPropertyFloat("Average Prefetch Distance", NULL,
+								 (stats->distance_sum * 1.0 / stats->prefetch_count), 3, es);
+			ExplainPropertyInteger("Max Prefetch Distance", NULL,
+								   stats->distance_max, es);
+			ExplainPropertyInteger("Prefetch Capacity", NULL,
+								   stats->distance_capacity, es);
+
+			ExplainPropertyUInteger("I/O Count", NULL,
+									stats->io_count, es);
+			ExplainPropertyUInteger("I/O Waits", NULL,
+									stats->wait_count, es);
+			ExplainPropertyFloat("Average I/O Size", NULL,
+								 (stats->io_nblocks * 1.0 / Max(1, stats->io_count)), 3, es);
+			ExplainPropertyFloat("Average I/Os In Progress", NULL,
+								 (stats->io_in_progress * 1.0 / Max(1, stats->io_count)), 3, es);
+		}
+	}
+}
+
+/*
+ * Show information about prefetch and I/O in a scan node.
+ */
+static void
+show_scan_io_usage(ScanState *planstate, ExplainState *es)
+{
+	Plan	   *plan = planstate->ps.plan;
+	IOStats		stats = {0};
+
+	if (!es->io)
+		return;
+
+	/*
+	 * Initialize counters with stats from the local process first.
+	 *
+	 * The scan descriptor may not exist, e.g. if the scan did not start, or
+	 * because of debug_parallel_query=regress. We still want to collect data
+	 * from workers.
+	 */
+	if (planstate->ss_currentScanDesc &&
+		planstate->ss_currentScanDesc->rs_instrument)
+	{
+		stats = planstate->ss_currentScanDesc->rs_instrument->io;
+	}
+
+	/*
+	 * Accumulate data from parallel workers (if any).
+	 */
+	switch (nodeTag(plan))
+	{
+		case T_BitmapHeapScan:
+			{
+				SharedBitmapHeapInstrumentation *sinstrument
+				= ((BitmapHeapScanState *) planstate)->sinstrument;
+
+				if (sinstrument)
+				{
+					for (int i = 0; i < sinstrument->num_workers; ++i)
+					{
+						BitmapHeapScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+						AccumulateIOStats(&stats, &winstrument->stats.io);
+
+						if (!es->workers_state)
+							continue;
+
+						ExplainOpenWorker(i, es);
+						print_io_usage(es, &winstrument->stats.io);
+						ExplainCloseWorker(i, es);
+					}
+				}
+
+				break;
+			}
+		default:
+			/* ignore other plans */
+			return;
+	}
+
+	print_io_usage(es, &stats);
+}
+
 /*
  * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node
  *
diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c
index 65dd4111459..0e07a63fca6 100644
--- a/src/backend/commands/explain_state.c
+++ b/src/backend/commands/explain_state.c
@@ -162,6 +162,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
 								"EXPLAIN", opt->defname, p),
 						 parser_errposition(pstate, opt->location)));
 		}
+		else if (strcmp(opt->defname, "io") == 0)
+			es->io = defGetBoolean(opt);
 		else if (!ApplyExtensionExplainOption(es, opt, pstate))
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -188,6 +190,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("EXPLAIN option %s requires ANALYZE", "TIMING")));
 
+	/* check that IO is used with EXPLAIN ANALYZE */
+	if (es->io && !es->analyze)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("EXPLAIN option %s requires ANALYZE", "IO")));
+
 	/* check that serialize is used with EXPLAIN ANALYZE */
 	if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze)
 		ereport(ERROR,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index d65e2a87b42..83d6478bc2b 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -144,13 +144,20 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
 	 */
 	if (!node->ss.ss_currentScanDesc)
 	{
+		uint32		flags = SO_NONE;
+
+		if (ScanRelIsReadOnly(&node->ss))
+			flags |= SO_HINT_REL_READ_ONLY;
+
+		if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
+			flags |= SO_SCAN_INSTRUMENT;
+
 		node->ss.ss_currentScanDesc =
 			table_beginscan_bm(node->ss.ss_currentRelation,
 							   node->ss.ps.state->es_snapshot,
 							   0,
 							   NULL,
-							   ScanRelIsReadOnly(&node->ss) ?
-							   SO_HINT_REL_READ_ONLY : SO_NONE);
+							   flags);
 	}
 
 	node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator;
@@ -330,6 +337,14 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 		 */
 		si->exact_pages += node->stats.exact_pages;
 		si->lossy_pages += node->stats.lossy_pages;
+
+		/* collect I/O instrumentation for this process */
+		if (node->ss.ss_currentScanDesc &&
+			node->ss.ss_currentScanDesc->rs_instrument)
+		{
+			AccumulateIOStats(&si->stats.io,
+							  &node->ss.ss_currentScanDesc->rs_instrument->io);
+		}
 	}
 
 	/*
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 0b6cdf7c873..936e08ea450 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -74,6 +74,7 @@
 #include "postgres.h"
 
 #include "miscadmin.h"
+#include "executor/instrument_node.h"
 #include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
@@ -123,6 +124,9 @@ struct ReadStream
 	bool		advice_enabled;
 	bool		temporary;
 
+	/* scan stats counters */
+	IOStats    *stats;
+
 	/*
 	 * One-block buffer to support 'ungetting' a block number, to resolve flow
 	 * control problems when I/Os are split.
@@ -188,6 +192,73 @@ block_range_read_stream_cb(ReadStream *stream,
 	return InvalidBlockNumber;
 }
 
+/*
+ * Update stream stats with current pinned buffer depth.
+ *
+ * Called once per buffer returned to the consumer in read_stream_next_buffer().
+ * Records the number of pinned buffers at that moment, so we can compute the
+ * average look-ahead depth.
+ */
+static inline void
+read_stream_count_prefetch(ReadStream *stream)
+{
+	IOStats    *stats = stream->stats;
+
+	if (stats == NULL)
+		return;
+
+	stats->prefetch_count++;
+	stats->distance_sum += stream->pinned_buffers;
+	if (stream->pinned_buffers > stats->distance_max)
+		stats->distance_max = stream->pinned_buffers;
+}
+
+/*
+ * Update stream stats about size of I/O requests.
+ *
+ * We count the number of I/O requests, size of requests (counted in blocks)
+ * and number of in-progress I/Os.
+ */
+static inline void
+read_stream_count_io(ReadStream *stream, int nblocks, int in_progress)
+{
+	IOStats    *stats = stream->stats;
+
+	if (stats == NULL)
+		return;
+
+	stats->io_count++;
+	stats->io_nblocks += nblocks;
+	stats->io_in_progress += in_progress;
+}
+
+/*
+ * Update stream stats about waits for I/O when consuming buffers.
+ *
+ * We count the number of I/O waits while pulling buffers out of a stream.
+ */
+static inline void
+read_stream_count_wait(ReadStream *stream)
+{
+	IOStats    *stats = stream->stats;
+
+	if (stats == NULL)
+		return;
+
+	stats->wait_count++;
+}
+
+/*
+ * Enable collection of stats into the provided IOStats.
+ */
+void
+read_stream_enable_stats(ReadStream *stream, IOStats *stats)
+{
+	stream->stats = stats;
+	if (stream->stats)
+		stream->stats->distance_capacity = stream->max_pinned_buffers;
+}
+
 /*
  * Ask the callback which block it would like us to read next, with a one block
  * buffer in front to allow read_stream_unget_block() to work.
@@ -426,6 +497,9 @@ read_stream_start_pending_read(ReadStream *stream)
 		Assert(stream->ios_in_progress < stream->max_ios);
 		stream->ios_in_progress++;
 		stream->seq_blocknum = stream->pending_read_blocknum + nblocks;
+
+		/* update I/O stats */
+		read_stream_count_io(stream, nblocks, stream->ios_in_progress);
 	}
 
 	/*
@@ -1021,6 +1095,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 										flags)))
 			{
 				/* Fast return. */
+				read_stream_count_prefetch(stream);
 				return buffer;
 			}
 
@@ -1036,6 +1111,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			 * to avoid having to effectively do another synchronous IO for
 			 * the next block (if it were also a miss).
 			 */
+
+			/* update I/O stats */
+			read_stream_count_io(stream, 1, stream->ios_in_progress);
+
+			/* update prefetch distance */
+			read_stream_count_prefetch(stream);
 		}
 		else
 		{
@@ -1100,6 +1181,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 
 		needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
 
+		/* Count it as a wait if we need to wait for IO */
+		if (needed_wait)
+			read_stream_count_wait(stream);
+
 		Assert(stream->ios_in_progress > 0);
 		stream->ios_in_progress--;
 		if (++stream->oldest_io_index == stream->max_ios)
@@ -1228,6 +1313,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 	}
 #endif
 
+	read_stream_count_prefetch(stream);
+
 	/* Pin transferred to caller. */
 	Assert(stream->pinned_buffers > 0);
 	stream->pinned_buffers--;
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index fd2076c582a..2ea06a67a63 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -24,6 +24,7 @@
 
 
 struct ParallelTableScanDescData;
+struct TableScanInstrumentation;
 
 /*
  * Generic descriptor for table scans. This is the base-class for table scans,
@@ -64,6 +65,11 @@ typedef struct TableScanDescData
 
 	struct ParallelTableScanDescData *rs_parallel;	/* parallel scan
 													 * information */
+
+	/*
+	 * Instrumentation counters maintained by all table AMs.
+	 */
+	struct TableScanInstrumentation *rs_instrument;
 } TableScanDescData;
 typedef struct TableScanDescData *TableScanDesc;
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index a21c7db5439..c13f05d39db 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -69,6 +69,9 @@ typedef enum ScanOptions
 
 	/* set if the query doesn't modify the relation */
 	SO_HINT_REL_READ_ONLY = 1 << 10,
+
+	/* collect scan instrumentation */
+	SO_SCAN_INSTRUMENT = 1 << 11,
 }			ScanOptions;
 
 /*
diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h
index 6252fe11f15..97bc7ed49f6 100644
--- a/src/include/commands/explain_state.h
+++ b/src/include/commands/explain_state.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
 	bool		summary;		/* print total planning and execution timing */
 	bool		memory;			/* print planner's memory usage information */
 	bool		settings;		/* print modified settings */
+	bool		io;				/* print info about IO (prefetch, ...) */
 	bool		generic;		/* generate a generic plan */
 	ExplainSerializeOption serialize;	/* serialize the query's output? */
 	ExplainFormat format;		/* output format */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index cc9fbb0e2f0..f093a52aae0 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -64,6 +64,7 @@ typedef enum InstrumentOption
 	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
 	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
 	INSTRUMENT_WAL = 1 << 3,	/* needs WAL usage */
+	INSTRUMENT_IO = 1 << 4,		/* needs IO usage */
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index e6a3f9f1941..22a75ccd863 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -48,6 +48,55 @@ typedef struct SharedAggInfo
 } SharedAggInfo;
 
 
+/* ---------------------
+ *	Instrumentation information about read streams and I/O
+ * ---------------------
+ */
+typedef struct IOStats
+{
+	/* number of buffers returned to consumer (for averaging distance) */
+	uint64		prefetch_count;
+
+	/* sum of pinned_buffers sampled at each buffer return */
+	uint64		distance_sum;
+
+	/* maximum actual pinned_buffers observed during the scan */
+	int16		distance_max;
+
+	/* maximum possible look-ahead distance (max_pinned_buffers) */
+	int16		distance_capacity;
+
+	/* number of waits for a read (for the I/O) */
+	uint64		wait_count;
+
+	/* I/O stats */
+	uint64		io_count;		/* number of I/Os */
+	uint64		io_nblocks;		/* sum of blocks for all I/Os */
+	uint64		io_in_progress; /* sum of in-progress I/Os */
+} IOStats;
+
+typedef struct TableScanInstrumentation
+{
+	IOStats		io;
+} TableScanInstrumentation;
+
+/* merge IO statistics from 'src' into 'dst' */
+static inline void
+AccumulateIOStats(IOStats *dst, IOStats *src)
+{
+	dst->prefetch_count += src->prefetch_count;
+	dst->distance_sum += src->distance_sum;
+	if (src->distance_max > dst->distance_max)
+		dst->distance_max = src->distance_max;
+	if (src->distance_capacity > dst->distance_capacity)
+		dst->distance_capacity = src->distance_capacity;
+	dst->wait_count += src->wait_count;
+	dst->io_count += src->io_count;
+	dst->io_nblocks += src->io_nblocks;
+	dst->io_in_progress += src->io_in_progress;
+}
+
+
 /* ---------------------
  *	Instrumentation information for indexscans (amgettuple and amgetbitmap)
  * ---------------------
@@ -79,6 +128,7 @@ typedef struct BitmapHeapScanInstrumentation
 {
 	uint64		exact_pages;
 	uint64		lossy_pages;
+	TableScanInstrumentation stats;
 } BitmapHeapScanInstrumentation;
 
 /*
diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index c9359b29b0f..48995c6d534 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -65,6 +65,7 @@
 
 struct ReadStream;
 typedef struct ReadStream ReadStream;
+struct IOStats;
 
 /* for block_range_read_stream_cb */
 typedef struct BlockRangeReadStreamPrivate
@@ -103,5 +104,6 @@ extern BlockNumber read_stream_pause(ReadStream *stream);
 extern void read_stream_resume(ReadStream *stream);
 extern void read_stream_reset(ReadStream *stream);
 extern void read_stream_end(ReadStream *stream);
+extern void read_stream_enable_stats(ReadStream *stream, struct IOStats *stats);
 
 #endif							/* READ_STREAM_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9e6a39f5608..98b8d78e693 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1279,6 +1279,7 @@ IOContext
 IOFuncSelector
 IOObject
 IOOp
+IOStats
 IO_STATUS_BLOCK
 IPCompareMethod
 ITEM
@@ -3127,6 +3128,7 @@ TableLikeClause
 TableSampleClause
 TableScanDesc
 TableScanDescData
+TableScanInstrumentation
 TableSpaceCacheEntry
 TableSpaceOpts
 TableToProcess
-- 
2.53.0

From 38d3f24ac878be825d80edcc2c90d466791b51b5 Mon Sep 17 00:00:00 2001
From: test <test>
Date: Tue, 7 Apr 2026 02:04:06 +0200
Subject: [PATCH v13 3/5] auto_explain: Add new GUC auto_explain.log_io

Allows enabling the new EXPLAIN "IO" option for auto_explain.

Reviewed-by: Melanie Plageman <[email protected]>
Reviewed-by: Lukas Fittl <[email protected]>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 contrib/auto_explain/auto_explain.c | 15 +++++++++++++++
 doc/src/sgml/auto-explain.sgml      | 20 ++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index 6ceae1c69ce..9114ced45a7 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -38,6 +38,7 @@ static int	auto_explain_log_parameter_max_length = -1; /* bytes or -1 */
 static bool auto_explain_log_analyze = false;
 static bool auto_explain_log_verbose = false;
 static bool auto_explain_log_buffers = false;
+static bool auto_explain_log_io = false;
 static bool auto_explain_log_wal = false;
 static bool auto_explain_log_triggers = false;
 static bool auto_explain_log_timing = true;
@@ -203,6 +204,17 @@ _PG_init(void)
 							 NULL,
 							 NULL);
 
+	DefineCustomBoolVariable("auto_explain.log_io",
+							 "Log I/O statistics.",
+							 NULL,
+							 &auto_explain_log_io,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	DefineCustomBoolVariable("auto_explain.log_wal",
 							 "Log WAL usage.",
 							 NULL,
@@ -343,6 +355,8 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
 				queryDesc->instrument_options |= INSTRUMENT_ROWS;
 			if (auto_explain_log_buffers)
 				queryDesc->instrument_options |= INSTRUMENT_BUFFERS;
+			if (auto_explain_log_io)
+				queryDesc->instrument_options |= INSTRUMENT_IO;
 			if (auto_explain_log_wal)
 				queryDesc->instrument_options |= INSTRUMENT_WAL;
 		}
@@ -440,6 +454,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 			es->analyze = (queryDesc->instrument_options && auto_explain_log_analyze);
 			es->verbose = auto_explain_log_verbose;
 			es->buffers = (es->analyze && auto_explain_log_buffers);
+			es->io = (es->analyze && auto_explain_log_io);
 			es->wal = (es->analyze && auto_explain_log_wal);
 			es->timing = (es->analyze && auto_explain_log_timing);
 			es->summary = es->analyze;
diff --git a/doc/src/sgml/auto-explain.sgml b/doc/src/sgml/auto-explain.sgml
index ee85a67eb2e..06a8fcc6c5b 100644
--- a/doc/src/sgml/auto-explain.sgml
+++ b/doc/src/sgml/auto-explain.sgml
@@ -128,6 +128,26 @@ LOAD 'auto_explain';
     </listitem>
    </varlistentry>
 
+   <varlistentry id="auto-explain-configuration-parameters-log-io">
+    <term>
+     <varname>auto_explain.log_io</varname> (<type>boolean</type>)
+     <indexterm>
+      <primary><varname>auto_explain.log_io</varname> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+      <varname>auto_explain.log_io</varname> controls whether I/O usage
+      statistics are printed when an execution plan is logged; it's
+      equivalent to the <literal>IO</literal> option of <command>EXPLAIN</command>.
+      This parameter has no effect
+      unless <varname>auto_explain.log_analyze</varname> is enabled.
+      This parameter is off by default.
+      Only superusers can change this setting.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry id="auto-explain-configuration-parameters-log-wal">
     <term>
      <varname>auto_explain.log_wal</varname> (<type>boolean</type>)
-- 
2.53.0

From 7f0269e4307f5bb8bd850092aba8a5df8411c587 Mon Sep 17 00:00:00 2001
From: test <test>
Date: Tue, 7 Apr 2026 01:38:53 +0200
Subject: [PATCH v13 4/5] Add EXPLAIN (IO) instrumentation for SeqScan

Adds support for EXPLAIN (IO) instrumentation for sequential scans. This
requires adding shared instrumentation for parallel scans, using the
separate DSM approach introduced by dd78e69cfc33.

Reviewed-by: Melanie Plageman <[email protected]>
Reviewed-by: Lukas Fittl <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 src/backend/commands/explain.c         |  25 +++++
 src/backend/executor/execParallel.c    |  11 +++
 src/backend/executor/nodeSeqscan.c     | 127 +++++++++++++++++++++++--
 src/include/executor/instrument_node.h |  19 ++++
 src/include/executor/nodeSeqscan.h     |   9 ++
 src/include/nodes/execnodes.h          |   1 +
 src/test/regress/expected/explain.out  |  18 +++-
 src/test/regress/sql/explain.sql       |   4 +-
 src/tools/pgindent/typedefs.list       |   2 +
 9 files changed, 203 insertions(+), 13 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e9aafdf6e55..18d424da944 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2032,6 +2032,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			if (IsA(plan, CteScan))
 				show_ctescan_info(castNode(CteScanState, planstate), es);
+			show_scan_io_usage((ScanState *) planstate, es);
 			break;
 		case T_Gather:
 			{
@@ -4102,6 +4103,30 @@ show_scan_io_usage(ScanState *planstate, ExplainState *es)
 					}
 				}
 
+				break;
+			}
+		case T_SeqScan:
+			{
+				SharedSeqScanInstrumentation *sinstrument
+				= ((SeqScanState *) planstate)->sinstrument;
+
+				if (sinstrument)
+				{
+					for (int i = 0; i < sinstrument->num_workers; ++i)
+					{
+						SeqScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+						AccumulateIOStats(&stats, &winstrument->stats.io);
+
+						if (!es->workers_state)
+							continue;
+
+						ExplainOpenWorker(i, es);
+						print_io_usage(es, &winstrument->stats.io);
+						ExplainCloseWorker(i, es);
+					}
+				}
+
 				break;
 			}
 		default:
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 1a5ec0c305f..9690f0938ae 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -257,6 +257,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			if (planstate->plan->parallel_aware)
 				ExecSeqScanEstimate((SeqScanState *) planstate,
 									e->pcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSeqScanInstrumentEstimate((SeqScanState *) planstate,
+										  e->pcxt);
 			break;
 		case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
@@ -500,6 +503,9 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			if (planstate->plan->parallel_aware)
 				ExecSeqScanInitializeDSM((SeqScanState *) planstate,
 										 d->pcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSeqScanInstrumentInitDSM((SeqScanState *) planstate,
+										 d->pcxt);
 			break;
 		case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
@@ -1148,6 +1154,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_BitmapHeapScanState:
 			ExecBitmapHeapRetrieveInstrumentation((BitmapHeapScanState *) planstate);
 			break;
+		case T_SeqScanState:
+			ExecSeqScanRetrieveInstrumentation((SeqScanState *) planstate);
+			break;
 		default:
 			break;
 	}
@@ -1388,6 +1397,8 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 		case T_SeqScanState:
 			if (planstate->plan->parallel_aware)
 				ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSeqScanInstrumentInitWorker((SeqScanState *) planstate, pwcxt);
 			break;
 		case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
index 04803b0e37d..a4fcf7c7ad4 100644
--- a/src/backend/executor/nodeSeqscan.c
+++ b/src/backend/executor/nodeSeqscan.c
@@ -29,6 +29,7 @@
 
 #include "access/relscan.h"
 #include "access/tableam.h"
+#include "executor/execParallel.h"
 #include "executor/execScan.h"
 #include "executor/executor.h"
 #include "executor/nodeSeqscan.h"
@@ -65,15 +66,21 @@ SeqNext(SeqScanState *node)
 
 	if (scandesc == NULL)
 	{
+		uint32		flags = SO_NONE;
+
+		if (ScanRelIsReadOnly(&node->ss))
+			flags |= SO_HINT_REL_READ_ONLY;
+
+		if (estate->es_instrument & INSTRUMENT_IO)
+			flags |= SO_SCAN_INSTRUMENT;
+
 		/*
 		 * We reach here if the scan is not parallel, or if we're serially
 		 * executing a scan that was planned to be parallel.
 		 */
 		scandesc = table_beginscan(node->ss.ss_currentRelation,
 								   estate->es_snapshot,
-								   0, NULL,
-								   ScanRelIsReadOnly(&node->ss) ?
-								   SO_HINT_REL_READ_ONLY : SO_NONE);
+								   0, NULL, flags);
 		node->ss.ss_currentScanDesc = scandesc;
 	}
 
@@ -302,6 +309,22 @@ ExecEndSeqScan(SeqScanState *node)
 	 */
 	scanDesc = node->ss.ss_currentScanDesc;
 
+	/*
+	 * Collect I/O stats for this process into shared instrumentation.
+	 */
+	if (node->sinstrument != NULL && IsParallelWorker())
+	{
+		SeqScanInstrumentation *si;
+
+		Assert(ParallelWorkerNumber < node->sinstrument->num_workers);
+		si = &node->sinstrument->sinstrument[ParallelWorkerNumber];
+
+		if (scanDesc && scanDesc->rs_instrument)
+		{
+			AccumulateIOStats(&si->stats.io, &scanDesc->rs_instrument->io);
+		}
+	}
+
 	/*
 	 * close heap scan
 	 */
@@ -370,6 +393,13 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelTableScanDesc pscan;
+	uint32		flags = SO_NONE;
+
+	if (ScanRelIsReadOnly(&node->ss))
+		flags |= SO_HINT_REL_READ_ONLY;
+
+	if (estate->es_instrument & INSTRUMENT_IO)
+		flags |= SO_SCAN_INSTRUMENT;
 
 	pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
 	table_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -378,9 +408,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
 
 	node->ss.ss_currentScanDesc =
-		table_beginscan_parallel(node->ss.ss_currentRelation, pscan,
-								 ScanRelIsReadOnly(&node->ss) ?
-								 SO_HINT_REL_READ_ONLY : SO_NONE);
+		table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags);
 }
 
 /* ----------------------------------------------------------------
@@ -410,10 +438,91 @@ ExecSeqScanInitializeWorker(SeqScanState *node,
 							ParallelWorkerContext *pwcxt)
 {
 	ParallelTableScanDesc pscan;
+	uint32		flags = SO_NONE;
+
+	if (ScanRelIsReadOnly(&node->ss))
+		flags |= SO_HINT_REL_READ_ONLY;
+
+	if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
+		flags |= SO_SCAN_INSTRUMENT;
 
 	pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->ss.ss_currentScanDesc =
-		table_beginscan_parallel(node->ss.ss_currentRelation, pscan,
-								 ScanRelIsReadOnly(&node->ss) ?
-								 SO_HINT_REL_READ_ONLY : SO_NONE);
+		table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags);
+}
+
+/*
+ * Compute the amount of space we'll need for the shared instrumentation and
+ * inform pcxt->estimator.
+ */
+void
+ExecSeqScanInstrumentEstimate(SeqScanState *node, ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	if (!estate->es_instrument || pcxt->nworkers == 0)
+		return;
+
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   offsetof(SharedSeqScanInstrumentation, sinstrument) +
+						   sizeof(SeqScanInstrumentation) * pcxt->nworkers);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up parallel sequential scan instrumentation.
+ */
+void
+ExecSeqScanInstrumentInitDSM(SeqScanState *node, ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	SharedSeqScanInstrumentation *sinstrument;
+	Size		size;
+
+	if (!estate->es_instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedSeqScanInstrumentation, sinstrument) +
+		sizeof(SeqScanInstrumentation) * pcxt->nworkers;
+	sinstrument = shm_toc_allocate(pcxt->toc, size);
+	memset(sinstrument, 0, size);
+	sinstrument->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc,
+				   node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+				   sinstrument);
+	node->sinstrument = sinstrument;
+}
+
+/*
+ * Look up and save the location of the shared instrumentation.
+ */
+void
+ExecSeqScanInstrumentInitWorker(SeqScanState *node,
+								ParallelWorkerContext *pwcxt)
+{
+	if (!node->ss.ps.state->es_instrument)
+		return;
+
+	node->sinstrument = shm_toc_lookup(pwcxt->toc,
+									   node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+									   true);
+}
+
+/*
+ * Transfer sequential scan instrumentation from DSM to private memory.
+ */
+void
+ExecSeqScanRetrieveInstrumentation(SeqScanState *node)
+{
+	SharedSeqScanInstrumentation *sinstrument = node->sinstrument;
+	Size		size;
+
+	if (sinstrument == NULL)
+		return;
+
+	size = offsetof(SharedSeqScanInstrumentation, sinstrument)
+		+ sinstrument->num_workers * sizeof(SeqScanInstrumentation);
+
+	node->sinstrument = palloc(size);
+	memcpy(node->sinstrument, sinstrument, size);
 }
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 22a75ccd863..003dc262b5d 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -266,4 +266,23 @@ typedef struct SharedIncrementalSortInfo
 	IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER];
 } SharedIncrementalSortInfo;
 
+
+/* ---------------------
+ *	Instrumentation information for sequential scans
+ * ---------------------
+ */
+typedef struct SeqScanInstrumentation
+{
+	TableScanInstrumentation stats;
+} SeqScanInstrumentation;
+
+/*
+ * Shared memory container for per-worker information
+ */
+typedef struct SharedSeqScanInstrumentation
+{
+	int			num_workers;
+	SeqScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedSeqScanInstrumentation;
+
 #endif							/* INSTRUMENT_NODE_H */
diff --git a/src/include/executor/nodeSeqscan.h b/src/include/executor/nodeSeqscan.h
index 7a1490596fb..9c0ad4879d7 100644
--- a/src/include/executor/nodeSeqscan.h
+++ b/src/include/executor/nodeSeqscan.h
@@ -28,4 +28,13 @@ extern void ExecSeqScanReInitializeDSM(SeqScanState *node, ParallelContext *pcxt
 extern void ExecSeqScanInitializeWorker(SeqScanState *node,
 										ParallelWorkerContext *pwcxt);
 
+/* instrument support */
+extern void ExecSeqScanInstrumentEstimate(SeqScanState *node,
+										  ParallelContext *pcxt);
+extern void ExecSeqScanInstrumentInitDSM(SeqScanState *node,
+										 ParallelContext *pcxt);
+extern void ExecSeqScanInstrumentInitWorker(SeqScanState *node,
+											ParallelWorkerContext *pwcxt);
+extern void ExecSeqScanRetrieveInstrumentation(SeqScanState *node);
+
 #endif							/* NODESEQSCAN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3ecae7552fc..56febb3204c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1670,6 +1670,7 @@ typedef struct SeqScanState
 {
 	ScanState	ss;				/* its first field is NodeTag */
 	Size		pscan_len;		/* size of parallel heap scan descriptor */
+	struct SharedSeqScanInstrumentation *sinstrument;
 } SeqScanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index dc31c7ce9f9..74a4d87801e 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -100,7 +100,7 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
 (1 row)
 
 \a
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
 explain_filter
 <explain xmlns="http://www.postgresql.org/N/explain";>
   <Query>
@@ -119,6 +119,13 @@ explain_filter
       <Actual-Rows>N.N</Actual-Rows>
       <Actual-Loops>N</Actual-Loops>
       <Disabled>false</Disabled>
+      <Average-Prefetch-Distance>N.N</Average-Prefetch-Distance>
+      <Max-Prefetch-Distance>N</Max-Prefetch-Distance>
+      <Prefetch-Capacity>N</Prefetch-Capacity>
+      <I-O-Count>N</I-O-Count>
+      <I-O-Waits>N</I-O-Waits>
+      <Average-I-O-Size>N.N</Average-I-O-Size>
+      <Average-I-Os-In-Progress>N.N</Average-I-Os-In-Progress>
       <Shared-Hit-Blocks>N</Shared-Hit-Blocks>
       <Shared-Read-Blocks>N</Shared-Read-Blocks>
       <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>
@@ -149,7 +156,7 @@ explain_filter
   </Query>
 </explain>
 (1 row)
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
 explain_filter
 - Plan: 
     Node Type: "Seq Scan"
@@ -166,6 +173,13 @@ explain_filter
     Actual Rows: N.N
     Actual Loops: N
     Disabled: false
+    Average Prefetch Distance: N.N
+    Max Prefetch Distance: N
+    Prefetch Capacity: N
+    I/O Count: N
+    I/O Waits: N
+    Average I/O Size: N.N
+    Average I/Os In Progress: N.N
     Shared Hit Blocks: N
     Shared Read Blocks: N
     Shared Dirtied Blocks: N
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 8f10e1aff55..2f163c64bf6 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -69,8 +69,8 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
 
 \a
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
 \a
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 98b8d78e693..c0436a13ac3 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2800,6 +2800,7 @@ SelfJoinCandidate
 SemTPadded
 SemiAntiJoinFactors
 SeqScan
+SeqScanInstrumentation
 SeqScanState
 SeqTable
 SeqTableData
@@ -2864,6 +2865,7 @@ SharedMemoizeInfo
 SharedRecordTableEntry
 SharedRecordTableKey
 SharedRecordTypmodRegistry
+SharedSeqScanInstrumentation
 SharedSortInfo
 SharedTuplestore
 SharedTuplestoreAccessor
-- 
2.53.0

From 5facc503442201929c416bd37233ccd3d85ee45d Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Sun, 5 Apr 2026 17:31:18 -0400
Subject: [PATCH v13 5/5] Add EXPLAIN (IO) instrumentation for TidRangeScan

Adds support for EXPLAIN (IO) instrumentation for TidRange scans. This
requires adding shared instrumentation for parallel scans, using the
separate DSM approach introduced by dd78e69cfc33.

Reviewed-by: Melanie Plageman <[email protected]>
Reviewed-by: Lukas Fittl <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 src/backend/commands/explain.c          |  25 +++++
 src/backend/executor/execParallel.c     |  12 +++
 src/backend/executor/nodeTidrangescan.c | 127 ++++++++++++++++++++++--
 src/include/executor/instrument_node.h  |  18 ++++
 src/include/executor/nodeTidrangescan.h |   9 ++
 src/include/nodes/execnodes.h           |   1 +
 src/tools/pgindent/typedefs.list        |   2 +
 7 files changed, 186 insertions(+), 8 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 18d424da944..b8e4c4eb2da 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2149,6 +2149,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				if (plan->qual)
 					show_instrumentation_count("Rows Removed by Filter", 1,
 											   planstate, es);
+				show_scan_io_usage((ScanState *) planstate, es);
 			}
 			break;
 		case T_ForeignScan:
@@ -4127,6 +4128,30 @@ show_scan_io_usage(ScanState *planstate, ExplainState *es)
 					}
 				}
 
+				break;
+			}
+		case T_TidRangeScan:
+			{
+				SharedTidRangeScanInstrumentation *sinstrument
+				= ((TidRangeScanState *) planstate)->trss_sinstrument;
+
+				if (sinstrument)
+				{
+					for (int i = 0; i < sinstrument->num_workers; ++i)
+					{
+						TidRangeScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+						AccumulateIOStats(&stats, &winstrument->stats.io);
+
+						if (!es->workers_state)
+							continue;
+
+						ExplainOpenWorker(i, es);
+						print_io_usage(es, &winstrument->stats.io);
+						ExplainCloseWorker(i, es);
+					}
+				}
+
 				break;
 			}
 		default:
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 9690f0938ae..81b87d82fab 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -291,6 +291,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			if (planstate->plan->parallel_aware)
 				ExecTidRangeScanEstimate((TidRangeScanState *) planstate,
 										 e->pcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecTidRangeScanInstrumentEstimate((TidRangeScanState *) planstate,
+											   e->pcxt);
 			break;
 		case T_AppendState:
 			if (planstate->plan->parallel_aware)
@@ -536,6 +539,9 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			if (planstate->plan->parallel_aware)
 				ExecTidRangeScanInitializeDSM((TidRangeScanState *) planstate,
 											  d->pcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecTidRangeScanInstrumentInitDSM((TidRangeScanState *) planstate,
+											  d->pcxt);
 			break;
 		case T_AppendState:
 			if (planstate->plan->parallel_aware)
@@ -1157,6 +1163,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_SeqScanState:
 			ExecSeqScanRetrieveInstrumentation((SeqScanState *) planstate);
 			break;
+		case T_TidRangeScanState:
+			ExecTidRangeScanRetrieveInstrumentation((TidRangeScanState *) planstate);
+			break;
 		default:
 			break;
 	}
@@ -1430,6 +1439,9 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			if (planstate->plan->parallel_aware)
 				ExecTidRangeScanInitializeWorker((TidRangeScanState *) planstate,
 												 pwcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecTidRangeScanInstrumentInitWorker((TidRangeScanState *) planstate,
+												 pwcxt);
 			break;
 		case T_AppendState:
 			if (planstate->plan->parallel_aware)
diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c
index 4a8fe91b2b3..550916928cb 100644
--- a/src/backend/executor/nodeTidrangescan.c
+++ b/src/backend/executor/nodeTidrangescan.c
@@ -18,7 +18,9 @@
 #include "access/sysattr.h"
 #include "access/tableam.h"
 #include "catalog/pg_operator.h"
+#include "executor/execParallel.h"
 #include "executor/executor.h"
+#include "executor/instrument.h"
 #include "executor/nodeTidrangescan.h"
 #include "nodes/nodeFuncs.h"
 #include "utils/rel.h"
@@ -242,12 +244,19 @@ TidRangeNext(TidRangeScanState *node)
 
 		if (scandesc == NULL)
 		{
+			uint32		flags = SO_NONE;
+
+			if (ScanRelIsReadOnly(&node->ss))
+				flags |= SO_HINT_REL_READ_ONLY;
+
+			if (estate->es_instrument & INSTRUMENT_IO)
+				flags |= SO_SCAN_INSTRUMENT;
+
 			scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation,
 												estate->es_snapshot,
 												&node->trss_mintid,
 												&node->trss_maxtid,
-												ScanRelIsReadOnly(&node->ss) ?
-												SO_HINT_REL_READ_ONLY : SO_NONE);
+												flags);
 			node->ss.ss_currentScanDesc = scandesc;
 		}
 		else
@@ -342,6 +351,20 @@ ExecEndTidRangeScan(TidRangeScanState *node)
 {
 	TableScanDesc scan = node->ss.ss_currentScanDesc;
 
+	/* Collect IO stats for this process into shared instrumentation */
+	if (node->trss_sinstrument != NULL && IsParallelWorker())
+	{
+		TidRangeScanInstrumentation *si;
+
+		Assert(ParallelWorkerNumber < node->trss_sinstrument->num_workers);
+		si = &node->trss_sinstrument->sinstrument[ParallelWorkerNumber];
+
+		if (scan && scan->rs_instrument)
+		{
+			AccumulateIOStats(&si->stats.io, &scan->rs_instrument->io);
+		}
+	}
+
 	if (scan != NULL)
 		table_endscan(scan);
 }
@@ -454,6 +477,13 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt)
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelTableScanDesc pscan;
+	uint32		flags = SO_NONE;
+
+	if (ScanRelIsReadOnly(&node->ss))
+		flags |= SO_HINT_REL_READ_ONLY;
+
+	if (estate->es_instrument & INSTRUMENT_IO)
+		flags |= SO_SCAN_INSTRUMENT;
 
 	pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen);
 	table_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -462,9 +492,7 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt)
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
 	node->ss.ss_currentScanDesc =
 		table_beginscan_parallel_tidrange(node->ss.ss_currentRelation,
-										  pscan,
-										  ScanRelIsReadOnly(&node->ss) ?
-										  SO_HINT_REL_READ_ONLY : SO_NONE);
+										  pscan, flags);
 }
 
 /* ----------------------------------------------------------------
@@ -494,11 +522,94 @@ ExecTidRangeScanInitializeWorker(TidRangeScanState *node,
 								 ParallelWorkerContext *pwcxt)
 {
 	ParallelTableScanDesc pscan;
+	uint32		flags = SO_NONE;
+
+	if (ScanRelIsReadOnly(&node->ss))
+		flags |= SO_HINT_REL_READ_ONLY;
+
+	if (node->ss.ps.state->es_instrument)
+		flags |= SO_SCAN_INSTRUMENT;
 
 	pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->ss.ss_currentScanDesc =
 		table_beginscan_parallel_tidrange(node->ss.ss_currentRelation,
-										  pscan,
-										  ScanRelIsReadOnly(&node->ss) ?
-										  SO_HINT_REL_READ_ONLY : SO_NONE);
+										  pscan, flags);
+}
+
+/*
+ * Compute the amount of space we'll need for the shared instrumentation and
+ * inform pcxt->estimator.
+ */
+void
+ExecTidRangeScanInstrumentEstimate(TidRangeScanState *node,
+								   ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	if (!estate->es_instrument || pcxt->nworkers == 0)
+		return;
+
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   offsetof(SharedTidRangeScanInstrumentation, sinstrument) +
+						   sizeof(TidRangeScanInstrumentation) * pcxt->nworkers);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up parallel scan instrumentation.
+ */
+void
+ExecTidRangeScanInstrumentInitDSM(TidRangeScanState *node,
+								  ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	SharedTidRangeScanInstrumentation *sinstrument;
+	Size		size;
+
+	if (!estate->es_instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedTidRangeScanInstrumentation, sinstrument) +
+		sizeof(TidRangeScanInstrumentation) * pcxt->nworkers;
+	sinstrument = shm_toc_allocate(pcxt->toc, size);
+	memset(sinstrument, 0, size);
+	sinstrument->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc,
+				   node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+				   sinstrument);
+	node->trss_sinstrument = sinstrument;
+}
+
+/*
+ * Look up and save the location of the shared instrumentation.
+ */
+void
+ExecTidRangeScanInstrumentInitWorker(TidRangeScanState *node,
+									 ParallelWorkerContext *pwcxt)
+{
+	if (!node->ss.ps.state->es_instrument)
+		return;
+
+	node->trss_sinstrument = shm_toc_lookup(pwcxt->toc,
+											node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+											true);
+}
+
+/*
+ * Transfer scan instrumentation from DSM to private memory.
+ */
+void
+ExecTidRangeScanRetrieveInstrumentation(TidRangeScanState *node)
+{
+	SharedTidRangeScanInstrumentation *sinstrument = node->trss_sinstrument;
+	Size		size;
+
+	if (sinstrument == NULL)
+		return;
+
+	size = offsetof(SharedTidRangeScanInstrumentation, sinstrument)
+		+ sinstrument->num_workers * sizeof(TidRangeScanInstrumentation);
+
+	node->trss_sinstrument = palloc(size);
+	memcpy(node->trss_sinstrument, sinstrument, size);
 }
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 003dc262b5d..4076990408e 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -285,4 +285,22 @@ typedef struct SharedSeqScanInstrumentation
 	SeqScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
 } SharedSeqScanInstrumentation;
 
+
+/*
+ *	Instrumentation information for TID range scans
+ */
+typedef struct TidRangeScanInstrumentation
+{
+	TableScanInstrumentation stats;
+} TidRangeScanInstrumentation;
+
+/*
+ * Shared memory container for per-worker information
+ */
+typedef struct SharedTidRangeScanInstrumentation
+{
+	int			num_workers;
+	TidRangeScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedTidRangeScanInstrumentation;
+
 #endif							/* INSTRUMENT_NODE_H */
diff --git a/src/include/executor/nodeTidrangescan.h b/src/include/executor/nodeTidrangescan.h
index 8752d1ea8c4..9e7d0a357bb 100644
--- a/src/include/executor/nodeTidrangescan.h
+++ b/src/include/executor/nodeTidrangescan.h
@@ -28,4 +28,13 @@ extern void ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelConte
 extern void ExecTidRangeScanReInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt);
 extern void ExecTidRangeScanInitializeWorker(TidRangeScanState *node, ParallelWorkerContext *pwcxt);
 
+/* instrument support */
+extern void ExecTidRangeScanInstrumentEstimate(TidRangeScanState *node,
+											   ParallelContext *pcxt);
+extern void ExecTidRangeScanInstrumentInitDSM(TidRangeScanState *node,
+											  ParallelContext *pcxt);
+extern void ExecTidRangeScanInstrumentInitWorker(TidRangeScanState *node,
+												 ParallelWorkerContext *pwcxt);
+extern void ExecTidRangeScanRetrieveInstrumentation(TidRangeScanState *node);
+
 #endif							/* NODETIDRANGESCAN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 56febb3204c..13359180d25 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1922,6 +1922,7 @@ typedef struct TidRangeScanState
 	ItemPointerData trss_maxtid;
 	bool		trss_inScan;
 	Size		trss_pscanlen;
+	struct SharedTidRangeScanInstrumentation *trss_sinstrument;
 } TidRangeScanState;
 
 /* ----------------
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c0436a13ac3..0cd41106975 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2867,6 +2867,7 @@ SharedRecordTableKey
 SharedRecordTypmodRegistry
 SharedSeqScanInstrumentation
 SharedSortInfo
+SharedTidRangeScanInstrumentation
 SharedTuplestore
 SharedTuplestoreAccessor
 SharedTuplestoreChunk
@@ -3171,6 +3172,7 @@ TidOpExpr
 TidPath
 TidRangePath
 TidRangeScan
+TidRangeScanInstrumentation
 TidRangeScanState
 TidScan
 TidScanState
-- 
2.53.0

Reply via email to