Hi, I've pushed the first two parts earlier today. Here's a v13 with the remaining pieces rebased on top of that, with updated commit messages and some additional fixes.
1) There was a bogus else-if in explain.c that would make it impossible to enable "wal" and "io" at the same time. 2) Replaced one of the "typedef IOStats" with just forward declaration, to prevent possible conflicting typedefs in the future (even if that wasn't an issue now). 3) Used (es_instrument & INSTRUMENT_IO) more consistently. A couple places in the executor still checked just es_instrument, and so would collect stats even if not needed. Be consistent. 4) Various comment additions / improvements. I intend to push this later today, after giving it one more round of review etc. regards -- Tomas Vondra
From 1270fb4c87b964021a66ffb261a901e0fdc929af Mon Sep 17 00:00:00 2001 From: Tomas Vondra <[email protected]> Date: Tue, 31 Mar 2026 13:44:23 +0200 Subject: [PATCH v13 1/5] Switch EXPLAIN to unaligned output for json/xml/yaml Use unaligned output for multiple EXPLAIN queries using non-text format in regression tests. With aligned output adding/removing explain fields can be very disruptive, as it often modifies the whole block because of padding. Unaligned output does not have this issue. Reviewed-by: Melanie Plageman <[email protected]> Reviewed-by: Lukas Fittl <[email protected]> Reviewed-by: Andres Freund <[email protected]> Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me --- src/test/regress/expected/explain.out | 296 +++++++++++++------------- src/test/regress/sql/explain.sql | 5 +- 2 files changed, 150 insertions(+), 151 deletions(-) diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index 7c1f26b182c..dc31c7ce9f9 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -93,164 +93,160 @@ select explain_filter('explain (analyze, buffers, format text) select * from int Execution Time: N.N ms (3 rows) -select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8'); - explain_filter --------------------------------------------------------- - <explain xmlns="http://www.postgresql.org/N/explain"> + - <Query> + - <Plan> + - <Node-Type>Seq Scan</Node-Type> + - <Parallel-Aware>false</Parallel-Aware> + - <Async-Capable>false</Async-Capable> + - <Relation-Name>int8_tbl</Relation-Name> + - <Alias>i8</Alias> + - <Startup-Cost>N.N</Startup-Cost> + - <Total-Cost>N.N</Total-Cost> + - <Plan-Rows>N</Plan-Rows> + - <Plan-Width>N</Plan-Width> + - <Actual-Startup-Time>N.N</Actual-Startup-Time> + - <Actual-Total-Time>N.N</Actual-Total-Time> + - <Actual-Rows>N.N</Actual-Rows> + - <Actual-Loops>N</Actual-Loops> + - <Disabled>false</Disabled> + - <Shared-Hit-Blocks>N</Shared-Hit-Blocks> + - <Shared-Read-Blocks>N</Shared-Read-Blocks> + - <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>+ - <Shared-Written-Blocks>N</Shared-Written-Blocks>+ - <Local-Hit-Blocks>N</Local-Hit-Blocks> + - <Local-Read-Blocks>N</Local-Read-Blocks> + - <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks> + - <Local-Written-Blocks>N</Local-Written-Blocks> + - <Temp-Read-Blocks>N</Temp-Read-Blocks> + - <Temp-Written-Blocks>N</Temp-Written-Blocks> + - </Plan> + - <Planning> + - <Shared-Hit-Blocks>N</Shared-Hit-Blocks> + - <Shared-Read-Blocks>N</Shared-Read-Blocks> + - <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>+ - <Shared-Written-Blocks>N</Shared-Written-Blocks>+ - <Local-Hit-Blocks>N</Local-Hit-Blocks> + - <Local-Read-Blocks>N</Local-Read-Blocks> + - <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks> + - <Local-Written-Blocks>N</Local-Written-Blocks> + - <Temp-Read-Blocks>N</Temp-Read-Blocks> + - <Temp-Written-Blocks>N</Temp-Written-Blocks> + - </Planning> + - <Planning-Time>N.N</Planning-Time> + - <Triggers> + - </Triggers> + - <Execution-Time>N.N</Execution-Time> + - </Query> + - </explain> -(1 row) - -select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8'); - explain_filter -------------------------------- - - Plan: + - Node Type: "Seq Scan" + - Parallel Aware: false + - Async Capable: false + - Relation Name: "int8_tbl"+ - Alias: "i8" + - Startup Cost: N.N + - Total Cost: N.N + - Plan Rows: N + - Plan Width: N + - Actual Startup Time: N.N + - Actual Total Time: N.N + - Actual Rows: N.N + - Actual Loops: N + - Disabled: false + - Shared Hit Blocks: N + - Shared Read Blocks: N + - Shared Dirtied Blocks: N + - Shared Written Blocks: N + - Local Hit Blocks: N + - Local Read Blocks: N + - Local Dirtied Blocks: N + - Local Written Blocks: N + - Temp Read Blocks: N + - Temp Written Blocks: N + - Planning: + - Shared Hit Blocks: N + - Shared Read Blocks: N + - Shared Dirtied Blocks: N + - Shared Written Blocks: N + - Local Hit Blocks: N + - Local Read Blocks: N + - Local Dirtied Blocks: N + - Local Written Blocks: N + - Temp Read Blocks: N + - Temp Written Blocks: N + - Planning Time: N.N + - Triggers: + - Serialization: + - Time: N.N + - Output Volume: N + - Format: "text" + - Shared Hit Blocks: N + - Shared Read Blocks: N + - Shared Dirtied Blocks: N + - Shared Written Blocks: N + - Local Hit Blocks: N + - Local Read Blocks: N + - Local Dirtied Blocks: N + - Local Written Blocks: N + - Temp Read Blocks: N + - Temp Written Blocks: N + - Execution Time: N.N -(1 row) - select explain_filter('explain (buffers, format text) select * from int8_tbl i8'); explain_filter --------------------------------------------------------- Seq Scan on int8_tbl i8 (cost=N.N..N.N rows=N width=N) (1 row) +\a +select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8'); +explain_filter +<explain xmlns="http://www.postgresql.org/N/explain"> + <Query> + <Plan> + <Node-Type>Seq Scan</Node-Type> + <Parallel-Aware>false</Parallel-Aware> + <Async-Capable>false</Async-Capable> + <Relation-Name>int8_tbl</Relation-Name> + <Alias>i8</Alias> + <Startup-Cost>N.N</Startup-Cost> + <Total-Cost>N.N</Total-Cost> + <Plan-Rows>N</Plan-Rows> + <Plan-Width>N</Plan-Width> + <Actual-Startup-Time>N.N</Actual-Startup-Time> + <Actual-Total-Time>N.N</Actual-Total-Time> + <Actual-Rows>N.N</Actual-Rows> + <Actual-Loops>N</Actual-Loops> + <Disabled>false</Disabled> + <Shared-Hit-Blocks>N</Shared-Hit-Blocks> + <Shared-Read-Blocks>N</Shared-Read-Blocks> + <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks> + <Shared-Written-Blocks>N</Shared-Written-Blocks> + <Local-Hit-Blocks>N</Local-Hit-Blocks> + <Local-Read-Blocks>N</Local-Read-Blocks> + <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks> + <Local-Written-Blocks>N</Local-Written-Blocks> + <Temp-Read-Blocks>N</Temp-Read-Blocks> + <Temp-Written-Blocks>N</Temp-Written-Blocks> + </Plan> + <Planning> + <Shared-Hit-Blocks>N</Shared-Hit-Blocks> + <Shared-Read-Blocks>N</Shared-Read-Blocks> + <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks> + <Shared-Written-Blocks>N</Shared-Written-Blocks> + <Local-Hit-Blocks>N</Local-Hit-Blocks> + <Local-Read-Blocks>N</Local-Read-Blocks> + <Local-Dirtied-Blocks>N</Local-Dirtied-Blocks> + <Local-Written-Blocks>N</Local-Written-Blocks> + <Temp-Read-Blocks>N</Temp-Read-Blocks> + <Temp-Written-Blocks>N</Temp-Written-Blocks> + </Planning> + <Planning-Time>N.N</Planning-Time> + <Triggers> + </Triggers> + <Execution-Time>N.N</Execution-Time> + </Query> +</explain> +(1 row) +select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8'); +explain_filter +- Plan: + Node Type: "Seq Scan" + Parallel Aware: false + Async Capable: false + Relation Name: "int8_tbl" + Alias: "i8" + Startup Cost: N.N + Total Cost: N.N + Plan Rows: N + Plan Width: N + Actual Startup Time: N.N + Actual Total Time: N.N + Actual Rows: N.N + Actual Loops: N + Disabled: false + Shared Hit Blocks: N + Shared Read Blocks: N + Shared Dirtied Blocks: N + Shared Written Blocks: N + Local Hit Blocks: N + Local Read Blocks: N + Local Dirtied Blocks: N + Local Written Blocks: N + Temp Read Blocks: N + Temp Written Blocks: N + Planning: + Shared Hit Blocks: N + Shared Read Blocks: N + Shared Dirtied Blocks: N + Shared Written Blocks: N + Local Hit Blocks: N + Local Read Blocks: N + Local Dirtied Blocks: N + Local Written Blocks: N + Temp Read Blocks: N + Temp Written Blocks: N + Planning Time: N.N + Triggers: + Serialization: + Time: N.N + Output Volume: N + Format: "text" + Shared Hit Blocks: N + Shared Read Blocks: N + Shared Dirtied Blocks: N + Shared Written Blocks: N + Local Hit Blocks: N + Local Read Blocks: N + Local Dirtied Blocks: N + Local Written Blocks: N + Temp Read Blocks: N + Temp Written Blocks: N + Execution Time: N.N +(1 row) select explain_filter('explain (buffers, format json) select * from int8_tbl i8'); - explain_filter ------------------------------------- - [ + - { + - "Plan": { + - "Node Type": "Seq Scan", + - "Parallel Aware": false, + - "Async Capable": false, + - "Relation Name": "int8_tbl",+ - "Alias": "i8", + - "Startup Cost": N.N, + - "Total Cost": N.N, + - "Plan Rows": N, + - "Plan Width": N, + - "Disabled": false, + - "Shared Hit Blocks": N, + - "Shared Read Blocks": N, + - "Shared Dirtied Blocks": N, + - "Shared Written Blocks": N, + - "Local Hit Blocks": N, + - "Local Read Blocks": N, + - "Local Dirtied Blocks": N, + - "Local Written Blocks": N, + - "Temp Read Blocks": N, + - "Temp Written Blocks": N + - }, + - "Planning": { + - "Shared Hit Blocks": N, + - "Shared Read Blocks": N, + - "Shared Dirtied Blocks": N, + - "Shared Written Blocks": N, + - "Local Hit Blocks": N, + - "Local Read Blocks": N, + - "Local Dirtied Blocks": N, + - "Local Written Blocks": N, + - "Temp Read Blocks": N, + - "Temp Written Blocks": N + - } + - } + - ] +explain_filter +[ + { + "Plan": { + "Node Type": "Seq Scan", + "Parallel Aware": false, + "Async Capable": false, + "Relation Name": "int8_tbl", + "Alias": "i8", + "Startup Cost": N.N, + "Total Cost": N.N, + "Plan Rows": N, + "Plan Width": N, + "Disabled": false, + "Shared Hit Blocks": N, + "Shared Read Blocks": N, + "Shared Dirtied Blocks": N, + "Shared Written Blocks": N, + "Local Hit Blocks": N, + "Local Read Blocks": N, + "Local Dirtied Blocks": N, + "Local Written Blocks": N, + "Temp Read Blocks": N, + "Temp Written Blocks": N + }, + "Planning": { + "Shared Hit Blocks": N, + "Shared Read Blocks": N, + "Shared Dirtied Blocks": N, + "Shared Written Blocks": N, + "Local Hit Blocks": N, + "Local Read Blocks": N, + "Local Dirtied Blocks": N, + "Local Written Blocks": N, + "Temp Read Blocks": N, + "Temp Written Blocks": N + } + } +] (1 row) - +\a -- Check expansion of window definitions select explain_filter('explain verbose select sum(unique1) over w, sum(unique2) over (w order by hundred), sum(tenthous) over (w order by hundred) from tenk1 window w as (partition by ten)'); explain_filter diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index ebdab42604b..8f10e1aff55 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -66,10 +66,13 @@ select explain_filter('explain select * from int8_tbl i8'); select explain_filter('explain (analyze, buffers off) select * from int8_tbl i8'); select explain_filter('explain (analyze, buffers off, verbose) select * from int8_tbl i8'); select explain_filter('explain (analyze, buffers, format text) select * from int8_tbl i8'); +select explain_filter('explain (buffers, format text) select * from int8_tbl i8'); + +\a select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8'); select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8'); -select explain_filter('explain (buffers, format text) select * from int8_tbl i8'); select explain_filter('explain (buffers, format json) select * from int8_tbl i8'); +\a -- Check expansion of window definitions -- 2.53.0
From 1d79ca14128b7f1b805bef51880c9603c9c02b8e Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Sun, 5 Apr 2026 17:17:22 -0400 Subject: [PATCH v13 2/5] Add EXPLAIN (IO) infrastructure with BitmapHeapScan support Allows collecting details about AIO / prefetch for scan nodes backed by a ReadStream. This may be enabled by a new "IO" option in EXPLAIN, and it shows information about the prefetch distance and I/O requests. As of this commit this applies only to BitmapHeapScan, because that's the only scan node using a ReadStream and collecting instrumentation from workers in a parallel query. Support for SeqScan and TidRangeScan, the other scan nodes using ReadStream, will be added in subsequent commits. The stats are collected only when required by EXPLAIN ANALYZE, with the IO option (disabled by default). The amount of collected statistics is very limited, but we don't want to clutter EXPLAIN with too much data. The IOStats struct is stored in the scan descriptor as a field, next to other fields used by table AMs. A pointer to the field is passed to the ReadStream, and updated directly. It's the responsibility of the table AM to allocate the struct (e.g. in ambeginscan) whenever the flag SO_SCAN_INSTRUMENT flag is passed to the scan, so that the executor and ReadStream has access to it. The collected stats are designed for ReadStream, but are meant to be reasonably generic in case a TAM manages I/Os in different ways. Reviewed-by: Melanie Plageman <[email protected]> Reviewed-by: Lukas Fittl <[email protected]> Reviewed-by: Andres Freund <[email protected]> Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me --- doc/src/sgml/ref/explain.sgml | 12 ++ src/backend/access/heap/heapam.c | 10 ++ src/backend/commands/explain.c | 128 ++++++++++++++++++++++ src/backend/commands/explain_state.c | 8 ++ src/backend/executor/nodeBitmapHeapscan.c | 19 +++- src/backend/storage/aio/read_stream.c | 87 +++++++++++++++ src/include/access/relscan.h | 6 + src/include/access/tableam.h | 3 + src/include/commands/explain_state.h | 1 + src/include/executor/instrument.h | 1 + src/include/executor/instrument_node.h | 50 +++++++++ src/include/storage/read_stream.h | 2 + src/tools/pgindent/typedefs.list | 2 + 13 files changed, 327 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 5b8b521802e..a854c41e963 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -46,6 +46,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep TIMING [ <replaceable class="parameter">boolean</replaceable> ] SUMMARY [ <replaceable class="parameter">boolean</replaceable> ] MEMORY [ <replaceable class="parameter">boolean</replaceable> ] + IO [ <replaceable class="parameter">boolean</replaceable> ] FORMAT { TEXT | XML | JSON | YAML } </synopsis> </refsynopsisdiv> @@ -298,6 +299,17 @@ ROLLBACK; </listitem> </varlistentry> + <varlistentry> + <term><literal>IO</literal></term> + <listitem> + <para> + Include information on I/O performed by each node. + This parameter may only be used when <literal>ANALYZE</literal> is also + enabled. It defaults to <literal>FALSE</literal>. + </para> + </listitem> + </varlistentry> + <varlistentry> <term><literal>FORMAT</literal></term> <listitem> diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f6ac5a0897c..89ab9742aa5 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -43,6 +43,7 @@ #include "catalog/pg_database.h" #include "catalog/pg_database_d.h" #include "commands/vacuum.h" +#include "executor/instrument_node.h" #include "pgstat.h" #include "port/pg_bitutils.h" #include "storage/lmgr.h" @@ -1200,6 +1201,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan->rs_base.rs_nkeys = nkeys; scan->rs_base.rs_flags = flags; scan->rs_base.rs_parallel = parallel_scan; + scan->rs_base.rs_instrument = NULL; scan->rs_strategy = NULL; /* set in initscan */ scan->rs_cbuf = InvalidBuffer; @@ -1312,6 +1314,14 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + /* enable read stream instrumentation */ + if (flags & SO_SCAN_INSTRUMENT) + { + scan->rs_base.rs_instrument = palloc0_object(TableScanInstrumentation); + read_stream_enable_stats(scan->rs_read_stream, + &scan->rs_base.rs_instrument->io); + } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index f151f21f9b3..e9aafdf6e55 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "access/xact.h" #include "catalog/pg_type.h" #include "commands/createas.h" @@ -139,6 +140,8 @@ static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_indexsearches_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); +static void show_scan_io_usage(ScanState *planstate, + ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); @@ -519,6 +522,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, instrument_option |= INSTRUMENT_BUFFERS; if (es->wal) instrument_option |= INSTRUMENT_WAL; + if (es->io) + instrument_option |= INSTRUMENT_IO; /* * We always collect timing for the entire statement, even when node-level @@ -2008,6 +2013,7 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_tidbitmap_info((BitmapHeapScanState *) planstate, es); + show_scan_io_usage((ScanState *) planstate, es); break; case T_SampleScan: show_tablesample(((SampleScan *) plan)->tablesample, @@ -3984,6 +3990,128 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) } } +/* + * Print I/O stats - prefetching and I/O performed + * + * This prints two types of stats - "prefetch" about the prefetching done by + * ReadStream, and "I/O" issued by the stream. The prefetch stats are based + * on buffers pulled from the stream (even if no I/O is needed). The I/O + * information is related to I/O requests issued by the stream. + * + * The prefetch stats are printed if any buffer was pulled from the stream. + * For the I/O stats it depend on the output format. In non-text formats the + * information is printed always, while in text format only if there were + * any actual I/O requests. + */ +static void +print_io_usage(ExplainState *es, IOStats *stats) +{ + /* don't print prefetch stats if there's nothing to report */ + if (stats->prefetch_count > 0) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + /* prefetch distance info */ + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: avg=%.2f max=%d capacity=%d\n", + (stats->distance_sum * 1.0 / stats->prefetch_count), + stats->distance_max, + stats->distance_capacity); + + /* prefetch I/O info (only if there were actual I/Os) */ + if (stats->io_count > 0) + { + ExplainIndentText(es); + appendStringInfo(es->str, "I/O: count=%" PRIu64 " waits=%" PRIu64 + " size=%.2f inprogress=%.2f\n", + stats->io_count, stats->wait_count, + (stats->io_nblocks * 1.0 / stats->io_count), + (stats->io_in_progress * 1.0 / stats->io_count)); + } + } + else + { + ExplainPropertyFloat("Average Prefetch Distance", NULL, + (stats->distance_sum * 1.0 / stats->prefetch_count), 3, es); + ExplainPropertyInteger("Max Prefetch Distance", NULL, + stats->distance_max, es); + ExplainPropertyInteger("Prefetch Capacity", NULL, + stats->distance_capacity, es); + + ExplainPropertyUInteger("I/O Count", NULL, + stats->io_count, es); + ExplainPropertyUInteger("I/O Waits", NULL, + stats->wait_count, es); + ExplainPropertyFloat("Average I/O Size", NULL, + (stats->io_nblocks * 1.0 / Max(1, stats->io_count)), 3, es); + ExplainPropertyFloat("Average I/Os In Progress", NULL, + (stats->io_in_progress * 1.0 / Max(1, stats->io_count)), 3, es); + } + } +} + +/* + * Show information about prefetch and I/O in a scan node. + */ +static void +show_scan_io_usage(ScanState *planstate, ExplainState *es) +{ + Plan *plan = planstate->ps.plan; + IOStats stats = {0}; + + if (!es->io) + return; + + /* + * Initialize counters with stats from the local process first. + * + * The scan descriptor may not exist, e.g. if the scan did not start, or + * because of debug_parallel_query=regress. We still want to collect data + * from workers. + */ + if (planstate->ss_currentScanDesc && + planstate->ss_currentScanDesc->rs_instrument) + { + stats = planstate->ss_currentScanDesc->rs_instrument->io; + } + + /* + * Accumulate data from parallel workers (if any). + */ + switch (nodeTag(plan)) + { + case T_BitmapHeapScan: + { + SharedBitmapHeapInstrumentation *sinstrument + = ((BitmapHeapScanState *) planstate)->sinstrument; + + if (sinstrument) + { + for (int i = 0; i < sinstrument->num_workers; ++i) + { + BitmapHeapScanInstrumentation *winstrument = &sinstrument->sinstrument[i]; + + AccumulateIOStats(&stats, &winstrument->stats.io); + + if (!es->workers_state) + continue; + + ExplainOpenWorker(i, es); + print_io_usage(es, &winstrument->stats.io); + ExplainCloseWorker(i, es); + } + } + + break; + } + default: + /* ignore other plans */ + return; + } + + print_io_usage(es, &stats); +} + /* * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node * diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c index 65dd4111459..0e07a63fca6 100644 --- a/src/backend/commands/explain_state.c +++ b/src/backend/commands/explain_state.c @@ -162,6 +162,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) "EXPLAIN", opt->defname, p), parser_errposition(pstate, opt->location))); } + else if (strcmp(opt->defname, "io") == 0) + es->io = defGetBoolean(opt); else if (!ApplyExtensionExplainOption(es, opt, pstate)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -188,6 +190,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("EXPLAIN option %s requires ANALYZE", "TIMING"))); + /* check that IO is used with EXPLAIN ANALYZE */ + if (es->io && !es->analyze) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("EXPLAIN option %s requires ANALYZE", "IO"))); + /* check that serialize is used with EXPLAIN ANALYZE */ if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze) ereport(ERROR, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index d65e2a87b42..83d6478bc2b 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -144,13 +144,20 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (node->ss.ps.state->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, NULL, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + flags); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; @@ -330,6 +337,14 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) */ si->exact_pages += node->stats.exact_pages; si->lossy_pages += node->stats.lossy_pages; + + /* collect I/O instrumentation for this process */ + if (node->ss.ss_currentScanDesc && + node->ss.ss_currentScanDesc->rs_instrument) + { + AccumulateIOStats(&si->stats.io, + &node->ss.ss_currentScanDesc->rs_instrument->io); + } } /* diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 0b6cdf7c873..936e08ea450 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -74,6 +74,7 @@ #include "postgres.h" #include "miscadmin.h" +#include "executor/instrument_node.h" #include "storage/aio.h" #include "storage/fd.h" #include "storage/smgr.h" @@ -123,6 +124,9 @@ struct ReadStream bool advice_enabled; bool temporary; + /* scan stats counters */ + IOStats *stats; + /* * One-block buffer to support 'ungetting' a block number, to resolve flow * control problems when I/Os are split. @@ -188,6 +192,73 @@ block_range_read_stream_cb(ReadStream *stream, return InvalidBlockNumber; } +/* + * Update stream stats with current pinned buffer depth. + * + * Called once per buffer returned to the consumer in read_stream_next_buffer(). + * Records the number of pinned buffers at that moment, so we can compute the + * average look-ahead depth. + */ +static inline void +read_stream_count_prefetch(ReadStream *stream) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->prefetch_count++; + stats->distance_sum += stream->pinned_buffers; + if (stream->pinned_buffers > stats->distance_max) + stats->distance_max = stream->pinned_buffers; +} + +/* + * Update stream stats about size of I/O requests. + * + * We count the number of I/O requests, size of requests (counted in blocks) + * and number of in-progress I/Os. + */ +static inline void +read_stream_count_io(ReadStream *stream, int nblocks, int in_progress) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->io_count++; + stats->io_nblocks += nblocks; + stats->io_in_progress += in_progress; +} + +/* + * Update stream stats about waits for I/O when consuming buffers. + * + * We count the number of I/O waits while pulling buffers out of a stream. + */ +static inline void +read_stream_count_wait(ReadStream *stream) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->wait_count++; +} + +/* + * Enable collection of stats into the provided IOStats. + */ +void +read_stream_enable_stats(ReadStream *stream, IOStats *stats) +{ + stream->stats = stats; + if (stream->stats) + stream->stats->distance_capacity = stream->max_pinned_buffers; +} + /* * Ask the callback which block it would like us to read next, with a one block * buffer in front to allow read_stream_unget_block() to work. @@ -426,6 +497,9 @@ read_stream_start_pending_read(ReadStream *stream) Assert(stream->ios_in_progress < stream->max_ios); stream->ios_in_progress++; stream->seq_blocknum = stream->pending_read_blocknum + nblocks; + + /* update I/O stats */ + read_stream_count_io(stream, nblocks, stream->ios_in_progress); } /* @@ -1021,6 +1095,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) flags))) { /* Fast return. */ + read_stream_count_prefetch(stream); return buffer; } @@ -1036,6 +1111,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) * to avoid having to effectively do another synchronous IO for * the next block (if it were also a miss). */ + + /* update I/O stats */ + read_stream_count_io(stream, 1, stream->ios_in_progress); + + /* update prefetch distance */ + read_stream_count_prefetch(stream); } else { @@ -1100,6 +1181,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) needed_wait = WaitReadBuffers(&stream->ios[io_index].op); + /* Count it as a wait if we need to wait for IO */ + if (needed_wait) + read_stream_count_wait(stream); + Assert(stream->ios_in_progress > 0); stream->ios_in_progress--; if (++stream->oldest_io_index == stream->max_ios) @@ -1228,6 +1313,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) } #endif + read_stream_count_prefetch(stream); + /* Pin transferred to caller. */ Assert(stream->pinned_buffers > 0); stream->pinned_buffers--; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index fd2076c582a..2ea06a67a63 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -24,6 +24,7 @@ struct ParallelTableScanDescData; +struct TableScanInstrumentation; /* * Generic descriptor for table scans. This is the base-class for table scans, @@ -64,6 +65,11 @@ typedef struct TableScanDescData struct ParallelTableScanDescData *rs_parallel; /* parallel scan * information */ + + /* + * Instrumentation counters maintained by all table AMs. + */ + struct TableScanInstrumentation *rs_instrument; } TableScanDescData; typedef struct TableScanDescData *TableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index a21c7db5439..c13f05d39db 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -69,6 +69,9 @@ typedef enum ScanOptions /* set if the query doesn't modify the relation */ SO_HINT_REL_READ_ONLY = 1 << 10, + + /* collect scan instrumentation */ + SO_SCAN_INSTRUMENT = 1 << 11, } ScanOptions; /* diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h index 6252fe11f15..97bc7ed49f6 100644 --- a/src/include/commands/explain_state.h +++ b/src/include/commands/explain_state.h @@ -55,6 +55,7 @@ typedef struct ExplainState bool summary; /* print total planning and execution timing */ bool memory; /* print planner's memory usage information */ bool settings; /* print modified settings */ + bool io; /* print info about IO (prefetch, ...) */ bool generic; /* generate a generic plan */ ExplainSerializeOption serialize; /* serialize the query's output? */ ExplainFormat format; /* output format */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index cc9fbb0e2f0..f093a52aae0 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -64,6 +64,7 @@ typedef enum InstrumentOption INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */ INSTRUMENT_ROWS = 1 << 2, /* needs row count */ INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */ + INSTRUMENT_IO = 1 << 4, /* needs IO usage */ INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index e6a3f9f1941..22a75ccd863 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -48,6 +48,55 @@ typedef struct SharedAggInfo } SharedAggInfo; +/* --------------------- + * Instrumentation information about read streams and I/O + * --------------------- + */ +typedef struct IOStats +{ + /* number of buffers returned to consumer (for averaging distance) */ + uint64 prefetch_count; + + /* sum of pinned_buffers sampled at each buffer return */ + uint64 distance_sum; + + /* maximum actual pinned_buffers observed during the scan */ + int16 distance_max; + + /* maximum possible look-ahead distance (max_pinned_buffers) */ + int16 distance_capacity; + + /* number of waits for a read (for the I/O) */ + uint64 wait_count; + + /* I/O stats */ + uint64 io_count; /* number of I/Os */ + uint64 io_nblocks; /* sum of blocks for all I/Os */ + uint64 io_in_progress; /* sum of in-progress I/Os */ +} IOStats; + +typedef struct TableScanInstrumentation +{ + IOStats io; +} TableScanInstrumentation; + +/* merge IO statistics from 'src' into 'dst' */ +static inline void +AccumulateIOStats(IOStats *dst, IOStats *src) +{ + dst->prefetch_count += src->prefetch_count; + dst->distance_sum += src->distance_sum; + if (src->distance_max > dst->distance_max) + dst->distance_max = src->distance_max; + if (src->distance_capacity > dst->distance_capacity) + dst->distance_capacity = src->distance_capacity; + dst->wait_count += src->wait_count; + dst->io_count += src->io_count; + dst->io_nblocks += src->io_nblocks; + dst->io_in_progress += src->io_in_progress; +} + + /* --------------------- * Instrumentation information for indexscans (amgettuple and amgetbitmap) * --------------------- @@ -79,6 +128,7 @@ typedef struct BitmapHeapScanInstrumentation { uint64 exact_pages; uint64 lossy_pages; + TableScanInstrumentation stats; } BitmapHeapScanInstrumentation; /* diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h index c9359b29b0f..48995c6d534 100644 --- a/src/include/storage/read_stream.h +++ b/src/include/storage/read_stream.h @@ -65,6 +65,7 @@ struct ReadStream; typedef struct ReadStream ReadStream; +struct IOStats; /* for block_range_read_stream_cb */ typedef struct BlockRangeReadStreamPrivate @@ -103,5 +104,6 @@ extern BlockNumber read_stream_pause(ReadStream *stream); extern void read_stream_resume(ReadStream *stream); extern void read_stream_reset(ReadStream *stream); extern void read_stream_end(ReadStream *stream); +extern void read_stream_enable_stats(ReadStream *stream, struct IOStats *stats); #endif /* READ_STREAM_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9e6a39f5608..98b8d78e693 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1279,6 +1279,7 @@ IOContext IOFuncSelector IOObject IOOp +IOStats IO_STATUS_BLOCK IPCompareMethod ITEM @@ -3127,6 +3128,7 @@ TableLikeClause TableSampleClause TableScanDesc TableScanDescData +TableScanInstrumentation TableSpaceCacheEntry TableSpaceOpts TableToProcess -- 2.53.0
From 38d3f24ac878be825d80edcc2c90d466791b51b5 Mon Sep 17 00:00:00 2001 From: test <test> Date: Tue, 7 Apr 2026 02:04:06 +0200 Subject: [PATCH v13 3/5] auto_explain: Add new GUC auto_explain.log_io Allows enabling the new EXPLAIN "IO" option for auto_explain. Reviewed-by: Melanie Plageman <[email protected]> Reviewed-by: Lukas Fittl <[email protected]> Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me --- contrib/auto_explain/auto_explain.c | 15 +++++++++++++++ doc/src/sgml/auto-explain.sgml | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index 6ceae1c69ce..9114ced45a7 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -38,6 +38,7 @@ static int auto_explain_log_parameter_max_length = -1; /* bytes or -1 */ static bool auto_explain_log_analyze = false; static bool auto_explain_log_verbose = false; static bool auto_explain_log_buffers = false; +static bool auto_explain_log_io = false; static bool auto_explain_log_wal = false; static bool auto_explain_log_triggers = false; static bool auto_explain_log_timing = true; @@ -203,6 +204,17 @@ _PG_init(void) NULL, NULL); + DefineCustomBoolVariable("auto_explain.log_io", + "Log I/O statistics.", + NULL, + &auto_explain_log_io, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomBoolVariable("auto_explain.log_wal", "Log WAL usage.", NULL, @@ -343,6 +355,8 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) queryDesc->instrument_options |= INSTRUMENT_ROWS; if (auto_explain_log_buffers) queryDesc->instrument_options |= INSTRUMENT_BUFFERS; + if (auto_explain_log_io) + queryDesc->instrument_options |= INSTRUMENT_IO; if (auto_explain_log_wal) queryDesc->instrument_options |= INSTRUMENT_WAL; } @@ -440,6 +454,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc) es->analyze = (queryDesc->instrument_options && auto_explain_log_analyze); es->verbose = auto_explain_log_verbose; es->buffers = (es->analyze && auto_explain_log_buffers); + es->io = (es->analyze && auto_explain_log_io); es->wal = (es->analyze && auto_explain_log_wal); es->timing = (es->analyze && auto_explain_log_timing); es->summary = es->analyze; diff --git a/doc/src/sgml/auto-explain.sgml b/doc/src/sgml/auto-explain.sgml index ee85a67eb2e..06a8fcc6c5b 100644 --- a/doc/src/sgml/auto-explain.sgml +++ b/doc/src/sgml/auto-explain.sgml @@ -128,6 +128,26 @@ LOAD 'auto_explain'; </listitem> </varlistentry> + <varlistentry id="auto-explain-configuration-parameters-log-io"> + <term> + <varname>auto_explain.log_io</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>auto_explain.log_io</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + <varname>auto_explain.log_io</varname> controls whether I/O usage + statistics are printed when an execution plan is logged; it's + equivalent to the <literal>IO</literal> option of <command>EXPLAIN</command>. + This parameter has no effect + unless <varname>auto_explain.log_analyze</varname> is enabled. + This parameter is off by default. + Only superusers can change this setting. + </para> + </listitem> + </varlistentry> + <varlistentry id="auto-explain-configuration-parameters-log-wal"> <term> <varname>auto_explain.log_wal</varname> (<type>boolean</type>) -- 2.53.0
From 7f0269e4307f5bb8bd850092aba8a5df8411c587 Mon Sep 17 00:00:00 2001 From: test <test> Date: Tue, 7 Apr 2026 01:38:53 +0200 Subject: [PATCH v13 4/5] Add EXPLAIN (IO) instrumentation for SeqScan Adds support for EXPLAIN (IO) instrumentation for sequential scans. This requires adding shared instrumentation for parallel scans, using the separate DSM approach introduced by dd78e69cfc33. Reviewed-by: Melanie Plageman <[email protected]> Reviewed-by: Lukas Fittl <[email protected]> Reviewed-by: Andres Freund <[email protected]> Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me --- src/backend/commands/explain.c | 25 +++++ src/backend/executor/execParallel.c | 11 +++ src/backend/executor/nodeSeqscan.c | 127 +++++++++++++++++++++++-- src/include/executor/instrument_node.h | 19 ++++ src/include/executor/nodeSeqscan.h | 9 ++ src/include/nodes/execnodes.h | 1 + src/test/regress/expected/explain.out | 18 +++- src/test/regress/sql/explain.sql | 4 +- src/tools/pgindent/typedefs.list | 2 + 9 files changed, 203 insertions(+), 13 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index e9aafdf6e55..18d424da944 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2032,6 +2032,7 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); if (IsA(plan, CteScan)) show_ctescan_info(castNode(CteScanState, planstate), es); + show_scan_io_usage((ScanState *) planstate, es); break; case T_Gather: { @@ -4102,6 +4103,30 @@ show_scan_io_usage(ScanState *planstate, ExplainState *es) } } + break; + } + case T_SeqScan: + { + SharedSeqScanInstrumentation *sinstrument + = ((SeqScanState *) planstate)->sinstrument; + + if (sinstrument) + { + for (int i = 0; i < sinstrument->num_workers; ++i) + { + SeqScanInstrumentation *winstrument = &sinstrument->sinstrument[i]; + + AccumulateIOStats(&stats, &winstrument->stats.io); + + if (!es->workers_state) + continue; + + ExplainOpenWorker(i, es); + print_io_usage(es, &winstrument->stats.io); + ExplainCloseWorker(i, es); + } + } + break; } default: diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 1a5ec0c305f..9690f0938ae 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -257,6 +257,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) if (planstate->plan->parallel_aware) ExecSeqScanEstimate((SeqScanState *) planstate, e->pcxt); + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecSeqScanInstrumentEstimate((SeqScanState *) planstate, + e->pcxt); break; case T_IndexScanState: if (planstate->plan->parallel_aware) @@ -500,6 +503,9 @@ ExecParallelInitializeDSM(PlanState *planstate, if (planstate->plan->parallel_aware) ExecSeqScanInitializeDSM((SeqScanState *) planstate, d->pcxt); + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecSeqScanInstrumentInitDSM((SeqScanState *) planstate, + d->pcxt); break; case T_IndexScanState: if (planstate->plan->parallel_aware) @@ -1148,6 +1154,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, case T_BitmapHeapScanState: ExecBitmapHeapRetrieveInstrumentation((BitmapHeapScanState *) planstate); break; + case T_SeqScanState: + ExecSeqScanRetrieveInstrumentation((SeqScanState *) planstate); + break; default: break; } @@ -1388,6 +1397,8 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) case T_SeqScanState: if (planstate->plan->parallel_aware) ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt); + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecSeqScanInstrumentInitWorker((SeqScanState *) planstate, pwcxt); break; case T_IndexScanState: if (planstate->plan->parallel_aware) diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 04803b0e37d..a4fcf7c7ad4 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -29,6 +29,7 @@ #include "access/relscan.h" #include "access/tableam.h" +#include "executor/execParallel.h" #include "executor/execScan.h" #include "executor/executor.h" #include "executor/nodeSeqscan.h" @@ -65,15 +66,21 @@ SeqNext(SeqScanState *node) if (scandesc == NULL) { + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (estate->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; + /* * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, - 0, NULL, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + 0, NULL, flags); node->ss.ss_currentScanDesc = scandesc; } @@ -302,6 +309,22 @@ ExecEndSeqScan(SeqScanState *node) */ scanDesc = node->ss.ss_currentScanDesc; + /* + * Collect I/O stats for this process into shared instrumentation. + */ + if (node->sinstrument != NULL && IsParallelWorker()) + { + SeqScanInstrumentation *si; + + Assert(ParallelWorkerNumber < node->sinstrument->num_workers); + si = &node->sinstrument->sinstrument[ParallelWorkerNumber]; + + if (scanDesc && scanDesc->rs_instrument) + { + AccumulateIOStats(&si->stats.io, &scanDesc->rs_instrument->io); + } + } + /* * close heap scan */ @@ -370,6 +393,13 @@ ExecSeqScanInitializeDSM(SeqScanState *node, { EState *estate = node->ss.ps.state; ParallelTableScanDesc pscan; + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (estate->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, @@ -378,9 +408,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node, shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags); } /* ---------------------------------------------------------------- @@ -410,10 +438,91 @@ ExecSeqScanInitializeWorker(SeqScanState *node, ParallelWorkerContext *pwcxt) { ParallelTableScanDesc pscan; + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (node->ss.ps.state->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags); +} + +/* + * Compute the amount of space we'll need for the shared instrumentation and + * inform pcxt->estimator. + */ +void +ExecSeqScanInstrumentEstimate(SeqScanState *node, ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + if (!estate->es_instrument || pcxt->nworkers == 0) + return; + + shm_toc_estimate_chunk(&pcxt->estimator, + offsetof(SharedSeqScanInstrumentation, sinstrument) + + sizeof(SeqScanInstrumentation) * pcxt->nworkers); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* + * Set up parallel sequential scan instrumentation. + */ +void +ExecSeqScanInstrumentInitDSM(SeqScanState *node, ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + SharedSeqScanInstrumentation *sinstrument; + Size size; + + if (!estate->es_instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedSeqScanInstrumentation, sinstrument) + + sizeof(SeqScanInstrumentation) * pcxt->nworkers; + sinstrument = shm_toc_allocate(pcxt->toc, size); + memset(sinstrument, 0, size); + sinstrument->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, + node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET, + sinstrument); + node->sinstrument = sinstrument; +} + +/* + * Look up and save the location of the shared instrumentation. + */ +void +ExecSeqScanInstrumentInitWorker(SeqScanState *node, + ParallelWorkerContext *pwcxt) +{ + if (!node->ss.ps.state->es_instrument) + return; + + node->sinstrument = shm_toc_lookup(pwcxt->toc, + node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET, + true); +} + +/* + * Transfer sequential scan instrumentation from DSM to private memory. + */ +void +ExecSeqScanRetrieveInstrumentation(SeqScanState *node) +{ + SharedSeqScanInstrumentation *sinstrument = node->sinstrument; + Size size; + + if (sinstrument == NULL) + return; + + size = offsetof(SharedSeqScanInstrumentation, sinstrument) + + sinstrument->num_workers * sizeof(SeqScanInstrumentation); + + node->sinstrument = palloc(size); + memcpy(node->sinstrument, sinstrument, size); } diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 22a75ccd863..003dc262b5d 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -266,4 +266,23 @@ typedef struct SharedIncrementalSortInfo IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER]; } SharedIncrementalSortInfo; + +/* --------------------- + * Instrumentation information for sequential scans + * --------------------- + */ +typedef struct SeqScanInstrumentation +{ + TableScanInstrumentation stats; +} SeqScanInstrumentation; + +/* + * Shared memory container for per-worker information + */ +typedef struct SharedSeqScanInstrumentation +{ + int num_workers; + SeqScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]; +} SharedSeqScanInstrumentation; + #endif /* INSTRUMENT_NODE_H */ diff --git a/src/include/executor/nodeSeqscan.h b/src/include/executor/nodeSeqscan.h index 7a1490596fb..9c0ad4879d7 100644 --- a/src/include/executor/nodeSeqscan.h +++ b/src/include/executor/nodeSeqscan.h @@ -28,4 +28,13 @@ extern void ExecSeqScanReInitializeDSM(SeqScanState *node, ParallelContext *pcxt extern void ExecSeqScanInitializeWorker(SeqScanState *node, ParallelWorkerContext *pwcxt); +/* instrument support */ +extern void ExecSeqScanInstrumentEstimate(SeqScanState *node, + ParallelContext *pcxt); +extern void ExecSeqScanInstrumentInitDSM(SeqScanState *node, + ParallelContext *pcxt); +extern void ExecSeqScanInstrumentInitWorker(SeqScanState *node, + ParallelWorkerContext *pwcxt); +extern void ExecSeqScanRetrieveInstrumentation(SeqScanState *node); + #endif /* NODESEQSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3ecae7552fc..56febb3204c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1670,6 +1670,7 @@ typedef struct SeqScanState { ScanState ss; /* its first field is NodeTag */ Size pscan_len; /* size of parallel heap scan descriptor */ + struct SharedSeqScanInstrumentation *sinstrument; } SeqScanState; /* ---------------- diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index dc31c7ce9f9..74a4d87801e 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -100,7 +100,7 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8' (1 row) \a -select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8'); +select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8'); explain_filter <explain xmlns="http://www.postgresql.org/N/explain"> <Query> @@ -119,6 +119,13 @@ explain_filter <Actual-Rows>N.N</Actual-Rows> <Actual-Loops>N</Actual-Loops> <Disabled>false</Disabled> + <Average-Prefetch-Distance>N.N</Average-Prefetch-Distance> + <Max-Prefetch-Distance>N</Max-Prefetch-Distance> + <Prefetch-Capacity>N</Prefetch-Capacity> + <I-O-Count>N</I-O-Count> + <I-O-Waits>N</I-O-Waits> + <Average-I-O-Size>N.N</Average-I-O-Size> + <Average-I-Os-In-Progress>N.N</Average-I-Os-In-Progress> <Shared-Hit-Blocks>N</Shared-Hit-Blocks> <Shared-Read-Blocks>N</Shared-Read-Blocks> <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks> @@ -149,7 +156,7 @@ explain_filter </Query> </explain> (1 row) -select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8'); +select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8'); explain_filter - Plan: Node Type: "Seq Scan" @@ -166,6 +173,13 @@ explain_filter Actual Rows: N.N Actual Loops: N Disabled: false + Average Prefetch Distance: N.N + Max Prefetch Distance: N + Prefetch Capacity: N + I/O Count: N + I/O Waits: N + Average I/O Size: N.N + Average I/Os In Progress: N.N Shared Hit Blocks: N Shared Read Blocks: N Shared Dirtied Blocks: N diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index 8f10e1aff55..2f163c64bf6 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -69,8 +69,8 @@ select explain_filter('explain (analyze, buffers, format text) select * from int select explain_filter('explain (buffers, format text) select * from int8_tbl i8'); \a -select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8'); -select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8'); +select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8'); +select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8'); select explain_filter('explain (buffers, format json) select * from int8_tbl i8'); \a diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 98b8d78e693..c0436a13ac3 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2800,6 +2800,7 @@ SelfJoinCandidate SemTPadded SemiAntiJoinFactors SeqScan +SeqScanInstrumentation SeqScanState SeqTable SeqTableData @@ -2864,6 +2865,7 @@ SharedMemoizeInfo SharedRecordTableEntry SharedRecordTableKey SharedRecordTypmodRegistry +SharedSeqScanInstrumentation SharedSortInfo SharedTuplestore SharedTuplestoreAccessor -- 2.53.0
From 5facc503442201929c416bd37233ccd3d85ee45d Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Sun, 5 Apr 2026 17:31:18 -0400 Subject: [PATCH v13 5/5] Add EXPLAIN (IO) instrumentation for TidRangeScan Adds support for EXPLAIN (IO) instrumentation for TidRange scans. This requires adding shared instrumentation for parallel scans, using the separate DSM approach introduced by dd78e69cfc33. Reviewed-by: Melanie Plageman <[email protected]> Reviewed-by: Lukas Fittl <[email protected]> Reviewed-by: Andres Freund <[email protected]> Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me --- src/backend/commands/explain.c | 25 +++++ src/backend/executor/execParallel.c | 12 +++ src/backend/executor/nodeTidrangescan.c | 127 ++++++++++++++++++++++-- src/include/executor/instrument_node.h | 18 ++++ src/include/executor/nodeTidrangescan.h | 9 ++ src/include/nodes/execnodes.h | 1 + src/tools/pgindent/typedefs.list | 2 + 7 files changed, 186 insertions(+), 8 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 18d424da944..b8e4c4eb2da 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2149,6 +2149,7 @@ ExplainNode(PlanState *planstate, List *ancestors, if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); + show_scan_io_usage((ScanState *) planstate, es); } break; case T_ForeignScan: @@ -4127,6 +4128,30 @@ show_scan_io_usage(ScanState *planstate, ExplainState *es) } } + break; + } + case T_TidRangeScan: + { + SharedTidRangeScanInstrumentation *sinstrument + = ((TidRangeScanState *) planstate)->trss_sinstrument; + + if (sinstrument) + { + for (int i = 0; i < sinstrument->num_workers; ++i) + { + TidRangeScanInstrumentation *winstrument = &sinstrument->sinstrument[i]; + + AccumulateIOStats(&stats, &winstrument->stats.io); + + if (!es->workers_state) + continue; + + ExplainOpenWorker(i, es); + print_io_usage(es, &winstrument->stats.io); + ExplainCloseWorker(i, es); + } + } + break; } default: diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 9690f0938ae..81b87d82fab 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -291,6 +291,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) if (planstate->plan->parallel_aware) ExecTidRangeScanEstimate((TidRangeScanState *) planstate, e->pcxt); + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecTidRangeScanInstrumentEstimate((TidRangeScanState *) planstate, + e->pcxt); break; case T_AppendState: if (planstate->plan->parallel_aware) @@ -536,6 +539,9 @@ ExecParallelInitializeDSM(PlanState *planstate, if (planstate->plan->parallel_aware) ExecTidRangeScanInitializeDSM((TidRangeScanState *) planstate, d->pcxt); + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecTidRangeScanInstrumentInitDSM((TidRangeScanState *) planstate, + d->pcxt); break; case T_AppendState: if (planstate->plan->parallel_aware) @@ -1157,6 +1163,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, case T_SeqScanState: ExecSeqScanRetrieveInstrumentation((SeqScanState *) planstate); break; + case T_TidRangeScanState: + ExecTidRangeScanRetrieveInstrumentation((TidRangeScanState *) planstate); + break; default: break; } @@ -1430,6 +1439,9 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) if (planstate->plan->parallel_aware) ExecTidRangeScanInitializeWorker((TidRangeScanState *) planstate, pwcxt); + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecTidRangeScanInstrumentInitWorker((TidRangeScanState *) planstate, + pwcxt); break; case T_AppendState: if (planstate->plan->parallel_aware) diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 4a8fe91b2b3..550916928cb 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -18,7 +18,9 @@ #include "access/sysattr.h" #include "access/tableam.h" #include "catalog/pg_operator.h" +#include "executor/execParallel.h" #include "executor/executor.h" +#include "executor/instrument.h" #include "executor/nodeTidrangescan.h" #include "nodes/nodeFuncs.h" #include "utils/rel.h" @@ -242,12 +244,19 @@ TidRangeNext(TidRangeScanState *node) if (scandesc == NULL) { + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (estate->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; + scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation, estate->es_snapshot, &node->trss_mintid, &node->trss_maxtid, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + flags); node->ss.ss_currentScanDesc = scandesc; } else @@ -342,6 +351,20 @@ ExecEndTidRangeScan(TidRangeScanState *node) { TableScanDesc scan = node->ss.ss_currentScanDesc; + /* Collect IO stats for this process into shared instrumentation */ + if (node->trss_sinstrument != NULL && IsParallelWorker()) + { + TidRangeScanInstrumentation *si; + + Assert(ParallelWorkerNumber < node->trss_sinstrument->num_workers); + si = &node->trss_sinstrument->sinstrument[ParallelWorkerNumber]; + + if (scan && scan->rs_instrument) + { + AccumulateIOStats(&si->stats.io, &scan->rs_instrument->io); + } + } + if (scan != NULL) table_endscan(scan); } @@ -454,6 +477,13 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) { EState *estate = node->ss.ps.state; ParallelTableScanDesc pscan; + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (estate->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, @@ -462,9 +492,7 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, - pscan, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + pscan, flags); } /* ---------------------------------------------------------------- @@ -494,11 +522,94 @@ ExecTidRangeScanInitializeWorker(TidRangeScanState *node, ParallelWorkerContext *pwcxt) { ParallelTableScanDesc pscan; + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (node->ss.ps.state->es_instrument) + flags |= SO_SCAN_INSTRUMENT; pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, - pscan, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + pscan, flags); +} + +/* + * Compute the amount of space we'll need for the shared instrumentation and + * inform pcxt->estimator. + */ +void +ExecTidRangeScanInstrumentEstimate(TidRangeScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + if (!estate->es_instrument || pcxt->nworkers == 0) + return; + + shm_toc_estimate_chunk(&pcxt->estimator, + offsetof(SharedTidRangeScanInstrumentation, sinstrument) + + sizeof(TidRangeScanInstrumentation) * pcxt->nworkers); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* + * Set up parallel scan instrumentation. + */ +void +ExecTidRangeScanInstrumentInitDSM(TidRangeScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + SharedTidRangeScanInstrumentation *sinstrument; + Size size; + + if (!estate->es_instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedTidRangeScanInstrumentation, sinstrument) + + sizeof(TidRangeScanInstrumentation) * pcxt->nworkers; + sinstrument = shm_toc_allocate(pcxt->toc, size); + memset(sinstrument, 0, size); + sinstrument->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, + node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET, + sinstrument); + node->trss_sinstrument = sinstrument; +} + +/* + * Look up and save the location of the shared instrumentation. + */ +void +ExecTidRangeScanInstrumentInitWorker(TidRangeScanState *node, + ParallelWorkerContext *pwcxt) +{ + if (!node->ss.ps.state->es_instrument) + return; + + node->trss_sinstrument = shm_toc_lookup(pwcxt->toc, + node->ss.ps.plan->plan_node_id + PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET, + true); +} + +/* + * Transfer scan instrumentation from DSM to private memory. + */ +void +ExecTidRangeScanRetrieveInstrumentation(TidRangeScanState *node) +{ + SharedTidRangeScanInstrumentation *sinstrument = node->trss_sinstrument; + Size size; + + if (sinstrument == NULL) + return; + + size = offsetof(SharedTidRangeScanInstrumentation, sinstrument) + + sinstrument->num_workers * sizeof(TidRangeScanInstrumentation); + + node->trss_sinstrument = palloc(size); + memcpy(node->trss_sinstrument, sinstrument, size); } diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 003dc262b5d..4076990408e 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -285,4 +285,22 @@ typedef struct SharedSeqScanInstrumentation SeqScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]; } SharedSeqScanInstrumentation; + +/* + * Instrumentation information for TID range scans + */ +typedef struct TidRangeScanInstrumentation +{ + TableScanInstrumentation stats; +} TidRangeScanInstrumentation; + +/* + * Shared memory container for per-worker information + */ +typedef struct SharedTidRangeScanInstrumentation +{ + int num_workers; + TidRangeScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]; +} SharedTidRangeScanInstrumentation; + #endif /* INSTRUMENT_NODE_H */ diff --git a/src/include/executor/nodeTidrangescan.h b/src/include/executor/nodeTidrangescan.h index 8752d1ea8c4..9e7d0a357bb 100644 --- a/src/include/executor/nodeTidrangescan.h +++ b/src/include/executor/nodeTidrangescan.h @@ -28,4 +28,13 @@ extern void ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelConte extern void ExecTidRangeScanReInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt); extern void ExecTidRangeScanInitializeWorker(TidRangeScanState *node, ParallelWorkerContext *pwcxt); +/* instrument support */ +extern void ExecTidRangeScanInstrumentEstimate(TidRangeScanState *node, + ParallelContext *pcxt); +extern void ExecTidRangeScanInstrumentInitDSM(TidRangeScanState *node, + ParallelContext *pcxt); +extern void ExecTidRangeScanInstrumentInitWorker(TidRangeScanState *node, + ParallelWorkerContext *pwcxt); +extern void ExecTidRangeScanRetrieveInstrumentation(TidRangeScanState *node); + #endif /* NODETIDRANGESCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 56febb3204c..13359180d25 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1922,6 +1922,7 @@ typedef struct TidRangeScanState ItemPointerData trss_maxtid; bool trss_inScan; Size trss_pscanlen; + struct SharedTidRangeScanInstrumentation *trss_sinstrument; } TidRangeScanState; /* ---------------- diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index c0436a13ac3..0cd41106975 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2867,6 +2867,7 @@ SharedRecordTableKey SharedRecordTypmodRegistry SharedSeqScanInstrumentation SharedSortInfo +SharedTidRangeScanInstrumentation SharedTuplestore SharedTuplestoreAccessor SharedTuplestoreChunk @@ -3171,6 +3172,7 @@ TidOpExpr TidPath TidRangePath TidRangeScan +TidRangeScanInstrumentation TidRangeScanState TidScan TidScanState -- 2.53.0
