On 7/23/25 02:37, Tomas Vondra wrote: > ... > >>> Thanks. I wonder how difficult would it be to add something like this to >>> pgstattuple. I mean, it shouldn't be difficult to look at leaf pages and >>> count distinct blocks, right? Seems quite useful. >> >> I agree that that would be quite useful. >> > > Good first patch for someone ;-) >
I got a bit bored yesterday, so I gave this a try and whipped up a patch that adds two pgstattuple functins that I think could be useful for analyzing index metrics that matter for prefetching. The patch adds two functions, that are meant to provide data for additional analysis rather than computing something final. Each function splits the index into a sequence of block ranges (of given length), and calculates some metrics on that. pgstatindex_nheap - number of leafs in the range - number of block numbers - number of distinct block numbers - number of runs (of the same block) pgstatindex_runs - number of leafs in the range - run length - number of runs with the length It's trivial to summarize this into a per-index statistic (of course, there may be some inaccuracies when the run spans multiple ranges), but it also seems useful to be able to look at parts of the index. This is meant as a quick experimental patch, to help with generating better datasets for the evaluation. And I think it works for that, and I don't have immediate plans to work on this outside that context. There are a couple things we'd need to address before actually merging this, I think. Two that I can think of right now: First, the "range length" determines memory usage. Right now it's a bit naive, and just extracts all blocks (for the range) into an array. That might be an issue for larger ranges, I'm sure there are strategies to mitigate that - doing some of the processing when reading block numbers, using hyperloglog to estimate distincts, etc. Second, the index is walked sequentially in physical order, from block 0 to the last block. But that's not really what the index prefetch sees. To make it "more accurate" it'd be better to just scan the leaf pages as if during a "full index scan". Also, I haven't updated the docs. That'd also need to be done. regards -- Tomas Vondra
From dd36f2194986e5366e1f0800eff6b8a61611dd0f Mon Sep 17 00:00:00 2001 From: Tomas Vondra <to...@vondra.me> Date: Thu, 24 Jul 2025 13:00:24 +0200 Subject: [PATCH v1] pgstattuple: analyze TIDs on btree leaf pages --- contrib/pgstattuple/Makefile | 3 +- contrib/pgstattuple/pgstatindex.c | 447 ++++++++++++++++++ contrib/pgstattuple/pgstattuple--1.5--1.6.sql | 30 ++ contrib/pgstattuple/pgstattuple.control | 2 +- 4 files changed, 480 insertions(+), 2 deletions(-) create mode 100644 contrib/pgstattuple/pgstattuple--1.5--1.6.sql diff --git a/contrib/pgstattuple/Makefile b/contrib/pgstattuple/Makefile index c5b17fc703e..d5c62ba36f9 100644 --- a/contrib/pgstattuple/Makefile +++ b/contrib/pgstattuple/Makefile @@ -10,7 +10,8 @@ OBJS = \ EXTENSION = pgstattuple DATA = pgstattuple--1.4.sql pgstattuple--1.4--1.5.sql \ pgstattuple--1.3--1.4.sql pgstattuple--1.2--1.3.sql \ - pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql + pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql \ + pgstattuple--1.5--1.6.sql PGFILEDESC = "pgstattuple - tuple-level statistics" REGRESS = pgstattuple diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index 4b9d76ec4e4..aa4431075b0 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -62,6 +62,9 @@ PG_FUNCTION_INFO_V1(pg_relpages_v1_5); PG_FUNCTION_INFO_V1(pg_relpagesbyid_v1_5); PG_FUNCTION_INFO_V1(pgstatginindex_v1_5); +PG_FUNCTION_INFO_V1(pgstatindex_nheap_v1_6); +PG_FUNCTION_INFO_V1(pgstatindex_runs_v1_6); + Datum pgstatginindex_internal(Oid relid, FunctionCallInfo fcinfo); #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX) @@ -128,6 +131,9 @@ static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo); static int64 pg_relpages_impl(Relation rel); static void GetHashPageStats(Page page, HashIndexStat *stats); +static Datum pgstatindex_nheap_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo); +static Datum pgstatindex_runs_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo); + /* ------------------------------------------------------ * pgstatindex() * @@ -756,3 +762,444 @@ GetHashPageStats(Page page, HashIndexStat *stats) } stats->free_space += PageGetExactFreeSpace(page); } + +/* + */ +Datum +pgstatindex_nheap_v1_6(PG_FUNCTION_ARGS) +{ + text *relname = PG_GETARG_TEXT_PP(0); + int64 nblocks = PG_GETARG_INT64(1); + Relation rel; + RangeVar *relrv; + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + PG_RETURN_DATUM(pgstatindex_nheap_impl(rel, nblocks, fcinfo)); +} + +/* + */ +Datum +pgstatindex_runs_v1_6(PG_FUNCTION_ARGS) +{ + text *relname = PG_GETARG_TEXT_PP(0); + int64 nblocks = PG_GETARG_INT64(1); + Relation rel; + RangeVar *relrv; + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + PG_RETURN_DATUM(pgstatindex_runs_impl(rel, nblocks, fcinfo)); +} + +static int +count_block_runs(int range_nblocks, BlockNumber *range_blocks) +{ + int nruns = 1; + + for (int i = 1; i < range_nblocks; i++) + { + if (range_blocks[i] != range_blocks[i-1]) + nruns++; + } + + return nruns; +} + +static int +blocknum_cmp(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(BlockNumber)); +} + +static int +count_blocks_distinct(int range_nblocks, BlockNumber *range_blocks) +{ + pg_qsort(range_blocks, range_nblocks, sizeof(BlockNumber), blocknum_cmp); + + return count_block_runs(range_nblocks, range_blocks); +} + +static Datum +pgstatindex_nheap_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + BlockNumber nblocks; + BlockNumber blkno; + BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); + + BlockNumber range_start = 0; + BlockNumber *range_blocks = NULL; + int range_nblocks; + int range_leafs; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + + Datum values[5]; + bool nulls[5]; + + /* no NULLs */ + memset(nulls, 0, sizeof(nulls)); + + if (!IS_INDEX(rel) || !IS_BTREE(rel)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" is not a btree index", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit + * early. We're capable of assessing an indisready&&!indisvalid index, + * but the results could be confusing. For example, the index's size + * could be too low for a valid index of the table. + */ + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index \"%s\" is not valid", + RelationGetRelationName(rel)))); + + InitMaterializedSRF(fcinfo, 0); + tupdesc = rsinfo->setDesc; + tupstore = rsinfo->setResult; + + /* pre-allocate space for the maximum number of TIDs we might see + * on range_len pages */ + range_nblocks = 0; + range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage); + + /* + * Scan all blocks except the metapage + */ + nblocks = RelationGetNumberOfBlocks(rel); + + for (blkno = 1; blkno < nblocks; blkno++) + { + Buffer buffer; + Page page; + BTPageOpaque opaque; + + CHECK_FOR_INTERRUPTS(); + + /* + * If we started a new range, count the distinct blocks and add + * a tuple into the result set (if there are any TIDs). + */ + if (range_start + range_len <= blkno) + { + if (range_nblocks > 0) + { + HeapTuple tuple; + int ndistinct; + int nruns; + + nruns = count_block_runs(range_nblocks, range_blocks); + + /* this modifies the array, so do once */ + ndistinct = count_blocks_distinct(range_nblocks, range_blocks); + + values[0] = Int64GetDatum(range_start); + values[1] = Int64GetDatum(range_leafs); + values[2] = Int64GetDatum(range_nblocks); + values[3] = Int64GetDatum(ndistinct); + values[4] = Int64GetDatum(nruns); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + + range_nblocks = 0; + range_leafs = 0; + } + + while (range_start + range_len <= blkno) + range_start += range_len; + } + + /* Read and lock buffer */ + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + opaque = BTPageGetOpaque(page); + + /* + * Ignore deleted/dead pages, and internal (non-leaf) pages. + */ + if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque)) + { + OffsetNumber offset = FirstOffsetNumber; + OffsetNumber maxoffset = PageGetMaxOffsetNumber(page); + bool rightmost = P_RIGHTMOST(opaque); + + while (offset < maxoffset) + { + ItemId id = PageGetItemId(page, offset); + IndexTuple itup = (IndexTuple) PageGetItem(page, id); + BlockNumber block = ItemPointerGetBlockNumber(&itup->t_tid); + bool ispivot = (!rightmost && offset == P_HIKEY); + + Assert(range_nblocks >= 0); + Assert(range_nblocks < range_len * MaxTIDsPerBTreePage); + + /* ignore pivot tuples */ + if (!ispivot) + range_blocks[range_nblocks++] = block; + + offset++; + } + + range_leafs++; + } + + /* Unlock and release buffer */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* output the last range */ + if (range_nblocks > 0) + { + HeapTuple tuple; + int ndistinct; + int nruns; + + nruns = count_block_runs(range_nblocks, range_blocks); + + /* this modifies the array, so do once */ + ndistinct = count_blocks_distinct(range_nblocks, range_blocks); + + values[0] = Int64GetDatum(range_start); + values[1] = Int64GetDatum(range_leafs); + values[2] = Int64GetDatum(range_nblocks); + values[3] = Int64GetDatum(ndistinct); + values[4] = Int64GetDatum(nruns); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + + range_nblocks = 0; + range_leafs = 0; + } + + relation_close(rel, AccessShareLock); + + PG_RETURN_NULL(); +} + +static void +count_run_lengths(int range_nblocks, BlockNumber *range_blocks, int *lengths) +{ + int len = 1; + BlockNumber curr = range_blocks[0]; + + for (int i = 0; i < range_nblocks; i++) + { + if (range_blocks[i] != curr) + { + lengths[len]++; + len = 1; + curr = range_blocks[i]; + continue; + } + + len++; + } + + lengths[len]++; +} + +static Datum +pgstatindex_runs_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + BlockNumber nblocks; + BlockNumber blkno; + BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); + + BlockNumber range_start = 0; + BlockNumber *range_blocks = NULL; + int *run_lengths = NULL; + int range_nblocks; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + + Datum values[3]; + bool nulls[3]; + + /* no NULLs */ + memset(nulls, 0, sizeof(nulls)); + + if (!IS_INDEX(rel) || !IS_BTREE(rel)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" is not a btree index", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit + * early. We're capable of assessing an indisready&&!indisvalid index, + * but the results could be confusing. For example, the index's size + * could be too low for a valid index of the table. + */ + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index \"%s\" is not valid", + RelationGetRelationName(rel)))); + + InitMaterializedSRF(fcinfo, 0); + tupdesc = rsinfo->setDesc; + tupstore = rsinfo->setResult; + + /* pre-allocate space for the maximum number of TIDs we might see + * on range_len pages */ + range_nblocks = 0; + range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage); + + /* lengths of runs (the range could be one long run) */ + run_lengths = palloc_array(int, (range_len * MaxTIDsPerBTreePage + 1)); + + /* + * Scan all blocks except the metapage + */ + nblocks = RelationGetNumberOfBlocks(rel); + + for (blkno = 1; blkno < nblocks; blkno++) + { + Buffer buffer; + Page page; + BTPageOpaque opaque; + + CHECK_FOR_INTERRUPTS(); + + /* + * If we started a new range, count the distinct blocks and add + * a tuple into the result set (if there are any TIDs). + */ + if (range_start + range_len <= blkno) + { + if (range_nblocks > 0) + { + HeapTuple tuple; + + memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1)); + count_run_lengths(range_nblocks, range_blocks, run_lengths); + + for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++) + { + if (run_lengths[i] > 0) + { + values[0] = Int64GetDatum(range_start); + values[1] = Int32GetDatum(i); + values[2] = Int32GetDatum(run_lengths[i]); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + } + } + + range_nblocks = 0; + } + + while (range_start + range_len <= blkno) + range_start += range_len; + } + + /* Read and lock buffer */ + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + opaque = BTPageGetOpaque(page); + + /* + * Ignore deleted/dead pages, and internal (non-leaf) pages. + */ + if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque)) + { + OffsetNumber offset = FirstOffsetNumber; + OffsetNumber maxoffset = PageGetMaxOffsetNumber(page); + bool rightmost = P_RIGHTMOST(opaque); + + while (offset < maxoffset) + { + ItemId id = PageGetItemId(page, offset); + IndexTuple itup = (IndexTuple) PageGetItem(page, id); + BlockNumber block = ItemPointerGetBlockNumber(&itup->t_tid); + bool ispivot = (!rightmost && offset == P_HIKEY); + + Assert(range_nblocks >= 0); + Assert(range_nblocks < range_len * MaxTIDsPerBTreePage); + + /* ignore pivot tuples */ + if (!ispivot) + range_blocks[range_nblocks++] = block; + + offset++; + } + } + + /* Unlock and release buffer */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* output the last range */ + if (range_nblocks > 0) + { + HeapTuple tuple; + + memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1)); + count_run_lengths(range_nblocks, range_blocks, run_lengths); + + for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++) + { + if (run_lengths[i] > 0) + { + values[0] = Int64GetDatum(range_start); + values[1] = Int32GetDatum(i); + values[2] = Int32GetDatum(run_lengths[i]); + + /* Build and return the result tuple */ + tuple = heap_form_tuple(tupdesc, values, nulls); + + tuplestore_puttuple(tupstore, tuple); + } + } + + range_nblocks = 0; + } + + relation_close(rel, AccessShareLock); + + PG_RETURN_NULL(); +} diff --git a/contrib/pgstattuple/pgstattuple--1.5--1.6.sql b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql new file mode 100644 index 00000000000..c6a450a301b --- /dev/null +++ b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql @@ -0,0 +1,30 @@ +/* contrib/pgstattuple/pgstattuple--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pgstattuple UPDATE TO '1.6'" to load this file. \quit + +CREATE OR REPLACE FUNCTION pgstatindex_nheap(IN relname text, + IN blocks BIGINT, -- length of ranges to analyze + OUT block BIGINT, -- first block of a range + OUT num_leafs BIGINT, -- number of leaf pages + OUT num_items BIGINT, -- number of heap TIDs + OUT num_blocks BIGINT, -- number of distinct blocks + OUT num_runs BIGINT) -- number of continuous runs +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pgstatindex_nheap_v1_6' +LANGUAGE C STRICT PARALLEL SAFE; + +REVOKE EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) TO pg_stat_scan_tables; + +CREATE OR REPLACE FUNCTION pgstatindex_runs(IN relname text, + IN blocks BIGINT, -- length of ranges to analyze + OUT block BIGINT, -- first block of a range + OUT run_length INT, -- number of leaf pages + OUT run_count INT) -- number of heap TIDs +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pgstatindex_runs_v1_6' +LANGUAGE C STRICT PARALLEL SAFE; + +REVOKE EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) TO pg_stat_scan_tables; diff --git a/contrib/pgstattuple/pgstattuple.control b/contrib/pgstattuple/pgstattuple.control index 6af40757b27..80d06958e90 100644 --- a/contrib/pgstattuple/pgstattuple.control +++ b/contrib/pgstattuple/pgstattuple.control @@ -1,5 +1,5 @@ # pgstattuple extension comment = 'show tuple-level statistics' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pgstattuple' relocatable = true -- 2.50.1