Re: index prefetching

Tomas Vondra Thu, 24 Jul 2025 04:19:45 -0700

On 7/23/25 02:37, Tomas Vondra wrote:
> ...
> 
>>> Thanks. I wonder how difficult would it be to add something like this to
>>> pgstattuple. I mean, it shouldn't be difficult to look at leaf pages and
>>> count distinct blocks, right? Seems quite useful.
>>
>> I agree that that would be quite useful.
>>
> 
> Good first patch for someone ;-)
>


I got a bit bored yesterday, so I gave this a try and whipped up a patch
that adds two pgstattuple functins that I think could be useful for
analyzing index metrics that matter for prefetching.

The patch adds two functions, that are meant to provide data for
additional analysis rather than computing something final. Each function
splits the index into a sequence of block ranges (of given length), and
calculates some metrics on that.

pgstatindex_nheap
  - number of leafs in the range
  - number of block numbers
  - number of distinct block numbers
  - number of runs (of the same block)

pgstatindex_runs
  - number of leafs in the range
  - run length
  - number of runs with the length

It's trivial to summarize this into a per-index statistic (of course,
there may be some inaccuracies when the run spans multiple ranges), but
it also seems useful to be able to look at parts of the index.

This is meant as a quick experimental patch, to help with generating
better datasets for the evaluation. And I think it works for that, and I
don't have immediate plans to work on this outside that context.

There are a couple things we'd need to address before actually merging
this, I think. Two that I can think of right now:

First, the "range length" determines memory usage. Right now it's a bit
naive, and just extracts all blocks (for the range) into an array. That
might be an issue for larger ranges, I'm sure there are strategies to
mitigate that - doing some of the processing when reading block numbers,
using hyperloglog to estimate distincts, etc.

Second, the index is walked sequentially in physical order, from block 0
to the last block. But that's not really what the index prefetch sees.
To make it "more accurate" it'd be better to just scan the leaf pages as
if during a "full index scan".

Also, I haven't updated the docs. That'd also need to be done.


regards

-- 
Tomas Vondra

From dd36f2194986e5366e1f0800eff6b8a61611dd0f Mon Sep 17 00:00:00 2001
From: Tomas Vondra <to...@vondra.me>
Date: Thu, 24 Jul 2025 13:00:24 +0200
Subject: [PATCH v1] pgstattuple: analyze TIDs on btree leaf pages

---
 contrib/pgstattuple/Makefile                  |   3 +-
 contrib/pgstattuple/pgstatindex.c             | 447 ++++++++++++++++++
 contrib/pgstattuple/pgstattuple--1.5--1.6.sql |  30 ++
 contrib/pgstattuple/pgstattuple.control       |   2 +-
 4 files changed, 480 insertions(+), 2 deletions(-)
 create mode 100644 contrib/pgstattuple/pgstattuple--1.5--1.6.sql

diff --git a/contrib/pgstattuple/Makefile b/contrib/pgstattuple/Makefile
index c5b17fc703e..d5c62ba36f9 100644
--- a/contrib/pgstattuple/Makefile
+++ b/contrib/pgstattuple/Makefile
@@ -10,7 +10,8 @@ OBJS = \
 EXTENSION = pgstattuple
 DATA = pgstattuple--1.4.sql pgstattuple--1.4--1.5.sql \
 	pgstattuple--1.3--1.4.sql pgstattuple--1.2--1.3.sql \
-	pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql
+	pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql \
+	pgstattuple--1.5--1.6.sql
 PGFILEDESC = "pgstattuple - tuple-level statistics"
 
 REGRESS = pgstattuple
diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c
index 4b9d76ec4e4..aa4431075b0 100644
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -62,6 +62,9 @@ PG_FUNCTION_INFO_V1(pg_relpages_v1_5);
 PG_FUNCTION_INFO_V1(pg_relpagesbyid_v1_5);
 PG_FUNCTION_INFO_V1(pgstatginindex_v1_5);
 
+PG_FUNCTION_INFO_V1(pgstatindex_nheap_v1_6);
+PG_FUNCTION_INFO_V1(pgstatindex_runs_v1_6);
+
 Datum		pgstatginindex_internal(Oid relid, FunctionCallInfo fcinfo);
 
 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
@@ -128,6 +131,9 @@ static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo);
 static int64 pg_relpages_impl(Relation rel);
 static void GetHashPageStats(Page page, HashIndexStat *stats);
 
+static Datum pgstatindex_nheap_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo);
+static Datum pgstatindex_runs_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo);
+
 /* ------------------------------------------------------
  * pgstatindex()
  *
@@ -756,3 +762,444 @@ GetHashPageStats(Page page, HashIndexStat *stats)
 	}
 	stats->free_space += PageGetExactFreeSpace(page);
 }
+
+/*
+ */
+Datum
+pgstatindex_nheap_v1_6(PG_FUNCTION_ARGS)
+{
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	int64		nblocks = PG_GETARG_INT64(1);
+	Relation	rel;
+	RangeVar   *relrv;
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	PG_RETURN_DATUM(pgstatindex_nheap_impl(rel, nblocks, fcinfo));
+}
+
+/*
+ */
+Datum
+pgstatindex_runs_v1_6(PG_FUNCTION_ARGS)
+{
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	int64		nblocks = PG_GETARG_INT64(1);
+	Relation	rel;
+	RangeVar   *relrv;
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	PG_RETURN_DATUM(pgstatindex_runs_impl(rel, nblocks, fcinfo));
+}
+
+static int
+count_block_runs(int range_nblocks, BlockNumber *range_blocks)
+{
+	int			nruns = 1;
+
+	for (int i = 1; i < range_nblocks; i++)
+	{
+		if (range_blocks[i] != range_blocks[i-1])
+			nruns++;
+	}
+
+	return nruns;
+}
+
+static int
+blocknum_cmp(const void *a, const void *b)
+{
+	return memcmp(a, b, sizeof(BlockNumber));
+}
+
+static int
+count_blocks_distinct(int range_nblocks, BlockNumber *range_blocks)
+{
+	pg_qsort(range_blocks, range_nblocks, sizeof(BlockNumber), blocknum_cmp);
+
+	return count_block_runs(range_nblocks, range_blocks);
+}
+
+static Datum
+pgstatindex_nheap_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	BlockNumber nblocks;
+	BlockNumber blkno;
+	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+
+	BlockNumber range_start = 0;
+	BlockNumber	*range_blocks = NULL;
+	int			range_nblocks;
+	int			range_leafs;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+
+	Datum		values[5];
+	bool		nulls[5];
+
+	/* no NULLs */
+	memset(nulls, 0, sizeof(nulls));
+
+	if (!IS_INDEX(rel) || !IS_BTREE(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is not a btree index",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit
+	 * early.  We're capable of assessing an indisready&&!indisvalid index,
+	 * but the results could be confusing.  For example, the index's size
+	 * could be too low for a valid index of the table.
+	 */
+	if (!rel->rd_index->indisvalid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("index \"%s\" is not valid",
+						RelationGetRelationName(rel))));
+
+	InitMaterializedSRF(fcinfo, 0);
+	tupdesc = rsinfo->setDesc;
+	tupstore = rsinfo->setResult;
+
+	/* pre-allocate space for the maximum number of TIDs we might see
+	 * on range_len pages */
+	range_nblocks = 0;
+	range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage);
+
+	/*
+	 * Scan all blocks except the metapage
+	 */
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	for (blkno = 1; blkno < nblocks; blkno++)
+	{
+		Buffer		buffer;
+		Page		page;
+		BTPageOpaque opaque;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If we started a new range, count the distinct blocks and add
+		 * a tuple into the result set (if there are any TIDs).
+		 */
+		if (range_start + range_len <= blkno)
+		{
+			if (range_nblocks > 0)
+			{
+				HeapTuple	tuple;
+				int			ndistinct;
+				int			nruns;
+
+				nruns = count_block_runs(range_nblocks, range_blocks);
+
+				/* this modifies the array, so do once */
+				ndistinct = count_blocks_distinct(range_nblocks, range_blocks);
+
+				values[0] = Int64GetDatum(range_start);
+				values[1] = Int64GetDatum(range_leafs);
+				values[2] = Int64GetDatum(range_nblocks);
+				values[3] = Int64GetDatum(ndistinct);
+				values[4] = Int64GetDatum(nruns);
+
+				/* Build and return the result tuple */
+				tuple = heap_form_tuple(tupdesc, values, nulls);
+
+				tuplestore_puttuple(tupstore, tuple);
+
+				range_nblocks = 0;
+				range_leafs = 0;
+			}
+
+			while (range_start + range_len <= blkno)
+				range_start += range_len;
+		}
+
+		/* Read and lock buffer */
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+		page = BufferGetPage(buffer);
+		opaque = BTPageGetOpaque(page);
+
+		/*
+		 * Ignore deleted/dead pages, and internal (non-leaf) pages.
+		 */
+		if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque))
+		{
+			OffsetNumber	offset = FirstOffsetNumber;
+			OffsetNumber	maxoffset = PageGetMaxOffsetNumber(page);
+			bool			rightmost = P_RIGHTMOST(opaque);
+
+			while (offset < maxoffset)
+			{
+				ItemId		id = PageGetItemId(page, offset);
+				IndexTuple	itup = (IndexTuple) PageGetItem(page, id);
+				BlockNumber	block = ItemPointerGetBlockNumber(&itup->t_tid);
+				bool		ispivot = (!rightmost && offset == P_HIKEY);
+
+				Assert(range_nblocks >= 0);
+				Assert(range_nblocks < range_len * MaxTIDsPerBTreePage);
+
+				/* ignore pivot tuples */
+				if (!ispivot)
+					range_blocks[range_nblocks++] = block;
+
+				offset++;
+			}
+
+			range_leafs++;
+		}
+
+		/* Unlock and release buffer */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+	}
+
+	/* output the last range */
+	if (range_nblocks > 0)
+	{
+		HeapTuple	tuple;
+		int			ndistinct;
+		int			nruns;
+	
+		nruns = count_block_runs(range_nblocks, range_blocks);
+	
+		/* this modifies the array, so do once */
+		ndistinct = count_blocks_distinct(range_nblocks, range_blocks);
+	
+		values[0] = Int64GetDatum(range_start);
+		values[1] = Int64GetDatum(range_leafs);
+		values[2] = Int64GetDatum(range_nblocks);
+		values[3] = Int64GetDatum(ndistinct);
+		values[4] = Int64GetDatum(nruns);
+	
+		/* Build and return the result tuple */
+		tuple = heap_form_tuple(tupdesc, values, nulls);
+	
+		tuplestore_puttuple(tupstore, tuple);
+	
+		range_nblocks = 0;
+		range_leafs = 0;
+	}
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_NULL();
+}
+
+static void
+count_run_lengths(int range_nblocks, BlockNumber *range_blocks, int *lengths)
+{
+	int			len = 1;
+	BlockNumber	curr = range_blocks[0];
+
+	for (int i = 0; i < range_nblocks; i++)
+	{
+		if (range_blocks[i] != curr)
+		{
+			lengths[len]++;
+			len = 1;
+			curr = range_blocks[i];
+			continue;
+		}
+
+		len++;
+	}
+
+	lengths[len]++;
+}
+
+static Datum
+pgstatindex_runs_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	BlockNumber nblocks;
+	BlockNumber blkno;
+	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+
+	BlockNumber range_start = 0;
+	BlockNumber	*range_blocks = NULL;
+	int			*run_lengths = NULL;
+	int			range_nblocks;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+
+	Datum		values[3];
+	bool		nulls[3];
+
+	/* no NULLs */
+	memset(nulls, 0, sizeof(nulls));
+
+	if (!IS_INDEX(rel) || !IS_BTREE(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is not a btree index",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit
+	 * early.  We're capable of assessing an indisready&&!indisvalid index,
+	 * but the results could be confusing.  For example, the index's size
+	 * could be too low for a valid index of the table.
+	 */
+	if (!rel->rd_index->indisvalid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("index \"%s\" is not valid",
+						RelationGetRelationName(rel))));
+
+	InitMaterializedSRF(fcinfo, 0);
+	tupdesc = rsinfo->setDesc;
+	tupstore = rsinfo->setResult;
+
+	/* pre-allocate space for the maximum number of TIDs we might see
+	 * on range_len pages */
+	range_nblocks = 0;
+	range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage);
+
+	/* lengths of runs (the range could be one long run) */
+	run_lengths = palloc_array(int, (range_len * MaxTIDsPerBTreePage + 1));
+
+	/*
+	 * Scan all blocks except the metapage
+	 */
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	for (blkno = 1; blkno < nblocks; blkno++)
+	{
+		Buffer		buffer;
+		Page		page;
+		BTPageOpaque opaque;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If we started a new range, count the distinct blocks and add
+		 * a tuple into the result set (if there are any TIDs).
+		 */
+		if (range_start + range_len <= blkno)
+		{
+			if (range_nblocks > 0)
+			{
+				HeapTuple	tuple;
+
+				memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1));
+				count_run_lengths(range_nblocks, range_blocks, run_lengths);
+
+				for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++)
+				{
+					if (run_lengths[i] > 0)
+					{
+						values[0] = Int64GetDatum(range_start);
+						values[1] = Int32GetDatum(i);
+						values[2] = Int32GetDatum(run_lengths[i]);
+
+						/* Build and return the result tuple */
+						tuple = heap_form_tuple(tupdesc, values, nulls);
+
+						tuplestore_puttuple(tupstore, tuple);
+					}
+				}
+
+				range_nblocks = 0;
+			}
+
+			while (range_start + range_len <= blkno)
+				range_start += range_len;
+		}
+
+		/* Read and lock buffer */
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+		page = BufferGetPage(buffer);
+		opaque = BTPageGetOpaque(page);
+
+		/*
+		 * Ignore deleted/dead pages, and internal (non-leaf) pages.
+		 */
+		if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque))
+		{
+			OffsetNumber	offset = FirstOffsetNumber;
+			OffsetNumber	maxoffset = PageGetMaxOffsetNumber(page);
+			bool			rightmost = P_RIGHTMOST(opaque);
+
+			while (offset < maxoffset)
+			{
+				ItemId		id = PageGetItemId(page, offset);
+				IndexTuple	itup = (IndexTuple) PageGetItem(page, id);
+				BlockNumber	block = ItemPointerGetBlockNumber(&itup->t_tid);
+				bool		ispivot = (!rightmost && offset == P_HIKEY);
+
+				Assert(range_nblocks >= 0);
+				Assert(range_nblocks < range_len * MaxTIDsPerBTreePage);
+
+				/* ignore pivot tuples */
+				if (!ispivot)
+					range_blocks[range_nblocks++] = block;
+
+				offset++;
+			}
+		}
+
+		/* Unlock and release buffer */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+	}
+
+	/* output the last range */
+	if (range_nblocks > 0)
+	{
+		HeapTuple	tuple;
+
+		memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1));
+		count_run_lengths(range_nblocks, range_blocks, run_lengths);
+
+		for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++)
+		{
+			if (run_lengths[i] > 0)
+			{
+				values[0] = Int64GetDatum(range_start);
+				values[1] = Int32GetDatum(i);
+				values[2] = Int32GetDatum(run_lengths[i]);
+
+				/* Build and return the result tuple */
+				tuple = heap_form_tuple(tupdesc, values, nulls);
+
+				tuplestore_puttuple(tupstore, tuple);
+			}
+		}
+
+		range_nblocks = 0;
+	}
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_NULL();
+}
diff --git a/contrib/pgstattuple/pgstattuple--1.5--1.6.sql b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql
new file mode 100644
index 00000000000..c6a450a301b
--- /dev/null
+++ b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql
@@ -0,0 +1,30 @@
+/* contrib/pgstattuple/pgstattuple--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pgstattuple UPDATE TO '1.6'" to load this file. \quit
+
+CREATE OR REPLACE FUNCTION pgstatindex_nheap(IN relname text,
+    IN blocks BIGINT,			-- length of ranges to analyze
+    OUT block BIGINT,			-- first block of a range
+    OUT num_leafs BIGINT,		-- number of leaf pages
+    OUT num_items BIGINT,		-- number of heap TIDs
+    OUT num_blocks BIGINT,		-- number of distinct blocks
+    OUT num_runs BIGINT)		-- number of continuous runs
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgstatindex_nheap_v1_6'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+REVOKE EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) TO pg_stat_scan_tables;
+
+CREATE OR REPLACE FUNCTION pgstatindex_runs(IN relname text,
+    IN blocks BIGINT,			-- length of ranges to analyze
+    OUT block BIGINT,			-- first block of a range
+    OUT run_length INT,			-- number of leaf pages
+    OUT run_count INT)		-- number of heap TIDs
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgstatindex_runs_v1_6'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+REVOKE EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) TO pg_stat_scan_tables;
diff --git a/contrib/pgstattuple/pgstattuple.control b/contrib/pgstattuple/pgstattuple.control
index 6af40757b27..80d06958e90 100644
--- a/contrib/pgstattuple/pgstattuple.control
+++ b/contrib/pgstattuple/pgstattuple.control
@@ -1,5 +1,5 @@
 # pgstattuple extension
 comment = 'show tuple-level statistics'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pgstattuple'
 relocatable = true
-- 
2.50.1

Re: index prefetching

Reply via email to