Re: index prefetching

Tomas Vondra Fri, 14 Jul 2023 13:32:17 -0700

Here's a v5 of the patch, rebased to current master and fixing a couple
compiler warnings reported by cfbot (%lu vs. UINT64_FORMAT in some debug
messages). No other changes compared to v4.


cfbot also reported a failure on windows in pg_dump [1], but it seem
pretty strange:

[11:42:48.708] ------------------------------------- 8<
-------------------------------------
[11:42:48.708] stderr:
[11:42:48.708] #   Failed test 'connecting to an invalid database: matches'

The patch does nothing related to pg_dump, and the test works perfectly
fine for me (I don't have windows machine, but 32-bit and 64-bit linux
works fine for me).


regards


[1] https://cirrus-ci.com/task/6398095366291456

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index e2c9b5f069..9045c6eb7a 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -678,7 +678,6 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 					scan->xs_hitup = so->pageData[so->curPageData].recontup;
 
 				so->curPageData++;
-
 				return true;
 			}
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 5a17112c91..0b6c920ebd 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -44,6 +44,7 @@
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
@@ -751,6 +752,9 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			PROGRESS_CLUSTER_INDEX_RELID
 		};
 		int64		ci_val[2];
+		int			prefetch_target;
+
+		prefetch_target = get_tablespace_io_concurrency(OldHeap->rd_rel->reltablespace);
 
 		/* Set phase and OIDOldIndex to columns */
 		ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
@@ -759,7 +763,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		tableScan = NULL;
 		heapScan = NULL;
-		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0,
+									prefetch_target, prefetch_target);
 		index_rescan(indexScan, NULL, 0, NULL, 0);
 	}
 	else
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 722927aeba..264ebe1d8e 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -126,6 +126,9 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_hitup = NULL;
 	scan->xs_hitupdesc = NULL;
 
+	/* set in each AM when applicable */
+	scan->xs_prefetch = NULL;
+
 	return scan;
 }
 
@@ -440,8 +443,9 @@ systable_beginscan(Relation heapRelation,
 				elog(ERROR, "column is not in index");
 		}
 
+		/* no index prefetch for system catalogs */
 		sysscan->iscan = index_beginscan(heapRelation, irel,
-										 snapshot, nkeys, 0);
+										 snapshot, nkeys, 0, 0, 0);
 		index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 		sysscan->scan = NULL;
 	}
@@ -696,8 +700,9 @@ systable_beginscan_ordered(Relation heapRelation,
 			elog(ERROR, "column is not in index");
 	}
 
+	/* no index prefetch for system catalogs */
 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
-									 snapshot, nkeys, 0);
+									 snapshot, nkeys, 0, 0, 0);
 	index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 	sysscan->scan = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index b25b03f7ab..0b8f136f04 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -54,11 +54,13 @@
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_type.h"
 #include "commands/defrem.h"
+#include "common/hashfn.h"
 #include "nodes/makefuncs.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/lsyscache.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
@@ -106,7 +108,10 @@ do { \
 
 static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  int nkeys, int norderbys, Snapshot snapshot,
-											  ParallelIndexScanDesc pscan, bool temp_snap);
+											  ParallelIndexScanDesc pscan, bool temp_snap,
+											  int prefetch_target, int prefetch_reset);
+
+static void index_prefetch(IndexScanDesc scan, ItemPointer tid);
 
 
 /* ----------------------------------------------------------------
@@ -200,18 +205,36 @@ index_insert(Relation indexRelation,
  * index_beginscan - start a scan of an index with amgettuple
  *
  * Caller must be holding suitable locks on the heap and the index.
+ *
+ * prefetch_target determines if prefetching is requested for this index scan.
+ * We need to be able to disable this for two reasons. Firstly, we don't want
+ * to do prefetching for IOS (where we hope most of the heap pages won't be
+ * really needed. Secondly, we must prevent infinite loop when determining
+ * prefetch value for the tablespace - the get_tablespace_io_concurrency()
+ * does an index scan internally, which would result in infinite loop. So we
+ * simply disable prefetching in systable_beginscan().
+ *
+ * XXX Maybe we should do prefetching even for catalogs, but then disable it
+ * when accessing TableSpaceRelationId. We still need the ability to disable
+ * this and catalogs are expected to be tiny, so prefetching is unlikely to
+ * make a difference.
+ *
+ * XXX The second reason doesn't really apply after effective_io_concurrency
+ * lookup moved to caller of index_beginscan.
  */
 IndexScanDesc
 index_beginscan(Relation heapRelation,
 				Relation indexRelation,
 				Snapshot snapshot,
-				int nkeys, int norderbys)
+				int nkeys, int norderbys,
+				int prefetch_target, int prefetch_reset)
 {
 	IndexScanDesc scan;
 
 	Assert(snapshot != InvalidSnapshot);
 
-	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
+	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false,
+									prefetch_target, prefetch_reset);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -241,7 +264,8 @@ index_beginscan_bitmap(Relation indexRelation,
 
 	Assert(snapshot != InvalidSnapshot);
 
-	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false);
+	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false,
+									0, 0); /* no prefetch */
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -258,7 +282,8 @@ index_beginscan_bitmap(Relation indexRelation,
 static IndexScanDesc
 index_beginscan_internal(Relation indexRelation,
 						 int nkeys, int norderbys, Snapshot snapshot,
-						 ParallelIndexScanDesc pscan, bool temp_snap)
+						 ParallelIndexScanDesc pscan, bool temp_snap,
+						 int prefetch_target, int prefetch_reset)
 {
 	IndexScanDesc scan;
 
@@ -276,12 +301,27 @@ index_beginscan_internal(Relation indexRelation,
 	/*
 	 * Tell the AM to open a scan.
 	 */
-	scan = indexRelation->rd_indam->ambeginscan(indexRelation, nkeys,
-												norderbys);
+	scan = indexRelation->rd_indam->ambeginscan(indexRelation, nkeys, norderbys);
 	/* Initialize information for parallel scan. */
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
 
+	/* with prefetching enabled, initialize the necessary state */
+	if (prefetch_target > 0)
+	{
+		IndexPrefetch prefetcher = palloc0(sizeof(IndexPrefetchData));
+
+		prefetcher->queueIndex = 0;
+		prefetcher->queueStart = 0;
+		prefetcher->queueEnd = 0;
+
+		prefetcher->prefetchTarget = 0;
+		prefetcher->prefetchMaxTarget = prefetch_target;
+		prefetcher->prefetchReset = prefetch_reset;
+
+		scan->xs_prefetch = prefetcher;
+	}
+
 	return scan;
 }
 
@@ -317,6 +357,20 @@ index_rescan(IndexScanDesc scan,
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
+
+	/* If we're prefetching for this index, maybe reset some of the state. */
+	if (scan->xs_prefetch != NULL)
+	{
+		IndexPrefetch prefetcher = scan->xs_prefetch;
+
+		prefetcher->queueStart = 0;
+		prefetcher->queueEnd = 0;
+		prefetcher->queueIndex = 0;
+		prefetcher->prefetchDone = false;
+	
+		prefetcher->prefetchTarget = Min(prefetcher->prefetchTarget,
+										 prefetcher->prefetchReset);
+	}
 }
 
 /* ----------------
@@ -345,6 +399,19 @@ index_endscan(IndexScanDesc scan)
 	if (scan->xs_temp_snap)
 		UnregisterSnapshot(scan->xs_snapshot);
 
+	/* If prefetching enabled, log prefetch stats. */
+	if (scan->xs_prefetch)
+	{
+		IndexPrefetch prefetch = scan->xs_prefetch;
+
+		elog(LOG, "index prefetch stats: requests " UINT64_FORMAT " prefetches " UINT64_FORMAT " (%f) skip cached " UINT64_FORMAT " sequential " UINT64_FORMAT,
+			 prefetch->countAll,
+			 prefetch->countPrefetch,
+			 prefetch->countPrefetch * 100.0 / prefetch->countAll,
+			 prefetch->countSkipCached,
+			 prefetch->countSkipSequential);
+	}
+
 	/* Release the scan data structure itself */
 	IndexScanEnd(scan);
 }
@@ -487,10 +554,13 @@ index_parallelrescan(IndexScanDesc scan)
  * index_beginscan_parallel - join parallel index scan
  *
  * Caller must be holding suitable locks on the heap and the index.
+ *
+ * XXX See index_beginscan() for more comments on prefetch_target.
  */
 IndexScanDesc
 index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
-						 int norderbys, ParallelIndexScanDesc pscan)
+						 int norderbys, ParallelIndexScanDesc pscan,
+						 int prefetch_target, int prefetch_reset)
 {
 	Snapshot	snapshot;
 	IndexScanDesc scan;
@@ -499,7 +569,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
 	snapshot = RestoreSnapshot(pscan->ps_snapshot_data);
 	RegisterSnapshot(snapshot);
 	scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot,
-									pscan, true);
+									pscan, true, prefetch_target, prefetch_reset);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -623,20 +693,74 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 bool
 index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot)
 {
+	IndexPrefetch	prefetch = scan->xs_prefetch;
+
 	for (;;)
 	{
+		/* with prefetching enabled, accumulate enough TIDs into the prefetch */
+		if (PREFETCH_ACTIVE(prefetch))
+		{
+			/* 
+			 * incrementally ramp up prefetch distance
+			 *
+			 * XXX Intentionally done as first, so that with prefetching there's
+			 * always at least one item in the queue.
+			 */
+			prefetch->prefetchTarget = Min(prefetch->prefetchTarget + 1,
+										prefetch->prefetchMaxTarget);
+
+			/*
+			 * get more TID while there is empty space in the queue (considering
+			 * current prefetch target
+			 */
+			while (!PREFETCH_FULL(prefetch))
+			{
+				ItemPointer tid;
+
+				/* Time to fetch the next TID from the index */
+				tid = index_getnext_tid(scan, direction);
+
+				/* If we're out of index entries, we're done */
+				if (tid == NULL)
+				{
+					prefetch->prefetchDone = true;
+					break;
+				}
+
+				Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+
+				prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)] = *tid;
+				prefetch->queueEnd++;
+
+				index_prefetch(scan, tid);
+			}
+		}
+
 		if (!scan->xs_heap_continue)
 		{
-			ItemPointer tid;
+			if (PREFETCH_ENABLED(prefetch))
+			{
+				/* prefetching enabled, but reached the end and queue empty */
+				if (PREFETCH_DONE(prefetch))
+					break;
+
+				scan->xs_heaptid = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)];
+				prefetch->queueIndex++;
+			}
+			else	/* not prefetching, just do the regular work  */
+			{
+				ItemPointer tid;
 
-			/* Time to fetch the next TID from the index */
-			tid = index_getnext_tid(scan, direction);
+				/* Time to fetch the next TID from the index */
+				tid = index_getnext_tid(scan, direction);
 
-			/* If we're out of index entries, we're done */
-			if (tid == NULL)
-				break;
+				/* If we're out of index entries, we're done */
+				if (tid == NULL)
+					break;
+
+				Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+			}
 
-			Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
 		}
 
 		/*
@@ -988,3 +1112,258 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * Add the block to the tiny top-level queue (LRU), and check if the block
+ * is in a sequential pattern.
+ */
+static bool
+index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block)
+{
+	int		idx;
+
+	/* If the queue is empty, just store the block and we're done. */
+	if (prefetch->blockIndex == 0)
+	{
+		prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+		prefetch->blockIndex++;
+		return false;
+	}
+
+	/*
+	 * Otherwise, check if it's the same as the immediately preceding block (we
+	 * don't want to prefetch the same block over and over.)
+	 */
+	if (prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex - 1)] == block)
+		return true;
+
+	/* Not the same block, so add it to the queue. */
+	prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+	prefetch->blockIndex++;
+
+	/* check sequential patter a couple requests back */
+	for (int i = 1; i < PREFETCH_SEQ_PATTERN_BLOCKS; i++)
+	{
+		/* not enough requests to confirm a sequential pattern */
+		if (prefetch->blockIndex < i)
+			return false;
+
+		/*
+		 * index of the already requested buffer (-1 because we already
+		 * incremented the index when adding the block to the queue)
+		 */
+		idx = PREFETCH_BLOCK_INDEX(prefetch->blockIndex - i - 1);
+
+		/*  */
+		if (prefetch->blockItems[idx] != (block - i))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * index_prefetch_add_cache
+ *		Add a block to the cache, return true if it was recently prefetched.
+ *
+ * When checking a block, we need to check if it was recently prefetched,
+ * where recently means within PREFETCH_CACHE_SIZE requests. This check
+ * needs to be very cheap, even with fairly large caches (hundreds of
+ * entries). The cache does not need to be perfect, we can accept false
+ * positives/negatives, as long as the rate is reasonably low. We also
+ * need to expire entries, so that only "recent" requests are remembered.
+ *
+ * A queue would allow expiring the requests, but checking if a block was
+ * prefetched would be expensive (linear search for longer queues). Another
+ * option would be a hash table, but that has issues with expiring entries
+ * cheaply (which usually degrades the hash table).
+ *
+ * So we use a cache that is organized as multiple small LRU caches. Each
+ * block is mapped to a particular LRU by hashing (so it's a bit like a
+ * hash table), and each LRU is tiny (e.g. 8 entries). The LRU only keeps
+ * the most recent requests (for that particular LRU).
+ *
+ * This allows quick searches and expiration, with false negatives (when
+ * a particular LRU has too many collisions).
+ *
+ * For example, imagine 128 LRU caches, each with 8 entries - that's 1024
+ * prefetch request in total.
+ *
+ * The recency is determined using a prefetch counter, incremented every
+ * time we end up prefetching a block. The counter is uint64, so it should
+ * not wrap (125 zebibytes, would take ~4 million years at 1GB/s).
+ *
+ * To check if a block was prefetched recently, we calculate hash(block),
+ * and then linearly search if the tiny LRU has entry for the same block
+ * and request less than PREFETCH_CACHE_SIZE ago.
+ *
+ * At the same time, we either update the entry (for the same block) if
+ * found, or replace the oldest/empty entry.
+ *
+ * If the block was not recently prefetched (i.e. we want to prefetch it),
+ * we increment the counter.
+ */
+static bool
+index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block)
+{
+	PrefetchCacheEntry *entry;
+
+	/* calculate which LRU to use */
+	int			lru = hash_uint32(block) % PREFETCH_LRU_COUNT;
+
+	/* entry to (maybe) use for this block request */
+	uint64		oldestRequest = PG_UINT64_MAX;
+	int			oldestIndex = -1;
+
+	/*
+	 * First add the block to the (tiny) top-level LRU cache and see if it's
+	 * part of a sequential pattern. In this case we just ignore the block
+	 * and don't prefetch it - we expect read-ahead to do a better job.
+	 *
+	 * XXX Maybe we should still add the block to the later cache, in case
+	 * we happen to access it later? That might help if we first scan a lot
+	 * of the table sequentially, and then randomly. Not sure that's very
+	 * likely with index access, though.
+	 */
+	if (index_prefetch_is_sequential(prefetch, block))
+	{
+		prefetch->countSkipSequential++;
+		return true;
+	}
+
+	/* see if we already have prefetched this block (linear search of LRU) */
+	for (int i = 0; i < PREFETCH_LRU_SIZE; i++)
+	{
+		entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + i];
+
+		/* Is this the oldest prefetch request in this LRU? */
+		if (entry->request < oldestRequest)
+		{
+			oldestRequest = entry->request;
+			oldestIndex = i;
+		}
+
+		/* Request numbers are positive, so 0 means "unused". */
+		if (entry->request == 0)
+			continue;
+
+		/* Is this entry for the same block as the current request? */
+		if (entry->block == block)
+		{
+			bool	prefetched;
+
+			/*
+			 * Is the old request sufficiently recent? If yes, we treat the
+			 * block as already prefetched.
+			 *
+			 * XXX We do add the cache size to the request in order not to
+			 * have issues with uint64 underflows.
+			 */
+			prefetched = (entry->request + PREFETCH_CACHE_SIZE >= prefetch->prefetchReqNumber);
+
+			/* Update the request number. */
+			entry->request = ++prefetch->prefetchReqNumber;
+
+			prefetch->countSkipCached += (prefetched) ? 1 : 0;
+
+			return prefetched;
+		}
+	}
+
+	/*
+	 * We didn't find the block in the LRU, so store it either in an empty
+	 * entry, or in the "oldest" prefetch request in this LRU.
+	 */
+	Assert((oldestIndex >= 0) && (oldestIndex < PREFETCH_LRU_SIZE));
+
+	entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + oldestIndex];
+
+	entry->block = block;
+	entry->request = ++prefetch->prefetchReqNumber;
+
+	/* not in the prefetch cache */
+	return false;
+}
+
+/*
+ * Do prefetching, and gradually increase the prefetch distance.
+ *
+ * XXX This is limited to a single index page (because that's where we get
+ * currPos.items from). But index tuples are typically very small, so there
+ * should be quite a bit of stuff to prefetch (especially with deduplicated
+ * indexes, etc.). Does not seem worth reworking the index access to allow
+ * more aggressive prefetching, it's best effort.
+ *
+ * XXX Some ideas how to auto-tune the prefetching, so that unnecessary
+ * prefetching does not cause significant regressions (e.g. for nestloop
+ * with inner index scan). We could track number of index pages visited
+ * and index tuples returned, to calculate avg tuples / page, and then
+ * use that to limit prefetching after switching to a new page (instead
+ * of just using prefetchMaxTarget, which can get much larger).
+ *
+ * XXX Obviously, another option is to use the planner estimates - we know
+ * how many rows we're expected to fetch (on average, assuming the estimates
+ * are reasonably accurate), so why not to use that. And maybe combine it
+ * with the auto-tuning based on runtime statistics, described above.
+ *
+ * XXX The prefetching may interfere with the patch allowing us to evaluate
+ * conditions on the index tuple, in which case we may not need the heap
+ * tuple. Maybe if there's such filter, we should prefetch only pages that
+ * are not all-visible (and the same idea would also work for IOS), but
+ * it also makes the indexing a bit "aware" of the visibility stuff (which
+ * seems a bit wrong). Also, maybe we should consider the filter selectivity
+ * (if the index-only filter is expected to eliminate only few rows, then
+ * the vm check is pointless). Maybe this could/should be auto-tuning too,
+ * i.e. we could track how many heap tuples were needed after all, and then
+ * we would consider this when deciding whether to prefetch all-visible
+ * pages or not (matters only for regular index scans, not IOS).
+ *
+ * XXX Maybe we could/should also prefetch the next index block, e.g. stored
+ * in BTScanPosData.nextPage.
+ */
+static void
+index_prefetch(IndexScanDesc scan, ItemPointer tid)
+{
+	IndexPrefetch	prefetch = scan->xs_prefetch;
+	BlockNumber	block;
+
+	/*
+	 * No heap relation means bitmap index scan, which does prefetching at
+	 * the bitmap heap scan, so no prefetch here (we can't do it anyway,
+	 * without the heap)
+	 *
+	 * XXX But in this case we should have prefetchMaxTarget=0, because in
+	 * index_bebinscan_bitmap() we disable prefetching. So maybe we should
+	 * just check that.
+	 */
+	if (!prefetch)
+		return;
+
+	/* was it initialized correctly? */
+	// Assert(prefetch->prefetchIndex != -1);
+
+	/*
+	 * If we got here, prefetching is enabled and it's a node that supports
+	 * prefetching (i.e. it can't be a bitmap index scan).
+	 */
+	Assert(scan->heapRelation);
+
+	prefetch->countAll++;
+
+	block = ItemPointerGetBlockNumber(tid);
+
+	/*
+	 * Do not prefetch the same block over and over again,
+	 *
+	 * This happens e.g. for clustered or naturally correlated indexes
+	 * (fkey to a sequence ID). It's not expensive (the block is in page
+	 * cache already, so no I/O), but it's not free either.
+	 */
+	if (!index_prefetch_add_cache(prefetch, block))
+	{
+		prefetch->countPrefetch++;
+
+		PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block);
+		pgBufferUsage.blks_prefetches++;
+	}
+}
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8570b14f62..6ae445d62c 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3558,6 +3558,7 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 								  !INSTR_TIME_IS_ZERO(usage->blk_write_time));
 		bool		has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) ||
 									   !INSTR_TIME_IS_ZERO(usage->temp_blk_write_time));
+		bool		has_prefetches = (usage->blks_prefetches > 0);
 		bool		show_planning = (planning && (has_shared ||
 												  has_local || has_temp || has_timing ||
 												  has_temp_timing));
@@ -3655,6 +3656,23 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			appendStringInfoChar(es->str, '\n');
 		}
 
+		/* As above, show only positive counter values. */
+		if (has_prefetches)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "Prefetches:");
+
+			if (usage->blks_prefetches > 0)
+				appendStringInfo(es->str, " blocks=%lld",
+								 (long long) usage->blks_prefetches);
+
+			if (usage->blks_prefetch_rounds > 0)
+				appendStringInfo(es->str, " rounds=%lld",
+								 (long long) usage->blks_prefetch_rounds);
+
+			appendStringInfoChar(es->str, '\n');
+		}
+
 		if (show_planning)
 			es->indent--;
 	}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 1d82b64b89..e5ce1dbc95 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -765,11 +765,15 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
 	/*
 	 * May have to restart scan from this point if a potential conflict is
 	 * found.
+	 *
+	 * XXX Should this do index prefetch? Probably not worth it for unique
+	 * constraints, I guess? Otherwise we should calculate prefetch_target
+	 * just like in nodeIndexscan etc.
 	 */
 retry:
 	conflict = false;
 	found_self = false;
-	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0, 0, 0);
 	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
 
 	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index e776524227..c0bb732658 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -204,8 +204,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
 	/* Build scan key. */
 	skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
 
-	/* Start an index scan. */
-	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0);
+	/* Start an index scan.
+	 *
+	 * XXX Should this do index prefetching? We're looking for a single tuple,
+	 * probably using a PK / UNIQUE index, so does not seem worth it. If we
+	 * reconsider this, calclate prefetch_target like in nodeIndexscan.
+	 */
+	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0, 0, 0);
 
 retry:
 	found = false;
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ee78a5749d..434be59fca 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -235,6 +235,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->local_blks_written += add->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written;
+	dst->blks_prefetch_rounds += add->blks_prefetch_rounds;
+	dst->blks_prefetches += add->blks_prefetches;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
 	INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
@@ -257,6 +259,8 @@ BufferUsageAccumDiff(BufferUsage *dst,
 	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
+	dst->blks_prefetches += add->blks_prefetches - sub->blks_prefetches;
+	dst->blks_prefetch_rounds += add->blks_prefetch_rounds - sub->blks_prefetch_rounds;
 	INSTR_TIME_ACCUM_DIFF(dst->blk_read_time,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 0b43a9b969..3ecb8470d4 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -87,12 +87,20 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * We reach here if the index only scan is not parallel, or if we're
 		 * serially executing an index only scan that was planned to be
 		 * parallel.
+		 *
+		 * XXX Maybe we should enable prefetching, but prefetch only pages that
+		 * are not all-visible (but checking that from the index code seems like
+		 * a violation of layering etc).
+		 *
+		 * XXX This might lead to IOS being slower than plain index scan, if the
+		 * table has a lot of pages that need recheck.
 		 */
 		scandesc = index_beginscan(node->ss.ss_currentRelation,
 								   node->ioss_RelationDesc,
 								   estate->es_snapshot,
 								   node->ioss_NumScanKeys,
-								   node->ioss_NumOrderByKeys);
+								   node->ioss_NumOrderByKeys,
+								   0, 0);	/* no index prefetch for IOS */
 
 		node->ioss_ScanDesc = scandesc;
 
@@ -674,7 +682,8 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 0, 0);	/* no index prefetch for IOS */
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -719,7 +728,8 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 0, 0);	/* no index prefetch for IOS */
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 4540c7781d..71ae6a47ce 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -43,6 +43,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 /*
  * When an ordering operator is used, tuples fetched from the index that
@@ -85,6 +86,7 @@ IndexNext(IndexScanState *node)
 	ScanDirection direction;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
+	Relation heapRel = node->ss.ss_currentRelation;
 
 	/*
 	 * extract necessary information from index scan node
@@ -103,6 +105,22 @@ IndexNext(IndexScanState *node)
 
 	if (scandesc == NULL)
 	{
+		int	prefetch_target;
+		int	prefetch_reset;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index. This is
+		 * essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 */
+		prefetch_target = get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace);
+		prefetch_reset = Min(prefetch_target, node->ss.ps.plan->plan_rows);
+
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -111,7 +129,9 @@ IndexNext(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   prefetch_target,
+								   prefetch_reset);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -198,6 +218,23 @@ IndexNextWithReorder(IndexScanState *node)
 
 	if (scandesc == NULL)
 	{
+		Relation heapRel = node->ss.ss_currentRelation;
+		int	prefetch_target;
+		int	prefetch_reset;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index. This is
+		 * essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 */
+		prefetch_target = get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace);
+		prefetch_reset = Min(prefetch_target, node->ss.ps.plan->plan_rows);
+
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -206,7 +243,9 @@ IndexNextWithReorder(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   prefetch_target,
+								   prefetch_reset);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -1678,6 +1717,21 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel;
+	int			prefetch_target;
+	int			prefetch_reset;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+	heapRel = node->ss.ss_currentRelation;
+
+	prefetch_target = get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace);
+	prefetch_reset = Min(prefetch_target, node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen);
 	index_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -1690,7 +1744,9 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 prefetch_target,
+								 prefetch_reset);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
@@ -1726,6 +1782,14 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 							  ParallelWorkerContext *pwcxt)
 {
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel;
+	int			prefetch_target;
+	int			prefetch_reset;
+
+	heapRel = node->ss.ss_currentRelation;
+
+	prefetch_target = get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace);
+	prefetch_reset = Min(prefetch_target, node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->iss_ScanDesc =
@@ -1733,7 +1797,9 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 prefetch_target,
+								 prefetch_reset);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index d27ef2985d..d65575fd10 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1131,6 +1131,8 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
 			need_full_snapshot = true;
 		}
 
+		elog(LOG, "slot = %s  need_full_snapshot = %d", cmd->slotname, need_full_snapshot);
+
 		ctx = CreateInitDecodingContext(cmd->plugin, NIL, need_full_snapshot,
 										InvalidXLogRecPtr,
 										XL_ROUTINE(.page_read = logical_read_xlog_page,
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c4fcd0076e..0b02b6265d 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6218,7 +6218,7 @@ get_actual_variable_endpoint(Relation heapRel,
 
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable,
-								 1, 0);
+								 1, 0, 0, 0);	/* XXX maybe do prefetch? */
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index a308795665..f3efffc4a8 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -17,6 +17,7 @@
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
+#include "storage/bufmgr.h"
 #include "storage/lockdefs.h"
 #include "utils/relcache.h"
 #include "utils/snapshot.h"
@@ -152,7 +153,9 @@ extern bool index_insert(Relation indexRelation,
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
 									 Snapshot snapshot,
-									 int nkeys, int norderbys);
+									 int nkeys, int norderbys,
+									 int prefetch_target,
+									 int prefetch_reset);
 extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
 											Snapshot snapshot,
 											int nkeys);
@@ -169,7 +172,9 @@ extern void index_parallelscan_initialize(Relation heapRelation,
 extern void index_parallelrescan(IndexScanDesc scan);
 extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  Relation indexrel, int nkeys, int norderbys,
-											  ParallelIndexScanDesc pscan);
+											  ParallelIndexScanDesc pscan,
+											  int prefetch_target,
+											  int prefetch_reset);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
 									 ScanDirection direction);
 struct TupleTableSlot;
@@ -230,4 +235,108 @@ extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
 										  ScanDirection direction);
 extern void systable_endscan_ordered(SysScanDesc sysscan);
 
+/*
+ * XXX not sure it's the right place to define these callbacks etc.
+ */
+typedef void (*prefetcher_getrange_function) (IndexScanDesc scandesc,
+											  ScanDirection direction,
+											  int *start, int *end,
+											  bool *reset);
+
+typedef BlockNumber (*prefetcher_getblock_function) (IndexScanDesc scandesc,
+													 ScanDirection direction,
+													 int index);
+
+/*
+ * Cache of recently prefetched blocks, organized as a hash table of
+ * small LRU caches. Doesn't need to be perfectly accurate, but we
+ * aim to make false positives/negatives reasonably low.
+ */
+typedef struct PrefetchCacheEntry {
+	BlockNumber		block;
+	uint64			request;
+} PrefetchCacheEntry;
+
+/*
+ * Size of the cache of recently prefetched blocks - shouldn't be too
+ * small or too large. 1024 seems about right, it covers ~8MB of data.
+ * It's somewhat arbitrary, there's no particular formula saying it
+ * should not be higher/lower.
+ *
+ * The cache is structured as an array of small LRU caches, so the total
+ * size needs to be a multiple of LRU size. The LRU should be tiny to
+ * keep linear search cheap enough.
+ *
+ * XXX Maybe we could consider effective_cache_size or something?
+ */
+#define		PREFETCH_LRU_SIZE		8
+#define		PREFETCH_LRU_COUNT		128
+#define		PREFETCH_CACHE_SIZE		(PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT)
+
+/*
+ * Used to detect sequential patterns (and disable prefetching).
+ */
+#define		PREFETCH_QUEUE_HISTORY			8
+#define		PREFETCH_SEQ_PATTERN_BLOCKS		4
+
+
+typedef struct IndexPrefetchData
+{
+	/*
+	 * XXX We need to disable this in some cases (e.g. when using index-only
+	 * scans, we don't want to prefetch pages). Or maybe we should prefetch
+	 * only pages that are not all-visible, that'd be even better.
+	 */
+	int			prefetchTarget;	/* how far we should be prefetching */
+	int			prefetchMaxTarget;	/* maximum prefetching distance */
+	int			prefetchReset;	/* reset to this distance on rescan */
+	bool		prefetchDone;	/* did we get all TIDs from the index? */
+
+	/* runtime statistics */
+	uint64		countAll;		/* all prefetch requests */
+	uint64		countPrefetch;	/* actual prefetches */
+	uint64		countSkipSequential;
+	uint64		countSkipCached;
+
+	/*
+	 * Queue of TIDs to prefetch.
+	 *
+	 * XXX Sizing for MAX_IO_CONCURRENCY may be overkill, but it seems simpler
+	 * than dynamically adjusting for custom values.
+	 */
+	ItemPointerData	queueItems[MAX_IO_CONCURRENCY];
+	uint64			queueIndex;	/* next TID to prefetch */
+	uint64			queueStart;	/* first valid TID in queue */
+	uint64			queueEnd;	/* first invalid (empty) TID in queue */
+
+	/*
+	 * A couple of last prefetched blocks, used to check for certain access
+	 * pattern and skip prefetching - e.g. for sequential access).
+	 *
+	 * XXX Separate from the main queue, because we only want to compare the
+	 * block numbers, not the whole TID. In sequential access it's likely we
+	 * read many items from each page, and we don't want to check many items
+	 * (as that is much more expensive).
+	 */
+	BlockNumber		blockItems[PREFETCH_QUEUE_HISTORY];
+	uint64			blockIndex;	/* index in the block (points to the first
+								 * empty entry)*/
+
+	/*
+	 * Cache of recently prefetched blocks, organized as a hash table of
+	 * small LRU caches.
+	 */
+	uint64				prefetchReqNumber;
+	PrefetchCacheEntry	prefetchCache[PREFETCH_CACHE_SIZE];
+
+} IndexPrefetchData;
+
+#define PREFETCH_QUEUE_INDEX(a)	((a) % (MAX_IO_CONCURRENCY))
+#define PREFETCH_QUEUE_EMPTY(p)	((p)->queueEnd == (p)->queueIndex)
+#define PREFETCH_ENABLED(p)		((p) && ((p)->prefetchMaxTarget > 0))
+#define PREFETCH_FULL(p)		((p)->queueEnd - (p)->queueIndex == (p)->prefetchTarget)
+#define PREFETCH_DONE(p)		((p) && ((p)->prefetchDone && PREFETCH_QUEUE_EMPTY(p)))
+#define PREFETCH_ACTIVE(p)		(PREFETCH_ENABLED(p) && !(p)->prefetchDone)
+#define PREFETCH_BLOCK_INDEX(v)	((v) % PREFETCH_QUEUE_HISTORY)
+
 #endif							/* GENAM_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index d03360eac0..c119fe597d 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -106,6 +106,12 @@ typedef struct IndexFetchTableData
 	Relation	rel;
 } IndexFetchTableData;
 
+/*
+ * Forward declaration, defined in genam.h.
+ */
+typedef struct IndexPrefetchData IndexPrefetchData;
+typedef struct IndexPrefetchData *IndexPrefetch;
+
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -162,6 +168,9 @@ typedef struct IndexScanDescData
 	bool	   *xs_orderbynulls;
 	bool		xs_recheckorderby;
 
+	/* prefetching state (or NULL if disabled) */
+	IndexPrefetchData *xs_prefetch;
+
 	/* parallel index scan information, in shared memory */
 	struct ParallelIndexScanDescData *parallel_scan;
 }			IndexScanDescData;
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 87e5e2183b..97dd3c2c42 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -33,6 +33,8 @@ typedef struct BufferUsage
 	int64		local_blks_written; /* # of local disk blocks written */
 	int64		temp_blks_read; /* # of temp blocks read */
 	int64		temp_blks_written;	/* # of temp blocks written */
+	int64		blks_prefetch_rounds;	/* # of prefetch rounds */
+	int64		blks_prefetches;	/* # of buffers prefetched */
 	instr_time	blk_read_time;	/* time spent reading blocks */
 	instr_time	blk_write_time; /* time spent writing blocks */
 	instr_time	temp_blk_read_time; /* time spent reading temp blocks */

Re: index prefetching

Reply via email to