[HACKERS] DISTINCT with btree skip scan

Thomas Munro Fri, 04 Jul 2014 17:19:23 -0700

Hello

As an exercise I hacked up the simplest code I could think of that would
demonstrate a faster DISTINCT based on skipping ahead to the next distinct
value in an index-only scan. Please see the attached (extremely buggy)
patch, and the example session below.  (It's against my natural instinct to
send such half-baked early hacking phase code to the list, but thought it
would make sense to demo the concept and then seek advice, warnings, cease
and desist notices etc before pressing on down that route!)  I would be
most grateful for any feedback you might have.


Best regards,

Thomas Munro

postgres=# create table foo (a int, b int, primary key (a, b));
CREATE TABLE
Time: 9.002 ms
postgres=# insert into foo select generate_series(1, 5000000) % 10,
generate_series(1, 5000000);
INSERT 0 5000000
Time: 35183.444 ms
postgres=# analyze foo;
ANALYZE
Time: 493.168 ms

postgres=# set enable_hashagg = true;
SET
Time: 0.274 ms
postgres=# explain select distinct a from foo;
┌───────────────────────────────────────────────────────────────────┐
│                            QUERY PLAN                             │
├───────────────────────────────────────────────────────────────────┤
│ HashAggregate  (cost=84624.00..84624.10 rows=10 width=4)          │
│   Group Key: a                                                    │
│   ->  Seq Scan on foo  (cost=0.00..72124.00 rows=5000000 width=4) │
│ Planning time: 0.065 ms                                           │
└───────────────────────────────────────────────────────────────────┘
(4 rows)

Time: 0.500 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 6 │
│ 8 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 9 │
│ 7 │
│ 0 │
└───┘
(10 rows)

Time: 2017.019 ms

postgres=# set enable_hashagg = false;
SET
Time: 0.302 ms
postgres=# explain select distinct a from foo;
┌─────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                                             QUERY PLAN
                           │
├─────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Index Only Scan for distinct prefix 1 using foo_pkey on foo
 (cost=0.43..263354.20 rows=10 width=4) │
│ Planning time: 0.063 ms
                          │
└─────────────────────────────────────────────────────────────────────────────────────────────────────┘
(2 rows)

Time: 0.443 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
│ 7 │
│ 8 │
│ 9 │
└───┘
(10 rows)

Time: 0.565 ms

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 53cf96f..5f10d7f 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -29,6 +29,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance to the next key value in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -742,6 +743,37 @@ index_can_return(Relation indexRelation)
 									  PointerGetDatum(indexRelation)));
 }
 
+bool
+index_can_skip(Relation indexRelation)
+{
+	FmgrInfo   *procedure;
+
+	RELATION_CHECKS;
+
+	/* amcanskip is optional; assume FALSE if not provided by AM */
+	if (!RegProcedureIsValid(indexRelation->rd_am->amcanskip))
+		return false;
+
+	GET_REL_PROCEDURE(amcanskip);
+
+	return DatumGetBool(FunctionCall1(procedure,
+									  PointerGetDatum(indexRelation)));
+}
+
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	FmgrInfo   *procedure;
+	SCAN_CHECKS;
+	GET_SCAN_PROCEDURE(amskip);
+	Datum result =
+		DatumGetPointer(FunctionCall3(procedure,
+									  PointerGetDatum(scan),
+									  Int32GetDatum(direction),
+									  Int32GetDatum(prefix)));
+	return DatumGetBool(result);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 36dc6c2..2f987e4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -434,6 +434,8 @@ btbeginscan(PG_FUNCTION_ARGS)
 	so->currPos.nextTupleOffset = 0;
 	so->markPos.nextTupleOffset = 0;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -509,6 +511,19 @@ btrescan(PG_FUNCTION_ARGS)
 }
 
 /*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+Datum
+btskip(PG_FUNCTION_ARGS)
+{
+	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+	int			prefix = PG_GETARG_INT32(2);
+	bool result = _bt_skip(scan, dir, prefix);
+	PG_RETURN_BOOL(result);	
+}
+
+/*
  *	btendscan() -- close down a scan
  */
 Datum
@@ -1118,3 +1133,12 @@ btcanreturn(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_BOOL(true);
 }
+
+/*
+ * btcanskip() -- Check whether btree indexes support skipping.
+ */
+Datum
+btcanskip(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 203b969..b88f25a 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1082,6 +1082,84 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
+ *  _bt_skip() -- Skip items that have the same prefix as the current one.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	/* TODO:TM For now, we use _bt_search to search from the root; in theory
+	 * we should be able to do a local traversal ie from the current page, but
+	 * I don't know if it would actually be better in general  */
+
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+
+	if (!scan->xs_want_itup)
+		elog(ERROR, "_bt_skip cannot skip if not returning tuples");
+	if (!scan->xs_itup)
+		elog(ERROR, "_bt_skip cannot skip if a tuple has not been fetched yet");
+	/*elog(NOTICE, "_bt_skip");*/
+
+	if (BTScanPosIsValid(so->currPos))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/* 
+	 * TODO:TM lazily call _bt_mkscankey the first time and then just update
+	 * the values in so->skipScanKey each time after that, instead of the repeated
+	 * free/realloc
+	 */	
+	if (so->skipScanKey != NULL)
+		_bt_freeskey(so->skipScanKey);
+	so->skipScanKey = _bt_mkscankey(scan->indexRelation, scan->xs_itup);
+
+	/* TODO:TM share code with _bt_search?  Much of this copied from there
+	 * without good understanding of what is going on!  */
+
+	stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey, true, &buf, BT_READ);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	/*
+	 * TODO:TM this doesn't work when we're scanning backwards!  (ie with an
+	 * ORDER BY ... DESC)
+	 */
+	offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey, true);
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		if (!_bt_steppage(scan, dir))
+		{
+			/* elog(NOTICE, "reached the end"); */
+			return false;
+		}
+	}
+	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0d9663c..332cf02 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1065,7 +1065,20 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_IndexOnlyScan:
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
-
+				if (indexonlyscan->distinctPrefix > 0)
+				{
+					/*
+					 * TODO:TM show "for distinct prefix (a, b)" instead of
+					 * just the column count?
+					 */
+					if (es->format == EXPLAIN_FORMAT_TEXT)
+						appendStringInfo(es->str, " for distinct prefix %d",
+										 indexonlyscan->distinctPrefix);
+					else
+						ExplainPropertyInteger("Distinct Prefix", 
+											   indexonlyscan->distinctPrefix,
+											   es);
+				}
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index afcd1ff..d46de44 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -74,6 +74,21 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;
 
 	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted)
+	{
+		/*elog(NOTICE, "I WOULD LIKE TO SKIP");*/
+		if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys))
+		{
+			/* Reached end of index. */
+			/* TODO:TM is this appropriate cleanup? */
+			return ExecClearTuple(slot);
+		}
+	}
+
+	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
@@ -176,6 +191,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 			PredicateLockPage(scandesc->heapRelation,
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
+		/*
+		 * TODO:TM Do distinct scans break SSI?  We don't visit every page
+		 * anymore!  But maybe that's OK, because we only "read" the fact that
+		 * there is *at least one* row that has this value, so a conflicting
+		 * write would be one that removes ALL rows having the same value, and
+		 * therefore would touch this page that happens to hold the arbitrary
+		 * row we looked at, in the case of a disinct skip scan.  Does this
+		 * make any sense?
+		 */
+
+		node->ioss_FirstTupleEmitted = true;
 
 		return slot;
 	}
@@ -391,6 +417,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ioss_HeapFetches = 0;
+	indexstate->ioss_NumDistinctKeys = node->distinctPrefix;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 8d3d5a7..2bca8fe 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -398,6 +398,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(distinctPrefix);
 
 	return newnode;
 }
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 42dcb11..3908899 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -982,6 +982,10 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	index_only_scan = (scantype != ST_BITMAPSCAN &&
 					   check_index_only(rel, index));
 
+	/* TODO:TM -- if index_only_scan is true, AND we want DISTINCT, then
+	 * ... perhaps this is the right place to check if we can use an index
+	 * only scan in distinct mode? */
+
 	/*
 	 * 4. Generate an indexscan path if there are relevant restriction clauses
 	 * in the current clauses, OR the index ordering is potentially useful for
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f2c9c99..7e1c625 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1934,10 +1934,30 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 															   result_plan,
 															current_pathkeys,
 															   -1.0);
+				result_plan = (Plan *) make_unique(result_plan,
+												   parse->distinctClause);
+			}
+			else if IsA(result_plan, IndexOnlyScan)
+			{
+				/* 
+				 * The plan is appropriately sorted, and we can ask it to
+				 * produce unique values by skipping.
+				 *
+				 * TODO:TM This is a quick and dirty hack to get something
+				 * working by modfifying hte plan, but really we probably need
+				 * help planning this much earlier, possibly in
+				 * build_index_paths.
+				 */
+				IndexOnlyScan *ios = (IndexOnlyScan *) result_plan;
+				/*elog(NOTICE, "using skip to implement DISTINCT");*/
+				ios->distinctPrefix = list_length(root->distinct_pathkeys);
+			}
+			else
+			{
+				/* The plan is appropriately sorted, but contains duplicates. */
+				result_plan = (Plan *) make_unique(result_plan,
+												   parse->distinctClause);
 			}
-
-			result_plan = (Plan *) make_unique(result_plan,
-											   parse->distinctClause);
 			result_plan->plan_rows = dNumDistinctRows;
 			/* The Unique node won't change sort ordering */
 		}
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index d99158f..349097d 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -157,6 +157,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 					 IndexBulkDeleteResult *stats);
 extern bool index_can_return(Relation indexRelation);
+extern bool index_can_skip(Relation indexRelation);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 				uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ed6f697..0cc762a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -606,6 +606,9 @@ typedef struct BTScanOpaqueData
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
+
+	/* workspace for _bt_skip */
+	ScanKey		skipScanKey;	/* used to control skipping */
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -638,6 +641,8 @@ extern Datum btbulkdelete(PG_FUNCTION_ARGS);
 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
 extern Datum btcanreturn(PG_FUNCTION_ARGS);
 extern Datum btoptions(PG_FUNCTION_ARGS);
+extern Datum btcanskip(PG_FUNCTION_ARGS);
+extern Datum btskip(PG_FUNCTION_ARGS);
 
 /*
  * prototypes for functions in nbtinsert.c
@@ -684,6 +689,8 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
+
 
 /*
  * prototypes for functions in nbtutils.c
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h
index 759ea70..b3d835e 100644
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -67,6 +67,8 @@ CATALOG(pg_am,2601)
 	regproc		amcanreturn;	/* can indexscan return IndexTuples? */
 	regproc		amcostestimate; /* estimate cost of an indexscan */
 	regproc		amoptions;		/* parse AM-specific parameters */
+	regproc		amcanskip;		/* can indexscan skip to next prefix value? */
+	regproc		amskip;			/* skip to next prefix value function */
 } FormData_pg_am;
 
 /* ----------------
@@ -80,7 +82,7 @@ typedef FormData_pg_am *Form_pg_am;
  *		compiler constants for pg_am
  * ----------------
  */
-#define Natts_pg_am						30
+#define Natts_pg_am						32
 #define Anum_pg_am_amname				1
 #define Anum_pg_am_amstrategies			2
 #define Anum_pg_am_amsupport			3
@@ -111,25 +113,27 @@ typedef FormData_pg_am *Form_pg_am;
 #define Anum_pg_am_amcanreturn			28
 #define Anum_pg_am_amcostestimate		29
 #define Anum_pg_am_amoptions			30
+#define Anum_pg_am_amcanskip			31
+#define Anum_pg_am_amskip				32
 
 /* ----------------
  *		initial contents of pg_am
  * ----------------
  */
 
-DATA(insert OID = 403 (  btree		5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree		5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions btcanskip btskip ));
 DESCR("b-tree index access method");
 #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash		1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash		1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions - - ));
 DESCR("hash index access method");
 #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist		0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist		0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions - - ));
 DESCR("GiST index access method");
 #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin		0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin		0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions - - ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742
-DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
+DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions - - ));
 DESCR("SP-GiST index access method");
 #define SPGIST_AM_OID 4000
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index f44085c..792ec5b 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -564,6 +564,10 @@ DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 1 0 0 0 f f f f t f v
 DESCR("btree(internal)");
 DATA(insert OID = 2785 (  btoptions		   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  btoptions _null_ _null_ _null_ ));
 DESCR("btree(internal)");
+DATA(insert OID = 4051 (  btcanskip	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ btcanskip _null_ _null_ _null_ ));
+DESCR("btree(internal)");
+DATA(insert OID = 4052 (  btskip	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_	btskip _null_ _null_ _null_ ));
+DESCR("btree(internal)");
 
 DATA(insert OID = 339 (  poly_same		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
 DATA(insert OID = 340 (  poly_contain	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b271f21..9c64c81 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1280,6 +1280,7 @@ typedef struct IndexScanState
  *		ScanDesc		   index scan descriptor
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		HeapFetches		   number of tuples we were forced to fetch from heap
+ *		NumDistinctKeys	   number of keys for skip-based DISTINCT  
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1298,6 +1299,8 @@ typedef struct IndexOnlyScanState
 	IndexScanDesc ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
 	long		ioss_HeapFetches;
+	int         ioss_NumDistinctKeys;
+	bool		ioss_FirstTupleEmitted;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 3b9c683..6ae992a 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -340,6 +340,7 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			distinctPrefix;	/* the size of the prefix for distinct scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index af4f53f..ecc72f2 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -61,6 +61,8 @@ typedef struct RelationAmInfo
 	FmgrInfo	ammarkpos;
 	FmgrInfo	amrestrpos;
 	FmgrInfo	amcanreturn;
+	FmgrInfo	amcanskip;
+	FmgrInfo	amskip;
 } RelationAmInfo;

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] DISTINCT with btree skip scan

Reply via email to