Re: Index Skip Scan

Jesper Pedersen Thu, 20 Jun 2019 06:21:53 -0700

Hi,

On 6/19/19 9:57 AM, Dmitry Dolgov wrote:

Here is what I was talking about, POC for an integration with index scan. About
using of create_skipscan_unique_path and suggested planner improvements, I hope
together with Jesper we can come up with something soon.

I made some minor changes, but I did move all the code increate_distinct_paths() under enable_indexskipscan to limit the overheadif skip scan isn't enabled.


Attached is v20, since the last patch should have been v19.

Best regards,
 Jesper

>From 4fd4bd601f510ccce858196c0e93d37aa2d0f20f Mon Sep 17 00:00:00 2001
From: jesperpedersen <jesper.peder...@redhat.com>
Date: Thu, 20 Jun 2019 07:42:24 -0400
Subject: [PATCH 1/2] Implementation of Index Skip Scan (see Loose Index Scan
 in the wiki [1]) on top of IndexScan and IndexOnlyScan. To make it suitable
 for both situations when there are small number of distinct values and
 significant amount of distinct values the following approach is taken -
 instead of searching from the root for every value we're searching for then
 first on the current page, and then if not found continue searching from the
 root.

Original patch and design were proposed by Thomas Munro [2], revived and
improved by Dmitry Dolgov and Jesper Pedersen.

[1] https://wiki.postgresql.org/wiki/Loose_indexscan
[2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com
---
 contrib/bloom/blutils.c                       |   1 +
 doc/src/sgml/config.sgml                      |  16 ++
 doc/src/sgml/indexam.sgml                     |  10 +
 doc/src/sgml/indices.sgml                     |  24 ++
 src/backend/access/brin/brin.c                |   1 +
 src/backend/access/gin/ginutil.c              |   1 +
 src/backend/access/gist/gist.c                |   1 +
 src/backend/access/hash/hash.c                |   1 +
 src/backend/access/index/indexam.c            |  16 ++
 src/backend/access/nbtree/nbtree.c            |  12 +
 src/backend/access/nbtree/nbtsearch.c         | 224 +++++++++++++++++-
 src/backend/access/spgist/spgutils.c          |   1 +
 src/backend/commands/explain.c                |  24 ++
 src/backend/executor/nodeIndexonlyscan.c      |  22 ++
 src/backend/executor/nodeIndexscan.c          |  22 ++
 src/backend/nodes/copyfuncs.c                 |   2 +
 src/backend/nodes/outfuncs.c                  |   3 +
 src/backend/nodes/readfuncs.c                 |   2 +
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/optimizer/path/pathkeys.c         |  84 ++++++-
 src/backend/optimizer/plan/createplan.c       |  20 +-
 src/backend/optimizer/plan/planagg.c          |   1 +
 src/backend/optimizer/plan/planner.c          |  79 +++++-
 src/backend/optimizer/util/pathnode.c         |  40 ++++
 src/backend/optimizer/util/plancat.c          |   1 +
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   5 +
 src/include/access/genam.h                    |   1 +
 src/include/access/nbtree.h                   |   5 +
 src/include/nodes/execnodes.h                 |   7 +
 src/include/nodes/pathnodes.h                 |   8 +
 src/include/nodes/plannodes.h                 |   2 +
 src/include/optimizer/cost.h                  |   1 +
 src/include/optimizer/pathnode.h              |   5 +
 src/include/optimizer/paths.h                 |   4 +
 src/test/regress/expected/create_index.out    |   1 +
 src/test/regress/expected/select_distinct.out | 209 ++++++++++++++++
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/sql/create_index.sql         |   2 +
 src/test/regress/sql/select_distinct.sql      |  68 ++++++
 41 files changed, 918 insertions(+), 22 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index ee3bd56274..a88b730f2e 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -129,6 +129,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = blcostestimate;
 	amroutine->amoptions = bloptions;
 	amroutine->amproperty = NULL;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 84341a30e5..a1c8a1ea27 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4400,6 +4400,22 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-indexskipscan" xreflabel="enable_indexskipscan">
+      <term><varname>enable_indexskipscan</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_indexskipscan</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of index-skip-scan plan
+        types (see <xref linkend="indexes-index-skip-scans"/>). This parameter requires
+        that <varname>enable_indexonlyscan</varname> is <literal>on</literal>.
+        The default is <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-material" xreflabel="enable_material">
       <term><varname>enable_material</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index dd54c68802..c2eb296306 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -144,6 +144,7 @@ typedef struct IndexAmRoutine
     amendscan_function amendscan;
     ammarkpos_function ammarkpos;       /* can be NULL */
     amrestrpos_function amrestrpos;     /* can be NULL */
+    amskip_function amskip;     /* can be NULL */
 
     /* interface functions to support parallel index scans */
     amestimateparallelscan_function amestimateparallelscan;    /* can be NULL */
@@ -687,6 +688,15 @@ amrestrpos (IndexScanDesc scan);
 
   <para>
 <programlisting>
+bool
+amskip (IndexScanDesc scan, ScanDirection direction, int prefix);
+</programlisting>
+  Skip past all tuples where the first 'prefix' columns have the same value as
+  the last tuple returned in the current scan.
+  </para>
+
+  <para>
+<programlisting>
 Size
 amestimateparallelscan (void);
 </programlisting>
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 95c0a1926c..592149f10e 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1235,6 +1235,30 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success;
    and later will recognize such cases and allow index-only scans to be
    generated, but older versions will not.
   </para>
+
+  <sect2 id="indexes-index-skip-scans">
+    <title>Index Skip Scans</title>
+
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index</primary>
+      <secondary>index-skip scans</secondary>
+    </indexterm>
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index-skip scan</primary>
+    </indexterm>
+
+    <para>
+     In case if index scan is used to retrieve the distinct values of a column
+     efficiently, it can be not very efficient, since it requires to scan all
+     the equal values of a key. In such cases planner will consider apply index
+     skip scan approach, which is based on the idea of
+     <ulink url="https://wiki.postgresql.org/wiki/Free_Space_Map_Problems";>
+     Loose index scan</ulink>. Rather than scanning all equal values of a key,
+     as soon as a new value is found, it will search for a larger value on the
+     same index page, and if not found, restart the search by looking for a
+     larger value. This is much faster when the index has many equal keys.
+    </para>
+  </sect2>
  </sect1>
 
 
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index ae7b729edd..233ea9e5ec 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -109,6 +109,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = brincostestimate;
 	amroutine->amoptions = brinoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index cf9699ad18..9817f34c34 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -61,6 +61,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gincostestimate;
 	amroutine->amoptions = ginoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 470b121e7d..328c17f13a 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -83,6 +83,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gistcostestimate;
 	amroutine->amoptions = gistoptions;
 	amroutine->amproperty = gistproperty;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 5cc30dac42..019e330cff 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -82,6 +82,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = hashcostestimate;
 	amroutine->amoptions = hashoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index aefdd2916d..1c2def162c 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance past duplicate key values in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -730,6 +731,21 @@ index_can_return(Relation indexRelation, int attno)
 	return indexRelation->rd_indam->amcanreturn(indexRelation, attno);
 }
 
+/* ----------------
+ *		index_skip
+ *
+ *		Skip past all tuples where the first 'prefix' columns have the
+ *		same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	SCAN_CHECKS;
+
+	return scan->indexRelation->rd_indam->amskip(scan, direction, prefix);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 85e54ac44b..3e50abd6b0 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -131,6 +131,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
+	amroutine->amskip = btskip;
 	amroutine->amcostestimate = btcostestimate;
 	amroutine->amoptions = btoptions;
 	amroutine->amproperty = btproperty;
@@ -380,6 +381,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	 */
 	so->currTuples = so->markTuples = NULL;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -447,6 +450,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	_bt_preprocess_array_keys(scan);
 }
 
+/*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+bool
+btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	return _bt_skip(scan, direction, prefix);
+}
+
 /*
  *	btendscan() -- close down a scan
  */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index c655dadb96..f60534e1fb 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -37,7 +37,10 @@ static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
-
+static inline void _bt_update_skip_scankeys(IndexScanDesc scan,
+											Relation indexRel);
+static inline bool _bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key,
+										Buffer buf, ScanDirection dir);
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -1380,6 +1383,184 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	return true;
 }
 
+/*
+ *  _bt_skip() -- Skip items that have the same prefix as the most recently
+ * fetched index tuple.  The current position is set so that a subsequent call
+ * to _bt_next will fetch the first tuple that differs in the leading 'prefix'
+ * keys.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+	Relation 	 indexRel = scan->indexRelation;
+
+	/* We want to return tuples, and we need a starting point */
+	Assert(scan->xs_want_itup);
+	Assert(scan->xs_itup);
+
+	/*
+	 * If skipScanKey is NULL then we initialize it with _bt_mkscankey,
+	 * otherwise we will just update the sk_flags / sk_argument elements
+	 * in order to eliminate repeated free/realloc.
+	 */
+	if (so->skipScanKey == NULL)
+	{
+		so->skipScanKey = _bt_mkscankey(indexRel, scan->xs_itup);
+		so->skipScanKey->keysz = prefix;
+		so->skipScanKey->scantid = NULL;
+	}
+	else
+	{
+		_bt_update_skip_scankeys(scan, indexRel);
+	}
+
+	/* Check if the next unique key can be found within the current page */
+	if (BTScanPosIsValid(so->currPos) &&
+		_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf, dir))
+	{
+		bool keyFound = false;
+
+		LockBuffer(so->currPos.buf, BT_READ);
+		offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, so->currPos.buf);
+
+		/* Lock the page for SERIALIZABLE transactions */
+		PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(so->currPos.buf),
+						  scan->xs_snapshot);
+
+		/* We know in which direction to look */
+		_bt_initialize_more_data(so, dir);
+
+		if (ScanDirectionIsForward(dir))
+		{
+			/* Move back for _bt_next */
+			offnum = OffsetNumberPrev(offnum);
+		}
+
+		/* Now read the data */
+		keyFound = _bt_readpage(scan, dir, offnum);
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+		if (keyFound)
+		{
+			/* set IndexTuple */
+			currItem = &so->currPos.items[so->currPos.itemIndex];
+			scan->xs_heaptid = currItem->heapTid;
+			if (scan->xs_want_itup)
+				scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+			return true;
+		}
+	}
+
+	if (BTScanPosIsValid(so->currPos))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/*
+	 * We haven't found scan key within the current page, so let's scan from
+	 * the root. Use _bt_search and _bt_binsrch to get the buffer and offset
+	 * number
+	 */
+	so->skipScanKey->nextkey = ScanDirectionIsForward(dir);
+	stack = _bt_search(scan->indexRelation, so->skipScanKey,
+					   &buf, BT_READ, scan->xs_snapshot);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	if (ScanDirectionIsForward(dir))
+	{
+		offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+	}
+	else
+	{
+		/* For backward scan finding offnum is more involved. It is wrong to
+		 * just use binary search, since we will find the last item from the
+		 * sequence of equal items, and we need the first one. Otherwise e.g.
+		 * backward cursor scan will return an incorrect value. */
+		offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+		/* One step back to find a previous value */
+		if (_bt_next(scan, dir))
+		{
+			_bt_update_skip_scankeys(scan, indexRel);
+
+			/* And now find the last item from the sequence for the current,
+			 * value with the intention do OffsetNumberNext. As a result we
+			 * end up on a first element from the sequence. */
+			if (_bt_scankey_within_page(scan, so->skipScanKey,
+										so->currPos.buf, dir))
+			{
+				offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+			}
+			else
+			{
+				ReleaseBuffer(so->currPos.buf);
+				so->currPos.buf = InvalidBuffer;
+
+				stack = _bt_search(scan->indexRelation, so->skipScanKey,
+								   &buf, BT_READ, scan->xs_snapshot);
+				_bt_freestack(stack);
+				so->currPos.buf = buf;
+				offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+			}
+		}
+		else
+		{
+			pfree(so->skipScanKey);
+			so->skipScanKey = NULL;
+			return false;
+		}
+	}
+
+	/* Lock the page for SERIALIZABLE transactions */
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+
+	/* We know in which direction to look */
+	_bt_initialize_more_data(so, dir);
+
+	if (ScanDirectionIsForward(dir))
+		/* Move back for _bt_next */
+		offnum = OffsetNumberPrev(offnum);
+	else
+		offnum = OffsetNumberNext(offnum);
+
+	/* Now read the data */
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		if (!_bt_steppage(scan, dir))
+		{
+			pfree(so->skipScanKey);
+			so->skipScanKey = NULL;
+			return false;
+		}
+	}
+	else
+	{
+		/* Drop the lock, and maybe the pin, on the current page */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	}
+
+	/* And set IndexTuple */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
 /*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
@@ -2249,3 +2430,44 @@ _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
 	so->numKilled = 0;			/* just paranoia */
 	so->markItemIndex = -1;		/* ditto */
 }
+
+static inline void
+_bt_update_skip_scankeys(IndexScanDesc scan, Relation indexRel)
+{
+	TupleDesc		itupdesc;
+	int				indnkeyatts, i;
+	BTScanOpaque 	so = (BTScanOpaque) scan->opaque;
+	ScanKey			scankeys = so->skipScanKey->scankeys;
+
+	itupdesc = RelationGetDescr(indexRel);
+	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRel);
+	for (i = 0; i < indnkeyatts; i++)
+	{
+		Datum datum;
+		bool null;
+		int flags;
+
+		datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null);
+		flags = (null ? SK_ISNULL : 0) |
+				(indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT);
+		scankeys[i].sk_flags = flags;
+		scankeys[i].sk_argument = datum;
+	}
+}
+
+static inline bool
+_bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key,
+						Buffer buf, ScanDirection dir)
+{
+	OffsetNumber low, high, compare_offset;
+	Page page = BufferGetPage(buf);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	int 		 compare_value = ScanDirectionIsForward(dir) ? 0 : 1;
+
+	low = P_FIRSTDATAKEY(opaque);
+	high = PageGetMaxOffsetNumber(page);
+	compare_offset = ScanDirectionIsForward(dir) ? high : low;
+
+	return _bt_compare(scan->indexRelation,
+					   key, page, compare_offset) > compare_value;
+}
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 45472db147..dc151ecf09 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -64,6 +64,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = spgcostestimate;
 	amroutine->amoptions = spgoptions;
 	amroutine->amproperty = spgproperty;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 92969636b7..19a504c312 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1363,6 +1363,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			{
 				IndexScan  *indexscan = (IndexScan *) plan;
 
+				if (indexscan->skipPrefixSize > 0)
+				{
+					if (es->format != EXPLAIN_FORMAT_TEXT)
+						ExplainPropertyInteger("Distinct Prefix", NULL,
+											   indexscan->skipPrefixSize,
+											   es);
+				}
+
 				ExplainIndexScanDetails(indexscan->indexid,
 										indexscan->indexorderdir,
 										es);
@@ -1373,6 +1381,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
 
+				if (indexonlyscan->skipPrefixSize > 0)
+				{
+					if (es->format != EXPLAIN_FORMAT_TEXT)
+						ExplainPropertyInteger("Distinct Prefix", NULL,
+											   indexonlyscan->skipPrefixSize,
+											   es);
+				}
+
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
@@ -1582,6 +1598,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	switch (nodeTag(plan))
 	{
 		case T_IndexScan:
+			if (((IndexScan *) plan)->skipPrefixSize > 0)
+			{
+				ExplainPropertyText("Scan mode", "Skip scan", es);
+			}
 			show_scan_qual(((IndexScan *) plan)->indexqualorig,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexScan *) plan)->indexqualorig)
@@ -1595,6 +1615,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			break;
 		case T_IndexOnlyScan:
+			if (((IndexOnlyScan *) plan)->skipPrefixSize > 0)
+			{
+				ExplainPropertyText("Scan mode", "Skip scan", es);
+			}
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexOnlyScan *) plan)->indexqual)
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 8a4d795d1a..15e2ff7b1b 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -115,6 +115,24 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted)
+	{
+		if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize))
+		{
+			/* Reached end of index. At this point currPos is invalidated,
+			 * and we need to reset ioss_FirstTupleEmitted, since otherwise
+			 * after going backwards, reaching the end of index, and going
+			 * forward again we apply skip again. It would be incorrect and
+			 * lead to an extra skipped item. */
+			node->ioss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+	}
+
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
@@ -253,6 +271,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
 
+		node->ioss_FirstTupleEmitted = true;
+
 		return slot;
 	}
 
@@ -503,6 +523,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+	indexstate->ioss_SkipPrefixSize = node->skipPrefixSize;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index ac7aa81f67..752c077d8b 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -116,6 +116,7 @@ IndexNext(IndexScanState *node)
 								   node->iss_NumOrderByKeys);
 
 		node->iss_ScanDesc = scandesc;
+		node->iss_ScanDesc->xs_want_itup = true;
 
 		/*
 		 * If no run-time keys to calculate or they are ready, go ahead and
@@ -127,6 +128,24 @@ IndexNext(IndexScanState *node)
 						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted)
+	{
+		if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize))
+		{
+			/* Reached end of index. At this point currPos is invalidated,
+			 * and we need to reset ioss_FirstTupleEmitted, since otherwise
+			 * after going backwards, reaching the end of index, and going
+			 * forward again we apply skip again. It would be incorrect and
+			 * lead to an extra skipped item. */
+			node->ioss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+	}
+
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
@@ -149,6 +168,7 @@ IndexNext(IndexScanState *node)
 			}
 		}
 
+		node->ioss_FirstTupleEmitted = true;
 		return slot;
 	}
 
@@ -906,6 +926,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexScan;
+	indexstate->ioss_SkipPrefixSize = node->skipPrefixSize;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 78deade89b..98d7107caa 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -490,6 +490,7 @@ _copyIndexScan(const IndexScan *from)
 	COPY_NODE_FIELD(indexorderbyorig);
 	COPY_NODE_FIELD(indexorderbyops);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(skipPrefixSize);
 
 	return newnode;
 }
@@ -515,6 +516,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(skipPrefixSize);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 8400dd319e..15200f2e3a 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -559,6 +559,7 @@ _outIndexScan(StringInfo str, const IndexScan *node)
 	WRITE_NODE_FIELD(indexorderbyorig);
 	WRITE_NODE_FIELD(indexorderbyops);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(skipPrefixSize);
 }
 
 static void
@@ -573,6 +574,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
 	WRITE_NODE_FIELD(indexorderby);
 	WRITE_NODE_FIELD(indextlist);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(skipPrefixSize);
 }
 
 static void
@@ -2208,6 +2210,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node)
 	WRITE_NODE_FIELD(group_pathkeys);
 	WRITE_NODE_FIELD(window_pathkeys);
 	WRITE_NODE_FIELD(distinct_pathkeys);
+	WRITE_NODE_FIELD(uniq_distinct_pathkeys);
 	WRITE_NODE_FIELD(sort_pathkeys);
 	WRITE_NODE_FIELD(processed_tlist);
 	WRITE_NODE_FIELD(minmax_aggs);
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 6c2626ee62..3652b244dd 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1787,6 +1787,7 @@ _readIndexScan(void)
 	READ_NODE_FIELD(indexorderbyorig);
 	READ_NODE_FIELD(indexorderbyops);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(skipPrefixSize);
 
 	READ_DONE();
 }
@@ -1806,6 +1807,7 @@ _readIndexOnlyScan(void)
 	READ_NODE_FIELD(indexorderby);
 	READ_NODE_FIELD(indextlist);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(skipPrefixSize);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a2a9b1f7be..6e0fe90e5c 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -124,6 +124,7 @@ int			max_parallel_workers_per_gather = 2;
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
 bool		enable_indexonlyscan = true;
+bool		enable_indexskipscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 08b5061612..af7d9c4270 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -29,6 +29,7 @@
 #include "utils/lsyscache.h"
 
 
+static bool pathkey_is_unique(PathKey *new_pathkey, List *pathkeys);
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
 											 RelOptInfo *partrel,
@@ -94,6 +95,30 @@ make_canonical_pathkey(PlannerInfo *root,
 	return pk;
 }
 
+/*
+ * pathkey_is_unique
+ *	   Part of pathkey_is_redundant, that is reponsible for the case, when the
+ *	   new pathkey's equivalence class is the same as that of any existing
+ *	   member of the pathkey list.
+ */
+static bool
+pathkey_is_unique(PathKey *new_pathkey, List *pathkeys)
+{
+	EquivalenceClass *new_ec = new_pathkey->pk_eclass;
+	ListCell   *lc;
+
+	/* If same EC already used in list, then redundant */
+	foreach(lc, pathkeys)
+	{
+		PathKey    *old_pathkey = (PathKey *) lfirst(lc);
+
+		if (new_ec == old_pathkey->pk_eclass)
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * pathkey_is_redundant
  *	   Is a pathkey redundant with one already in the given list?
@@ -133,22 +158,12 @@ static bool
 pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys)
 {
 	EquivalenceClass *new_ec = new_pathkey->pk_eclass;
-	ListCell   *lc;
 
 	/* Check for EC containing a constant --- unconditionally redundant */
 	if (EC_MUST_BE_REDUNDANT(new_ec))
 		return true;
 
-	/* If same EC already used in list, then redundant */
-	foreach(lc, pathkeys)
-	{
-		PathKey    *old_pathkey = (PathKey *) lfirst(lc);
-
-		if (new_ec == old_pathkey->pk_eclass)
-			return true;
-	}
-
-	return false;
+	return pathkey_is_unique(new_pathkey, pathkeys);
 }
 
 /*
@@ -1096,6 +1111,53 @@ make_pathkeys_for_sortclauses(PlannerInfo *root,
 	return pathkeys;
 }
 
+/*
+ * make_pathkeys_for_distinctclauses
+ *		Generate a pathkeys list for distinct clauses, that represents the sort
+ *		order specified by a list of SortGroupClauses. Similar to
+ *		make_pathkeys_for_sortclauses, but allows to specify if we need to
+ *		check the full redundancy, or just uniqueness.
+ */
+List *
+make_pathkeys_for_distinctclauses(PlannerInfo *root,
+								  List *distinctclauses,
+								  List *tlist, bool checkRedundant)
+{
+	List	   *pathkeys = NIL;
+	ListCell   *l;
+
+	foreach(l, distinctclauses)
+	{
+		SortGroupClause *sortcl = (SortGroupClause *) lfirst(l);
+		Expr	   *sortkey;
+		PathKey    *pathkey;
+
+		sortkey = (Expr *) get_sortgroupclause_expr(sortcl, tlist);
+		Assert(OidIsValid(sortcl->sortop));
+		pathkey = make_pathkey_from_sortop(root,
+										   sortkey,
+										   root->nullable_baserels,
+										   sortcl->sortop,
+										   sortcl->nulls_first,
+										   sortcl->tleSortGroupRef,
+										   true);
+
+		/* Canonical form eliminates redundant ordering keys */
+		if (checkRedundant)
+		{
+			if (!pathkey_is_redundant(pathkey, pathkeys))
+				pathkeys = lappend(pathkeys, pathkey);
+		}
+		else
+		{
+			if (!pathkey_is_unique(pathkey, pathkeys))
+				pathkeys = lappend(pathkeys, pathkey);
+		}
+	}
+	return pathkeys;
+}
+
+
 /****************************************************************************
  *		PATHKEYS AND MERGECLAUSES
  ****************************************************************************/
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 608d5adfed..e2c53a0d05 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -175,12 +175,14 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid,
 								 Oid indexid, List *indexqual, List *indexqualorig,
 								 List *indexorderby, List *indexorderbyorig,
 								 List *indexorderbyops,
-								 ScanDirection indexscandir);
+								 ScanDirection indexscandir,
+								 int skipprefix);
 static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 										 Index scanrelid, Oid indexid,
 										 List *indexqual, List *indexorderby,
 										 List *indextlist,
-										 ScanDirection indexscandir);
+										 ScanDirection indexscandir,
+										 int skipprefix);
 static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
 											  List *indexqual,
 											  List *indexqualorig);
@@ -2903,7 +2905,8 @@ create_indexscan_plan(PlannerInfo *root,
 												fixed_indexquals,
 												fixed_indexorderbys,
 												best_path->indexinfo->indextlist,
-												best_path->indexscandir);
+												best_path->indexscandir,
+												best_path->indexskipprefix);
 	else
 		scan_plan = (Scan *) make_indexscan(tlist,
 											qpqual,
@@ -2914,7 +2917,8 @@ create_indexscan_plan(PlannerInfo *root,
 											fixed_indexorderbys,
 											indexorderbys,
 											indexorderbyops,
-											best_path->indexscandir);
+											best_path->indexscandir,
+											best_path->indexskipprefix);
 
 	copy_generic_path_info(&scan_plan->plan, &best_path->path);
 
@@ -5150,7 +5154,8 @@ make_indexscan(List *qptlist,
 			   List *indexorderby,
 			   List *indexorderbyorig,
 			   List *indexorderbyops,
-			   ScanDirection indexscandir)
+			   ScanDirection indexscandir,
+			   int skipPrefixSize)
 {
 	IndexScan  *node = makeNode(IndexScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5167,6 +5172,7 @@ make_indexscan(List *qptlist,
 	node->indexorderbyorig = indexorderbyorig;
 	node->indexorderbyops = indexorderbyops;
 	node->indexorderdir = indexscandir;
+	node->skipPrefixSize = skipPrefixSize;
 
 	return node;
 }
@@ -5179,7 +5185,8 @@ make_indexonlyscan(List *qptlist,
 				   List *indexqual,
 				   List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir)
+				   ScanDirection indexscandir,
+				   int skipPrefixSize)
 {
 	IndexOnlyScan *node = makeNode(IndexOnlyScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5194,6 +5201,7 @@ make_indexonlyscan(List *qptlist,
 	node->indexorderby = indexorderby;
 	node->indextlist = indextlist;
 	node->indexorderdir = indexscandir;
+	node->skipPrefixSize = skipPrefixSize;
 
 	return node;
 }
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index 9381939c82..ed52139839 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -505,6 +505,7 @@ minmax_qp_callback(PlannerInfo *root, void *extra)
 	root->group_pathkeys = NIL;
 	root->window_pathkeys = NIL;
 	root->distinct_pathkeys = NIL;
+	root->uniq_distinct_pathkeys = NIL;
 
 	root->sort_pathkeys =
 		make_pathkeys_for_sortclauses(root,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index cb897cc7f4..663be21597 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3615,12 +3615,21 @@ standard_qp_callback(PlannerInfo *root, void *extra)
 
 	if (parse->distinctClause &&
 		grouping_is_sortable(parse->distinctClause))
+	{
+		root->uniq_distinct_pathkeys =
+			make_pathkeys_for_distinctclauses(root,
+											  parse->distinctClause,
+											  tlist, false);
 		root->distinct_pathkeys =
-			make_pathkeys_for_sortclauses(root,
-										  parse->distinctClause,
-										  tlist);
+			make_pathkeys_for_distinctclauses(root,
+											  parse->distinctClause,
+											  tlist, true);
+	}
 	else
+	{
 		root->distinct_pathkeys = NIL;
+		root->uniq_distinct_pathkeys = NIL;
+	}
 
 	root->sort_pathkeys =
 		make_pathkeys_for_sortclauses(root,
@@ -4807,6 +4816,70 @@ create_distinct_paths(PlannerInfo *root,
 												  path,
 												  list_length(root->distinct_pathkeys),
 												  numDistinctRows));
+
+				/* Consider index skip scan as well */
+				if (enable_indexskipscan &&
+					IsA(path, IndexPath) &&
+					((IndexPath *) path)->indexinfo->amcanskip &&
+					(path->pathtype == T_IndexOnlyScan ||
+					 path->pathtype == T_IndexScan) &&
+					root->distinct_pathkeys != NIL)
+				{
+					ListCell   		*lc;
+					IndexOptInfo 	*index = NULL;
+					bool 			different_columns_order = false,
+									not_empty_qual = false;
+					int 			i = 0;
+
+					index = ((IndexPath *) path)->indexinfo;
+
+					/*
+					 * The order of columns in the index should be the same, as for
+					 * unique distincs pathkeys, otherwise we cannot use _bt_search
+					 * in the skip implementation - this can lead to a missing
+					 * records.
+					 */
+					foreach(lc, root->uniq_distinct_pathkeys)
+					{
+						PathKey *pathKey = lfirst_node(PathKey, lc);
+						EquivalenceMember *em =
+							lfirst_node(EquivalenceMember,
+										list_head(pathKey->pk_eclass->ec_members));
+						Var *var = (Var *) em->em_expr;
+
+						Assert(i < index->ncolumns);
+
+						if (index->indexkeys[i] != var->varattno)
+						{
+							different_columns_order = true;
+							break;
+						}
+
+						i++;
+					}
+
+					/*
+					 * XXX: In case of index scan quals evaluation happens after
+					 * ExecScanFetch, which means skip results could be fitered out
+					 */
+					if (path->pathtype == T_IndexScan &&
+						parse->jointree != NULL &&
+						parse->jointree->quals != NULL &&
+						((List *)parse->jointree->quals)->length != 0)
+							not_empty_qual = true;
+
+					if (!different_columns_order &&	!not_empty_qual)
+					{
+						int distinctPrefixKeys =
+							list_length(root->uniq_distinct_pathkeys);
+						add_path(distinct_rel, (Path *)
+								 create_skipscan_unique_path(root,
+															 distinct_rel,
+															 path,
+															 distinctPrefixKeys,
+															 numDistinctRows));
+					}
+				}
 			}
 		}
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index d884d2bb00..df9b57215f 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2928,6 +2928,46 @@ create_upper_unique_path(PlannerInfo *root,
 	return pathnode;
 }
 
+/*
+ * create_skipscan_unique_path
+ *	  Creates a pathnode the same as an existing IndexPath except based on
+ *	  skipping duplicate values.  This may or may not be cheaper than using
+ *	  create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root,
+							RelOptInfo *rel,
+							Path *basepath,
+							int distinctPrefixKeys,
+							double numGroups)
+{
+	IndexPath *pathnode = makeNode(IndexPath);
+
+	Assert(IsA(basepath, IndexPath));
+
+	/* We don't want to modify basepath, so make a copy. */
+	memcpy(pathnode, basepath, sizeof(IndexPath));
+
+	/* The size of the prefix we'll use for skipping. */
+	Assert(pathnode->indexinfo->amcanskip);
+	Assert(distinctPrefixKeys > 0);
+	/*Assert(distinctPrefixKeys <= list_length(pathnode->path.pathkeys));*/
+	pathnode->indexskipprefix = distinctPrefixKeys;
+
+	/*
+	 * The cost to skip to each distinct value should be roughly the same as
+	 * the cost of finding the first key times the number of distinct values
+	 * we expect to find.
+	 */
+	pathnode->path.startup_cost = basepath->startup_cost;
+	pathnode->path.total_cost = basepath->startup_cost * numGroups;
+	pathnode->path.rows = numGroups;
+
+	return pathnode;
+}
+
 /*
  * create_agg_path
  *	  Creates a pathnode that represents performing aggregation/grouping
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 40f497660d..8c05b3bb5c 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -269,6 +269,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->amoptionalkey = amroutine->amoptionalkey;
 			info->amsearcharray = amroutine->amsearcharray;
 			info->amsearchnulls = amroutine->amsearchnulls;
+			info->amcanskip = (amroutine->amskip != NULL);
 			info->amcanparallel = amroutine->amcanparallel;
 			info->amhasgettuple = (amroutine->amgettuple != NULL);
 			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1208eb9a68..007c8ac14e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -912,6 +912,15 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-skip-scan plans."),
+			NULL
+		},
+		&enable_indexskipscan,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of bitmap-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 5ee5e09ddf..99facc8f50 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -351,6 +351,7 @@
 #enable_hashjoin = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexskipscan = on
 #enable_material = on
 #enable_mergejoin = on
 #enable_nestloop = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 6e3db06eed..34033c5486 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -130,6 +130,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+								 ScanDirection dir, int prefix);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -225,6 +229,7 @@ typedef struct IndexAmRoutine
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
 	amrestrpos_function amrestrpos; /* can be NULL */
+	amskip_function amskip;				/* can be NULL */
 
 	/* interface functions to support parallel index scans */
 	amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 8c053be2ca..2e79098b85 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -173,6 +173,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 												   IndexBulkDeleteResult *stats);
 extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 									uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index a3583f225b..247cdb8127 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -663,6 +663,9 @@ typedef struct BTScanOpaqueData
 	 */
 	int			markItemIndex;	/* itemIndex, or -1 if not valid */
 
+	/* Work space for _bt_skip */
+	BTScanInsert	skipScanKey;	/* used to control skipping */
+
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
@@ -777,6 +780,7 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 							   Snapshot snapshot);
 
@@ -801,6 +805,7 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
 extern Size BTreeShmemSize(void);
 extern void BTreeShmemInit(void);
 extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern bool btproperty(Oid index_oid, int attno,
 					   IndexAMProperty prop, const char *propname,
 					   bool *res, bool *isnull);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 99b9fa414f..08096ec68b 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1377,6 +1377,8 @@ typedef struct IndexScanState
 	ExprContext *iss_RuntimeContext;
 	Relation	iss_RelationDesc;
 	struct IndexScanDescData *iss_ScanDesc;
+	int         ioss_SkipPrefixSize;
+	bool		ioss_FirstTupleEmitted;
 
 	/* These are needed for re-checking ORDER BY expr ordering */
 	pairingheap *iss_ReorderQueue;
@@ -1406,6 +1408,9 @@ typedef struct IndexScanState
  *		TableSlot		   slot for holding tuples fetched from the table
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		PscanLen		   size of parallel index-only scan descriptor
+ *		NumDistinctKeys	   number of keys for skip-based DISTINCT
+ *		SkipPrefixSize	   number of keys for skip-based DISTINCT
+ *		FirstTupleEmitted  has the first tuple been emitted
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1424,6 +1429,8 @@ typedef struct IndexOnlyScanState
 	struct IndexScanDescData *ioss_ScanDesc;
 	TupleTableSlot *ioss_TableSlot;
 	Buffer		ioss_VMBuffer;
+	int         ioss_SkipPrefixSize;
+	bool		ioss_FirstTupleEmitted;
 	Size		ioss_PscanLen;
 } IndexOnlyScanState;
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 4b7703d478..e571c84473 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -298,6 +298,9 @@ struct PlannerInfo
 	List	   *group_pathkeys; /* groupClause pathkeys, if any */
 	List	   *window_pathkeys;	/* pathkeys of bottom window, if any */
 	List	   *distinct_pathkeys;	/* distinctClause pathkeys, if any */
+	List	   *uniq_distinct_pathkeys;	/* unique, but not necessarily not
+										   redundant distinctClause pathkeys,
+										   if any */
 	List	   *sort_pathkeys;	/* sortClause pathkeys, if any */
 
 	List	   *part_schemes;	/* Canonicalised partition schemes used in the
@@ -829,6 +832,7 @@ struct IndexOptInfo
 	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
 	bool		amhasgettuple;	/* does AM have amgettuple interface? */
 	bool		amhasgetbitmap; /* does AM have amgetbitmap interface? */
+	bool		amcanskip;		/* can AM skip duplicate values? */
 	bool		amcanparallel;	/* does AM support parallel scan? */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
 	void		(*amcostestimate) ();	/* AM's cost estimator */
@@ -1165,6 +1169,9 @@ typedef struct Path
  * we need not recompute them when considering using the same index in a
  * bitmap index/heap scan (see BitmapHeapPath).  The costs of the IndexPath
  * itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
  *----------
  */
 typedef struct IndexPath
@@ -1177,6 +1184,7 @@ typedef struct IndexPath
 	ScanDirection indexscandir;
 	Cost		indextotalcost;
 	Selectivity indexselectivity;
+	int			indexskipprefix;
 } IndexPath;
 
 /*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 70f8b8e22b..4cdee103ec 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -405,6 +405,7 @@ typedef struct IndexScan
 	List	   *indexorderbyorig;	/* the same in original form */
 	List	   *indexorderbyops;	/* OIDs of sort ops for ORDER BY exprs */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			skipPrefixSize;	/* the size of the prefix for distinct scans */
 } IndexScan;
 
 /* ----------------
@@ -432,6 +433,7 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			skipPrefixSize;	/* the size of the prefix for distinct scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 9b6bdbc518..ad28c7f54a 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
 extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexskipscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e70d6a3f18..fa461201a7 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -202,6 +202,11 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
 												 Path *subpath,
 												 int numCols,
 												 double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+											  RelOptInfo *rel,
+											  Path *subpath,
+											  int numCols,
+											  double numGroups);
 extern AggPath *create_agg_path(PlannerInfo *root,
 								RelOptInfo *rel,
 								Path *subpath,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 7345137d1d..a782d12a50 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -209,6 +209,10 @@ extern List *build_join_pathkeys(PlannerInfo *root,
 extern List *make_pathkeys_for_sortclauses(PlannerInfo *root,
 										   List *sortclauses,
 										   List *tlist);
+extern List *make_pathkeys_for_distinctclauses(PlannerInfo *root,
+											   List *sortclauses,
+											   List *tlist,
+											   bool checkRedundant);
 extern void initialize_mergeclause_eclasses(PlannerInfo *root,
 											RestrictInfo *restrictinfo);
 extern void update_mergeclause_eclasses(PlannerInfo *root,
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 5305b53cac..056de928fe 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -19,6 +19,7 @@ CREATE INDEX tenk1_unique1 ON tenk1 USING btree(unique1 int4_ops);
 CREATE INDEX tenk1_unique2 ON tenk1 USING btree(unique2 int4_ops);
 CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
 CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
+CREATE INDEX tenk1_four ON tenk1 (four);
 CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
 CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
 CREATE INDEX tenk2_hundred ON tenk2 USING btree(hundred int4_ops);
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index f3696c6d1d..e79b9e8274 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -244,3 +244,212 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+-- index only skip scan
+SELECT DISTINCT four FROM tenk1;
+ four 
+------
+    0
+    1
+    2
+    3
+(4 rows)
+
+SELECT DISTINCT four FROM tenk1 WHERE four = 1;
+ four 
+------
+    1
+(1 row)
+
+-- index skip scan
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 ORDER BY four;
+ four | ten 
+------+-----
+    0 |   0
+    1 |   9
+    2 |   0
+    3 |   1
+(4 rows)
+
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 WHERE four = 1 ORDER BY four;
+ four | ten 
+------+-----
+    1 |   9
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 ORDER BY four;
+              QUERY PLAN              
+--------------------------------------
+ Index Scan using tenk1_four on tenk1
+   Scan mode: Skip scan
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 WHERE four = 1 ORDER BY four;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Result
+   ->  Unique
+         ->  Bitmap Heap Scan on tenk1
+               Recheck Cond: (four = 1)
+               ->  Bitmap Index Scan on tenk1_four
+                     Index Cond: (four = 1)
+(6 rows)
+
+-- check colums order
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+ four 
+------
+    0
+    2
+(2 rows)
+
+CREATE INDEX tenk1_four_ten on tenk1 (four, ten);
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+ four 
+------
+    0
+    2
+(2 rows)
+
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE four = 0;
+ four | ten 
+------+-----
+    0 |   0
+    0 |   2
+    0 |   4
+    0 |   6
+    0 |   8
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Index Only Scan using tenk1_four_ten on tenk1
+   Scan mode: Skip scan
+   Index Cond: (ten = 2)
+(3 rows)
+
+-- test uniq_distinct_pathkeys
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE four = 0;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Index Only Scan using tenk1_four_ten on tenk1
+   Scan mode: Skip scan
+   Index Cond: (four = 0)
+(3 rows)
+
+DROP INDEX tenk1_four_ten;
+CREATE INDEX tenk1_ten_four on tenk1 (ten, four);
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+ four 
+------
+    0
+    2
+(2 rows)
+
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE ten = 2;
+ four | ten 
+------+-----
+    0 |   2
+    2 |   2
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Unique
+   ->  Index Only Scan using tenk1_ten_four on tenk1
+         Index Cond: (ten = 2)
+(3 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE ten = 2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Unique
+   ->  Index Only Scan using tenk1_ten_four on tenk1
+         Index Cond: (ten = 2)
+(3 rows)
+
+DROP INDEX tenk1_ten_four;
+-- check projection case
+SELECT DISTINCT four, four FROM tenk1 WHERE ten = 2;
+ four | four 
+------+------
+    0 |    0
+    2 |    2
+(2 rows)
+
+SELECT DISTINCT four, 1 FROM tenk1 WHERE ten = 2;
+ four | ?column? 
+------+----------
+    2 |        1
+    0 |        1
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
+                QUERY PLAN                 
+-------------------------------------------
+ Index Only Scan using tenk1_four on tenk1
+   Scan mode: Skip scan
+(2 rows)
+
+BEGIN;
+DECLARE c SCROLL CURSOR FOR SELECT DISTINCT four FROM tenk1;
+FETCH FROM c;
+ four 
+------
+    0
+(1 row)
+
+FETCH BACKWARD FROM c;
+ four 
+------
+(0 rows)
+
+FETCH 5 FROM c;
+ four 
+------
+    0
+    1
+    2
+    3
+(4 rows)
+
+FETCH BACKWARD 5 FROM c;
+ four 
+------
+    3
+    2
+    1
+    0
+(4 rows)
+
+FETCH 5 FROM c;
+ four 
+------
+    0
+    1
+    2
+    3
+(4 rows)
+
+FETCH BACKWARD 5 FROM c;
+ four 
+------
+    3
+    2
+    1
+    0
+(4 rows)
+
+END;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb905..bd3b373515 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_hashjoin                | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexskipscan           | on
  enable_material                | on
  enable_mergejoin               | on
  enable_nestloop                | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(17 rows)
+(18 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index cd46f071bd..04760639a8 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -26,6 +26,8 @@ CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
 
 CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
 
+CREATE INDEX tenk1_four ON tenk1 (four);
+
 CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
 
 CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index a605e86449..bbe1978305 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -73,3 +73,71 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- index only skip scan
+SELECT DISTINCT four FROM tenk1;
+SELECT DISTINCT four FROM tenk1 WHERE four = 1;
+
+-- index skip scan
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 ORDER BY four;
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 WHERE four = 1 ORDER BY four;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 ORDER BY four;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four, ten
+FROM tenk1 WHERE four = 1 ORDER BY four;
+
+-- check colums order
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+
+CREATE INDEX tenk1_four_ten on tenk1 (four, ten);
+
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE four = 0;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+
+-- test uniq_distinct_pathkeys
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE four = 0;
+
+DROP INDEX tenk1_four_ten;
+
+CREATE INDEX tenk1_ten_four on tenk1 (ten, four);
+
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE ten = 2;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE ten = 2;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT on (four, ten) four, ten FROM tenk1 WHERE ten = 2;
+
+DROP INDEX tenk1_ten_four;
+
+-- check projection case
+SELECT DISTINCT four, four FROM tenk1 WHERE ten = 2;
+SELECT DISTINCT four, 1 FROM tenk1 WHERE ten = 2;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
+
+BEGIN;
+DECLARE c SCROLL CURSOR FOR SELECT DISTINCT four FROM tenk1;
+
+FETCH FROM c;
+FETCH BACKWARD FROM c;
+
+FETCH 5 FROM c;
+FETCH BACKWARD 5 FROM c;
+
+FETCH 5 FROM c;
+FETCH BACKWARD 5 FROM c;
+
+END;
-- 
2.21.0

Re: Index Skip Scan

Reply via email to