Re: MDAM techniques and Index Skip Scan patch

Dmitry Dolgov Tue, 22 Mar 2022 13:01:17 -0700

> On Mon, Mar 21, 2022 at 06:34:09PM -0700, Andres Freund wrote:
>
> > I don't mind having all of the alternatives under the same CF item, only
> > one being tested seems to be only a small limitation of cfbot.
>
> IMO it's pretty clear that having "duelling" patches below one CF entry is a
> bad idea. I think they should be split, with inactive approaches marked as
> returned with feeback or whatnot.


On the other hand even for patches with dependencies (i.e. the patch A
depends on the patch B) different CF items cause a lot of confusion for
reviewers. I guess for various flavours of the same patch it would be
even worse. But I don't have a strong opinion here.

> Either way, currently this patch fails on cfbot due to a new GUC:
> https://api.cirrus-ci.com/v1/artifact/task/5134905372835840/log/src/test/recovery/tmp_check/regression.diffs
> https://cirrus-ci.com/task/5134905372835840

This seems to be easy to solve.

>From 5bae9fdf8b74e5996b606e78f8b2a5fb327e011b Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <[email protected]>
Date: Mon, 17 May 2021 11:47:07 +0200
Subject: [PATCH v41 1/6] Unique expressions

Extend PlannerInfo and Path structures with the list of relevant unique
expressions. It specifies which keys must be unique on the query
level, and allows to leverage this into on the path level. At the moment
only distinctClause makes use of such mechanism, which enables potential
use of index skip scan.

Originally proposed by David Rowley, based on the UniqueKey patch
implementation from Andy Fan, contains few bits out of previous version
from Jesper Pedersen, Floris Van Nee.
---
 src/backend/nodes/list.c                | 31 +++++++++
 src/backend/optimizer/path/Makefile     |  3 +-
 src/backend/optimizer/path/pathkeys.c   | 62 +++++++++++++++++
 src/backend/optimizer/path/uniquekeys.c | 92 +++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c    | 36 +++++++++-
 src/backend/optimizer/util/pathnode.c   | 32 ++++++---
 src/include/nodes/pathnodes.h           |  5 ++
 src/include/nodes/pg_list.h             |  1 +
 src/include/optimizer/pathnode.h        |  1 +
 src/include/optimizer/paths.h           |  9 +++
 10 files changed, 261 insertions(+), 11 deletions(-)
 create mode 100644 src/backend/optimizer/path/uniquekeys.c

diff --git a/src/backend/nodes/list.c b/src/backend/nodes/list.c
index f843f861ef..a53a50f372 100644
--- a/src/backend/nodes/list.c
+++ b/src/backend/nodes/list.c
@@ -1653,3 +1653,34 @@ list_oid_cmp(const ListCell *p1, const ListCell *p2)
 		return 1;
 	return 0;
 }
+
+/*
+ * Return true iff every entry in "members" list is also present
+ * in the "target" list.
+ */
+bool
+list_is_subset(const List *members, const List *target)
+{
+	const ListCell	*lc1, *lc2;
+
+	Assert(IsPointerList(members));
+	Assert(IsPointerList(target));
+	check_list_invariants(members);
+	check_list_invariants(target);
+
+	foreach(lc1, members)
+	{
+		bool found = false;
+		foreach(lc2, target)
+		{
+			if (equal(lfirst(lc1), lfirst(lc2)))
+			{
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			return false;
+	}
+	return true;
+}
diff --git a/src/backend/optimizer/path/Makefile b/src/backend/optimizer/path/Makefile
index 1e199ff66f..7b9820c25f 100644
--- a/src/backend/optimizer/path/Makefile
+++ b/src/backend/optimizer/path/Makefile
@@ -21,6 +21,7 @@ OBJS = \
 	joinpath.o \
 	joinrels.o \
 	pathkeys.o \
-	tidpath.o
+	tidpath.o \
+	uniquekeys.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 86a35cdef1..e2be1fbf90 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -29,6 +29,7 @@
 #include "utils/lsyscache.h"
 
 
+static bool pathkey_is_unique(PathKey *new_pathkey, List *pathkeys);
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
 											 RelOptInfo *partrel,
@@ -96,6 +97,29 @@ make_canonical_pathkey(PlannerInfo *root,
 	return pk;
 }
 
+/*
+ * pathkey_is_unique
+ *		Checks if the new pathkey's equivalence class is the same as that of
+ *		any existing member of the pathkey list.
+ */
+static bool
+pathkey_is_unique(PathKey *new_pathkey, List *pathkeys)
+{
+	EquivalenceClass *new_ec = new_pathkey->pk_eclass;
+	ListCell   *lc;
+
+	/* If same EC already is already in the list, then not unique */
+	foreach(lc, pathkeys)
+	{
+		PathKey    *old_pathkey = (PathKey *) lfirst(lc);
+
+		if (new_ec == old_pathkey->pk_eclass)
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * pathkey_is_redundant
  *	   Is a pathkey redundant with one already in the given list?
@@ -1152,6 +1176,44 @@ make_pathkeys_for_sortclauses(PlannerInfo *root,
 	return pathkeys;
 }
 
+/*
+ * make_pathkeys_for_uniquekeyclauses
+ *		Generate a pathkeys list to be used for uniquekey clauses
+ */
+List *
+make_pathkeys_for_uniquekeys(PlannerInfo *root,
+							 List *sortclauses,
+							 List *tlist)
+{
+	List	   *pathkeys = NIL;
+	ListCell   *l;
+
+	foreach(l, sortclauses)
+	{
+		SortGroupClause *sortcl = (SortGroupClause *) lfirst(l);
+		Expr	   *sortkey;
+		PathKey    *pathkey;
+
+		sortkey = (Expr *) get_sortgroupclause_expr(sortcl, tlist);
+		Assert(OidIsValid(sortcl->sortop));
+		pathkey = make_pathkey_from_sortop(root,
+										   sortkey,
+										   root->nullable_baserels,
+										   sortcl->sortop,
+										   sortcl->nulls_first,
+										   sortcl->tleSortGroupRef,
+										   true);
+
+		if (EC_MUST_BE_REDUNDANT(pathkey->pk_eclass))
+			continue;
+
+		if (pathkey_is_unique(pathkey, pathkeys))
+			pathkeys = lappend(pathkeys, pathkey);
+	}
+
+	return pathkeys;
+}
+
 /****************************************************************************
  *		PATHKEYS AND MERGECLAUSES
  ****************************************************************************/
diff --git a/src/backend/optimizer/path/uniquekeys.c b/src/backend/optimizer/path/uniquekeys.c
new file mode 100644
index 0000000000..d2525771e3
--- /dev/null
+++ b/src/backend/optimizer/path/uniquekeys.c
@@ -0,0 +1,92 @@
+/*-------------------------------------------------------------------------
+ *
+ * uniquekeys.c
+ *	  Utilities for matching and building unique keys
+ *
+ * Portions Copyright (c) 2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/optimizer/path/uniquekeys.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "optimizer/paths.h"
+
+/*
+ * build_uniquekeys
+ * 		Preparing list of pathkeys keys which are considered to be unique for
+ * 		this query.
+ *
+ * For now used only for distinct clauses, where redundant keys	need to be
+ * preserved e.g. for skip scan. Justification for this function existence is
+ * future plans to make it produce actual UniqueKey list. 
+ */
+List*
+build_uniquekeys(PlannerInfo *root, List *sortclauses)
+{
+	List *result = NIL;
+	List *sortkeys;
+	ListCell *l;
+	List *exprs = NIL;
+
+	sortkeys = make_pathkeys_for_uniquekeys(root,
+											sortclauses,
+											root->processed_tlist);
+
+	/* Create a uniquekey and add it to the list */
+	foreach(l, sortkeys)
+	{
+		PathKey    *pathkey = (PathKey *) lfirst(l);
+		EquivalenceClass *ec = pathkey->pk_eclass;
+		EquivalenceMember *mem = (EquivalenceMember*) lfirst(list_head(ec->ec_members));
+		exprs = lappend(exprs, mem->em_expr);
+	}
+
+	result = lappend(result, exprs);
+
+	return result;
+}
+
+/*
+ * query_has_uniquekeys_for
+ * 		Check if the specified unique keys matching all query level unique
+ * 		keys.
+ *
+ * The main use is to verify that unique keys for some path are covering all
+ * requested query unique keys. Based on this information a path could be
+ * rejected if it satisfy uniqueness only partially.
+ */
+bool
+query_has_uniquekeys_for(PlannerInfo *root, List *path_uniquekeys,
+						 bool allow_multinulls)
+{
+	ListCell *lc;
+	ListCell *lc2;
+
+	/* root->query_uniquekeys are the requested DISTINCT clauses on query level
+	 * path_uniquekeys are the unique keys on current path. All requested
+	 * query_uniquekeys must be satisfied by the path_uniquekeys.
+	 */
+	foreach(lc, root->query_uniquekeys)
+	{
+		List *query_ukey = lfirst_node(List, lc);
+		bool satisfied = false;
+		foreach(lc2, path_uniquekeys)
+		{
+			List *ukey = lfirst_node(List, lc2);
+			if (list_length(ukey) == 0 &&
+				list_length(query_ukey) != 0)
+				continue;
+			if (list_is_subset(ukey, query_ukey))
+			{
+				satisfied = true;
+				break;
+			}
+		}
+		if (!satisfied)
+			return false;
+	}
+	return true;
+}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index bd09f85aea..b67bff8ccc 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3102,12 +3102,18 @@ standard_qp_callback(PlannerInfo *root, void *extra)
 
 	if (parse->distinctClause &&
 		grouping_is_sortable(parse->distinctClause))
+	{
 		root->distinct_pathkeys =
 			make_pathkeys_for_sortclauses(root,
 										  parse->distinctClause,
 										  tlist);
+		root->query_uniquekeys = build_uniquekeys(root, parse->distinctClause);
+	}
 	else
+	{
 		root->distinct_pathkeys = NIL;
+		root->query_uniquekeys = NIL;
+	}
 
 	root->sort_pathkeys =
 		make_pathkeys_for_sortclauses(root,
@@ -4493,13 +4499,19 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 			Path	   *path = (Path *) lfirst(lc);
 
 			if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-			{
 				add_path(distinct_rel, (Path *)
 						 create_upper_unique_path(root, distinct_rel,
 												  path,
 												  list_length(root->distinct_pathkeys),
 												  numDistinctRows));
-			}
+		}
+
+		foreach(lc, input_rel->unique_pathlist)
+		{
+			Path	   *path = (Path *) lfirst(lc);
+
+			if (query_has_uniquekeys_for(root, path->uniquekeys, false))
+				add_path(distinct_rel, path);
 		}
 
 		/* For explicit-sort case, always use the more rigorous clause */
@@ -7109,6 +7121,26 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
 		}
 	}
 
+	foreach(lc, rel->unique_pathlist)
+	{
+		Path	   *subpath = (Path *) lfirst(lc);
+
+		/* Shouldn't have any parameterized paths anymore */
+		Assert(subpath->param_info == NULL);
+
+		if (tlist_same_exprs)
+			subpath->pathtarget->sortgrouprefs =
+				scanjoin_target->sortgrouprefs;
+		else
+		{
+			Path	   *newpath;
+
+			newpath = (Path *) create_projection_path(root, rel, subpath,
+													  scanjoin_target);
+			lfirst(lc) = newpath;
+		}
+	}
+
 	/*
 	 * Now, if final scan/join target contains SRFs, insert ProjectSetPath(s)
 	 * atop each existing path.  (Note that this function doesn't look at the
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5c32c96b71..abb77d867e 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -416,10 +416,10 @@ set_cheapest(RelOptInfo *parent_rel)
  * 'parent_rel' is the relation entry to which the path corresponds.
  * 'new_path' is a potential path for parent_rel.
  *
- * Returns nothing, but modifies parent_rel->pathlist.
+ * Returns modified pathlist.
  */
-void
-add_path(RelOptInfo *parent_rel, Path *new_path)
+static List *
+add_path_to(RelOptInfo *parent_rel, List *pathlist, Path *new_path)
 {
 	bool		accept_new = true;	/* unless we find a superior old path */
 	int			insert_at = 0;	/* where to insert new item */
@@ -440,7 +440,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 	 * for more than one old path to be tossed out because new_path dominates
 	 * it.
 	 */
-	foreach(p1, parent_rel->pathlist)
+	foreach(p1, pathlist)
 	{
 		Path	   *old_path = (Path *) lfirst(p1);
 		bool		remove_old = false; /* unless new proves superior */
@@ -584,8 +584,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		 */
 		if (remove_old)
 		{
-			parent_rel->pathlist = foreach_delete_current(parent_rel->pathlist,
-														  p1);
+			pathlist = foreach_delete_current(pathlist, p1);
 
 			/*
 			 * Delete the data pointed-to by the deleted cell, if possible
@@ -612,8 +611,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 	if (accept_new)
 	{
 		/* Accept the new path: insert it at proper place in pathlist */
-		parent_rel->pathlist =
-			list_insert_nth(parent_rel->pathlist, insert_at, new_path);
+		pathlist = list_insert_nth(pathlist, insert_at, new_path);
 	}
 	else
 	{
@@ -621,6 +619,23 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		if (!IsA(new_path, IndexPath))
 			pfree(new_path);
 	}
+
+	return pathlist;
+}
+
+void
+add_path(RelOptInfo *parent_rel, Path *new_path)
+{
+	parent_rel->pathlist = add_path_to(parent_rel,
+									   parent_rel->pathlist, new_path);
+}
+
+void
+add_unique_path(RelOptInfo *parent_rel, Path *new_path)
+{
+	parent_rel->unique_pathlist = add_path_to(parent_rel,
+											  parent_rel->unique_pathlist,
+											  new_path);
 }
 
 /*
@@ -2662,6 +2677,7 @@ create_projection_path(PlannerInfo *root,
 	pathnode->path.pathkeys = subpath->pathkeys;
 
 	pathnode->subpath = subpath;
+	pathnode->path.uniquekeys = subpath->uniquekeys;
 
 	/*
 	 * We might not need a separate Result node.  If the input plan node type
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1f3845b3fe..056b13826a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -293,6 +293,7 @@ struct PlannerInfo
 
 	List	   *query_pathkeys; /* desired pathkeys for query_planner() */
 
+	List	   *query_uniquekeys; /* unique keys required for the query */
 	List	   *group_pathkeys; /* groupClause pathkeys, if any */
 	List	   *window_pathkeys;	/* pathkeys of bottom window, if any */
 	List	   *distinct_pathkeys;	/* distinctClause pathkeys, if any */
@@ -695,6 +696,7 @@ typedef struct RelOptInfo
 	List	   *pathlist;		/* Path structures */
 	List	   *ppilist;		/* ParamPathInfos used in pathlist */
 	List	   *partial_pathlist;	/* partial Paths */
+	List       *unique_pathlist;    /* unique Paths */
 	struct Path *cheapest_startup_path;
 	struct Path *cheapest_total_path;
 	struct Path *cheapest_unique_path;
@@ -883,6 +885,7 @@ struct IndexOptInfo
 	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
 	bool		amhasgettuple;	/* does AM have amgettuple interface? */
 	bool		amhasgetbitmap; /* does AM have amgetbitmap interface? */
+	bool		amcanskip;		/* can AM skip duplicate values? */
 	bool		amcanparallel;	/* does AM support parallel scan? */
 	bool		amcanmarkpos;	/* does AM support mark/restore? */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
@@ -1196,6 +1199,8 @@ typedef struct Path
 
 	List	   *pathkeys;		/* sort ordering of path's output */
 	/* pathkeys is a List of PathKey nodes; see above */
+
+	List	   *uniquekeys;	/* the unique keys, or NIL if none */
 } Path;
 
 /* Macro for extracting a path's parameterization relids; beware double eval */
diff --git a/src/include/nodes/pg_list.h b/src/include/nodes/pg_list.h
index 2cb9d1371d..4ac871fd16 100644
--- a/src/include/nodes/pg_list.h
+++ b/src/include/nodes/pg_list.h
@@ -567,6 +567,7 @@ extern pg_nodiscard List *list_delete_last(List *list);
 extern pg_nodiscard List *list_delete_first_n(List *list, int n);
 extern pg_nodiscard List *list_delete_nth_cell(List *list, int n);
 extern pg_nodiscard List *list_delete_cell(List *list, ListCell *cell);
+extern bool list_is_subset(const List *members, const List *target);
 
 extern List *list_union(const List *list1, const List *list2);
 extern List *list_union_ptr(const List *list1, const List *list2);
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 620eeda2d6..bb6d730e93 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -27,6 +27,7 @@ extern int	compare_fractional_path_costs(Path *path1, Path *path2,
 										  double fraction);
 extern void set_cheapest(RelOptInfo *parent_rel);
 extern void add_path(RelOptInfo *parent_rel, Path *new_path);
+extern void add_unique_path(RelOptInfo *parent_rel, Path *new_path);
 extern bool add_path_precheck(RelOptInfo *parent_rel,
 							  Cost startup_cost, Cost total_cost,
 							  List *pathkeys, Relids required_outer);
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 0c3a0b90c8..3dfa21adad 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -229,6 +229,9 @@ extern List *build_join_pathkeys(PlannerInfo *root,
 extern List *make_pathkeys_for_sortclauses(PlannerInfo *root,
 										   List *sortclauses,
 										   List *tlist);
+extern List *make_pathkeys_for_uniquekeys(PlannerInfo *root,
+										  List *sortclauses,
+										  List *tlist);
 extern void initialize_mergeclause_eclasses(PlannerInfo *root,
 											RestrictInfo *restrictinfo);
 extern void update_mergeclause_eclasses(PlannerInfo *root,
@@ -255,4 +258,10 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root,
 extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 									List *live_childrels);
 
+extern bool query_has_uniquekeys_for(PlannerInfo *root,
+									 List *exprs,
+									 bool allow_multinulls);
+
+extern List *build_uniquekeys(PlannerInfo *root, List *sortclauses);
+
 #endif							/* PATHS_H */
-- 
2.32.0

>From 1f61de293ad1eef7e91971c4c26aab031ae205c0 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <[email protected]>
Date: Sat, 8 Jan 2022 17:16:49 +0100
Subject: [PATCH v41 2/6] Index skip scan

Allow IndexOnlyScan to skip duplicated tuples based on search key prefix
(a trick also known as Index Skip Scan or Loose Index Scan, see in the
wiki [1]). The idea is to avoid scanning all equal values of a key, as
soon as a new value is found, restart the search by looking for a larger
value. This approach is much faster when the index has many equal keys.

Implemented via equipping IndexPath with indexskipprefix field and
creating an extra IndexPath with such prefix if suitable unique
expressions are present. On the execution size a new index am function
amskip is introduced to provide index specific implementation for such
skipping. To simplify potential amskip implementations,
ExecSupportsBackwardScan now returns false in case if index skip scan is
used, otherwise amskip has to deal with scroll cursor and be prepared to
handle different advance/read directions. ExecSupportsBackwardScan may
seem to have too big scope, but looks like now it used only together
with cursorOptions checks for CURSOR_OPT_SCROLL.

Original patch and design were proposed by Thomas Munro [2], revived and
improved by Dmitry Dolgov and Jesper Pedersen.

[1] https://wiki.postgresql.org/wiki/Loose_indexscan
[2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com

Author: Jesper Pedersen, Dmitry Dolgov
Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan
---
 contrib/bloom/blutils.c                       |   1 +
 doc/src/sgml/config.sgml                      |  15 ++
 doc/src/sgml/indexam.sgml                     |  43 ++++++
 doc/src/sgml/indices.sgml                     |  23 +++
 src/backend/access/brin/brin.c                |   1 +
 src/backend/access/gin/ginutil.c              |   1 +
 src/backend/access/gist/gist.c                |   1 +
 src/backend/access/hash/hash.c                |   1 +
 src/backend/access/index/indexam.c            |  16 ++
 src/backend/access/spgist/spgutils.c          |   1 +
 src/backend/commands/explain.c                |  23 +++
 src/backend/executor/execAmi.c                |  32 +++-
 src/backend/executor/nodeIndexonlyscan.c      |  47 +++++-
 src/backend/nodes/copyfuncs.c                 |   1 +
 src/backend/nodes/outfuncs.c                  |   1 +
 src/backend/nodes/readfuncs.c                 |   1 +
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/optimizer/path/indxpath.c         | 140 +++++++++++++++++-
 src/backend/optimizer/path/pathkeys.c         |  54 ++++++-
 src/backend/optimizer/plan/createplan.c       |  10 +-
 src/backend/optimizer/util/pathnode.c         |  37 +++++
 src/backend/optimizer/util/plancat.c          |   1 +
 src/backend/utils/misc/guc.c                  |  10 ++
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   6 +
 src/include/access/genam.h                    |   1 +
 src/include/access/sdir.h                     |   7 +
 src/include/nodes/execnodes.h                 |   2 +
 src/include/nodes/pathnodes.h                 |   4 +
 src/include/nodes/plannodes.h                 |   2 +
 src/include/optimizer/cost.h                  |   1 +
 src/include/optimizer/pathnode.h              |   4 +
 src/include/optimizer/paths.h                 |   5 +-
 src/test/regress/expected/sysviews.out        |   3 +-
 34 files changed, 478 insertions(+), 19 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index a434cf93ef..3b312c039d 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -134,6 +134,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = blcostestimate;
 	amroutine->amoptions = bloptions;
 	amroutine->amproperty = NULL;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 7a48973b3c..e43295861f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -5035,6 +5035,21 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-indexskipscan" xreflabel="enable_indexskipscan">
+      <term><varname>enable_indexskipscan</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_indexskipscan</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of index-skip-scan plan
+        types (see <xref linkend="indexes-index-skip-scans"/>). The default is
+        <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-material" xreflabel="enable_material">
       <term><varname>enable_material</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index d4163c96e9..31081d0f8d 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -153,6 +153,7 @@ typedef struct IndexAmRoutine
     amendscan_function amendscan;
     ammarkpos_function ammarkpos;       /* can be NULL */
     amrestrpos_function amrestrpos;     /* can be NULL */
+    amskip_function amskip;             /* can be NULL */
 
     /* interface functions to support parallel index scans */
     amestimateparallelscan_function amestimateparallelscan;    /* can be NULL */
@@ -779,6 +780,48 @@ amrestrpos (IndexScanDesc scan);
 
   <para>
 <programlisting>
+bool
+amskip (IndexScanDesc scan,
+        ScanDirection direction,
+        int prefix);
+</programlisting>
+  Skip past all tuples where the first 'prefix' columns have the same value as
+  the last tuple returned in the current scan. The arguments are:
+
+   <variablelist>
+    <varlistentry>
+     <term><parameter>scan</parameter></term>
+     <listitem>
+      <para>
+       Index scan information
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term><parameter>direction</parameter></term>
+     <listitem>
+      <para>
+       The direction in which data is advancing.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term><parameter>prefix</parameter></term>
+     <listitem>
+      <para>
+        Distinct prefix size.
+      </para>
+     </listitem>
+    </varlistentry>
+
+   </variablelist>
+
+  </para>
+
+  <para>
+<programlisting>
 Size
 amestimateparallelscan (void);
 </programlisting>
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 023157d888..ab9595d37f 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1297,6 +1297,29 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success;
    and later will recognize such cases and allow index-only scans to be
    generated, but older versions will not.
   </para>
+
+  <sect2 id="indexes-index-skip-scans">
+    <title>Index Skip Scans</title>
+
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index</primary>
+      <secondary>index-skip scans</secondary>
+    </indexterm>
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index-skip scan</primary>
+    </indexterm>
+
+    <para>
+     When the rows retrieved from an index scan are then deduplicated by
+     eliminating rows matching on a prefix of index keys (e.g. when using
+     <literal>SELECT DISTINCT</literal>), the planner will consider
+     skipping groups of rows with a matching key prefix. When a row with
+     a particular prefix is found, remaining rows with the same key prefix
+     are skipped.  The larger the number of rows with the same key prefix
+     rows (i.e. the lower the number of distinct key prefixes in the index),
+     the more efficient this is.
+    </para>
+  </sect2>
  </sect1>
 
 
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 4366010768..0d04b299d3 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -119,6 +119,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = brincostestimate;
 	amroutine->amoptions = brinoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 3d15701a01..56292eb822 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -67,6 +67,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gincostestimate;
 	amroutine->amoptions = ginoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8c6c744ab7..16a45f05bc 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -89,6 +89,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gistcostestimate;
 	amroutine->amoptions = gistoptions;
 	amroutine->amproperty = gistproperty;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index a259a301fa..41e5e9b594 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -86,6 +86,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = hashcostestimate;
 	amroutine->amoptions = hashoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index fe80b8b0ba..bcf7c73467 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance past duplicate key values in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -739,6 +740,21 @@ index_can_return(Relation indexRelation, int attno)
 	return indexRelation->rd_indam->amcanreturn(indexRelation, attno);
 }
 
+/* ----------------
+ *		index_skip
+ *
+ *		Skip past all tuples where the first 'prefix' columns have the
+ *		same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	SCAN_CHECKS;
+
+	return scan->indexRelation->rd_indam->amskip(scan, direction, prefix);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 1ae7492216..0b0dfa278f 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -73,6 +73,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = spgcostestimate;
 	amroutine->amoptions = spgoptions;
 	amroutine->amproperty = spgproperty;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 9f632285b6..4f5bd1d678 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -152,6 +152,7 @@ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es);
 static void ExplainIndentText(ExplainState *es);
 static void ExplainJSONLineEnding(ExplainState *es);
 static void ExplainYAMLLineStarting(ExplainState *es);
+static void ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es);
 static void escape_yaml(StringInfo buf, const char *str);
 
 
@@ -1120,6 +1121,21 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used)
 	return planstate_tree_walker(planstate, ExplainPreScanNode, rels_used);
 }
 
+/*
+ * ExplainIndexSkipScanKeys -
+ *	  Append information about index skip scan to es->str.
+ *
+ * Can be used to print the skip prefix size.
+ */
+static void
+ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es)
+{
+	if (skipPrefixSize > 0)
+	{
+		ExplainPropertyInteger("Distinct Prefix", NULL, skipPrefixSize, es);
+	}
+}
+
 /*
  * ExplainNode -
  *	  Appends a description of a plan tree to es->str
@@ -1477,6 +1493,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
 
+
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
@@ -1750,6 +1767,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			break;
 		case T_IndexOnlyScan:
+			if (((IndexOnlyScan *) plan)->indexskipprefixsize > 0)
+			{
+				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+				ExplainPropertyBool("Skip scan", true, es);
+				ExplainIndexSkipScanKeys(indexonlyscan->indexskipprefixsize, es);
+			}
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexOnlyScan *) plan)->recheckqual)
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index b6245994f0..ced8933f44 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -64,7 +64,7 @@
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
-static bool IndexSupportsBackwardScan(Oid indexid);
+static bool IndexSupportsBackwardScan(Plan *node);
 
 
 /*
@@ -555,10 +555,8 @@ ExecSupportsBackwardScan(Plan *node)
 			return false;
 
 		case T_IndexScan:
-			return IndexSupportsBackwardScan(((IndexScan *) node)->indexid);
-
 		case T_IndexOnlyScan:
-			return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid);
+			return IndexSupportsBackwardScan(node);
 
 		case T_SubqueryScan:
 			return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan);
@@ -598,16 +596,38 @@ ExecSupportsBackwardScan(Plan *node)
 
 /*
  * An IndexScan or IndexOnlyScan node supports backward scan only if the
- * index's AM does.
+ * index's AM does and no skip scan is used.
  */
 static bool
-IndexSupportsBackwardScan(Oid indexid)
+IndexSupportsBackwardScan(Plan *node)
 {
 	bool		result;
+	Oid			indexid = InvalidOid;
+	int         skip_prefix_size = 0;
 	HeapTuple	ht_idxrel;
 	Form_pg_class idxrelrec;
 	IndexAmRoutine *amroutine;
 
+	Assert(IsA(node, IndexScan) || IsA(node, IndexOnlyScan));
+	switch(nodeTag(node))
+	{
+		case T_IndexScan:
+			indexid = ((IndexScan *) node)->indexid;
+			break;
+
+		case T_IndexOnlyScan:
+			indexid = ((IndexOnlyScan *) node)->indexid;
+			skip_prefix_size = ((IndexOnlyScan *) node)->indexskipprefixsize;
+			break;
+
+		default:
+			elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node));
+			break;
+	}
+
+	if (skip_prefix_size > 0)
+		return false;
+
 	/* Fetch the pg_class tuple of the index relation */
 	ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid));
 	if (!HeapTupleIsValid(ht_idxrel))
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index eb3ddd2943..40ad1b949b 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -41,6 +41,7 @@
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/predicate.h"
+#include "storage/itemptr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -62,9 +63,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	EState	   *estate;
 	ExprContext *econtext;
 	ScanDirection direction;
+	ScanDirection readDirection;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
-	ItemPointer tid;
+	ItemPointer tid = NULL;
+	IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan;
+
+	/*
+	 * Tells if the current position was reached via skipping. In this case
+	 * there is no nead for the index_getnext_tid
+	 */
+	bool skipped = false;
 
 	/*
 	 * extract necessary information from index scan node
@@ -72,7 +81,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	estate = node->ss.ps.state;
 	direction = estate->es_direction;
 	/* flip direction if this is an overall backward scan */
-	if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
+	if (ScanDirectionIsBackward(indexonlyscan->indexorderdir))
 	{
 		if (ScanDirectionIsForward(direction))
 			direction = BackwardScanDirection;
@@ -115,15 +124,43 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix.
+	 */
+	if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted)
+	{
+		if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize))
+		{
+			/*
+			 * Reached end of index. At this point currPos is invalidated, and
+			 * we need to reset ioss_FirstTupleEmitted, since otherwise after
+			 * going backwards, reaching the end of index, and going forward
+			 * again we apply skip again. It would be incorrect and lead to an
+			 * extra skipped item.
+			 */
+			node->ioss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+		else
+		{
+			skipped = true;
+			tid = &scandesc->xs_heaptid;
+		}
+	}
+
+	readDirection = skipped ? indexonlyscan->indexorderdir : direction;
+
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	while (skipped || (tid = index_getnext_tid(scandesc, readDirection)) != NULL)
 	{
 		bool		tuple_from_heap = false;
 
 		CHECK_FOR_INTERRUPTS();
 
+		skipped = false;
+
 		/*
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
@@ -248,6 +285,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
 
+		node->ioss_FirstTupleEmitted = true;
+
 		return slot;
 	}
 
@@ -502,6 +541,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+	indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index d4f8455a2b..fe0d92ad46 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -523,6 +523,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(indexskipprefixsize);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 6bdad462c7..06eb1a89e9 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -584,6 +584,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
 	WRITE_NODE_FIELD(indexorderby);
 	WRITE_NODE_FIELD(indextlist);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(indexskipprefixsize);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 3f68f7c18d..169fb408d3 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1888,6 +1888,7 @@ _readIndexOnlyScan(void)
 	READ_NODE_FIELD(indexorderby);
 	READ_NODE_FIELD(indextlist);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(indexskipprefixsize);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 4d9f3b4bb6..6c42ae2121 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -133,6 +133,7 @@ int			max_parallel_workers_per_gather = 2;
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
 bool		enable_indexonlyscan = true;
+bool		enable_indexskipscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0ef70ad7f1..00526f3476 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -784,6 +784,16 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	{
 		IndexPath  *ipath = (IndexPath *) lfirst(lc);
 
+		/*
+		 * To prevent unique paths from index skip scans being potentially used
+		 * when not needed scan keep them in a separate pathlist.
+		*/
+		if (ipath->indexskipprefix != 0)
+		{
+			add_unique_path(rel, (Path *) ipath);
+			continue;
+		}
+
 		if (index->amhasgettuple)
 			add_path(rel, (Path *) ipath);
 
@@ -866,12 +876,15 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	double		loop_count;
 	List	   *orderbyclauses;
 	List	   *orderbyclausecols;
-	List	   *index_pathkeys;
+	List	   *index_pathkeys = NIL;
 	List	   *useful_pathkeys;
+	List	   *index_pathkeys_pos = NIL;
 	bool		found_lower_saop_clause;
 	bool		pathkeys_possibly_useful;
 	bool		index_is_ordered;
 	bool		index_only_scan;
+	bool		not_empty_qual = false;
+	bool		can_skip;
 	int			indexcol;
 
 	/*
@@ -989,7 +1002,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	if (index_is_ordered && pathkeys_possibly_useful)
 	{
 		index_pathkeys = build_index_pathkeys(root, index,
-											  ForwardScanDirection);
+											  ForwardScanDirection,
+											  &index_pathkeys_pos);
 		useful_pathkeys = truncate_useless_pathkeys(root, rel,
 													index_pathkeys);
 		orderbyclauses = NIL;
@@ -1021,6 +1035,72 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	index_only_scan = (scantype != ST_BITMAPSCAN &&
 					   check_index_only(rel, index));
 
+	/* Check if an index skip scan is possible. */
+	can_skip = enable_indexskipscan & index->amcanskip & index_only_scan;
+
+	if (can_skip)
+	{
+		/*
+		 * Skip scan is not supported when there are qual conditions, which are
+		 * not covered by index. The reason for that is that those conditions
+		 * are evaluated later, already after skipping was applied.
+		 *
+		 * TODO: This implementation is too restrictive, and doesn't allow e.g.
+		 * index expressions. For that we need to examine index_clauses too.
+		 */
+		if (root->parse->jointree != NULL)
+		{
+			ListCell *lc;
+
+			foreach(lc, (List *) root->parse->jointree->quals)
+			{
+				Node *expr, *qual = (Node *) lfirst(lc);
+				OpExpr *expr_op;
+				Var *var;
+				bool found = false;
+
+				if (!is_opclause(qual))
+				{
+					not_empty_qual = true;
+					break;
+				}
+
+				expr = get_leftop(qual);
+				expr_op = (OpExpr *) qual;
+
+				if (!IsA(expr, Var))
+				{
+					not_empty_qual = true;
+					break;
+				}
+
+				var = (Var *) expr;
+
+				/*
+				 * Check if the qual operator is indexable by any columns of
+				 * the index, test collation and opfamily.
+				 */
+				for (int i = 0; i < index->ncolumns; i++)
+				{
+					if (index->indexkeys[i] == var->varattno &&
+						IndexCollMatchesExprColl(index->indexcollations[i],
+												 expr_op->inputcollid) &&
+						op_in_opfamily(expr_op->opno, index->opfamily[i]))
+					{
+						found = true;
+						break;
+					}
+				}
+
+				if (!found)
+				{
+					not_empty_qual = true;
+					break;
+				}
+			}
+		}
+	}
+
 	/*
 	 * 4. Generate an indexscan path if there are relevant restriction clauses
 	 * in the current clauses, OR the index ordering is potentially useful for
@@ -1044,6 +1124,33 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 								  false);
 		result = lappend(result, ipath);
 
+		/* Consider index skip scan as well */
+		if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual)
+		{
+			int numusefulkeys = list_length(useful_pathkeys);
+			int numsortkeys = list_length(root->query_pathkeys);
+
+			if (numusefulkeys == numsortkeys)
+			{
+				int prefix;
+				if (list_length(root->distinct_pathkeys) > 0)
+					prefix = find_index_prefix_for_pathkey(index_pathkeys,
+														   index_pathkeys_pos,
+														   llast_node(PathKey,
+														   root->distinct_pathkeys));
+				else
+					/*
+					 * All distinct keys are constant and optimized away.
+					 * Skipping with 1 is sufficient.
+					 */
+					prefix = 1;
+
+				result = lappend(result,
+								 create_skipscan_unique_path(root, index,
+															 (Path *) ipath, prefix));
+			}
+		}
+
 		/*
 		 * If appropriate, consider parallel index scan.  We don't allow
 		 * parallel index scan for bitmap index scans.
@@ -1082,7 +1189,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	if (index_is_ordered && pathkeys_possibly_useful)
 	{
 		index_pathkeys = build_index_pathkeys(root, index,
-											  BackwardScanDirection);
+											  BackwardScanDirection,
+											  &index_pathkeys_pos);
 		useful_pathkeys = truncate_useless_pathkeys(root, rel,
 													index_pathkeys);
 		if (useful_pathkeys != NIL)
@@ -1099,6 +1207,32 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 									  false);
 			result = lappend(result, ipath);
 
+			/* Consider index skip scan as well */
+			if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual)
+			{
+				int numusefulkeys = list_length(useful_pathkeys);
+				int numsortkeys = list_length(root->query_pathkeys);
+
+				if (numusefulkeys == numsortkeys)
+				{
+					int prefix;
+					if (list_length(root->distinct_pathkeys) > 0)
+						prefix = find_index_prefix_for_pathkey(index_pathkeys,
+															   index_pathkeys_pos,
+															   llast_node(PathKey,
+															   root->distinct_pathkeys));
+					else
+						/* all are distinct keys are constant and optimized away.
+						 * skipping with 1 is sufficient as all are constant anyway
+						 */
+						prefix = 1;
+
+					result = lappend(result,
+									 create_skipscan_unique_path(root, index,
+																 (Path *) ipath, prefix));
+				}
+			}
+
 			/* If appropriate, consider parallel index scan */
 			if (index->amcanparallel &&
 				rel->consider_parallel && outer_relids == NULL &&
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index e2be1fbf90..cfdff4eee9 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -523,6 +523,47 @@ get_cheapest_parallel_safe_total_inner(List *paths)
  *		NEW PATHKEY FORMATION
  ****************************************************************************/
 
+/*
+ * Find the prefix size for a specific path key in an index. For example, an
+ * index with (a,b,c) finding path key b will return prefix 2. Optionally
+ * pathkeys_positions can be provided, to specify at which position in the
+ * original pathkey list this particular key could be found (this is helpful
+ * when we deal with redundant pathkeys).
+ *
+ * Returns 0 when not found.
+ */
+int
+find_index_prefix_for_pathkey(List *index_pathkeys,
+							  List *pathkeys_positions,
+							  PathKey *target_pathkey)
+{
+	ListCell   *lc;
+	int			i;
+
+	i = 0;
+	foreach(lc, index_pathkeys)
+	{
+		PathKey    *cpathkey = (PathKey *) lfirst(lc);
+
+		if (cpathkey == target_pathkey)
+		{
+			/*
+			 * Prefix expected to start from 1, increment positions since
+			 * they're 0 based.
+			 */
+			if (pathkeys_positions != NIL &&
+				pathkeys_positions->length > i)
+				return list_nth_int(pathkeys_positions, i) + 1;
+			else
+				return i + 1;
+		}
+
+		i++;
+	}
+
+	return 0;
+}
+
 /*
  * build_index_pathkeys
  *	  Build a pathkeys list that describes the ordering induced by an index
@@ -535,7 +576,9 @@ get_cheapest_parallel_safe_total_inner(List *paths)
  * We iterate only key columns of covering indexes, since non-key columns
  * don't influence index ordering.  The result is canonical, meaning that
  * redundant pathkeys are removed; it may therefore have fewer entries than
- * there are key columns in the index.
+ * there are key columns in the index. Since by removing redundant pathkeys the
+ * information about original position is lost, return it via positions
+ * argument.
  *
  * Another reason for stopping early is that we may be able to tell that
  * an index column's sort order is uninteresting for this query.  However,
@@ -546,7 +589,8 @@ get_cheapest_parallel_safe_total_inner(List *paths)
 List *
 build_index_pathkeys(PlannerInfo *root,
 					 IndexOptInfo *index,
-					 ScanDirection scandir)
+					 ScanDirection scandir,
+					 List **positions)
 {
 	List	   *retval = NIL;
 	ListCell   *lc;
@@ -555,6 +599,8 @@ build_index_pathkeys(PlannerInfo *root,
 	if (index->sortopfamily == NULL)
 		return NIL;				/* non-orderable index */
 
+	*positions = NIL;
+
 	i = 0;
 	foreach(lc, index->indextlist)
 	{
@@ -608,7 +654,11 @@ build_index_pathkeys(PlannerInfo *root,
 			 * for this query.  Add it to list, unless it's redundant.
 			 */
 			if (!pathkey_is_redundant(cpathkey, retval))
+			{
 				retval = lappend(retval, cpathkey);
+				*positions = lappend_int(*positions,
+										 foreach_current_index(lc));
+			}
 		}
 		else
 		{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index fa069a217c..511dda3a9f 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -191,7 +191,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 										 List *indexqual, List *recheckqual,
 										 List *indexorderby,
 										 List *indextlist,
-										 ScanDirection indexscandir);
+										 ScanDirection indexscandir,
+										 int skipprefix);
 static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
 											  List *indexqual,
 											  List *indexqualorig);
@@ -3108,7 +3109,8 @@ create_indexscan_plan(PlannerInfo *root,
 												stripped_indexquals,
 												fixed_indexorderbys,
 												indexinfo->indextlist,
-												best_path->indexscandir);
+												best_path->indexscandir,
+												best_path->indexskipprefix);
 	else
 		scan_plan = (Scan *) make_indexscan(tlist,
 											qpqual,
@@ -5482,7 +5484,8 @@ make_indexonlyscan(List *qptlist,
 				   List *recheckqual,
 				   List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir)
+				   ScanDirection indexscandir,
+				   int skipPrefixSize)
 {
 	IndexOnlyScan *node = makeNode(IndexOnlyScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5498,6 +5501,7 @@ make_indexonlyscan(List *qptlist,
 	node->indexorderby = indexorderby;
 	node->indextlist = indextlist;
 	node->indexorderdir = indexscandir;
+	node->indexskipprefixsize = skipPrefixSize;
 
 	return node;
 }
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index abb77d867e..0c61795389 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3095,6 +3095,43 @@ create_upper_unique_path(PlannerInfo *root,
 	return pathnode;
 }
 
+/*
+ * create_skipscan_unique_path
+ *	  Creates a pathnode the same as an existing IndexPath except based on
+ *	  skipping duplicate values.  This may or may not be cheaper than using
+ *	  create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root, IndexOptInfo *index,
+							Path *basepath, int prefix)
+{
+	IndexPath 	*pathnode = makeNode(IndexPath);
+	int 		numDistinctRows;
+	List 		*uniqExprs;
+
+	Assert(IsA(basepath, IndexPath));
+
+	/* We don't want to modify basepath, so make a copy. */
+	memcpy(pathnode, basepath, sizeof(IndexPath));
+
+	uniqExprs = linitial_node(List, root->query_uniquekeys);
+
+	Assert(prefix > 0);
+	pathnode->indexskipprefix = prefix;
+	pathnode->path.uniquekeys = root->query_uniquekeys;
+
+	numDistinctRows = estimate_num_groups(root, uniqExprs,
+										  pathnode->path.rows,
+										  NULL, NULL);
+
+	pathnode->path.total_cost = pathnode->path.startup_cost * numDistinctRows;
+	pathnode->path.rows = numDistinctRows;
+
+	return pathnode;
+}
+
 /*
  * create_agg_path
  *	  Creates a pathnode that represents performing aggregation/grouping
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index a5002ad895..1e6fb0c543 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -272,6 +272,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->amoptionalkey = amroutine->amoptionalkey;
 			info->amsearcharray = amroutine->amsearcharray;
 			info->amsearchnulls = amroutine->amsearchnulls;
+			info->amcanskip = (amroutine->amskip != NULL);
 			info->amcanparallel = amroutine->amcanparallel;
 			info->amhasgettuple = (amroutine->amgettuple != NULL);
 			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 932aefc777..6410c6ede7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1010,6 +1010,16 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-skip-scan plans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexskipscan,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of bitmap-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 4cf5b26a36..17ff364a7a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -370,6 +370,7 @@
 #enable_incremental_sort = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexskipscan = on
 #enable_material = on
 #enable_memoize = on
 #enable_mergejoin = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index a382551a98..cb2f48a1bc 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -173,6 +173,11 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+								 ScanDirection dir,
+								 int prefix);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -277,6 +282,7 @@ typedef struct IndexAmRoutine
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
 	amrestrpos_function amrestrpos; /* can be NULL */
+	amskip_function amskip;				/* can be NULL */
 
 	/* interface functions to support parallel index scans */
 	amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 134b20f1e6..d13d95c458 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -183,6 +183,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 												   IndexBulkDeleteResult *istat);
 extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 									uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h
index 1ab4d5e19a..fd71629da4 100644
--- a/src/include/access/sdir.h
+++ b/src/include/access/sdir.h
@@ -55,4 +55,11 @@ typedef enum ScanDirection
 #define ScanDirectionIsForward(direction) \
 	((bool) ((direction) == ForwardScanDirection))
 
+/*
+ * ScanDirectionsAreOpposite
+ *		True iff scan directions are backward/forward or forward/backward.
+ */
+#define ScanDirectionsAreOpposite(dirA, dirB) \
+	((bool) (dirA != NoMovementScanDirection && dirA == -dirB))
+
 #endif							/* SDIR_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 44dd73fc80..cf36e6c0e6 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1533,6 +1533,8 @@ typedef struct IndexOnlyScanState
 	struct IndexScanDescData *ioss_ScanDesc;
 	TupleTableSlot *ioss_TableSlot;
 	Buffer		ioss_VMBuffer;
+	int         ioss_SkipPrefixSize;
+	bool		ioss_FirstTupleEmitted;
 	Size		ioss_PscanLen;
 } IndexOnlyScanState;
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 056b13826a..40997ee759 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1243,6 +1243,9 @@ typedef struct Path
  * we need not recompute them when considering using the same index in a
  * bitmap index/heap scan (see BitmapHeapPath).  The costs of the IndexPath
  * itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
  *----------
  */
 typedef struct IndexPath
@@ -1255,6 +1258,7 @@ typedef struct IndexPath
 	ScanDirection indexscandir;
 	Cost		indextotalcost;
 	Selectivity indexselectivity;
+	int			indexskipprefix;
 } IndexPath;
 
 /*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 0b518ce6b2..6b3eefebc6 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -453,6 +453,8 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			indexskipprefixsize;	/* the size of the prefix for distinct
+										 * scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 356a51f370..03d5816c82 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
 extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexskipscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index bb6d730e93..227cda4bd7 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -218,6 +218,10 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
 												 Path *subpath,
 												 int numCols,
 												 double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+											  IndexOptInfo *index,
+											  Path *subpath,
+											  int prefix);
 extern AggPath *create_agg_path(PlannerInfo *root,
 								RelOptInfo *rel,
 								Path *subpath,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 3dfa21adad..72b3bd9059 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -212,8 +212,11 @@ extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths,
 													   Relids required_outer,
 													   double fraction);
 extern Path *get_cheapest_parallel_safe_total_inner(List *paths);
+extern int find_index_prefix_for_pathkey(List *index_pathkeys,
+										 List *pathkey_positions,
+										 PathKey *target_pathkey);
 extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index,
-								  ScanDirection scandir);
+								  ScanDirection scandir, List **positions);
 extern List *build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel,
 									  ScanDirection scandir, bool *partialkeys);
 extern List *build_expression_pathkey(PlannerInfo *root, Expr *expr,
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 442eeb1e3f..1ada679d46 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -110,6 +110,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexskipscan           | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -122,7 +123,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(20 rows)
+(21 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
-- 
2.32.0

>From 0fef4ca321421b442e964ee4ae9b5cb721452750 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <[email protected]>
Date: Thu, 20 May 2021 21:13:38 +0200
Subject: [PATCH v41 3/6] amskip implementation for Btree

Btree implementation of index am method amskip for Index Skip Scan. To
make it more robust and suitable for both situations:

* small number of distinct values (e.g. because of the planner
underestimation)

* significant amounts of distinct values

a mixed approach is implemented. Instead of restarting the search for
every value, first we check if there is a next distinct value on the
current page. Only then if no such value was found restart and search
from the tree root.

No support for backward scan is implemented in case of a scroll cursor,
instead a Material node will be put on top.

Author: Jesper Pedersen, Dmitry Dolgov
Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan
---
 src/backend/access/nbtree/nbtree.c            |  12 +
 src/backend/access/nbtree/nbtsearch.c         | 215 +++++-
 src/include/access/nbtree.h                   |   5 +
 src/test/regress/expected/join.out            |   3 +
 src/test/regress/expected/select_distinct.out | 642 ++++++++++++++++++
 src/test/regress/sql/join.sql                 |   5 +
 src/test/regress/sql/select_distinct.sql      | 282 ++++++++
 7 files changed, 1163 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index c9b4964c1e..7b2aa594fb 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -125,6 +125,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
+	amroutine->amskip = btskip;
 	amroutine->amcostestimate = btcostestimate;
 	amroutine->amoptions = btoptions;
 	amroutine->amproperty = btproperty;
@@ -376,6 +377,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	 */
 	so->currTuples = so->markTuples = NULL;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -442,6 +445,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	_bt_preprocess_array_keys(scan);
 }
 
+/*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+bool
+btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	return _bt_skip(scan, direction, prefix);
+}
+
 /*
  *	btendscan() -- close down a scan
  */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 9d82d4904d..0ad761916a 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -45,7 +45,11 @@ static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
-
+static inline void _bt_update_skip_scankeys(IndexScanDesc scan,
+											Relation indexRel);
+static inline bool _bt_scankey_within_page(IndexScanDesc scan,
+										   BTScanInsert key,
+										   Buffer buf);
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -1498,6 +1502,161 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	return true;
 }
 
+/*
+ *  _bt_skip() -- Skip items that have the same prefix as the most recently
+ * 				  fetched index tuple.
+ *
+ * 		The current position is set so that a subsequent call to _bt_next will
+ * 		fetch the first tuple that differs in the leading 'prefix' keys.
+ *
+ * 		The current page is searched for the next unique value. If none is found
+ * 		we will do a scan from the root in order to find the next page with
+ * 		a unique value.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+	Relation 	 indexRel = scan->indexRelation;
+	bool scanstart = !BTScanPosIsValid(so->currPos);
+
+	/* We want to return tuples, and we need a starting point */
+	Assert(scan->xs_want_itup);
+	Assert(scan->xs_itup);
+
+	if (so->numKilled > 0)
+		_bt_killitems(scan);
+
+	/* If skipScanKey is NULL then we initialize it with _bt_mkscankey */
+	if (so->skipScanKey == NULL)
+	{
+		so->skipScanKey = _bt_mkscankey(indexRel, scan->xs_itup);
+		so->skipScanKey->keysz = prefix;
+		so->skipScanKey->scantid = NULL;
+	}
+	so->skipScanKey->nextkey = ScanDirectionIsForward(dir);
+	_bt_update_skip_scankeys(scan, indexRel);
+
+	/* Check if the next unique key can be found within the current page.
+	 * Since we do not lock the current page between jumps, it's possible
+	 * that it was splitted since the last time we saw it. This is fine in
+	 * case of scanning forward, since page split to the right and we are
+	 * still on the left most page. In case of scanning backwards it's
+	 * possible to loose some pages and we need to remember the previous
+	 * page, and then follow the right link from the current page until we
+	 * find the original one.
+	 *
+	 * Since the whole idea of checking the current page is to protect
+	 * ourselves and make more performant statistic mismatch case when
+	 * there are too many distinct values for jumping, it's not clear if
+	 * the complexity of this solution in case of backward scan is
+	 * justified, so for now just avoid it.
+	 */
+	if (BufferIsValid(so->currPos.buf) && ScanDirectionIsForward(dir))
+	{
+		_bt_lockbuf(indexRel, so->currPos.buf, BT_READ);
+
+		if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf))
+		{
+			bool keyFound = false;
+
+			offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, so->currPos.buf);
+
+			/* Lock the page for SERIALIZABLE transactions */
+			PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(so->currPos.buf),
+							  scan->xs_snapshot);
+
+			/* We know in which direction to look */
+			_bt_initialize_more_data(so, dir);
+
+			/* Now read the data */
+			keyFound = _bt_readpage(scan, dir, offnum);
+
+			_bt_relbuf(indexRel, so->currPos.buf);
+			so->currPos.buf = InvalidBuffer;
+
+			if (keyFound)
+			{
+				/* set IndexTuple */
+				currItem = &so->currPos.items[so->currPos.itemIndex];
+				scan->xs_heaptid = currItem->heapTid;
+				scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+				return true;
+			}
+		}
+		else
+			_bt_unlockbuf(indexRel, so->currPos.buf);
+	}
+
+	if (BufferIsValid(so->currPos.buf))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/*
+	 * We haven't found scan key within the current page, so let's scan from
+	 * the root. Use _bt_search and _bt_binsrch to get the buffer and offset
+	 * number
+	 */
+	stack = _bt_search(scan->indexRelation, so->skipScanKey,
+					   &buf, BT_READ, scan->xs_snapshot);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+
+	/* Lock the page for SERIALIZABLE transactions */
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+
+	/* We know in which direction to look */
+	_bt_initialize_more_data(so, dir);
+
+	/*
+	 * Simplest case is when both directions are forward, when we are already
+	 * at the next distinct key at the beginning of the series (so everything
+	 * else would be done in _bt_readpage)
+	 *
+	 * The case when both directions are backwards is also simple, but we need
+	 * to go one step back, since we need a last element from the previous
+	 * series.
+	 */
+	if (ScanDirectionIsBackward(dir) || (ScanDirectionIsForward(dir) & scanstart))
+		 offnum = OffsetNumberPrev(offnum);
+
+	/* Now read the data */
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		_bt_unlockbuf(indexRel, so->currPos.buf);
+		if (!_bt_steppage(scan, dir))
+		{
+			pfree(so->skipScanKey);
+			so->skipScanKey = NULL;
+			return false;
+		}
+	}
+	else
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+	/* And set IndexTuple */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	so->currPos.moreLeft = true;
+	so->currPos.moreRight = true;
+
+	return true;
+}
+
 /*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
@@ -2494,3 +2653,57 @@ _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
 	so->numKilled = 0;			/* just paranoia */
 	so->markItemIndex = -1;		/* ditto */
 }
+
+/*
+ * _bt_update_skip_scankeys() -- set up new values for the existing scankeys
+ * based on the current index tuple
+ */
+static inline void
+_bt_update_skip_scankeys(IndexScanDesc scan, Relation indexRel)
+{
+	TupleDesc		itupdesc;
+	int			indnkeyatts,
+				i;
+	BTScanOpaque 	so = (BTScanOpaque) scan->opaque;
+	ScanKey			scankeys = so->skipScanKey->scankeys;
+
+	itupdesc = RelationGetDescr(indexRel);
+	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRel);
+	for (i = 0; i < indnkeyatts; i++)
+	{
+		Datum datum;
+		bool null;
+		int flags;
+
+		datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null);
+		flags = (null ? SK_ISNULL : 0) |
+				(indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT);
+		scankeys[i].sk_flags = flags;
+		scankeys[i].sk_argument = datum;
+	}
+}
+
+/*
+ * _bt_scankey_within_page() -- check if the provided scankey could be found
+ * within a page, specified by the buffer.
+ *
+ * Scankey nextkey will tell us if we need to find a current key or the next
+ * one, which affects whether or not it's ok to be equal to the page highkey.
+ */
+static inline bool
+_bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key, Buffer buf)
+{
+	OffsetNumber low, high;
+	Page page = BufferGetPage(buf);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	int high_compare = key->nextkey ? 0 : 1;
+
+	low = P_FIRSTDATAKEY(opaque);
+	high = PageGetMaxOffsetNumber(page);
+
+	if (unlikely(high < low))
+		return false;
+
+	return (_bt_compare(scan->indexRelation, key, page, low) > 0 &&
+			_bt_compare(scan->indexRelation, key, page, high) < high_compare);
+}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9fec6fb1a8..2c516654c2 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1064,6 +1064,9 @@ typedef struct BTScanOpaqueData
 	 */
 	int			markItemIndex;	/* itemIndex, or -1 if not valid */
 
+	/* Work space for _bt_skip */
+	BTScanInsert	skipScanKey;	/* used to control skipping */
+
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
@@ -1229,6 +1232,7 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 							   Snapshot snapshot);
 
@@ -1253,6 +1257,7 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
 extern Size BTreeShmemSize(void);
 extern void BTreeShmemInit(void);
 extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern bool btproperty(Oid index_oid, int attno,
 					   IndexAMProperty prop, const char *propname,
 					   bool *res, bool *isnull);
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 19caebabd0..ebad6d4ae1 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4614,6 +4614,8 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
          ->  Seq Scan on d
 (8 rows)
 
+-- disable index skip scan to prevent it interfering with the plan
+set enable_indexskipscan to off;
 -- similarly, but keying off a DISTINCT clause
 explain (costs off)
 select d.* from d left join (select distinct * from b) s
@@ -4631,6 +4633,7 @@ select d.* from d left join (select distinct * from b) s
          ->  Seq Scan on d
 (9 rows)
 
+set enable_indexskipscan to on;
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION
 explain (costs off)
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 748419cee0..36b3291a7f 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -375,3 +375,645 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+-- index only skip scan
+CREATE TABLE distinct_a (a int, b int, c int);
+INSERT INTO distinct_a (
+    SELECT five, tenthous, 10 FROM
+    generate_series(1, 5) five,
+    generate_series(1, 10000) tenthous
+);
+CREATE INDEX ON distinct_a (a, b);
+CREATE INDEX ON distinct_a ((a + 1));
+ANALYZE distinct_a;
+SELECT DISTINCT a FROM distinct_a;
+ a 
+---
+ 1
+ 2
+ 3
+ 4
+ 5
+(5 rows)
+
+SELECT DISTINCT a FROM distinct_a WHERE a = 1;
+ a 
+---
+ 1
+(1 row)
+
+SELECT DISTINCT a FROM distinct_a ORDER BY a DESC;
+ a 
+---
+ 5
+ 4
+ 3
+ 2
+ 1
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_a;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Index Only Scan using distinct_a_a_b_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+(3 rows)
+
+-- test index skip scan with a condition on a non unique field
+SELECT DISTINCT ON (a) a, b FROM distinct_a WHERE b = 2;
+ a | b 
+---+---
+ 1 | 2
+ 2 | 2
+ 3 | 2
+ 4 | 2
+ 5 | 2
+(5 rows)
+
+-- test index skip scan backwards
+SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC;
+ a |   b   
+---+-------
+ 5 | 10000
+ 4 | 10000
+ 3 | 10000
+ 2 | 10000
+ 1 | 10000
+(5 rows)
+
+-- test index skip scan for expressions
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
+             QUERY PLAN             
+------------------------------------
+ Sort
+   Sort Key: ((a + 1))
+   ->  HashAggregate
+         Group Key: (a + 1)
+         ->  Seq Scan on distinct_a
+(5 rows)
+
+SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
+ ?column? 
+----------
+        2
+        3
+        4
+        5
+        6
+(5 rows)
+
+-- check colums order
+CREATE INDEX distinct_a_b_a on distinct_a (b, a);
+SELECT DISTINCT a FROM distinct_a WHERE b = 2;
+ a 
+---
+ 1
+ 2
+ 3
+ 4
+ 5
+(5 rows)
+
+SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2;
+ a | b 
+---+---
+ 1 | 2
+ 2 | 2
+ 3 | 2
+ 4 | 2
+ 5 | 2
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_a WHERE b = 2;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Index Only Scan using distinct_a_b_a on distinct_a
+   Skip scan: true
+   Distinct Prefix: 2
+   Index Cond: (b = 2)
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Index Only Scan using distinct_a_b_a on distinct_a
+   Skip scan: true
+   Distinct Prefix: 2
+   Index Cond: (b = 2)
+(4 rows)
+
+DROP INDEX distinct_a_b_a;
+-- test opposite scan/index directions inside a cursor
+-- forward/backward
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a, b;
+FETCH FROM c;
+ a | b 
+---+---
+ 1 | 1
+(1 row)
+
+FETCH BACKWARD FROM c;
+ a | b 
+---+---
+(0 rows)
+
+FETCH 6 FROM c;
+ a | b 
+---+---
+ 1 | 1
+ 2 | 1
+ 3 | 1
+ 4 | 1
+ 5 | 1
+(5 rows)
+
+FETCH BACKWARD 6 FROM c;
+ a | b 
+---+---
+ 5 | 1
+ 4 | 1
+ 3 | 1
+ 2 | 1
+ 1 | 1
+(5 rows)
+
+FETCH 6 FROM c;
+ a | b 
+---+---
+ 1 | 1
+ 2 | 1
+ 3 | 1
+ 4 | 1
+ 5 | 1
+(5 rows)
+
+FETCH BACKWARD 6 FROM c;
+ a | b 
+---+---
+ 5 | 1
+ 4 | 1
+ 3 | 1
+ 2 | 1
+ 1 | 1
+(5 rows)
+
+END;
+-- backward/forward
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a DESC, b DESC;
+FETCH FROM c;
+ a |   b   
+---+-------
+ 5 | 10000
+(1 row)
+
+FETCH BACKWARD FROM c;
+ a | b 
+---+---
+(0 rows)
+
+FETCH 6 FROM c;
+ a |   b   
+---+-------
+ 5 | 10000
+ 4 | 10000
+ 3 | 10000
+ 2 | 10000
+ 1 | 10000
+(5 rows)
+
+FETCH BACKWARD 6 FROM c;
+ a |   b   
+---+-------
+ 1 | 10000
+ 2 | 10000
+ 3 | 10000
+ 4 | 10000
+ 5 | 10000
+(5 rows)
+
+FETCH 6 FROM c;
+ a |   b   
+---+-------
+ 5 | 10000
+ 4 | 10000
+ 3 | 10000
+ 2 | 10000
+ 1 | 10000
+(5 rows)
+
+FETCH BACKWARD 6 FROM c;
+ a |   b   
+---+-------
+ 1 | 10000
+ 2 | 10000
+ 3 | 10000
+ 4 | 10000
+ 5 | 10000
+(5 rows)
+
+END;
+-- test missing values and skipping from the end
+CREATE TABLE distinct_abc(a int, b int, c int);
+CREATE INDEX ON distinct_abc(a, b, c);
+INSERT INTO distinct_abc
+	VALUES (1, 1, 1),
+		   (1, 1, 2),
+		   (1, 2, 2),
+		   (1, 2, 3),
+		   (2, 2, 1),
+		   (2, 2, 3),
+		   (3, 1, 1),
+		   (3, 1, 2),
+		   (3, 2, 2),
+		   (3, 2, 3);
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Index Only Scan using distinct_abc_a_b_c_idx on distinct_abc
+   Skip scan: true
+   Distinct Prefix: 1
+   Index Cond: (c = 2)
+(4 rows)
+
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2;
+FETCH ALL FROM c;
+ a | b | c 
+---+---+---
+ 1 | 1 | 2
+ 3 | 1 | 2
+(2 rows)
+
+FETCH BACKWARD ALL FROM c;
+ a | b | c 
+---+---+---
+ 3 | 1 | 2
+ 1 | 1 | 2
+(2 rows)
+
+END;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2
+ORDER BY a DESC, b DESC;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Index Only Scan Backward using distinct_abc_a_b_c_idx on distinct_abc
+   Skip scan: true
+   Distinct Prefix: 1
+   Index Cond: (c = 2)
+(4 rows)
+
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2
+ORDER BY a DESC, b DESC;
+FETCH ALL FROM c;
+ a | b | c 
+---+---+---
+ 3 | 2 | 2
+ 1 | 2 | 2
+(2 rows)
+
+FETCH BACKWARD ALL FROM c;
+ a | b | c 
+---+---+---
+ 1 | 2 | 2
+ 3 | 2 | 2
+(2 rows)
+
+END;
+DROP TABLE distinct_abc;
+-- check colums order
+SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
+ a 
+---
+ 1
+ 2
+ 3
+ 4
+ 5
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Unique
+   ->  Index Scan using distinct_a_a_b_idx on distinct_a
+         Index Cond: (b = 2)
+         Filter: (c = 10)
+(4 rows)
+
+-- check projection case
+SELECT DISTINCT a, a FROM distinct_a WHERE b = 2;
+ a | a 
+---+---
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+(5 rows)
+
+SELECT DISTINCT a, 1 FROM distinct_a WHERE b = 2;
+ a | ?column? 
+---+----------
+ 1 |        1
+ 2 |        1
+ 3 |        1
+ 4 |        1
+ 5 |        1
+(5 rows)
+
+-- test cursor forward/backward movements
+BEGIN;
+DECLARE c SCROLL CURSOR FOR SELECT DISTINCT a FROM distinct_a;
+FETCH FROM c;
+ a 
+---
+ 1
+(1 row)
+
+FETCH BACKWARD FROM c;
+ a 
+---
+(0 rows)
+
+FETCH 6 FROM c;
+ a 
+---
+ 1
+ 2
+ 3
+ 4
+ 5
+(5 rows)
+
+FETCH BACKWARD 6 FROM c;
+ a 
+---
+ 5
+ 4
+ 3
+ 2
+ 1
+(5 rows)
+
+FETCH 6 FROM c;
+ a 
+---
+ 1
+ 2
+ 3
+ 4
+ 5
+(5 rows)
+
+FETCH BACKWARD 6 FROM c;
+ a 
+---
+ 5
+ 4
+ 3
+ 2
+ 1
+(5 rows)
+
+END;
+DROP TABLE distinct_a;
+-- test tuples visibility
+CREATE TABLE distinct_visibility (a int, b int);
+INSERT INTO distinct_visibility (select a, b from generate_series(1,5) a, generate_series(1, 10000) b);
+CREATE INDEX ON distinct_visibility (a, b);
+ANALYZE distinct_visibility;
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b;
+ a | b 
+---+---
+ 1 | 1
+ 2 | 1
+ 3 | 1
+ 4 | 1
+ 5 | 1
+(5 rows)
+
+DELETE FROM distinct_visibility WHERE a = 2 and b = 1;
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b;
+ a | b 
+---+---
+ 1 | 1
+ 2 | 2
+ 3 | 1
+ 4 | 1
+ 5 | 1
+(5 rows)
+
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC;
+ a |   b   
+---+-------
+ 5 | 10000
+ 4 | 10000
+ 3 | 10000
+ 2 | 10000
+ 1 | 10000
+(5 rows)
+
+DELETE FROM distinct_visibility WHERE a = 2 and b = 10000;
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC;
+ a |   b   
+---+-------
+ 5 | 10000
+ 4 | 10000
+ 3 | 10000
+ 2 |  9999
+ 1 | 10000
+(5 rows)
+
+DROP TABLE distinct_visibility;
+-- test page boundaries
+CREATE TABLE distinct_boundaries AS
+    SELECT a, b::int2 b, (b % 2)::int2 c FROM
+        generate_series(1, 5) a,
+        generate_series(1,366) b;
+CREATE INDEX ON distinct_boundaries (a, b, c);
+ANALYZE distinct_boundaries;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c from distinct_boundaries
+WHERE b >= 1 and c = 0 ORDER BY a, b;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Index Only Scan using distinct_boundaries_a_b_c_idx on distinct_boundaries
+   Skip scan: true
+   Distinct Prefix: 1
+   Index Cond: ((b >= 1) AND (c = 0))
+(4 rows)
+
+SELECT DISTINCT ON (a) a, b, c from distinct_boundaries
+WHERE b >= 1 and c = 0 ORDER BY a, b;
+ a | b | c 
+---+---+---
+ 1 | 2 | 0
+ 2 | 2 | 0
+ 3 | 2 | 0
+ 4 | 2 | 0
+ 5 | 2 | 0
+(5 rows)
+
+DROP TABLE distinct_boundaries;
+-- test tuple killing
+-- DESC ordering
+CREATE TABLE distinct_killed AS
+    SELECT a, b, b % 2 AS c, 10 AS d
+        FROM generate_series(1, 5) a,
+             generate_series(1,1000) b;
+CREATE INDEX ON distinct_killed (a, b, c, d);
+DELETE FROM distinct_killed where a = 3;
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c,d
+    FROM distinct_killed ORDER BY a DESC, b DESC;
+    FETCH FORWARD ALL FROM c;
+ a |  b   | c | d  
+---+------+---+----
+ 5 | 1000 | 0 | 10
+ 4 | 1000 | 0 | 10
+ 2 | 1000 | 0 | 10
+ 1 | 1000 | 0 | 10
+(4 rows)
+
+    FETCH BACKWARD ALL FROM c;
+ a |  b   | c | d  
+---+------+---+----
+ 1 | 1000 | 0 | 10
+ 2 | 1000 | 0 | 10
+ 4 | 1000 | 0 | 10
+ 5 | 1000 | 0 | 10
+(4 rows)
+
+COMMIT;
+DROP TABLE distinct_killed;
+-- regular ordering
+CREATE TABLE distinct_killed AS
+    SELECT a, b, b % 2 AS c, 10 AS d
+        FROM generate_series(1, 5) a,
+             generate_series(1,1000) b;
+CREATE INDEX ON distinct_killed (a, b, c, d);
+DELETE FROM distinct_killed where a = 3;
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c,d
+    FROM distinct_killed ORDER BY a, b;
+    FETCH FORWARD ALL FROM c;
+ a | b | c | d  
+---+---+---+----
+ 1 | 1 | 1 | 10
+ 2 | 1 | 1 | 10
+ 4 | 1 | 1 | 10
+ 5 | 1 | 1 | 10
+(4 rows)
+
+    FETCH BACKWARD ALL FROM c;
+ a | b | c | d  
+---+---+---+----
+ 5 | 1 | 1 | 10
+ 4 | 1 | 1 | 10
+ 2 | 1 | 1 | 10
+ 1 | 1 | 1 | 10
+(4 rows)
+
+COMMIT;
+DROP TABLE distinct_killed;
+-- partial delete
+CREATE TABLE distinct_killed AS
+    SELECT a, b, b % 2 AS c, 10 AS d
+        FROM generate_series(1, 5) a,
+             generate_series(1,1000) b;
+CREATE INDEX ON distinct_killed (a, b, c, d);
+DELETE FROM distinct_killed WHERE a = 3 AND b <= 999;
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c,d
+    FROM distinct_killed ORDER BY a DESC, b DESC;
+    FETCH FORWARD ALL FROM c;
+ a |  b   | c | d  
+---+------+---+----
+ 5 | 1000 | 0 | 10
+ 4 | 1000 | 0 | 10
+ 3 | 1000 | 0 | 10
+ 2 | 1000 | 0 | 10
+ 1 | 1000 | 0 | 10
+(5 rows)
+
+    FETCH BACKWARD ALL FROM c;
+ a |  b   | c | d  
+---+------+---+----
+ 1 | 1000 | 0 | 10
+ 2 | 1000 | 0 | 10
+ 3 | 1000 | 0 | 10
+ 4 | 1000 | 0 | 10
+ 5 | 1000 | 0 | 10
+(5 rows)
+
+COMMIT;
+DROP TABLE distinct_killed;
+-- test posting lists
+CREATE TABLE distinct_posting (a int, b int, c int);
+CREATE INDEX ON distinct_posting (a, b, c);
+INSERT INTO distinct_posting
+	VALUES (1, 1, 1),
+		   (1, 1, 2),
+		   (1, 2, 2),
+		   (1, 2, 3),
+		   (2, 2, 1),
+		   (2, 2, 3),
+		   (3, 1, 1),
+		   (3, 1, 2),
+		   (3, 2, 2),
+		   (3, 2, 3);
+INSERT INTO distinct_posting (
+    SELECT 1 as a, 1 as b, 1 AS c
+        FROM generate_series(1,1000) i
+);
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c FROM distinct_posting WHERE c = 2
+    ORDER BY a DESC, b DESC;
+    FETCH ALL FROM c;
+ a | b | c 
+---+---+---
+ 3 | 2 | 2
+ 1 | 2 | 2
+(2 rows)
+
+    FETCH BACKWARD ALL FROM c;
+ a | b | c 
+---+---+---
+ 1 | 2 | 2
+ 3 | 2 | 2
+(2 rows)
+
+COMMIT;
+-- test that quals are check for indexability before applied
+CREATE TABLE Indexable_quals (a text, b text, c text);
+CREATE INDEX ON indexable_quals (a, b, c);
+INSERT INTO indexable_quals VALUES ('a1', 'b', 'xxx');
+INSERT INTO indexable_quals VALUES ('a1', 'b', 'yyy');
+INSERT INTO indexable_quals VALUES ('a2', 'b', 'xxx');
+INSERT INTO indexable_quals VALUES ('a2', 'b', 'yyy');
+SELECT DISTINCT ON (a, b)  a, b
+FROM indexable_quals WHERE c LIKE '%y%' AND a LIKE 'a%' AND b = 'b';
+ a  | b 
+----+---
+ a1 | b
+ a2 | b
+(2 rows)
+
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 6dd01b022e..972d58fd54 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1585,11 +1585,16 @@ explain (costs off)
 select d.* from d left join (select * from b group by b.id, b.c_id) s
   on d.a = s.id;
 
+-- disable index skip scan to prevent it interfering with the plan
+set enable_indexskipscan to off;
+
 -- similarly, but keying off a DISTINCT clause
 explain (costs off)
 select d.* from d left join (select distinct * from b) s
   on d.a = s.id;
 
+set enable_indexskipscan to on;
+
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION
 explain (costs off)
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index f27ff714f8..c9ccf4cc7d 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -174,3 +174,285 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- index only skip scan
+CREATE TABLE distinct_a (a int, b int, c int);
+INSERT INTO distinct_a (
+    SELECT five, tenthous, 10 FROM
+    generate_series(1, 5) five,
+    generate_series(1, 10000) tenthous
+);
+CREATE INDEX ON distinct_a (a, b);
+CREATE INDEX ON distinct_a ((a + 1));
+ANALYZE distinct_a;
+
+SELECT DISTINCT a FROM distinct_a;
+SELECT DISTINCT a FROM distinct_a WHERE a = 1;
+SELECT DISTINCT a FROM distinct_a ORDER BY a DESC;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_a;
+
+-- test index skip scan with a condition on a non unique field
+SELECT DISTINCT ON (a) a, b FROM distinct_a WHERE b = 2;
+
+-- test index skip scan backwards
+SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC;
+
+-- test index skip scan for expressions
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
+SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
+
+-- check colums order
+CREATE INDEX distinct_a_b_a on distinct_a (b, a);
+
+SELECT DISTINCT a FROM distinct_a WHERE b = 2;
+SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_a WHERE b = 2;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2;
+
+DROP INDEX distinct_a_b_a;
+
+-- test opposite scan/index directions inside a cursor
+-- forward/backward
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a, b;
+
+FETCH FROM c;
+FETCH BACKWARD FROM c;
+
+FETCH 6 FROM c;
+FETCH BACKWARD 6 FROM c;
+
+FETCH 6 FROM c;
+FETCH BACKWARD 6 FROM c;
+
+END;
+
+-- backward/forward
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a DESC, b DESC;
+
+FETCH FROM c;
+FETCH BACKWARD FROM c;
+
+FETCH 6 FROM c;
+FETCH BACKWARD 6 FROM c;
+
+FETCH 6 FROM c;
+FETCH BACKWARD 6 FROM c;
+
+END;
+
+-- test missing values and skipping from the end
+CREATE TABLE distinct_abc(a int, b int, c int);
+CREATE INDEX ON distinct_abc(a, b, c);
+INSERT INTO distinct_abc
+	VALUES (1, 1, 1),
+		   (1, 1, 2),
+		   (1, 2, 2),
+		   (1, 2, 3),
+		   (2, 2, 1),
+		   (2, 2, 3),
+		   (3, 1, 1),
+		   (3, 1, 2),
+		   (3, 2, 2),
+		   (3, 2, 3);
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2;
+
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2;
+
+FETCH ALL FROM c;
+FETCH BACKWARD ALL FROM c;
+
+END;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2
+ORDER BY a DESC, b DESC;
+
+BEGIN;
+DECLARE c SCROLL CURSOR FOR
+SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2
+ORDER BY a DESC, b DESC;
+
+FETCH ALL FROM c;
+FETCH BACKWARD ALL FROM c;
+
+END;
+
+DROP TABLE distinct_abc;
+
+-- check colums order
+SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
+
+-- check projection case
+SELECT DISTINCT a, a FROM distinct_a WHERE b = 2;
+SELECT DISTINCT a, 1 FROM distinct_a WHERE b = 2;
+
+-- test cursor forward/backward movements
+BEGIN;
+DECLARE c SCROLL CURSOR FOR SELECT DISTINCT a FROM distinct_a;
+
+FETCH FROM c;
+FETCH BACKWARD FROM c;
+
+FETCH 6 FROM c;
+FETCH BACKWARD 6 FROM c;
+
+FETCH 6 FROM c;
+FETCH BACKWARD 6 FROM c;
+
+END;
+
+DROP TABLE distinct_a;
+
+-- test tuples visibility
+CREATE TABLE distinct_visibility (a int, b int);
+INSERT INTO distinct_visibility (select a, b from generate_series(1,5) a, generate_series(1, 10000) b);
+CREATE INDEX ON distinct_visibility (a, b);
+ANALYZE distinct_visibility;
+
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b;
+DELETE FROM distinct_visibility WHERE a = 2 and b = 1;
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b;
+
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC;
+DELETE FROM distinct_visibility WHERE a = 2 and b = 10000;
+SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC;
+DROP TABLE distinct_visibility;
+
+-- test page boundaries
+CREATE TABLE distinct_boundaries AS
+    SELECT a, b::int2 b, (b % 2)::int2 c FROM
+        generate_series(1, 5) a,
+        generate_series(1,366) b;
+
+CREATE INDEX ON distinct_boundaries (a, b, c);
+ANALYZE distinct_boundaries;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c from distinct_boundaries
+WHERE b >= 1 and c = 0 ORDER BY a, b;
+
+SELECT DISTINCT ON (a) a, b, c from distinct_boundaries
+WHERE b >= 1 and c = 0 ORDER BY a, b;
+
+DROP TABLE distinct_boundaries;
+
+-- test tuple killing
+
+-- DESC ordering
+CREATE TABLE distinct_killed AS
+    SELECT a, b, b % 2 AS c, 10 AS d
+        FROM generate_series(1, 5) a,
+             generate_series(1,1000) b;
+
+CREATE INDEX ON distinct_killed (a, b, c, d);
+
+DELETE FROM distinct_killed where a = 3;
+
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c,d
+    FROM distinct_killed ORDER BY a DESC, b DESC;
+    FETCH FORWARD ALL FROM c;
+    FETCH BACKWARD ALL FROM c;
+COMMIT;
+
+DROP TABLE distinct_killed;
+
+-- regular ordering
+CREATE TABLE distinct_killed AS
+    SELECT a, b, b % 2 AS c, 10 AS d
+        FROM generate_series(1, 5) a,
+             generate_series(1,1000) b;
+
+CREATE INDEX ON distinct_killed (a, b, c, d);
+
+DELETE FROM distinct_killed where a = 3;
+
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c,d
+    FROM distinct_killed ORDER BY a, b;
+    FETCH FORWARD ALL FROM c;
+    FETCH BACKWARD ALL FROM c;
+COMMIT;
+
+DROP TABLE distinct_killed;
+
+-- partial delete
+CREATE TABLE distinct_killed AS
+    SELECT a, b, b % 2 AS c, 10 AS d
+        FROM generate_series(1, 5) a,
+             generate_series(1,1000) b;
+
+CREATE INDEX ON distinct_killed (a, b, c, d);
+
+DELETE FROM distinct_killed WHERE a = 3 AND b <= 999;
+
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c,d
+    FROM distinct_killed ORDER BY a DESC, b DESC;
+    FETCH FORWARD ALL FROM c;
+    FETCH BACKWARD ALL FROM c;
+COMMIT;
+
+DROP TABLE distinct_killed;
+
+-- test posting lists
+CREATE TABLE distinct_posting (a int, b int, c int);
+CREATE INDEX ON distinct_posting (a, b, c);
+INSERT INTO distinct_posting
+	VALUES (1, 1, 1),
+		   (1, 1, 2),
+		   (1, 2, 2),
+		   (1, 2, 3),
+		   (2, 2, 1),
+		   (2, 2, 3),
+		   (3, 1, 1),
+		   (3, 1, 2),
+		   (3, 2, 2),
+		   (3, 2, 3);
+
+INSERT INTO distinct_posting (
+    SELECT 1 as a, 1 as b, 1 AS c
+        FROM generate_series(1,1000) i
+);
+
+BEGIN;
+    DECLARE c SCROLL CURSOR FOR
+    SELECT DISTINCT ON (a) a,b,c FROM distinct_posting WHERE c = 2
+    ORDER BY a DESC, b DESC;
+    FETCH ALL FROM c;
+
+    FETCH BACKWARD ALL FROM c;
+COMMIT;
+
+-- test that quals are check for indexability before applied
+CREATE TABLE Indexable_quals (a text, b text, c text);
+CREATE INDEX ON indexable_quals (a, b, c);
+
+INSERT INTO indexable_quals VALUES ('a1', 'b', 'xxx');
+INSERT INTO indexable_quals VALUES ('a1', 'b', 'yyy');
+INSERT INTO indexable_quals VALUES ('a2', 'b', 'xxx');
+INSERT INTO indexable_quals VALUES ('a2', 'b', 'yyy');
+
+SELECT DISTINCT ON (a, b)  a, b
+FROM indexable_quals WHERE c LIKE '%y%' AND a LIKE 'a%' AND b = 'b';
-- 
2.32.0

>From eab94f627c1cbddcc8dec2f1553f28bf3007dab5 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <[email protected]>
Date: Thu, 20 May 2021 21:17:51 +0200
Subject: [PATCH v41 4/6] Extend amskip implementation for Btree

Add support for backward scan to Btree amskip implementation. This will
make index skip scan work without Material node in case of scrolling cursor.

Author: Jesper Pedersen, Dmitry Dolgov
Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan
---
 src/backend/access/index/indexam.c       |   6 +-
 src/backend/access/nbtree/nbtree.c       |   5 +-
 src/backend/access/nbtree/nbtsearch.c    | 302 ++++++++++++++++++++++-
 src/backend/executor/execAmi.c           |  32 +--
 src/backend/executor/nodeIndexonlyscan.c |  57 ++++-
 src/include/access/amapi.h               |   1 +
 src/include/access/genam.h               |   3 +-
 src/include/access/nbtree.h              |   6 +-
 8 files changed, 371 insertions(+), 41 deletions(-)

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index bcf7c73467..9fa5db27ea 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -748,11 +748,13 @@ index_can_return(Relation indexRelation, int attno)
  * ----------------
  */
 bool
-index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+index_skip(IndexScanDesc scan, ScanDirection direction,
+		   ScanDirection indexdir, bool scanstart, int prefix)
 {
 	SCAN_CHECKS;
 
-	return scan->indexRelation->rd_indam->amskip(scan, direction, prefix);
+	return scan->indexRelation->rd_indam->amskip(scan, direction,
+												 indexdir, prefix);
 }
 
 /* ----------------
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 7b2aa594fb..6a627de65a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -449,9 +449,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
  * btskip() -- skip to the beginning of the next key prefix
  */
 bool
-btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+btskip(IndexScanDesc scan, ScanDirection direction,
+	   ScanDirection indexdir, int prefix)
 {
-	return _bt_skip(scan, direction, prefix);
+	return _bt_skip(scan, direction, indexdir, prefix);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 0ad761916a..00742c7e21 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1509,12 +1509,31 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
  * 		The current position is set so that a subsequent call to _bt_next will
  * 		fetch the first tuple that differs in the leading 'prefix' keys.
  *
- * 		The current page is searched for the next unique value. If none is found
- * 		we will do a scan from the root in order to find the next page with
- * 		a unique value.
+ * 		There are four different kinds of skipping (depending on dir and
+ * 		indexdir, that are important to distinguish, especially in the presense
+ * 		of an index condition:
+ *
+ * 		* Advancing forward and reading forward
+ * 			simple scan
+ *
+ * 		* Advancing forward and reading backward
+ * 			scan inside a cursor fetching backward, when skipping is necessary
+ * 			right from the start
+ *
+ * 		* Advancing backward and reading forward
+ * 			scan with order by desc inside a cursor fetching forward, when
+ * 			skipping is necessary right from the start
+ *
+ * 		* Advancing backward and reading backward
+ * 			simple scan with order by desc
+ *
+ *      The current page is searched for the next unique value. If none is found
+ *      we will do a scan from the root in order to find the next page with
+ *      a unique value.
  */
 bool
-_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+_bt_skip(IndexScanDesc scan, ScanDirection dir,
+		 ScanDirection indexdir, int prefix)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	BTStack stack;
@@ -1625,11 +1644,282 @@ _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
 	 * to go one step back, since we need a last element from the previous
 	 * series.
 	 */
-	if (ScanDirectionIsBackward(dir) || (ScanDirectionIsForward(dir) & scanstart))
+	if ((ScanDirectionIsBackward(dir) && ScanDirectionIsBackward(indexdir)) ||
+		(ScanDirectionIsForward(dir) && ScanDirectionIsBackward(indexdir) & scanstart))
 		 offnum = OffsetNumberPrev(offnum);
 
+	/*
+	 * Andvance backward but read forward. At this moment we are at the next
+	 * distinct key at the beginning of the series. In case if scan just
+	 * started, we can read forward without doing anything else. Otherwise
+	 * find previous distinct key and the beginning of it's series and read
+	 * forward from there. To do so, go back one step, perform binary search
+	 * to find the first item in the series and let _bt_readpage do everything
+	 * else.
+	 */
+	else if (ScanDirectionIsBackward(dir) && ScanDirectionIsForward(indexdir) && !scanstart)
+	{
+		/* Reading forward means we expect to see more data on the right */
+		so->currPos.moreRight = true;
+
+		/* One step back to find a previous value */
+		if (!_bt_readpage(scan, dir, offnum) ||
+		 	--so->currPos.itemIndex < so->currPos.firstItem)
+		{
+			_bt_unlockbuf(indexRel, so->currPos.buf);
+			if (!_bt_steppage(scan, dir))
+			{
+				pfree(so->skipScanKey);
+				so->skipScanKey = NULL;
+				return false;
+			}
+		}
+		else
+			_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+		currItem = &so->currPos.items[so->currPos.itemIndex];
+		scan->xs_heaptid = currItem->heapTid;
+		if (scan->xs_want_itup)
+			scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+		_bt_update_skip_scankeys(scan, indexRel);
+
+		/*
+		 * And now find the last item from the sequence for the
+		 * current, value with the intention do OffsetNumberNext. As a
+		 * result we end up on a first element from the sequence.
+		 */
+		if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf))
+			offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+		else
+		{
+			/* Before leaving current page, deal with any killed items */
+			if (so->numKilled > 0)
+				_bt_killitems(scan);
+
+			ReleaseBuffer(so->currPos.buf);
+			so->currPos.buf = InvalidBuffer;
+
+			stack = _bt_search(scan->indexRelation, so->skipScanKey,
+							   &buf, BT_READ, scan->xs_snapshot);
+			_bt_freestack(stack);
+			so->currPos.buf = buf;
+			offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf);
+		}
+	}
+
+	/*
+	 * Advance forward but read backward. At this moment we are at the next
+	 * distinct key at the beginning of the series. In case if scan just
+	 * started, we can go one step back and read forward without doing
+	 * anything else. Otherwise find the next distinct key and the beginning
+	 * of it's series, go one step back and read backward from there.
+	 *
+	 * An interesting situation can happen if one of distinct keys do not pass
+	 * a corresponding index condition at all. In this case reading backward
+	 * can lead to a previous distinct key being found, creating a loop. To
+	 * avoid that check the value to be returned, and jump one more time if
+	 * it's the same as at the beginning. Note that we do not check visibility
+	 * here, and dead tuples could also lead to the same situation. This has to
+	 * be checked on the caller side.
+	 */
+	else if (ScanDirectionIsForward(dir) && ScanDirectionIsBackward(indexdir) && !scanstart)
+	{
+		IndexTuple 	startItup = CopyIndexTuple(scan->xs_itup);
+		bool 		nextFound = false;
+
+		/* Reading backwards means we expect to see more data on the left */
+		so->currPos.moreLeft = true;
+
+		for (;;)
+		{
+			IndexTuple itup;
+			OffsetNumber jumpOffset;
+
+			if (nextFound)
+				break;
+
+			/*
+			 * Find a next index tuple to update scan key. It could be at
+			 * the end, so check for max offset
+			 */
+			if (!_bt_readpage(scan, ForwardScanDirection, offnum))
+			{
+				/*
+				 * There's no actually-matching data on this page.  Try to
+				 * advance to the next page. Return false if there's no
+				 * matching data at all.
+				 */
+				_bt_unlockbuf(indexRel, so->currPos.buf);
+				if (!_bt_steppage(scan, dir))
+				{
+					pfree(so->skipScanKey);
+					so->skipScanKey = NULL;
+					return false;
+				}
+			}
+			else
+				_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+			/* check for interrupts while we're not holding any buffer lock */
+			CHECK_FOR_INTERRUPTS();
+
+			currItem = &so->currPos.items[so->currPos.firstItem];
+			itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+			scan->xs_itup = itup;
+
+			_bt_update_skip_scankeys(scan, indexRel);
+
+			/* Before leaving current page, deal with any killed items */
+			if (so->numKilled > 0)
+				_bt_killitems(scan);
+
+			ReleaseBuffer(so->currPos.buf);
+			so->currPos.buf = InvalidBuffer;
+
+			stack = _bt_search(scan->indexRelation, so->skipScanKey,
+							   &buf, BT_READ, scan->xs_snapshot);
+			_bt_freestack(stack);
+			so->currPos.buf = buf;
+
+			/*
+			 * We need to remember the original offset after the jump,
+			 * since in case of looping this would be the next starting
+			 * point
+			 */
+			jumpOffset = offnum = _bt_binsrch(scan->indexRelation,
+											  so->skipScanKey, buf);
+			offnum = OffsetNumberPrev(offnum);
+
+			if (!_bt_readpage(scan, indexdir, offnum))
+			{
+				/*
+				 * There's no actually-matching data on this page.  Try to
+				 * advance to the next page. Return false if there's no
+				 * matching data at all.
+				 */
+				_bt_unlockbuf(indexRel, so->currPos.buf);
+				if (!_bt_steppage(scan, indexdir))
+				{
+					pfree(so->skipScanKey);
+					so->skipScanKey = NULL;
+					return false;
+				}
+				_bt_lockbuf(indexRel, so->currPos.buf, BT_READ);
+			}
+
+			currItem = &so->currPos.items[so->currPos.lastItem];
+			itup = CopyIndexTuple((IndexTuple)
+					(so->currTuples + currItem->tupleOffset));
+
+			/*
+			 * To check if we returned the same tuple, try to find a
+			 * startItup on the current page. For that we need to update
+			 * scankey to match the whole tuple and set nextkey to return
+			 * an exact tuple, not the next one. If the tuple we found in
+			 * this way is equal to what we wanted to return, it means we
+			 * are in the loop, return offnum to the original position and
+			 * jump further
+			 *
+			 * Note that to compare tids we need to keep the leaf pinned,
+			 * otherwise there is a danger of vacuum cleaning up relevant
+			 * tuples.
+			 */
+			scan->xs_itup = startItup;
+			_bt_update_skip_scankeys(scan, indexRel);
+
+			so->skipScanKey->keysz = IndexRelationGetNumberOfKeyAttributes(indexRel);
+			so->skipScanKey->nextkey = false;
+
+			if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf))
+			{
+				OffsetNumber maxoff, startOffset;
+				IndexTuple verifiedItup;
+				Page page = BufferGetPage(so->currPos.buf);
+				startOffset = _bt_binsrch(scan->indexRelation,
+										  so->skipScanKey,
+										  so->currPos.buf);
+
+				maxoff = PageGetMaxOffsetNumber(page);
+
+				/* Now read the data */
+				if (_bt_readpage(scan, ForwardScanDirection, startOffset))
+				{
+					ItemPointer resultTids, verifyTids;
+					int nresult = 1,
+						nverify = 1;
+
+					currItem = &so->currPos.items[so->currPos.itemIndex];
+					verifiedItup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+					/*
+					 * We need to keep in mind that tuples we deal with
+					 * could be also posting tuples and represent a list of
+					 * tids.
+					 */
+					if (BTreeTupleIsPosting(verifiedItup))
+					{
+						nverify = BTreeTupleGetNPosting(verifiedItup);
+						verifyTids = BTreeTupleGetPosting(verifiedItup);
+						for (int i = 1; i < nverify; i++)
+							verifyTids[i] = *BTreeTupleGetPostingN(verifiedItup, i);
+					}
+					else
+						verifyTids = &verifiedItup->t_tid;
+
+					if (BTreeTupleIsPosting(itup))
+					{
+						nresult = BTreeTupleGetNPosting(itup);
+						resultTids = BTreeTupleGetPosting(itup);
+						for (int i = 1; i < nresult; i++)
+							resultTids[i] = *BTreeTupleGetPostingN(itup, i);
+					}
+					else
+						resultTids = &itup->t_tid;
+
+					/* One not equal means they're not equal. */
+					for(int i = 0; i < nverify; i++)
+					{
+						for(int j = 0; j < nresult; j++)
+						{
+							if (!ItemPointerEquals(&resultTids[j], &verifyTids[i]))
+							{
+								nextFound = true;
+								break;
+							}
+						}
+					}
+
+					if (!nextFound)
+						offnum = jumpOffset;
+				}
+
+				if ((offnum > maxoff) && (so->currPos.nextPage == P_NONE))
+				{
+					_bt_relbuf(indexRel, so->currPos.buf);
+					BTScanPosInvalidate(so->currPos);
+
+					pfree(so->skipScanKey);
+					so->skipScanKey = NULL;
+					return false;
+				}
+			}
+			else
+				/*
+				 * If startItup could be not found within the current page,
+				 * assume we found something new
+				 */
+				nextFound = true;
+
+			/* Return original scankey options */
+			so->skipScanKey->keysz = prefix;
+			so->skipScanKey->nextkey = ScanDirectionIsForward(dir);
+		}
+	}
+
 	/* Now read the data */
-	if (!_bt_readpage(scan, dir, offnum))
+	if (!_bt_readpage(scan, indexdir, offnum))
 	{
 		/*
 		 * There's no actually-matching data on this page.  Try to advance to
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index ced8933f44..b6245994f0 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -64,7 +64,7 @@
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
-static bool IndexSupportsBackwardScan(Plan *node);
+static bool IndexSupportsBackwardScan(Oid indexid);
 
 
 /*
@@ -555,8 +555,10 @@ ExecSupportsBackwardScan(Plan *node)
 			return false;
 
 		case T_IndexScan:
+			return IndexSupportsBackwardScan(((IndexScan *) node)->indexid);
+
 		case T_IndexOnlyScan:
-			return IndexSupportsBackwardScan(node);
+			return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid);
 
 		case T_SubqueryScan:
 			return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan);
@@ -596,38 +598,16 @@ ExecSupportsBackwardScan(Plan *node)
 
 /*
  * An IndexScan or IndexOnlyScan node supports backward scan only if the
- * index's AM does and no skip scan is used.
+ * index's AM does.
  */
 static bool
-IndexSupportsBackwardScan(Plan *node)
+IndexSupportsBackwardScan(Oid indexid)
 {
 	bool		result;
-	Oid			indexid = InvalidOid;
-	int         skip_prefix_size = 0;
 	HeapTuple	ht_idxrel;
 	Form_pg_class idxrelrec;
 	IndexAmRoutine *amroutine;
 
-	Assert(IsA(node, IndexScan) || IsA(node, IndexOnlyScan));
-	switch(nodeTag(node))
-	{
-		case T_IndexScan:
-			indexid = ((IndexScan *) node)->indexid;
-			break;
-
-		case T_IndexOnlyScan:
-			indexid = ((IndexOnlyScan *) node)->indexid;
-			skip_prefix_size = ((IndexOnlyScan *) node)->indexskipprefixsize;
-			break;
-
-		default:
-			elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node));
-			break;
-	}
-
-	if (skip_prefix_size > 0)
-		return false;
-
 	/* Fetch the pg_class tuple of the index relation */
 	ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid));
 	if (!HeapTupleIsValid(ht_idxrel))
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 40ad1b949b..d5abac20cb 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -67,6 +67,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid = NULL;
+	ItemPointerData startTid;
 	IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan;
 
 	/*
@@ -75,6 +76,14 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	 */
 	bool skipped = false;
 
+	/*
+	 * Index only scan must be aware that in case of skipping we can return to
+	 * the starting point due to visibility checks. In this situation we need
+	 * to jump further, and number of skipping attempts tell us how far do we
+	 * need to do so.
+	 */
+	int skipAttempts = 0;
+
 	/*
 	 * extract necessary information from index scan node
 	 */
@@ -123,13 +132,27 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_OrderByKeys,
 						 node->ioss_NumOrderByKeys);
 	}
+	else
+	{
+		ItemPointerCopy(&scandesc->xs_heaptid, &startTid);
+	}
 
 	/*
 	 * Check if we need to skip to the next key prefix.
+	 *
+	 * When fetching a cursor in the direction opposite to a general scan
+	 * direction, the result must be what normal fetching should have
+	 * returned, but in reversed order. In other words, return the last or
+	 * first scanned tuple in a DISTINCT set, depending on a cursor direction.
+	 * Due to that we skip also when the first tuple wasn't emitted yet, but
+	 * the directions are opposite.
 	 */
-	if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted)
+	if (node->ioss_SkipPrefixSize > 0 &&
+		(node->ioss_FirstTupleEmitted ||
+		 ScanDirectionsAreOpposite(direction, indexonlyscan->indexorderdir)))
 	{
-		if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize))
+		if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir,
+						!node->ioss_FirstTupleEmitted, node->ioss_SkipPrefixSize))
 		{
 			/*
 			 * Reached end of index. At this point currPos is invalidated, and
@@ -144,6 +167,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		else
 		{
 			skipped = true;
+			skipAttempts = 1;
 			tid = &scandesc->xs_heaptid;
 		}
 	}
@@ -161,6 +185,35 @@ IndexOnlyNext(IndexOnlyScanState *node)
 
 		skipped = false;
 
+		/*
+		 * If we already emitted first tuple, while doing index only skip scan
+		 * with advancing and reading in different directions we can return to
+		 * the same position where we started after visibility check. Recognize
+		 * such situations and skip more.
+		 */
+		if ((readDirection != direction) && node->ioss_FirstTupleEmitted &&
+			ItemPointerIsValid(&startTid) && ItemPointerEquals(&startTid, tid))
+		{
+			int i;
+			skipAttempts += 1;
+
+			for (i = 0; i < skipAttempts; i++)
+			{
+				if (!index_skip(scandesc, direction,
+								indexonlyscan->indexorderdir,
+								!node->ioss_FirstTupleEmitted,
+								node->ioss_SkipPrefixSize))
+				{
+					node->ioss_FirstTupleEmitted = false;
+					return ExecClearTuple(slot);
+				}
+			}
+
+			tid = &scandesc->xs_heaptid;
+		}
+
+		skipped = false;
+
 		/*
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index cb2f48a1bc..58e29d054c 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -176,6 +176,7 @@ typedef bool (*amgettuple_function) (IndexScanDesc scan,
 /* skip past duplicates in a given prefix */
 typedef bool (*amskip_function) (IndexScanDesc scan,
 								 ScanDirection dir,
+								 ScanDirection indexdir,
 								 int prefix);
 
 /* fetch all valid tuples */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index d13d95c458..5a6d904af6 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -183,7 +183,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 												   IndexBulkDeleteResult *istat);
 extern bool index_can_return(Relation indexRelation, int attno);
-extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction,
+					   ScanDirection indexdir, bool start, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 									uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 2c516654c2..039e8d1f0d 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1232,7 +1232,8 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
-extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir,
+					 ScanDirection indexdir, int prefix);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 							   Snapshot snapshot);
 
@@ -1257,7 +1258,8 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
 extern Size BTreeShmemSize(void);
 extern void BTreeShmemInit(void);
 extern bytea *btoptions(Datum reloptions, bool validate);
-extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir,
+				   ScanDirection indexdir, int prefix);
 extern bool btproperty(Oid index_oid, int attno,
 					   IndexAMProperty prop, const char *propname,
 					   bool *res, bool *isnull);
-- 
2.32.0

>From 6fc50a6a405b8232a935a3d2459b62dec09c570c Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <[email protected]>
Date: Sat, 22 Jan 2022 21:13:42 +0100
Subject: [PATCH v41 5/6] Extend index skip scan with ScanLooseKey

Index skip scan relies on the information about key prefix that needs to
be jumped over, but it's represented in a rather limited fashion only
via the prefix size. This approach is sufficient now, but for the sake
of flexibility introduce a concept of ScanLooseKey to represent
underspecified search keys. At the moment it's inspired by the idea of
skip keys constrained in some range of keyspace, and not used in any
way.
---
 src/backend/executor/nodeIndexonlyscan.c | 14 ++++++++++----
 src/include/access/relscan.h             |  2 ++
 src/include/access/skey.h                |  8 ++++++++
 src/include/nodes/execnodes.h            |  5 ++++-
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index d5abac20cb..470c364e53 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -147,12 +147,12 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	 * Due to that we skip also when the first tuple wasn't emitted yet, but
 	 * the directions are opposite.
 	 */
-	if (node->ioss_SkipPrefixSize > 0 &&
+	if (node->ioss_ScanLooseKeys != NULL &&
 		(node->ioss_FirstTupleEmitted ||
 		 ScanDirectionsAreOpposite(direction, indexonlyscan->indexorderdir)))
 	{
 		if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir,
-						!node->ioss_FirstTupleEmitted, node->ioss_SkipPrefixSize))
+						!node->ioss_FirstTupleEmitted, node->ioss_NumScanLooseKeys))
 		{
 			/*
 			 * Reached end of index. At this point currPos is invalidated, and
@@ -202,7 +202,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 				if (!index_skip(scandesc, direction,
 								indexonlyscan->indexorderdir,
 								!node->ioss_FirstTupleEmitted,
-								node->ioss_SkipPrefixSize))
+								node->ioss_NumScanLooseKeys))
 				{
 					node->ioss_FirstTupleEmitted = false;
 					return ExecClearTuple(slot);
@@ -594,7 +594,6 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
-	indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize;
 	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
@@ -697,6 +696,13 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 						   NULL,	/* no ArrayKeys */
 						   NULL);
 
+	if (node->indexskipprefixsize != 0)
+	{
+		indexstate->ioss_NumScanLooseKeys = node->indexskipprefixsize;
+		indexstate->ioss_ScanLooseKeys =
+			(ScanLooseKey) palloc(node->indexskipprefixsize * sizeof(ScanLooseKeyData));
+	}
+
 	/*
 	 * If we have runtime keys, we need an ExprContext to evaluate them. The
 	 * node's standard context won't do because we want to reset that context
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 53a93ccbe7..5200a1867d 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -119,8 +119,10 @@ typedef struct IndexScanDescData
 	struct SnapshotData *xs_snapshot;	/* snapshot to see */
 	int			numberOfKeys;	/* number of index qualifier conditions */
 	int			numberOfOrderBys;	/* number of ordering operators */
+	int			numberOfLooseKeys;	/* number of loose index qualifier conditions */
 	struct ScanKeyData *keyData;	/* array of index qualifier descriptors */
 	struct ScanKeyData *orderByData;	/* array of ordering op descriptors */
+	struct ScanLooseKeyData *looseKeyData;	/* array of loose index qualifier descriptors */
 	bool		xs_want_itup;	/* caller requests index tuples */
 	bool		xs_temp_snap;	/* unregister snapshot at scan end? */
 
diff --git a/src/include/access/skey.h b/src/include/access/skey.h
index b5ab17f7d9..a711dc353d 100644
--- a/src/include/access/skey.h
+++ b/src/include/access/skey.h
@@ -74,6 +74,14 @@ typedef struct ScanKeyData
 
 typedef ScanKeyData *ScanKey;
 
+typedef struct ScanLooseKeyData
+{
+	ScanKey start;
+	ScanKey end;
+} ScanLooseKeyData;
+
+typedef ScanLooseKeyData *ScanLooseKey;
+
 /*
  * About row comparisons:
  *
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index cf36e6c0e6..fa6ee25bc6 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1508,6 +1508,8 @@ typedef struct IndexScanState
  *		NumOrderByKeys	   number of OrderByKeys
  *		RuntimeKeys		   info about Skeys that must be evaluated at runtime
  *		NumRuntimeKeys	   number of RuntimeKeys
+ *		ScanLooseKeys 	   Skey structures for loose index quals
+ *		NumScanLooseKeys   number of ScanLooseKeys
  *		RuntimeKeysReady   true if runtime Skeys have been computed
  *		RuntimeContext	   expr context for evaling runtime Skeys
  *		RelationDesc	   index relation descriptor
@@ -1527,13 +1529,14 @@ typedef struct IndexOnlyScanState
 	int			ioss_NumOrderByKeys;
 	IndexRuntimeKeyInfo *ioss_RuntimeKeys;
 	int			ioss_NumRuntimeKeys;
+	struct ScanLooseKeyData *ioss_ScanLooseKeys;
+	int			ioss_NumScanLooseKeys;
 	bool		ioss_RuntimeKeysReady;
 	ExprContext *ioss_RuntimeContext;
 	Relation	ioss_RelationDesc;
 	struct IndexScanDescData *ioss_ScanDesc;
 	TupleTableSlot *ioss_TableSlot;
 	Buffer		ioss_VMBuffer;
-	int         ioss_SkipPrefixSize;
 	bool		ioss_FirstTupleEmitted;
 	Size		ioss_PscanLen;
 } IndexOnlyScanState;
-- 
2.32.0

>From 7cdd4cc9621a5818c38b27f26c15c570ab4e83f6 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <[email protected]>
Date: Fri, 14 May 2021 19:22:06 +0200
Subject: [PATCH v41 6/6] Index skip scan for IndexScan

Introduce Skip Scan support for IndexScan, not only for IndexOnlyScan.
It works in the same way as IndexOnlyScan, but planned has to check that
the chosen index is fully covering specified distinct expressions.

Author: Jesper Pedersen, Dmitry Dolgov
Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan
---
 src/backend/commands/explain.c                |  6 ++
 src/backend/executor/nodeIndexscan.c          | 56 +++++++++++++++-
 src/backend/nodes/copyfuncs.c                 |  1 +
 src/backend/nodes/outfuncs.c                  |  1 +
 src/backend/nodes/readfuncs.c                 |  1 +
 src/backend/optimizer/path/indxpath.c         | 59 ++++++++++++++++-
 src/backend/optimizer/plan/createplan.c       | 10 ++-
 src/include/nodes/execnodes.h                 |  4 ++
 src/include/nodes/plannodes.h                 |  2 +
 src/test/regress/expected/select_distinct.out | 64 ++++++++++++++++---
 src/test/regress/sql/select_distinct.sql      | 16 +++++
 11 files changed, 206 insertions(+), 14 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 4f5bd1d678..67d072b010 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1754,6 +1754,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	switch (nodeTag(plan))
 	{
 		case T_IndexScan:
+			if (((IndexScan *) plan)->indexskipprefixsize > 0)
+			{
+				IndexScan  *indexscan = (IndexScan *) plan;
+				ExplainPropertyBool("Skip scan", true, es);
+				ExplainIndexSkipScanKeys(indexscan->indexskipprefixsize, es);
+			}
 			show_scan_qual(((IndexScan *) plan)->indexqualorig,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexScan *) plan)->indexqualorig)
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 90b2699a96..90aad76b95 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -85,6 +85,13 @@ IndexNext(IndexScanState *node)
 	ScanDirection direction;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
+	IndexScan *indexscan = (IndexScan *) node->ss.ps.plan;
+
+	/*
+	 * tells if the current position was reached via skipping. In this case
+	 * there is no nead for the index_getnext_tid
+	 */
+	bool skipped = false;
 
 	/*
 	 * extract necessary information from index scan node
@@ -92,7 +99,7 @@ IndexNext(IndexScanState *node)
 	estate = node->ss.ps.state;
 	direction = estate->es_direction;
 	/* flip direction if this is an overall backward scan */
-	if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir))
+	if (ScanDirectionIsBackward(indexscan->indexorderdir))
 	{
 		if (ScanDirectionIsForward(direction))
 			direction = BackwardScanDirection;
@@ -117,6 +124,12 @@ IndexNext(IndexScanState *node)
 
 		node->iss_ScanDesc = scandesc;
 
+		/* Index skip scan assumes xs_want_itup, so set it to true */
+		if (indexscan->indexskipprefixsize > 0)
+			node->iss_ScanDesc->xs_want_itup = true;
+		else
+			node->iss_ScanDesc->xs_want_itup = false;
+
 		/*
 		 * If no run-time keys to calculate or they are ready, go ahead and
 		 * pass the scankeys to the index AM.
@@ -127,12 +140,48 @@ IndexNext(IndexScanState *node)
 						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 *
+	 * When fetching a cursor in the direction opposite to a general scan
+	 * direction, the result must be what normal fetching should have
+	 * returned, but in reversed order. In other words, return the last or
+	 * first scanned tuple in a DISTINCT set, depending on a cursor direction.
+	 * Due to that we skip also when the first tuple wasn't emitted yet, but
+	 * the directions are opposite.
+	 */
+	if (node->iss_SkipPrefixSize > 0 &&
+		(node->iss_FirstTupleEmitted ||
+		 ScanDirectionsAreOpposite(direction, indexscan->indexorderdir)))
+	{
+		if (!index_skip(scandesc, direction, indexscan->indexorderdir,
+					   !node->iss_FirstTupleEmitted, node->iss_SkipPrefixSize))
+		{
+			/*
+			 * Reached end of index. At this point currPos is invalidated, and
+			 * we need to reset iss_FirstTupleEmitted, since otherwise after
+			 * going backwards, reaching the end of index, and going forward
+			 * again we apply skip again. It would be incorrect and lead to an
+			 * extra skipped item.
+			 */
+			node->iss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+		else
+		{
+			skipped = true;
+			index_fetch_heap(scandesc, slot);
+		}
+	}
+
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while (skipped || index_getnext_slot(scandesc, direction, slot))
 	{
 		CHECK_FOR_INTERRUPTS();
+		skipped = false;
 
 		/*
 		 * If the index was lossy, we have to recheck the index quals using
@@ -149,6 +198,7 @@ IndexNext(IndexScanState *node)
 			}
 		}
 
+		node->iss_FirstTupleEmitted = true;
 		return slot;
 	}
 
@@ -914,6 +964,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexScan;
+	indexstate->iss_SkipPrefixSize = node->indexskipprefixsize;
+	indexstate->iss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index fe0d92ad46..8d0f92493f 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -497,6 +497,7 @@ _copyIndexScan(const IndexScan *from)
 	COPY_NODE_FIELD(indexorderbyorig);
 	COPY_NODE_FIELD(indexorderbyops);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(indexskipprefixsize);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 06eb1a89e9..ad5c223bf8 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -569,6 +569,7 @@ _outIndexScan(StringInfo str, const IndexScan *node)
 	WRITE_NODE_FIELD(indexorderbyorig);
 	WRITE_NODE_FIELD(indexorderbyops);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(indexskipprefixsize);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 169fb408d3..0fefbd5cc8 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1868,6 +1868,7 @@ _readIndexScan(void)
 	READ_NODE_FIELD(indexorderbyorig);
 	READ_NODE_FIELD(indexorderbyops);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(indexskipprefixsize);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 00526f3476..41444d2216 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -1036,7 +1036,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 					   check_index_only(rel, index));
 
 	/* Check if an index skip scan is possible. */
-	can_skip = enable_indexskipscan & index->amcanskip & index_only_scan;
+	can_skip = enable_indexskipscan & index->amcanskip;
 
 	if (can_skip)
 	{
@@ -1099,6 +1099,63 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 				}
 			}
 		}
+
+		/*
+		 * For an index scan verify that index fully covers distinct
+		 * expressions, otherwise there is not enough information for skipping
+		 */
+		if (!index_only_scan && root->query_uniquekeys != NULL)
+		{
+			ListCell *lc;
+
+			foreach(lc, root->query_uniquekeys)
+			{
+				List *uniqExprs = (List *) lfirst(lc);
+				ListCell *lc1;
+
+				foreach(lc1, uniqExprs)
+				{
+					Expr *expr = (Expr *) lfirst(lc1);
+					bool found = false;
+
+					if (!IsA(expr, Var))
+					{
+						ListCell *lc2;
+
+						foreach(lc2, index->indexprs)
+						{
+							if(equal(lfirst(lc1), lfirst(lc2)))
+							{
+								found = true;
+								break;
+							}
+						}
+					}
+					else
+					{
+						Var *var = (Var *) expr;
+
+						for (int i = 0; i < index->ncolumns; i++)
+						{
+							if (index->indexkeys[i] == var->varattno)
+							{
+								found = true;
+								break;
+							}
+						}
+					}
+
+					if (!found)
+					{
+						can_skip = false;
+						break;
+					}
+				}
+
+				if (!can_skip)
+					break;
+			}
+		}
 	}
 
 	/*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 511dda3a9f..618d1833ce 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -185,7 +185,8 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid,
 								 Oid indexid, List *indexqual, List *indexqualorig,
 								 List *indexorderby, List *indexorderbyorig,
 								 List *indexorderbyops,
-								 ScanDirection indexscandir);
+								 ScanDirection indexscandir,
+								 int skipprefix);
 static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 										 Index scanrelid, Oid indexid,
 										 List *indexqual, List *recheckqual,
@@ -3121,7 +3122,8 @@ create_indexscan_plan(PlannerInfo *root,
 											fixed_indexorderbys,
 											indexorderbys,
 											indexorderbyops,
-											best_path->indexscandir);
+											best_path->indexscandir,
+											best_path->indexskipprefix);
 
 	copy_generic_path_info(&scan_plan->plan, &best_path->path);
 
@@ -5454,7 +5456,8 @@ make_indexscan(List *qptlist,
 			   List *indexorderby,
 			   List *indexorderbyorig,
 			   List *indexorderbyops,
-			   ScanDirection indexscandir)
+			   ScanDirection indexscandir,
+			   int skipPrefixSize)
 {
 	IndexScan  *node = makeNode(IndexScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5471,6 +5474,7 @@ make_indexscan(List *qptlist,
 	node->indexorderbyorig = indexorderbyorig;
 	node->indexorderbyops = indexorderbyops;
 	node->indexorderdir = indexscandir;
+	node->indexskipprefixsize = skipPrefixSize;
 
 	return node;
 }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index fa6ee25bc6..0cf02c1cd7 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1486,6 +1486,8 @@ typedef struct IndexScanState
 	ExprContext *iss_RuntimeContext;
 	Relation	iss_RelationDesc;
 	struct IndexScanDescData *iss_ScanDesc;
+	int         iss_SkipPrefixSize;
+	bool		iss_FirstTupleEmitted;
 
 	/* These are needed for re-checking ORDER BY expr ordering */
 	pairingheap *iss_ReorderQueue;
@@ -1517,6 +1519,8 @@ typedef struct IndexScanState
  *		TableSlot		   slot for holding tuples fetched from the table
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		PscanLen		   size of parallel index-only scan descriptor
+ *		SkipPrefixSize	   number of keys for skip-based DISTINCT
+ *		FirstTupleEmitted  has the first tuple been emitted
  * ----------------
  */
 typedef struct IndexOnlyScanState
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 6b3eefebc6..c0c91d4d09 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -411,6 +411,8 @@ typedef struct IndexScan
 	List	   *indexorderbyorig;	/* the same in original form */
 	List	   *indexorderbyops;	/* OIDs of sort ops for ORDER BY exprs */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			indexskipprefixsize;	/* the size of the prefix for distinct
+										 * scans */
 } IndexScan;
 
 /* ----------------
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 36b3291a7f..b8f747d8d2 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -445,14 +445,12 @@ SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC;
 -- test index skip scan for expressions
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
-             QUERY PLAN             
-------------------------------------
- Sort
-   Sort Key: ((a + 1))
-   ->  HashAggregate
-         Group Key: (a + 1)
-         ->  Seq Scan on distinct_a
-(5 rows)
+                     QUERY PLAN                     
+----------------------------------------------------
+ Index Scan using distinct_a_expr_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+(3 rows)
 
 SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
  ?column? 
@@ -693,6 +691,56 @@ FETCH BACKWARD ALL FROM c;
 
 END;
 DROP TABLE distinct_abc;
+-- index skip scan
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+ a | b | c  
+---+---+----
+ 1 | 1 | 10
+ 2 | 1 | 10
+ 3 | 1 | 10
+ 4 | 1 | 10
+ 5 | 1 | 10
+(5 rows)
+
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+ a | b | c  
+---+---+----
+ 1 | 1 | 10
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Index Scan using distinct_a_a_b_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+(3 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Index Scan using distinct_a_a_b_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+   Index Cond: (a = 1)
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT *
+FROM distinct_a;
+          QUERY PLAN          
+------------------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on distinct_a
+(3 rows)
+
 -- check colums order
 SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
  a 
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index c9ccf4cc7d..b0d2ee7066 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -294,6 +294,22 @@ END;
 
 DROP TABLE distinct_abc;
 
+-- index skip scan
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT *
+FROM distinct_a;
+
 -- check colums order
 SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
 
-- 
2.32.0

Re: MDAM techniques and Index Skip Scan patch

Reply via email to