From 1d4fb1214b3f6b216654825985fb3ef2c5e045e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= <yizhi.fzh@alibaba-inc.com>
Date: Mon, 24 Feb 2020 20:16:42 +0800
Subject: [PATCH] Erase the distinctClause if the result is unique by
 definition

For a single relation, we can tell it by any one of the following
is true:
1. The pk is in the target list.
2. The uk is in the target list and the columns is not null
3. The columns in group-by clause is also in the target list

for relation join, we can tell it by:
if every relation in the jointree yield a unique result set,then
the final result is unique as well regardless the join method.
---
 src/backend/nodes/bitmapset.c                 |  18 ++
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/optimizer/plan/analyzejoins.c     | 246 +++++++++++++++-
 src/backend/optimizer/plan/planner.c          |  28 ++
 src/backend/utils/misc/guc.c                  |  10 +
 src/include/nodes/bitmapset.h                 |   1 +
 src/include/optimizer/cost.h                  |   1 +
 src/include/optimizer/planmain.h              |   2 +
 src/test/regress/expected/aggregates.out      |  13 +-
 src/test/regress/expected/join.out            |  16 +-
 .../regress/expected/select_distinct_2.out    | 276 ++++++++++++++++++
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/sql/select_distinct_2.sql    |  84 ++++++
 13 files changed, 679 insertions(+), 20 deletions(-)
 create mode 100644 src/test/regress/expected/select_distinct_2.out
 create mode 100644 src/test/regress/sql/select_distinct_2.sql

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 648cc1a7eb..5cb6924a29 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -1167,3 +1167,21 @@ bms_hash_value(const Bitmapset *a)
 	return DatumGetUInt32(hash_any((const unsigned char *) a->words,
 								   (lastword + 1) * sizeof(bitmapword)));
 }
+
+/*
+ * bms_array_free
+ *
+ * free the element in the array one by one, free the array as well at last
+ */
+void
+bms_array_free(Bitmapset **bms_array,  int len)
+{
+	int idx;
+	if (bms_array == NULL)
+		return;
+	for(idx = 0 ; idx < len; idx++)
+	{
+		bms_free(bms_array[idx]);
+	}
+	pfree(bms_array);
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index b5a0033721..dde16b5d44 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -138,6 +138,7 @@ bool		enable_partitionwise_aggregate = false;
 bool		enable_parallel_append = true;
 bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
+bool		enable_distinct_elimination = true;
 
 typedef struct
 {
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index d0ff660284..a15ba80808 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -22,6 +22,7 @@
  */
 #include "postgres.h"
 
+#include "access/relation.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
@@ -30,7 +31,10 @@
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
 #include "optimizer/tlist.h"
+#include "parser/parsetree.h"
 #include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
 
 /* local functions */
 static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
@@ -801,9 +805,248 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 		if (l == NULL)			/* had matches for all? */
 			return true;
 	}
+	return query_is_distinct_agg(query, colnos, opids);
+}
+
+/*
+ * scan_non_semi_anti_relids
+ *
+ * scan jointree to get non-semi/anti join rtindex.
+ */
+static void
+scan_non_semi_anti_relids(Node* jtnode, Relids* relids)
+{
+	if (jtnode == NULL)
+		return;
+
+	if (IsA(jtnode, RangeTblRef))
+	{
+		int			varno = ((RangeTblRef *) jtnode)->rtindex;
+
+		*relids = bms_add_member(*relids, varno);
+	}
+	else if (IsA(jtnode, FromExpr))
+	{
+		FromExpr   *f = (FromExpr *) jtnode;
+		ListCell   *l;
+
+		foreach(l, f->fromlist)
+			scan_non_semi_anti_relids(lfirst(l), relids);
+	}
+	else if (IsA(jtnode, JoinExpr))
+	{
+		JoinExpr   *j = (JoinExpr *) jtnode;
+
+		scan_non_semi_anti_relids(j->larg, relids);
+		if (j->jointype != JOIN_SEMI && j->jointype != JOIN_ANTI)
+		{
+			scan_non_semi_anti_relids(j->rarg, relids);
+		}
+	}
+	else
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(jtnode));
+
+}
+
+/*
+ * query_distinct_through_join
+ *
+ * check if the result is unique after the join. if every relation
+ * yields a unique result, the result is unique as well
+ */
+bool
+query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids)
+{
+	RangeTblEntry *rte;
+	int rt_index;
+	Relids non_semi_anti_relids = NULL, tmp = NULL;
+	ListCell *lc1, *lc2;
+
+	/* be filled with metadata and find_nonnull_var */
+	Bitmapset **non_null_var_per_table = NULL;
+	Query *query = root->parse;
+	int max_len;
+
+    /* be used for relation_has_unique_for */
+	List **non_null_expr_per_table = NULL;
+	List **non_null_opids_per_table = NULL;
+	bool ret = true;
+	int idx = 0;
+
+	if (query->hasTargetSRFs)
+		return false;
 
+	/* remove the relids for right semi/anti */
+	scan_non_semi_anti_relids((Node*)query->jointree, &non_semi_anti_relids);
+
+	rt_index = -1;
+	while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 )
+	{
+		List *indexlist = root->simple_rel_array[rt_index]->indexlist;
+		bool found = false;
+		rte = rt_fetch(rt_index, query->rtable);
+
+		/* for the subquery, we just handle some simple case */
+		if (rte->rtekind == RTE_SUBQUERY)
+		{
+			Query *subquery = rte->subquery;
+			List *sub_opnos = NIL;
+			List *sub_opids = NIL;
+			if (!query_supports_distinctness(subquery))
+			{
+				ret = false;
+				goto done;
+			}
+
+			forboth(lc1, colnos, lc2, opids)
+			{
+				TargetEntry *tle = get_tle_by_resno(query->targetList, lfirst_int(lc1));
+				Var *var;
+				if (!IsA(tle->expr, Var))
+					continue;
+				var =(Var*)tle->expr;
+				if (var->varno == rt_index)
+				{
+					sub_opnos = lappend_int(sub_opnos, var->varattno);
+					sub_opids = lappend_oid(sub_opids, lfirst_oid(lc2));
+				}
+			}
+
+			if (query_is_distinct_for(subquery, sub_opnos, sub_opids))
+			{
+				tmp = bms_add_member(tmp, rt_index);
+			}
+			else
+			{
+				ret = false;
+				goto done;
+			}
+		}
+		else if (rte->rtekind != RTE_RELATION)
+		{
+			ret = false;
+			goto done;
+		}
+		else
+		{
+			foreach(lc1, indexlist)
+			{
+				IndexOptInfo *ind = lfirst_node(IndexOptInfo, lc1);
+				if (ind->unique && ind->immediate && (ind->indpred == NIL || ind->predOK))
+				{
+					found = true;
+					break;
+				}
+			}
+
+			/* if any relation has no pk/uk index, we return fast */
+			if (!found)
+			{
+				ret = false;
+				goto done;
+			}
+		}
+	}
+
+	non_semi_anti_relids = bms_del_members(non_semi_anti_relids, tmp);
 	/*
-	 * Otherwise, a set-returning function in the query's targetlist can
+	 * we know all the base rte is RTE_RELATION and every one has uk/pk index
+	 */
+
+	max_len = list_length(query->rtable) + 1;
+	non_null_var_per_table = palloc0(max_len * sizeof(Bitmapset *));
+	non_null_expr_per_table = palloc0(max_len * sizeof(Bitmapset *));
+	non_null_opids_per_table = palloc0(max_len * sizeof(Bitmapset *));
+
+	/* fill non_null_var_per_table with restrictinfo */
+	foreach(lc1, find_nonnullable_vars(query->jointree->quals))
+	{
+		Var *var;
+
+		if (!IsA(lfirst(lc1), Var))
+			continue;
+		var = lfirst_node(Var, lc1);
+		if (var->varno == INNER_VAR ||
+			var->varno == OUTER_VAR ||
+			var->varno == INDEX_VAR)
+			continue;
+		non_null_var_per_table[var->varno] = bms_add_member(
+			non_null_var_per_table[var->varno], var->varattno);
+	}
+
+	rt_index = -1;
+	/* fill non_null_var_per_table with catalog */
+	while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 )
+	{
+		int attr_idx = 0;
+		Relation relation;
+		TupleDesc desc;
+
+		rte = rt_fetch(rt_index, query->rtable);
+		relation =  relation_open(rte->relid, AccessShareLock);
+		desc = relation->rd_att;
+
+		for(; attr_idx < desc->natts; attr_idx++)
+		{
+			if (!desc->attrs[attr_idx].attnotnull)
+				continue;
+			non_null_var_per_table[rt_index] = bms_add_member(
+				non_null_var_per_table[rt_index], attr_idx+1);
+		}
+		relation_close(relation, AccessShareLock);
+	}
+
+	/* fiter all the colnos and opids which is not null */
+	forboth(lc1, colnos, lc2, opids)
+	{
+		int colno = lfirst_int(lc1);
+		TargetEntry *tle = get_tle_by_resno(query->targetList, colno);
+		Var *var = NULL;
+		/* We don't know the varno if the target tle->expr is not a Var */
+		if (!IsA(tle->expr, Var))
+			continue;
+		var = (Var *)tle->expr;
+		if (!bms_is_member(var->varattno, non_null_var_per_table[var->varno]))
+			continue;
+		non_null_expr_per_table[var->varno] = lappend(
+			non_null_expr_per_table[var->varno], tle->expr);
+		non_null_opids_per_table[var->varno] = lappend_oid(
+			non_null_opids_per_table[var->varno], lfirst_oid(lc2));
+	}
+
+	rt_index = -1;
+	while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 )
+	{
+
+		/* if any one can't yield a unique result, we return fast */
+		if (!relation_has_unique_index_for(root, root->simple_rel_array[rt_index],
+										   NIL,
+										   non_null_expr_per_table[rt_index],
+										   non_null_opids_per_table[rt_index]))
+		{
+			ret = false;
+			goto done;
+		}
+	}
+done:
+	bms_array_free(non_null_var_per_table, max_len);
+	for(idx = 0; idx < max_len; idx++)
+	{
+		list_free(non_null_expr_per_table[idx]);
+		list_free(non_null_opids_per_table[idx]);
+	}
+	return ret;
+}
+
+bool
+query_is_distinct_agg(Query *query, List *colnos, List *opids)
+{
+	ListCell   *l;
+	Oid			opid;
+
+	/*
+	 * a set-returning function in the query's targetlist can
 	 * result in returning duplicate rows, despite any grouping that might
 	 * occur before tlist evaluation.  (If all tlist SRFs are within GROUP BY
 	 * columns, it would be safe because they'd be expanded before grouping.
@@ -901,7 +1144,6 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 				return true;
 		}
 	}
-
 	/*
 	 * XXX Are there any other cases in which we can easily see the result
 	 * must be distinct?
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index d6f2153593..690fab676c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -22,8 +22,10 @@
 #include "access/htup_details.h"
 #include "access/parallel.h"
 #include "access/sysattr.h"
+#include "access/relation.h"
 #include "access/table.h"
 #include "access/xact.h"
+#include "catalog/index.h"
 #include "catalog/pg_constraint.h"
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
@@ -35,6 +37,7 @@
 #include "lib/bipartite_match.h"
 #include "lib/knapsack.h"
 #include "miscadmin.h"
+#include "nodes/bitmapset.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #ifdef OPTIMIZER_DEBUG
@@ -64,6 +67,8 @@
 #include "utils/rel.h"
 #include "utils/selfuncs.h"
 #include "utils/syscache.h"
+#include "utils/typcache.h"
+
 
 /* GUC parameters */
 double		cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION;
@@ -4737,6 +4742,29 @@ create_distinct_paths(PlannerInfo *root,
 	Path	   *path;
 	ListCell   *lc;
 
+	if (enable_distinct_elimination)
+	{
+		List *colnos = NIL;
+		List *opnos = NIL;
+		ListCell *lc;
+
+		Assert(parse->distinctClause != NIL);
+
+		foreach(lc, parse->distinctClause)
+		{
+			SortGroupClause *sgc = lfirst_node(SortGroupClause, lc);
+			int idx = sgc->tleSortGroupRef;
+			TargetEntry *tle = get_tle_by_resno(parse->targetList, idx);
+			if (tle->resjunk)
+				continue;
+			colnos = lappend_int(colnos, idx);
+			opnos = lappend_oid(opnos, sgc->eqop);
+		}
+
+		if (query_is_distinct_agg(parse, colnos, opnos) ||
+			query_distinct_through_join(root, colnos, opnos))
+			return input_rel;
+	}
 	/* For now, do all work in the (DISTINCT, NULL) upperrel */
 	distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL);
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e44f71e991..fa798dd564 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1064,6 +1064,16 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_distinct_elimination", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables plan-time and run-time unique elimination."),
+		    gettext_noop("Allows the query planner to remove the uncecessary distinct clause."), 
+			GUC_EXPLAIN
+		},
+		&enable_distinct_elimination,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
 			gettext_noop("Enables genetic query optimization."),
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index b7b18a0b68..5d6f04ffa9 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -117,4 +117,5 @@ extern int	bms_prev_member(const Bitmapset *a, int prevbit);
 /* support for hashtables using Bitmapsets as keys: */
 extern uint32 bms_hash_value(const Bitmapset *a);
 
+extern void bms_array_free(Bitmapset **bms_array,  int len);
 #endif							/* BITMAPSET_H */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index cb012ba198..4fa5d32df6 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -64,6 +64,7 @@ extern PGDLLIMPORT bool enable_partitionwise_aggregate;
 extern PGDLLIMPORT bool enable_parallel_append;
 extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
+extern PGDLLIMPORT bool enable_distinct_elimination;
 extern PGDLLIMPORT int constraint_exclusion;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index eab486a621..ebd4f24577 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -100,6 +100,8 @@ extern List *remove_useless_joins(PlannerInfo *root, List *joinlist);
 extern void reduce_unique_semijoins(PlannerInfo *root);
 extern bool query_supports_distinctness(Query *query);
 extern bool query_is_distinct_for(Query *query, List *colnos, List *opids);
+extern bool query_is_distinct_agg(Query *query, List *colnos, List *opids);
+extern bool query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids);
 extern bool innerrel_is_unique(PlannerInfo *root,
 							   Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel,
 							   JoinType jointype, List *restrictlist, bool force_cache);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index f457b5b150..6712571578 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -870,14 +870,12 @@ explain (costs off)
   select distinct max(unique2) from tenk1;
                              QUERY PLAN                              
 ---------------------------------------------------------------------
- HashAggregate
-   Group Key: $0
+ Result
    InitPlan 1 (returns $0)
      ->  Limit
            ->  Index Only Scan Backward using tenk1_unique2 on tenk1
                  Index Cond: (unique2 IS NOT NULL)
-   ->  Result
-(7 rows)
+(5 rows)
 
 select distinct max(unique2) from tenk1;
  max  
@@ -1036,7 +1034,7 @@ explain (costs off)
   select distinct min(f1), max(f1) from minmaxtest;
                                          QUERY PLAN                                          
 ---------------------------------------------------------------------------------------------
- Unique
+ Result
    InitPlan 1 (returns $0)
      ->  Limit
            ->  Merge Append
@@ -1059,10 +1057,7 @@ explain (costs off)
                  ->  Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest_8
                        Index Cond: (f1 IS NOT NULL)
                  ->  Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest_9
-   ->  Sort
-         Sort Key: ($0), ($1)
-         ->  Result
-(26 rows)
+(23 rows)
 
 select distinct min(f1), max(f1) from minmaxtest;
  min | max 
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 761376b007..3f6595d53b 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4433,17 +4433,17 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
 explain (costs off)
 select d.* from d left join (select distinct * from b) s
   on d.a = s.id;
-              QUERY PLAN              
---------------------------------------
- Merge Right Join
-   Merge Cond: (b.id = d.a)
-   ->  Unique
-         ->  Sort
-               Sort Key: b.id, b.c_id
-               ->  Seq Scan on b
+           QUERY PLAN            
+---------------------------------
+ Merge Left Join
+   Merge Cond: (d.a = s.id)
    ->  Sort
          Sort Key: d.a
          ->  Seq Scan on d
+   ->  Sort
+         Sort Key: s.id
+         ->  Subquery Scan on s
+               ->  Seq Scan on b
 (9 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
diff --git a/src/test/regress/expected/select_distinct_2.out b/src/test/regress/expected/select_distinct_2.out
new file mode 100644
index 0000000000..2ece95a806
--- /dev/null
+++ b/src/test/regress/expected/select_distinct_2.out
@@ -0,0 +1,276 @@
+create table select_distinct_a(pk1 int, pk2 char(20),  uk1 char(20) not null,  uk2 int, e int, primary key(pk1, pk2));
+create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2);
+create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2));
+-- distinct erased since (pk1, pk2)
+explain (costs off) select distinct * from select_distinct_a;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+(1 row)
+
+-- distinct can't be reased since since we required all the uk must be not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a;
+             QUERY PLAN              
+-------------------------------------
+ HashAggregate
+   Group Key: uk1, uk2
+   ->  Seq Scan on select_distinct_a
+(3 rows)
+
+-- distinct ereased since uk + not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+   Filter: (uk2 IS NOT NULL)
+(2 rows)
+
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+   Filter: (uk2 > 1)
+(2 rows)
+
+-- distinct erased due to group by
+explain select distinct e from select_distinct_a group by e;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ HashAggregate  (cost=14.88..16.88 rows=200 width=4)
+   Group Key: e
+   ->  Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=390 width=4)
+(3 rows)
+
+-- distinct erased due to the restirctinfo
+explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c';
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Index Scan using select_distinct_a_pkey on select_distinct_a  (cost=0.15..8.17 rows=1 width=84)
+   Index Cond: ((pk1 = 1) AND (pk2 = 'c'::bpchar))
+(2 rows)
+
+-- test join
+set enable_mergejoin to off;
+set enable_hashjoin to off;
+insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3);
+insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5);
+-- Cartesian join
+explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null;
+                 QUERY PLAN                  
+---------------------------------------------
+ Nested Loop
+   ->  Seq Scan on select_distinct_b b
+   ->  Materialize
+         ->  Seq Scan on select_distinct_a a
+               Filter: (uk2 IS NOT NULL)
+(5 rows)
+
+select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4;
+         uk1          | uk2 |         pk1          | pk2 
+----------------------+-----+----------------------+-----
+ a                    |   0 | a                    |   0
+ a                    |   0 | d                    |   0
+ a                    |   0 | e                    |   0
+ A                    |   0 | a                    |   0
+ A                    |   0 | d                    |   0
+ A                    |   0 | e                    |   0
+ c                    |   0 | a                    |   0
+ c                    |   0 | d                    |   0
+ c                    |   0 | e                    |   0
+(9 rows)
+
+-- left join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Nested Loop Left Join  (cost=0.00..2310.28 rows=760 width=176)
+   Join Filter: (a.pk1 = b.a)
+   ->  Seq Scan on select_distinct_a a  (cost=0.00..13.90 rows=390 width=88)
+   ->  Materialize  (cost=0.00..15.85 rows=390 width=92)
+         ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=92)
+(5 rows)
+
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;;
+ pk1 |         pk2          |         pk1          | pk2 
+-----+----------------------+----------------------+-----
+   1 | a                    | a                    |   0
+   1 | a                    | e                    |   0
+   1 | b                    | a                    |   0
+   1 | b                    | e                    |   0
+   3 | c                    |                      |    
+(5 rows)
+
+-- right join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a);
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Nested Loop Left Join  (cost=0.15..140.88 rows=760 width=176)
+   ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=92)
+   ->  Index Only Scan using select_distinct_a_pkey on select_distinct_a a  (cost=0.15..0.31 rows=2 width=88)
+         Index Cond: (pk1 = b.a)
+(4 rows)
+
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+ pk1 |         pk2          |         pk1          | pk2 
+-----+----------------------+----------------------+-----
+   1 | a                    | a                    |   0
+   1 | a                    | e                    |   0
+   1 | b                    | a                    |   0
+   1 | b                    | e                    |   0
+     |                      | d                    |   0
+(5 rows)
+
+-- full join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Hash Full Join  (cost=10000000018.77..10000000060.26 rows=760 width=176)
+   Hash Cond: (a.pk1 = b.a)
+   ->  Seq Scan on select_distinct_a a  (cost=0.00..13.90 rows=390 width=88)
+   ->  Hash  (cost=13.90..13.90 rows=390 width=92)
+         ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=92)
+(5 rows)
+
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+ pk1 |         pk2          |         pk1          | pk2 
+-----+----------------------+----------------------+-----
+   1 | a                    | a                    |   0
+   1 | a                    | e                    |   0
+   1 | b                    | a                    |   0
+   1 | b                    | e                    |   0
+   3 | c                    |                      |    
+     |                      | d                    |   0
+(6 rows)
+
+-- distinct can't be erased since b.pk2 is missed
+explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Unique  (cost=10000000096.63..10000000104.23 rows=760 width=172)
+   ->  Sort  (cost=10000000096.63..10000000098.53 rows=760 width=172)
+         Sort Key: a.pk1, a.pk2, b.pk1
+         ->  Hash Full Join  (cost=10000000018.77..10000000060.26 rows=760 width=172)
+               Hash Cond: (a.pk1 = b.a)
+               ->  Seq Scan on select_distinct_a a  (cost=0.00..13.90 rows=390 width=88)
+               ->  Hash  (cost=13.90..13.90 rows=390 width=88)
+                     ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=88)
+(8 rows)
+
+-- Semi/anti join
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop
+   ->  HashAggregate
+         Group Key: select_distinct_b.a
+         ->  Seq Scan on select_distinct_b
+   ->  Index Only Scan using select_distinct_a_pkey on select_distinct_a
+         Index Cond: (pk1 = select_distinct_b.a)
+(6 rows)
+
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b);
+              QUERY PLAN               
+---------------------------------------
+ Seq Scan on select_distinct_a
+   Filter: (NOT (hashed SubPlan 1))
+   SubPlan 1
+     ->  Seq Scan on select_distinct_b
+(4 rows)
+
+-- we also can handle some limited subquery
+explain select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a;
+                                                QUERY PLAN                                                
+----------------------------------------------------------------------------------------------------------
+ Nested Loop  (cost=15.02..107.38 rows=390 width=184)
+   ->  HashAggregate  (cost=14.88..16.88 rows=200 width=4)
+         Group Key: select_distinct_b.a
+         ->  Seq Scan on select_distinct_b  (cost=0.00..13.90 rows=390 width=4)
+   ->  Index Scan using select_distinct_a_pkey on select_distinct_a a  (cost=0.15..0.42 rows=2 width=180)
+         Index Cond: (pk1 = select_distinct_b.a)
+(6 rows)
+
+select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3;
+ pk1 |         pk2          |         uk1          | uk2 | e | a 
+-----+----------------------+----------------------+-----+---+---
+   1 | a                    | a                    |   0 | 1 | 1
+   1 | b                    | A                    |   0 | 2 | 1
+(2 rows)
+
+explain select distinct * from select_distinct_a a,  (select distinct a from select_distinct_b) b where a.pk1 = b.a;
+                                                QUERY PLAN                                                
+----------------------------------------------------------------------------------------------------------
+ Nested Loop  (cost=15.02..107.38 rows=390 width=184)
+   ->  HashAggregate  (cost=14.88..16.88 rows=200 width=4)
+         Group Key: select_distinct_b.a
+         ->  Seq Scan on select_distinct_b  (cost=0.00..13.90 rows=390 width=4)
+   ->  Index Scan using select_distinct_a_pkey on select_distinct_a a  (cost=0.15..0.42 rows=2 width=180)
+         Index Cond: (pk1 = select_distinct_b.a)
+(6 rows)
+
+select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3;
+ pk1 |         pk2          |         uk1          | uk2 | e | a 
+-----+----------------------+----------------------+-----+---+---
+   1 | a                    | a                    |   0 | 1 | 1
+   1 | b                    | A                    |   0 | 2 | 1
+(2 rows)
+
+-- Distinct On
+-- can't erase since pk2 is missed
+explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a;
+                QUERY PLAN                 
+-------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: pk1
+         ->  Seq Scan on select_distinct_a
+(4 rows)
+
+-- ok to erase
+explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+(1 row)
+
+-- test some view.
+create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+explain select * from distinct_v1;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+   Filter: (uk2 IS NOT NULL)
+(2 rows)
+
+alter table select_distinct_a alter column uk1 drop not null;
+explain select * from distinct_v1;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ HashAggregate  (cost=15.84..17.84 rows=200 width=88)
+   Group Key: select_distinct_a.uk1, select_distinct_a.uk2
+   ->  Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+         Filter: (uk2 IS NOT NULL)
+(4 rows)
+
+alter table select_distinct_a alter column uk1 set not null;
+-- test generic plan
+prepare pt as select * from distinct_v1;
+explain execute pt;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+   Filter: (uk2 IS NOT NULL)
+(2 rows)
+
+alter table select_distinct_a alter column uk1 drop not null;
+explain execute pt;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ HashAggregate  (cost=15.84..17.84 rows=200 width=88)
+   Group Key: select_distinct_a.uk1, select_distinct_a.uk2
+   ->  Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+         Filter: (uk2 IS NOT NULL)
+(4 rows)
+
+drop view distinct_v1;
+drop table select_distinct_a;
+drop table select_distinct_b;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb905..e053214f9d 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -73,6 +73,7 @@ select name, setting from pg_settings where name like 'enable%';
               name              | setting 
 --------------------------------+---------
  enable_bitmapscan              | on
+ enable_distinct_elimination    | on
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(17 rows)
+(18 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/select_distinct_2.sql b/src/test/regress/sql/select_distinct_2.sql
new file mode 100644
index 0000000000..2fc54e7e36
--- /dev/null
+++ b/src/test/regress/sql/select_distinct_2.sql
@@ -0,0 +1,84 @@
+create table select_distinct_a(pk1 int, pk2 char(20),  uk1 char(20) not null,  uk2 int, e int, primary key(pk1, pk2));
+create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2);
+create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2));
+
+-- distinct erased since (pk1, pk2)
+explain (costs off) select distinct * from select_distinct_a;
+
+-- distinct can't be reased since since we required all the uk must be not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a;
+
+-- distinct ereased since uk + not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1;
+
+-- distinct erased due to group by
+explain select distinct e from select_distinct_a group by e;
+
+-- distinct erased due to the restirctinfo
+explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c';
+
+-- test join
+set enable_mergejoin to off;
+set enable_hashjoin to off;
+
+insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3);
+insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5);
+
+-- Cartesian join
+explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null;
+select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4;
+
+
+-- left join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a);
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;;
+
+-- right join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a);
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+
+-- full join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+
+-- distinct can't be erased since b.pk2 is missed
+explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+
+
+-- Semi/anti join
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b);
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b);
+
+-- we also can handle some limited subquery
+explain select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a;
+select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3;
+
+explain select distinct * from select_distinct_a a,  (select distinct a from select_distinct_b) b where a.pk1 = b.a;
+select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3;
+
+-- Distinct On
+-- can't erase since pk2 is missed
+explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a;
+-- ok to erase
+explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a;
+
+
+-- test some view.
+create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+explain select * from distinct_v1;
+
+alter table select_distinct_a alter column uk1 drop not null;
+explain select * from distinct_v1;
+
+alter table select_distinct_a alter column uk1 set not null;
+
+-- test generic plan
+prepare pt as select * from distinct_v1;
+explain execute pt;
+alter table select_distinct_a alter column uk1 drop not null;
+explain execute pt;
+
+drop view distinct_v1;
+drop table select_distinct_a;
+drop table select_distinct_b;
-- 
2.21.0

