From 42b07076105aab296f3a18bf0d1f84f667973d89 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Wed, 1 Nov 2017 12:49:20 +1300
Subject: [PATCH] Support removing LEFT JOINs with DISTINCT/GROUP BY

Normally we only remove a LEFT JOIN if its unused in the query and there
is no possibility that the LEFT JOINed table will cause any row
duplication.  If there's a DISTINCT or GROUP BY clause in the query then
and none of the DISTINCT/GROUP BY expressions are part of the
to-be-removed relation, then any row duplication that the join would cause
would be removed anyway.  Of course, we're unable to apply this if any
aggregate functions are used, as those duplicates must be counted.
---
 src/backend/optimizer/plan/analyzejoins.c | 48 ++++++++++++++++++++++------
 src/test/regress/expected/join.out        | 53 +++++++++++++++++++++++++++++++
 src/test/regress/sql/join.sql             | 16 ++++++++++
 3 files changed, 107 insertions(+), 10 deletions(-)

diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 511603b..9a9617b 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -150,10 +150,12 @@ clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids,
  *	  it will just duplicate its left input.
  *
  * This is true for a left join for which the join condition cannot match
- * more than one inner-side row.  (There are other possibly interesting
- * cases, but we don't have the infrastructure to prove them.)  We also
- * have to check that the inner side doesn't generate any variables needed
- * above the join.
+ * more than one inner-side row.  We can also allow removal of joins to
+ * relations that may match more than one inner-side row if a DISTINCT or
+ * GROUP BY clause would subsequently remove any duplicates caused by the
+ * join. (There are other possibly interesting cases, but we don't have the
+ * infrastructure to prove them.)  We also have to check that the inner side
+ * doesn't generate any variables needed above the join.
  */
 static bool
 join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
@@ -237,6 +239,22 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	}
 
 	/*
+	 * When a DISTINCT or GROUP BY clause is present, the unreferenced
+	 * relation's join has no ability to duplicate rows in result set, as any
+	 * duplicate rows would been removed by the DISTINCT/GROUP BY clause
+	 * anyway.  However, we're unable to apply this when aggregate functions
+	 * are present as we must aggregate any duplicated rows.  We needn't
+	 * bother checking the actual distinct/grouping exprs here, as we already
+	 * know from the above checks that no Var is present from the relation
+	 * we're trying to remove.
+	 */
+	if (root->parse->distinctClause != NIL)
+		return true;
+
+	if (root->parse->groupClause != NIL && !root->parse->hasAggs)
+		return true;
+
+	/*
 	 * Search for mergejoinable clauses that constrain the inner rel against
 	 * either the outer rel or a pseudoconstant.  If an operator is
 	 * mergejoinable then it behaves like equality for some btree opclass, so
@@ -597,15 +615,25 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 	if (rel->rtekind == RTE_RELATION)
 	{
 		/*
-		 * For a plain relation, we only know how to prove uniqueness by
-		 * reference to unique indexes.  Make sure there's at least one
-		 * suitable unique index.  It must be immediately enforced, and if
-		 * it's a partial index, it must match the query.  (Keep these
-		 * conditions in sync with relation_has_unique_index_for!)
+		 * For a plain relation, we can make use of DISTINCT or GROUP BY
+		 * clauses as unique proofs. We also know how to prove uniqueness by
+		 * reference to unique indexes.
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		if (root->parse->distinctClause != NIL)
+			return true;
+
+		if (root->parse->groupClause != NIL && !root->parse->hasAggs)
+			return true;
+
+		/*
+		 * Make sure there's at least one suitable unique index.  It must be
+		 * immediately enforced, and if it's a partial index, it must match
+		 * the query.  (Keep these conditions in sync with
+		 * relation_has_unique_index_for!)
+		 */
+		 foreach(lc, rel->indexlist)
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index f47449b..cd25b45 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4051,6 +4051,59 @@ select d.* from d left join (select id from a union select id from b) s
  Seq Scan on d
 (1 row)
 
+-- check join removal works without a unique index on the left joined table
+-- when a DISTINCT or GROUP BY is present which would remove row duplication
+explain (costs off)
+select distinct a.*,b.* from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a;
+               QUERY PLAN                
+-----------------------------------------
+ HashAggregate
+   Group Key: a.id, a.b_id, b.id, b.c_id
+   ->  Hash Left Join
+         Hash Cond: (a.b_id = b.id)
+         ->  Seq Scan on a
+         ->  Hash
+               ->  Seq Scan on b
+(7 rows)
+
+-- as above but with GROUP BY
+explain (costs off)
+select a.id,b.id from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+             QUERY PLAN             
+------------------------------------
+ HashAggregate
+   Group Key: a.id, b.id
+   ->  Hash Left Join
+         Hash Cond: (a.b_id = b.id)
+         ->  Seq Scan on a
+         ->  Hash
+               ->  Seq Scan on b
+(7 rows)
+
+-- ensure no join removal when we must aggregate any duplicated rows
+explain (costs off)
+select a.id,b.id,count(*) from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+                   QUERY PLAN                   
+------------------------------------------------
+ HashAggregate
+   Group Key: a.id, b.id
+   ->  Merge Left Join
+         Merge Cond: (a.b_id = d.a)
+         ->  Sort
+               Sort Key: a.b_id
+               ->  Hash Left Join
+                     Hash Cond: (a.b_id = b.id)
+                     ->  Seq Scan on a
+                     ->  Hash
+                           ->  Seq Scan on b
+         ->  Sort
+               Sort Key: d.a
+               ->  Seq Scan on d
+(14 rows)
+
 -- check join removal with a cross-type comparison operator
 explain (costs off)
 select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index d847d53..f14e2e8 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1331,6 +1331,22 @@ explain (costs off)
 select d.* from d left join (select id from a union select id from b) s
   on d.a = s.id;
 
+-- check join removal works without a unique index on the left joined table
+-- when a DISTINCT or GROUP BY is present which would remove row duplication
+explain (costs off)
+select distinct a.*,b.* from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a;
+
+-- as above but with GROUP BY
+explain (costs off)
+select a.id,b.id from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+
+-- ensure no join removal when we must aggregate any duplicated rows
+explain (costs off)
+select a.id,b.id,count(*) from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+
 -- check join removal with a cross-type comparison operator
 explain (costs off)
 select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4
-- 
1.9.5.msysgit.1