From 31edfa296a81431e2c68401da633c4a3df4eff98 Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Sun, 4 Mar 2018 21:32:57 +1300
Subject: [PATCH v2 2/2] Allow LEFT JOINs to be removed in more cases

Normally, LEFT JOINs to base rels can only be removed if none of the columns
from the joined relation are used in the query and if the relation has a
unique index which proves the join cannot duplicate any rows on the other side
of the join.

This commit relaxes this a little to no longer require the unique index to be
present.  Alternative proofs are used in the form of DISTINCT or GROUP BY
clauses.  The idea here being that it does not matter if the join causes row
duplicate if some DISTINCT/GROUP BY will subsequently remove those duplicate
rows again.  However, there are still cases that we need to beware of and not
allow this optimization to take place.  For example, if there are aggregate
functions in the query then these will be sensitive to these additional
duplicated rows, the case is the same for windowing functions.

On successful join removal of such a case the benefits are three-fold:

1. The join search becomes more simple.
2. No waste of effort joining the relation in the first place.
3. No waste of effort removing the additional duplicate rows caused by the
   join.
---
 src/backend/optimizer/plan/analyzejoins.c | 144 +++++++++++++++++++++-
 src/test/regress/expected/join.out        | 193 ++++++++++++++++++++++++++++++
 src/test/regress/sql/join.sql             |  71 +++++++++++
 3 files changed, 402 insertions(+), 6 deletions(-)

diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 40f284b29c9..1ca6f5b125a 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -38,6 +38,8 @@ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
 static void remove_rel_from_query(PlannerInfo *root, int relid,
 					  Relids joinrelids);
 static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
+static bool query_distinctifies_rows(Query *query);
+static bool rel_distinctified_after_join(PlannerInfo *root, RelOptInfo *rel);
 static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
 					List *clause_list);
@@ -151,10 +153,12 @@ clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids,
  *	  it will just duplicate its left input.
  *
  * This is true for a left join for which the join condition cannot match
- * more than one inner-side row.  (There are other possibly interesting
- * cases, but we don't have the infrastructure to prove them.)  We also
- * have to check that the inner side doesn't generate any variables needed
- * above the join.
+ * more than one inner-side row.  We can also allow removal of joins to
+ * relations that may match more than one inner-side row if a DISTINCT or
+ * GROUP BY clause would subsequently remove any duplicates caused by the
+ * join. (There are other possibly interesting cases, but we don't have the
+ * infrastructure to prove them.)  We also have to check that the inner side
+ * doesn't generate any variables needed above the join.
  */
 static bool
 join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
@@ -182,9 +186,10 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	/*
 	 * Before we go to the effort of checking whether any innerrel variables
 	 * are needed above the join, make a quick check to eliminate cases in
-	 * which we will surely be unable to prove uniqueness of the innerrel.
+	 * which we will surely be unable to remove the join.
 	 */
-	if (!rel_supports_distinctness(root, innerrel))
+	if (!rel_supports_distinctness(root, innerrel) &&
+		!query_distinctifies_rows(root->parse))
 		return false;
 
 	/* Compute the relid set for the join we are considering */
@@ -302,6 +307,14 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 		clause_list = lappend(clause_list, restrictinfo);
 	}
 
+	/*
+	 * Check to see if the relation cannot possibly cause any duplication in
+	 * the result set due to the query having a DISTINCT or GROUP BY clause
+	 * that would eliminate any duplicate rows produced by the join anyway.
+	 */
+	if (rel_distinctified_after_join(root, innerrel))
+		return true;
+
 	/*
 	 * Now that we have the relevant equality join clauses, try to prove the
 	 * innerrel distinct.
@@ -594,6 +607,125 @@ reduce_unique_semijoins(PlannerInfo *root)
 	}
 }
 
+/*
+ * query_distinctifies_rows
+ *		Returns true if the query has any properties that allow duplicate
+ *		results to be removed.
+ *
+ * This function is just a pre-check function to check if the query possibly
+ * has some properties that disallow duplicate rows from appearing in the
+ * result set.  More specifically we're interested in knowing if we can
+ * possibly exercise any means to have fewer duplicate rows in the result set
+ * prior to the de-duplication process.
+ *
+ * It's not critical that we always return false when this is not possible, as
+ * this function just serves as a pre-check to prevent further checks going
+ * ahead with expensive processing if we have no chance of success.
+ *
+ * If this function returns false then rel_tuples_needed must also return
+ * false.
+ */
+static bool
+query_distinctifies_rows(Query *query)
+{
+	/* a query with aggs or wfuncs needs all input rows from the join */
+	if (query->hasAggs || query->hasWindowFuncs)
+		return false;
+
+	/* no distinctification possible if there's no DISTINCT or GROUP BY */
+	if (query->distinctClause == NIL && query->groupClause == NIL)
+		return false;
+
+	/*
+	 * We shouldn't waste too much effort here as rel_distinctified_after_join
+	 * serves as the final check, but perhaps there's some other things common
+	 * enough and cheap enough to check here?
+	 */
+	return true;
+}
+
+/*
+ * rel_distinctified_after_join
+ *		Returns false if tuples from 'rel' are required as input for any part
+ *		of the plan.
+ *
+ * Our aim here is to determine if we can remove the join to 'rel' without
+ * having any affect on the final results.  If 'rel's join partner can match
+ * any single row to more than one of rel's output rows, then the rel would
+ * normally be required to maintain the correct result set.  However, if the
+ * query has a GROUP BY or DISTINCT clause then any row duplication caused
+ * will be removed again.  To complicate the matter we also need to ensure
+ * that the additional rows are not required for anything before the final
+ * resultset is returned.  For example, aggregate functions will see any
+ * duplicate rows, as will windowing functions.  We must also be careful not
+ * to change the number of times a volatile function is evaulated.
+ *
+ * We must only return true if we're completely certain that the tuples are
+ * not needed. If uncertain we'll play it safe and return false.
+ *
+ * Note: If making changes here then query_distinctifies_rows may need updated
+ */
+static bool
+rel_distinctified_after_join(PlannerInfo *root, RelOptInfo *rel)
+{
+	Query	   *query = root->parse;
+	Index		rti;
+
+	/* Aggs and window funcs are sensitive to row duplication from joins */
+	if (query->hasAggs || query->hasWindowFuncs)
+		return false;
+
+	/*
+	 * Ensure we have a DISTINCT or GROUP BY clause to remove duplicates. If
+	 * we have a GROUP BY clause then we'd better make sure there are not any
+	 * volatile functions in it.  If we have both a GROUP BY and a DISTINCT
+	 * clause and we have no aggregates then we shouldn't care too much about
+	 * volatile functions in the distinctClauses as GROUP BY will have done
+	 * all the distinctification work and all the columns in the targetlist
+	 * must be functionally dependant on the GROUP BY clauses anyway.  In the
+	 * case for DISTINCT ON the remaining targetlist items will be evaulated
+	 * after the results have been made distinct, so are only going to be
+	 * evaulated once whether we remove the join or not.
+	 */
+	if (query->groupClause == NIL)
+	{
+		if (query->distinctClause == NIL)
+			return false;
+		else if (contain_volatile_functions((Node *) query->distinctClause))
+			return false;
+	}
+	else if (contain_volatile_functions((Node *) query->groupClause))
+		return false;
+
+	/*
+	 * We must disallow the removal of the join if there are any LATERAL joins
+	 * to a volatile function using Vars from any relation at this level.  If
+	 * we removed these then it could reduce the number of times that the
+	 * volatile function is evaluated.
+	 */
+	for (rti = 1; rti < root->simple_rel_array_size; rti++)
+	{
+		RelOptInfo *brel = root->simple_rel_array[rti];
+		RangeTblEntry *rte;
+
+		/* there may be empty slots corresponding to non-baserel RTEs */
+		if (brel == NULL)
+			continue;
+
+		Assert(brel->relid == rti); /* sanity check on array */
+
+		/* ignore RTEs that are "other rels" */
+		if (brel->reloptkind != RELOPT_BASEREL)
+			continue;
+
+		rte = root->simple_rte_array[rti];
+		if (rte->rtekind == RTE_FUNCTION && brel->lateral_vars != NIL &&
+			contain_volatile_functions((Node *) rte->functions))
+			return false;
+	}
+
+	return true;		/* relation not needed */
+}
 
 /*
  * rel_supports_distinctness
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 76f31fad103..f921a1f0d14 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4208,6 +4208,199 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
          Filter: (a.id = i)
 (4 rows)
 
+-- check join removal works without a unique index on the left joined table
+-- when a DISTINCT or GROUP BY is present which would remove row duplication
+explain (costs off)
+select distinct a.*,b.* from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a;
+               QUERY PLAN                
+-----------------------------------------
+ HashAggregate
+   Group Key: a.id, a.b_id, b.id, b.c_id
+   ->  Hash Left Join
+         Hash Cond: (a.b_id = b.id)
+         ->  Seq Scan on a
+         ->  Hash
+               ->  Seq Scan on b
+(7 rows)
+
+-- as above but with DISTINCT ON
+explain (costs off)
+select distinct on (a.id,b.id) a.*,b.* from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a order by a.id,b.id,a.b_id;
+                QUERY PLAN                
+------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: a.id, b.id, a.b_id
+         ->  Hash Left Join
+               Hash Cond: (a.b_id = b.id)
+               ->  Seq Scan on a
+               ->  Hash
+                     ->  Seq Scan on b
+(8 rows)
+
+-- as above but with GROUP BY
+explain (costs off)
+select a.id,b.id from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+             QUERY PLAN             
+------------------------------------
+ HashAggregate
+   Group Key: a.id, b.id
+   ->  Hash Left Join
+         Hash Cond: (a.b_id = b.id)
+         ->  Seq Scan on a
+         ->  Hash
+               ->  Seq Scan on b
+(7 rows)
+
+-- also ensure the removal works with other relation types.
+explain (costs off)
+select distinct on (a.id,b.id) a.*,b.* from a
+left join b on a.b_id = b.id
+left join (values(1),(1)) d(a)
+	on a.b_id = d.a order by a.id,b.id,a.b_id;
+                QUERY PLAN                
+------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: a.id, b.id, a.b_id
+         ->  Hash Left Join
+               Hash Cond: (a.b_id = b.id)
+               ->  Seq Scan on a
+               ->  Hash
+                     ->  Seq Scan on b
+(8 rows)
+
+-- ensure no join removal when we must aggregate any duplicated rows
+explain (costs off)
+select a.id,b.id,count(*) from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+                   QUERY PLAN                   
+------------------------------------------------
+ HashAggregate
+   Group Key: a.id, b.id
+   ->  Merge Left Join
+         Merge Cond: (a.b_id = d.a)
+         ->  Sort
+               Sort Key: a.b_id
+               ->  Hash Left Join
+                     Hash Cond: (a.b_id = b.id)
+                     ->  Seq Scan on a
+                     ->  Hash
+                           ->  Seq Scan on b
+         ->  Sort
+               Sort Key: d.a
+               ->  Seq Scan on d
+(14 rows)
+
+-- removal of left joins using distinct or group by instead of a unique index
+-- matching the join condition of the left joined table must be disallowed if
+-- the query contains a lateral joined volatile function.  A reduction in the
+-- number of rows produced by the join could affect the number of evaluations
+-- of the volatile function.
+create or replace function vseries(pstart int, pend int) returns setof int as $$
+begin
+	while pstart <= pend
+	loop
+		raise notice '%', pstart;
+		return next pstart;
+		pstart := pstart + 1;
+	end loop;
+end;
+$$ volatile language plpgsql;
+-- ensure join to "d" is not removed (DISTINCT)
+explain (costs off)
+select distinct a.* from a left join d on a.id = d.a, lateral vseries(a.id, a.id);
+              QUERY PLAN               
+---------------------------------------
+ HashAggregate
+   Group Key: a.id, a.b_id
+   ->  Nested Loop
+         ->  Hash Right Join
+               Hash Cond: (d.a = a.id)
+               ->  Seq Scan on d
+               ->  Hash
+                     ->  Seq Scan on a
+         ->  Function Scan on vseries
+(9 rows)
+
+-- ensure join to "d" is not removed (GROUP BY)
+explain (costs off)
+select a.id from a left join d on a.id = d.a, lateral vseries(a.id, a.id) group by a.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Group
+   Group Key: a.id
+   ->  Nested Loop
+         ->  Merge Left Join
+               Merge Cond: (a.id = d.a)
+               ->  Index Only Scan using a_pkey on a
+               ->  Sort
+                     Sort Key: d.a
+                     ->  Seq Scan on d
+         ->  Function Scan on vseries
+(10 rows)
+
+-- ensure that the join to "d" is removed when the function is not volatile (DISTINCT)
+explain (costs off)
+select distinct a.* from a left join d on a.id = d.a, lateral generate_series(a.id, a.id);
+                  QUERY PLAN                  
+----------------------------------------------
+ HashAggregate
+   Group Key: a.id, a.b_id
+   ->  Nested Loop
+         ->  Seq Scan on a
+         ->  Function Scan on generate_series
+(5 rows)
+
+-- ensure that the join to "d" is removed when the function is not volatile (GROUP BY)
+explain (costs off)
+select a.id from a left join d on a.id = d.a, lateral generate_series(a.id, a.id) group by a.id;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Group
+   Group Key: a.id
+   ->  Nested Loop
+         ->  Index Only Scan using a_pkey on a
+         ->  Function Scan on generate_series
+(5 rows)
+
+-- ensure join to "d" can be removed despite the volatile function in the target list.
+explain (costs off)
+select distinct on (a.id) notice(a.id) from a left join d on a.id = d.a - d.a order by a.id;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Result
+   ->  Unique
+         ->  Index Only Scan using a_pkey on a
+(3 rows)
+
+-- ensure the volatile function is just executed once per a.id
+select distinct on (a.id) notice(a.id) from a left join d on a.id = d.a - d.a order by a.id;
+NOTICE:  0
+NOTICE:  1
+ notice 
+--------
+      0
+      1
+(2 rows)
+
+-- and it should be evaulated multiple times when there's no distinct on.
+select notice(a.id) from a left join d on a.id = d.a - d.a order by a.id;
+NOTICE:  0
+NOTICE:  0
+NOTICE:  0
+NOTICE:  1
+ notice 
+--------
+      0
+      0
+      0
+      1
+(4 rows)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 4be64eec293..4a53f9112de 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1398,6 +1398,77 @@ explain (costs off)
 select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
 			  lateral generate_series(1, q.id) gs(i) where q.id = gs.i;
 
+-- check join removal works without a unique index on the left joined table
+-- when a DISTINCT or GROUP BY is present which would remove row duplication
+explain (costs off)
+select distinct a.*,b.* from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a;
+
+-- as above but with DISTINCT ON
+explain (costs off)
+select distinct on (a.id,b.id) a.*,b.* from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a order by a.id,b.id,a.b_id;
+
+-- as above but with GROUP BY
+explain (costs off)
+select a.id,b.id from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+
+-- also ensure the removal works with other relation types.
+explain (costs off)
+select distinct on (a.id,b.id) a.*,b.* from a
+left join b on a.b_id = b.id
+left join (values(1),(1)) d(a)
+	on a.b_id = d.a order by a.id,b.id,a.b_id;
+
+-- ensure no join removal when we must aggregate any duplicated rows
+explain (costs off)
+select a.id,b.id,count(*) from a left join b on a.b_id = b.id left join d
+	on a.b_id = d.a group by a.id,b.id;
+
+-- removal of left joins using distinct or group by instead of a unique index
+-- matching the join condition of the left joined table must be disallowed if
+-- the query contains a lateral joined volatile function.  A reduction in the
+-- number of rows produced by the join could affect the number of evaluations
+-- of the volatile function.
+
+create or replace function vseries(pstart int, pend int) returns setof int as $$
+begin
+	while pstart <= pend
+	loop
+		raise notice '%', pstart;
+		return next pstart;
+		pstart := pstart + 1;
+	end loop;
+end;
+$$ volatile language plpgsql;
+
+-- ensure join to "d" is not removed (DISTINCT)
+explain (costs off)
+select distinct a.* from a left join d on a.id = d.a, lateral vseries(a.id, a.id);
+
+-- ensure join to "d" is not removed (GROUP BY)
+explain (costs off)
+select a.id from a left join d on a.id = d.a, lateral vseries(a.id, a.id) group by a.id;
+
+-- ensure that the join to "d" is removed when the function is not volatile (DISTINCT)
+explain (costs off)
+select distinct a.* from a left join d on a.id = d.a, lateral generate_series(a.id, a.id);
+
+-- ensure that the join to "d" is removed when the function is not volatile (GROUP BY)
+explain (costs off)
+select a.id from a left join d on a.id = d.a, lateral generate_series(a.id, a.id) group by a.id;
+
+-- ensure join to "d" can be removed despite the volatile function in the target list.
+explain (costs off)
+select distinct on (a.id) notice(a.id) from a left join d on a.id = d.a - d.a order by a.id;
+
+-- ensure the volatile function is just executed once per a.id
+select distinct on (a.id) notice(a.id) from a left join d on a.id = d.a - d.a order by a.id;
+
+-- and it should be evaulated multiple times when there's no distinct on.
+select notice(a.id) from a left join d on a.id = d.a - d.a order by a.id;
+
 rollback;
 
 create temp table parent (k int primary key, pd int);
-- 
2.16.2.windows.1