On Thu, 13 Oct 2022 at 16:47, Richard Guo <[email protected]> wrote:
> Currently in the patch the optimization is done before we check for
> presorted paths or do the explicit sort of the cheapest path. How about
> we move this optimization into the branch where we've found any
> presorted paths? Maybe something like:
I've attached a patch to that effect, but it turns out a bit more
complex than what you imagined. We still need to handle the case for
when there's no path that has the required pathkeys and we must add a
SortPath to the cheapest path. That requires adding some similar code
to add the LimitPath after the foreach loop over the pathlist is over.
I was also getting some weird plans like:
regression=# explain select distinct on (four) four,hundred from tenk1
where four=0 order by 1,2;
QUERY PLAN
----------------------------------------------------------------------
Sort (cost=0.20..0.20 rows=1 width=8)
Sort Key: hundred
-> Limit (cost=0.00..0.19 rows=1 width=8)
-> Seq Scan on tenk1 (cost=0.00..470.00 rows=2500 width=8)
Filter: (four = 0)
To stop the planner from adding that final sort, I opted to hack the
LimitPath's pathkeys to say that it's already sorted by the
PlannerInfo's sort_pathkeys. That feels slightly icky, but it does
seem a little wasteful to initialise a sort node on every execution of
the plan to sort a single tuple.
David
diff --git a/src/backend/optimizer/plan/planner.c
b/src/backend/optimizer/plan/planner.c
index 5d0fd6e072..5c4e4dc5cd 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4780,11 +4780,53 @@ create_final_distinct_paths(PlannerInfo *root,
RelOptInfo *input_rel,
if (pathkeys_contained_in(needed_pathkeys,
path->pathkeys))
{
- add_path(distinct_rel, (Path *)
- create_upper_unique_path(root,
distinct_rel,
-
path,
-
list_length(root->distinct_pathkeys),
-
numDistinctRows));
+ /*
+ * distinct_pathkeys may have become empty if
all of the
+ * pathkeys were determined to be redundant.
If all of the
+ * pathkeys are redundant then each DISTINCT
target must only
+ * allow a single value, therefore all
resulting tuples must
+ * be identical. We can uniquify these tuples
simply by just
+ * taking the first tuple. All we do here is
add a path to do
+ * LIMIT 1 on path. When doing DISTINCT ON we
may still have
+ * a non-NIL sort_pathkeys list, so we must
still only do this
+ * with paths which are contained in
needed_pathkeys.
+ */
+ if (root->distinct_pathkeys == NIL)
+ {
+ Node *limitCount;
+ LimitPath *lpath;
+
+ limitCount = (Node *)
makeConst(INT8OID, -1, InvalidOid,
+
sizeof(int64),
+
Int64GetDatum(1), false,
+
FLOAT8PASSBYVAL);
+
+ /*
+ * If the query already has a LIMIT
clause, then we could
+ * end up with a duplicate LimitPath in
the final plan.
+ * That does not seem worth troubling
over too much.
+ */
+ lpath = create_limit_path(root,
distinct_rel, path, NULL,
+
limitCount, LIMIT_OPTION_COUNT,
+
0, 1);
+
+ /*
+ * We set the LimitPath's pathkeys to
the sort_pathkeys
+ * since there can only be a single row
after the LIMIT to
+ * save the planner adding a useless
SortPath for the
+ * ORDER BY.
+ */
+ lpath->path.pathkeys =
root->sort_pathkeys;
+ add_path(distinct_rel, (Path *) lpath);
+ }
+ else
+ {
+ add_path(distinct_rel, (Path *)
+
create_upper_unique_path(root, distinct_rel,
+
path,
+
list_length(root->distinct_pathkeys),
+
numDistinctRows));
+ }
}
}
@@ -4805,13 +4847,33 @@ create_final_distinct_paths(PlannerInfo *root,
RelOptInfo *input_rel,
path = (Path *) create_sort_path(root, distinct_rel,
path,
needed_pathkeys,
-
-1.0);
+
root->distinct_pathkeys == NIL ?
+
1.0 : -1.0);
- add_path(distinct_rel, (Path *)
- create_upper_unique_path(root, distinct_rel,
-
path,
-
list_length(root->distinct_pathkeys),
-
numDistinctRows));
+ /*
+ * As above, use a LimitPath instead of a UniquePath when all
of the
+ * distinct_pathkeys are redundant and we're only going to get a
+ * series of tuples all with the same values anyway.
+ */
+ if (root->distinct_pathkeys == NIL)
+ {
+ Node *limitCount = (Node *) makeConst(INT8OID,
-1, InvalidOid,
+
sizeof(int64),
+
Int64GetDatum(1), false,
+
FLOAT8PASSBYVAL);
+
+ add_path(distinct_rel, (Path *)
+ create_limit_path(root, distinct_rel,
path, NULL,
+
limitCount, LIMIT_OPTION_COUNT, 0, 1));
+ }
+ else
+ {
+ add_path(distinct_rel, (Path *)
+ create_upper_unique_path(root,
distinct_rel,
+
path,
+
list_length(root->distinct_pathkeys),
+
numDistinctRows));
+ }
}
/*
diff --git a/src/test/regress/expected/select_distinct.out
b/src/test/regress/expected/select_distinct.out
index 748419cee0..6ce889d87c 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -279,6 +279,60 @@ RESET max_parallel_workers_per_gather;
RESET min_parallel_table_scan_size;
RESET parallel_setup_cost;
RESET parallel_tuple_cost;
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+ QUERY PLAN
+----------------------------
+ Limit
+ -> Seq Scan on tenk1
+ Filter: (four = 0)
+(3 rows)
+
+-- Ensure the above gives us the correct result
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+ four
+------
+ 0
+(1 row)
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+ QUERY PLAN
+---------------------------------------------
+ Limit
+ -> Seq Scan on tenk1
+ Filter: ((two <> 0) AND (four = 0))
+(3 rows)
+
+-- Ensure no rows are returned
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+ four
+------
+(0 rows)
+
+-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+ QUERY PLAN
+----------------------------
+ Limit
+ -> Seq Scan on tenk1
+ Filter: (four = 0)
+(3 rows)
+
+-- Ensure we only get 1 row
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+ four | ?column? | ?column? | ?column?
+------+----------+----------+----------
+ 0 | 1 | 2 | 3
+(1 row)
+
--
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
-- very own regression file.
diff --git a/src/test/regress/expected/select_distinct_on.out
b/src/test/regress/expected/select_distinct_on.out
index 3d98990991..b2978c1114 100644
--- a/src/test/regress/expected/select_distinct_on.out
+++ b/src/test/regress/expected/select_distinct_on.out
@@ -73,3 +73,53 @@ select distinct on (1) floor(random()) as r, f1 from
int4_tbl order by 1,2;
0 | -2147483647
(1 row)
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+-- Ensure we also get a LIMIT plan with DISTINCT ON
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+ FROM tenk1 WHERE four = 0 ORDER BY 1;
+ QUERY PLAN
+----------------------------------
+ Result
+ -> Limit
+ -> Seq Scan on tenk1
+ Filter: (four = 0)
+(4 rows)
+
+-- and check the result of the above query is correct
+SELECT DISTINCT ON (four) four,two
+ FROM tenk1 WHERE four = 0 ORDER BY 1;
+ four | two
+------+-----
+ 0 | 0
+(1 row)
+
+-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+ FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+ QUERY PLAN
+----------------------------------
+ Limit
+ -> Sort
+ Sort Key: two
+ -> Seq Scan on tenk1
+ Filter: (four = 0)
+(5 rows)
+
+-- Same again but use a column that is indexed so that we get an index scan
+-- then a limit
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,hundred
+ FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+ QUERY PLAN
+-----------------------------------------------------
+ Result
+ -> Limit
+ -> Index Scan using tenk1_hundred on tenk1
+ Filter: (four = 0)
+(4 rows)
+
diff --git a/src/test/regress/sql/select_distinct.sql
b/src/test/regress/sql/select_distinct.sql
index f27ff714f8..34020adad1 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -146,6 +146,32 @@ RESET min_parallel_table_scan_size;
RESET parallel_setup_cost;
RESET parallel_tuple_cost;
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+
+-- Ensure the above gives us the correct result
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+
+-- Ensure no rows are returned
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+
+-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+
+-- Ensure we only get 1 row
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+
--
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
-- very own regression file.
diff --git a/src/test/regress/sql/select_distinct_on.sql
b/src/test/regress/sql/select_distinct_on.sql
index 0920bd64b9..261e6140fe 100644
--- a/src/test/regress/sql/select_distinct_on.sql
+++ b/src/test/regress/sql/select_distinct_on.sql
@@ -17,3 +17,28 @@ SELECT DISTINCT ON (string4, ten) string4, ten, two
-- bug #5049: early 8.4.x chokes on volatile DISTINCT ON clauses
select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
+
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+
+-- Ensure we also get a LIMIT plan with DISTINCT ON
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+ FROM tenk1 WHERE four = 0 ORDER BY 1;
+
+-- and check the result of the above query is correct
+SELECT DISTINCT ON (four) four,two
+ FROM tenk1 WHERE four = 0 ORDER BY 1;
+
+-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+ FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+
+-- Same again but use a column that is indexed so that we get an index scan
+-- then a limit
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,hundred
+ FROM tenk1 WHERE four = 0 ORDER BY 1,2;